diff options
Diffstat (limited to 'eigen/Eigen/src/Core/arch')
30 files changed, 9895 insertions, 388 deletions
diff --git a/eigen/Eigen/src/Core/arch/AVX/Complex.h b/eigen/Eigen/src/Core/arch/AVX/Complex.h new file mode 100644 index 0000000..99439c8 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/AVX/Complex.h @@ -0,0 +1,483 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_AVX_H +#define EIGEN_COMPLEX_AVX_H + +namespace Eigen { + +namespace internal { + +//---------- float ---------- +struct Packet4cf +{ + EIGEN_STRONG_INLINE Packet4cf() {} + EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {} + __m256 v; +}; + +template<> struct packet_traits<std::complex<float> > : default_packet_traits +{ + typedef Packet4cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; + +template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) +{ + return Packet4cf(pnegate(a.v)); +} +template<> EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) +{ + const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); + return Packet4cf(_mm256_xor_ps(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, const Packet4cf& b) +{ + __m256 tmp1 = _mm256_mul_ps(_mm256_moveldup_ps(a.v), b.v); + __m256 tmp2 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); + __m256 result = _mm256_addsub_ps(tmp1, tmp2); + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pand <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf por <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pxor <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); } +template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); } + + +template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from) +{ + return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) +{ + // FIXME The following might be optimized using _mm256_movedup_pd + Packet2cf a = ploaddup<Packet2cf>(from); + Packet2cf b = ploaddup<Packet2cf>(from+1); + return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from, Index stride) +{ + return Packet4cf(_mm256_set_ps(std::imag(from[3*stride]), std::real(from[3*stride]), + std::imag(from[2*stride]), std::real(from[2*stride]), + std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index stride) +{ + __m128 low = _mm256_extractf128_ps(from.v, 0); + to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1))); + to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3))); + + __m128 high = _mm256_extractf128_ps(from.v, 1); + to[stride*2] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1))); + to[stride*3] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3))); + +} + +template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a) +{ + return pfirst(Packet2cf(_mm256_castps256_ps128(a.v))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) { + __m128 low = _mm256_extractf128_ps(a.v, 0); + __m128 high = _mm256_extractf128_ps(a.v, 1); + __m128d lowd = _mm_castps_pd(low); + __m128d highd = _mm_castps_pd(high); + low = _mm_castpd_ps(_mm_shuffle_pd(lowd,lowd,0x1)); + high = _mm_castpd_ps(_mm_shuffle_pd(highd,highd,0x1)); + __m256 result = _mm256_setzero_ps(); + result = _mm256_insertf128_ps(result, low, 1); + result = _mm256_insertf128_ps(result, high, 0); + return Packet4cf(result); +} + +template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a) +{ + return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v,0)), + Packet2cf(_mm256_extractf128_ps(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs) +{ + Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t0 = _mm256_hadd_ps(t0,t1); + Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t2 = _mm256_hadd_ps(t2,t3); + + t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); + t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); + + return Packet4cf(_mm256_add_ps(t1,t3)); +} + +template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a) +{ + return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), + Packet2cf(_mm256_extractf128_ps(a.v, 1)))); +} + +template<int Offset> +struct palign_impl<Offset,Packet4cf> +{ + static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) + { + if (Offset==0) return; + palign_impl<Offset*2,Packet8f>::run(first.v, second.v); + } +}; + +template<> struct conj_helper<Packet4cf, Packet4cf, false,true> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet4cf, Packet4cf, true,false> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet4cf, Packet4cf, true,true> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper<Packet8f, Packet4cf, false,false> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const + { return Packet4cf(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper<Packet4cf, Packet8f, false,false> +{ + EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const + { return Packet4cf(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b) +{ + Packet4cf num = pmul(a, pconj(b)); + __m256 tmp = _mm256_mul_ps(b.v, b.v); + __m256 tmp2 = _mm256_shuffle_ps(tmp,tmp,0xB1); + __m256 denom = _mm256_add_ps(tmp, tmp2); + return Packet4cf(_mm256_div_ps(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x) +{ + return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); +} + +//---------- double ---------- +struct Packet2cd +{ + EIGEN_STRONG_INLINE Packet2cd() {} + EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {} + __m256d v; +}; + +template<> struct packet_traits<std::complex<double> > : default_packet_traits +{ + typedef Packet2cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 2, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) { return Packet2cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) +{ + const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); + return Packet2cd(_mm256_xor_pd(a.v,mask)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, const Packet2cd& b) +{ + __m256d tmp1 = _mm256_shuffle_pd(a.v,a.v,0x0); + __m256d even = _mm256_mul_pd(tmp1, b.v); + __m256d tmp2 = _mm256_shuffle_pd(a.v,a.v,0xF); + __m256d tmp3 = _mm256_shuffle_pd(b.v,b.v,0x5); + __m256d odd = _mm256_mul_pd(tmp2, tmp3); + return Packet2cd(_mm256_addsub_pd(even, odd)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pand <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd por <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pxor <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from) +{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(ploadu<Packet4d>((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from) +{ + // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though) +// return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from)); + return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) { return pset1<Packet2cd>(*from); } + +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet2cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from, Index stride) +{ + return Packet2cd(_mm256_set_pd(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from, Index stride) +{ + __m128d low = _mm256_extractf128_pd(from.v, 0); + to[stride*0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1))); + __m128d high = _mm256_extractf128_pd(from.v, 1); + to[stride*1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1))); +} + +template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a) +{ + __m128d low = _mm256_extractf128_pd(a.v, 0); + EIGEN_ALIGN16 double res[2]; + _mm_store_pd(res, low); + return std::complex<double>(res[0],res[1]); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) { + __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1); + return Packet2cd(result); +} + +template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a) +{ + return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs) +{ + Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); + Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); + + return Packet2cd(_mm256_add_pd(t0,t1)); +} + +template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a) +{ + return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), + Packet1cd(_mm256_extractf128_pd(a.v,1)))); +} + +template<int Offset> +struct palign_impl<Offset,Packet2cd> +{ + static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) + { + if (Offset==0) return; + palign_impl<Offset*2,Packet4d>::run(first.v, second.v); + } +}; + +template<> struct conj_helper<Packet2cd, Packet2cd, false,true> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet2cd, Packet2cd, true,false> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet2cd, Packet2cd, true,true> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper<Packet4d, Packet2cd, false,false> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const + { return Packet2cd(Eigen::internal::pmul(x, y.v)); } +}; + +template<> struct conj_helper<Packet2cd, Packet4d, false,false> +{ + EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const + { return Packet2cd(Eigen::internal::pmul(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b) +{ + Packet2cd num = pmul(a, pconj(b)); + __m256d tmp = _mm256_mul_pd(b.v, b.v); + __m256d denom = _mm256_hadd_pd(tmp, tmp); + return Packet2cd(_mm256_div_pd(num.v, denom)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x) +{ + return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4cf,4>& kernel) { + __m256d P0 = _mm256_castps_pd(kernel.packet[0].v); + __m256d P1 = _mm256_castps_pd(kernel.packet[1].v); + __m256d P2 = _mm256_castps_pd(kernel.packet[2].v); + __m256d P3 = _mm256_castps_pd(kernel.packet[3].v); + + __m256d T0 = _mm256_shuffle_pd(P0, P1, 15); + __m256d T1 = _mm256_shuffle_pd(P0, P1, 0); + __m256d T2 = _mm256_shuffle_pd(P2, P3, 15); + __m256d T3 = _mm256_shuffle_pd(P2, P3, 0); + + kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32)); + kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49)); + kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32)); + kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2cd,2>& kernel) { + __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0+(2<<4)); + kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1+(3<<4)); + kernel.packet[0].v = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b) +{ + return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b) +{ + return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b) +{ + return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b) +{ + return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2))); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_AVX_H diff --git a/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h b/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h new file mode 100644 index 0000000..6af67ce --- /dev/null +++ b/eigen/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -0,0 +1,439 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_AVX_H +#define EIGEN_MATH_FUNCTIONS_AVX_H + +/* The sin, cos, exp, and log functions of this file are loosely derived from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +namespace Eigen { + +namespace internal { + +inline Packet8i pshiftleft(Packet8i v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(v, n); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +inline Packet8f pshiftright(Packet8f v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); + return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} + +// Sine function +// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and +// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants +// are (anti-)symmetric and thus have only odd/even coefficients +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +psin<Packet8f>(const Packet8f& _x) { + Packet8f x = _x; + + // Some useful values. + _EIGEN_DECLARE_CONST_Packet8i(one, 1); + _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f); + _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f); + _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f); + _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f); + + // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period. + Packet8f z = pmul(x, p8f_one_over_pi); + Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four)); + x = pmadd(shift, p8f_neg_pi_first, x); + x = pmadd(shift, p8f_neg_pi_second, x); + x = pmadd(shift, p8f_neg_pi_third, x); + z = pmul(x, p8f_four_over_pi); + + // Make a mask for the entries that need flipping, i.e. wherever the shift + // is odd. + Packet8i shift_ints = _mm256_cvtps_epi32(shift); + Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); + Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); + + // Create a mask for which interpolant to use, i.e. if z > 1, then the mask + // is set to ones for that entry. + Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ); + + // Evaluate the polynomial for the interval [1,3] in z. + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f); + Packet8f z_minus_two = psub(z, p8f_two); + Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two); + Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4); + right = pmadd(right, z_minus_two2, p8f_coeff_right_2); + right = pmadd(right, z_minus_two2, p8f_coeff_right_0); + + // Evaluate the polynomial for the interval [-1,1] in z. + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f); + Packet8f z2 = pmul(z, z); + Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5); + left = pmadd(left, z2, p8f_coeff_left_3); + left = pmadd(left, z2, p8f_coeff_left_1); + left = pmul(left, z); + + // Assemble the results, i.e. select the left and right polynomials. + left = _mm256_andnot_ps(ival_mask, left); + right = _mm256_and_ps(ival_mask, right); + Packet8f res = _mm256_or_ps(left, right); + + // Flip the sign on the odd intervals and return the result. + res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask)); + return res; +} + +// Natural logarithm +// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) +// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can +// be easily approximated by a polynomial centered on m=1 for stability. +// TODO(gonnet): Further reduce the interval allowing for lower-degree +// polynomial interpolants -> ... -> profit! +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +plog<Packet8f>(const Packet8f& _x) { + Packet8f x = _x; + _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); + + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); + + // The smallest non denormalized float number. + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); + + // Polynomial coefficients. + _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); + + Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN + Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); + + // Truncate input values to the minimum positive normal. + x = pmax(x, p8f_min_norm_pos); + + Packet8f emm0 = pshiftright(x,23); + Packet8f e = _mm256_sub_ps(emm0, p8f_126f); + + // Set the exponents to -1, i.e. x are in the range [0.5,1). + x = _mm256_and_ps(x, p8f_inv_mant_mask); + x = _mm256_or_ps(x, p8f_half); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); + Packet8f tmp = _mm256_and_ps(x, mask); + x = psub(x, p8f_1); + e = psub(e, _mm256_and_ps(p8f_1, mask)); + x = padd(x, tmp); + + Packet8f x2 = pmul(x, x); + Packet8f x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet8f y, y1, y2; + y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); + y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); + y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); + y = pmadd(y, x, p8f_cephes_log_p2); + y1 = pmadd(y1, x, p8f_cephes_log_p5); + y2 = pmadd(y2, x, p8f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + // Add the logarithm of the exponent back to the result of the interpolation. + y1 = pmul(e, p8f_cephes_log_q1); + tmp = pmul(x2, p8f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p8f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + + // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + return _mm256_or_ps( + _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), + _mm256_and_ps(iszero_mask, p8f_minus_inf)); +} + +// Exponential function. Works by writing "x = m*log(2) + r" where +// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then +// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +pexp<Packet8f>(const Packet8f& _x) { + _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); + + _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); + + _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); + + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); + + // Clamp x. + Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); + + // Express exp(x) as exp(m*ln(2) + r), start by extracting + // m = floor(x/ln(2) + 0.5). + Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); + +// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is +// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating +// truncation errors. Note that we don't use the "pmadd" function here to +// ensure that a precision-preserving FMA instruction is used. +#ifdef EIGEN_VECTORIZE_FMA + _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); + Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); +#else + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); + Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); + r = psub(r, pmul(m, p8f_cephes_exp_C2)); +#endif + + Packet8f r2 = pmul(r, r); + + // TODO(gonnet): Split into odd/even polynomials and try to exploit + // instruction-level parallelism. + Packet8f y = p8f_cephes_exp_p0; + y = pmadd(y, r, p8f_cephes_exp_p1); + y = pmadd(y, r, p8f_cephes_exp_p2); + y = pmadd(y, r, p8f_cephes_exp_p3); + y = pmadd(y, r, p8f_cephes_exp_p4); + y = pmadd(y, r, p8f_cephes_exp_p5); + y = pmadd(y, r2, r); + y = padd(y, p8f_1); + + // Build emm0 = 2^m. + Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); + emm0 = pshiftleft(emm0, 23); + + // Return 2^m * exp(r). + return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); +} + +// Hyperbolic Tangent function. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +ptanh<Packet8f>(const Packet8f& x) { + return internal::generic_fast_tanh_float(x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d +pexp<Packet4d>(const Packet4d& _x) { + Packet4d x = _x; + + _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); + _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); + _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); + + _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); + _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); + + Packet4d tmp, fx; + + // clamp x + x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); + // Express exp(x) as exp(g + n*log(2)). + fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); + + // Get the integer modulus of log(2), i.e. the "n" described above. + fx = _mm256_floor_pd(fx); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + tmp = pmul(fx, p4d_cephes_exp_C1); + Packet4d z = pmul(fx, p4d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet4d x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet4d px = p4d_cephes_exp_p0; + px = pmadd(px, x2, p4d_cephes_exp_p1); + px = pmadd(px, x2, p4d_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet4d qx = p4d_cephes_exp_q0; + qx = pmadd(qx, x2, p4d_cephes_exp_q1); + qx = pmadd(qx, x2, p4d_cephes_exp_q2); + qx = pmadd(qx, x2, p4d_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = _mm256_div_pd(px, psub(qx, px)); + x = pmadd(p4d_2, x, p4d_1); + + // Build e=2^n by constructing the exponents in a 128-bit vector and + // shifting them to where they belong in double-precision values. + __m128i emm0 = _mm256_cvtpd_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_1023); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i lo = _mm_slli_epi64(emm0, 52); + __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); + __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); + e = _mm256_insertf128_si256(e, hi, 1); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); +} + +// Functions for sqrt. +// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step +// of Newton's method, at a cost of 1-2 bits of precision as opposed to the +// exact solution. It does not handle +inf, or denormalized numbers correctly. +// The main advantage of this approach is not just speed, but also the fact that +// it can be inlined and pipelined with other computations, further reducing its +// effective latency. This is similar to Quake3's fast inverse square root. +// For detail see here: http://www.beyond3d.com/content/articles/8/ +#if EIGEN_FAST_MATH +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f +psqrt<Packet8f>(const Packet8f& _x) { + Packet8f half = pmul(_x, pset1<Packet8f>(.5f)); + Packet8f denormal_mask = _mm256_and_ps( + _mm256_cmp_ps(_x, pset1<Packet8f>((std::numeric_limits<float>::min)()), + _CMP_LT_OQ), + _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_GE_OQ)); + + // Compute approximate reciprocal sqrt. + Packet8f x = _mm256_rsqrt_ps(_x); + // Do a single step of Newton's iteration. + x = pmul(x, psub(pset1<Packet8f>(1.5f), pmul(half, pmul(x,x)))); + // Flush results for denormals to zero. + return _mm256_andnot_ps(denormal_mask, pmul(_x,x)); +} +#else +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f psqrt<Packet8f>(const Packet8f& x) { + return _mm256_sqrt_ps(x); +} +#endif +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4d psqrt<Packet4d>(const Packet4d& x) { + return _mm256_sqrt_pd(x); +} +#if EIGEN_FAST_MATH + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f prsqrt<Packet8f>(const Packet8f& _x) { + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); + + Packet8f neg_half = pmul(_x, p8f_minus_half); + + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); + Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); + + // Fill in NaNs and Infs for the negative/zero entries. + Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); + Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); + Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), + _mm256_and_ps(zero_mask, p8f_inf)); + + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); + + // Insert NaNs and Infs in all the right places. + return _mm256_or_ps(x, infs_and_nans); +} + +#else +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet8f prsqrt<Packet8f>(const Packet8f& x) { + _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); + return _mm256_div_ps(p8f_one, _mm256_sqrt_ps(x)); +} +#endif + +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4d prsqrt<Packet4d>(const Packet4d& x) { + _EIGEN_DECLARE_CONST_Packet4d(one, 1.0); + return _mm256_div_pd(p4d_one, _mm256_sqrt_pd(x)); +} + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_AVX_H diff --git a/eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/eigen/Eigen/src/Core/arch/AVX/PacketMath.h new file mode 100644 index 0000000..6362309 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/AVX/PacketMath.h @@ -0,0 +1,643 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_AVX_H +#define EIGEN_PACKET_MATH_AVX_H + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif + +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif +#endif + +typedef __m256 Packet8f; +typedef __m256i Packet8i; +typedef __m256d Packet4d; + +template<> struct is_arithmetic<__m256> { enum { value = true }; }; +template<> struct is_arithmetic<__m256i> { enum { value = true }; }; +template<> struct is_arithmetic<__m256d> { enum { value = true }; }; + +#define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ + const Packet8f p8f_##NAME = pset1<Packet8f>(X) + +#define _EIGEN_DECLARE_CONST_Packet4d(NAME,X) \ + const Packet4d p4d_##NAME = pset1<Packet4d>(X) + +#define _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(NAME,X) \ + const Packet8f p8f_##NAME = _mm256_castsi256_ps(pset1<Packet8i>(X)) + +#define _EIGEN_DECLARE_CONST_Packet8i(NAME,X) \ + const Packet8i p8i_##NAME = pset1<Packet8i>(X) + +// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going +// to leverage AVX512 instructions. +#ifndef EIGEN_VECTORIZE_AVX512 +template<> struct packet_traits<float> : default_packet_traits +{ + typedef Packet8f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8, + HasHalfPacket = 1, + + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasBlend = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1 + }; +}; +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet4d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 1, + + HasDiv = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasBlend = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1 + }; +}; +#endif + +template<> struct scalar_div_cost<float,true> { enum { value = 14 }; }; +template<> struct scalar_div_cost<double,true> { enum { value = 16 }; }; + +/* Proper support for integers is only provided by AVX2. In the meantime, we'll + use SSE instructions and packets to deal with integers. +template<> struct packet_traits<int> : default_packet_traits +{ + typedef Packet8i type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8 + }; +}; +*/ + +template<> struct unpacket_traits<Packet8f> { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; +template<> struct unpacket_traits<Packet8i> { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; + +template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) { return _mm256_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) { return _mm256_set1_epi32(from); } + +template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) { return _mm256_broadcast_ss(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); } + +template<> EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) { return _mm256_add_ps(_mm256_set1_ps(a), _mm256_set_ps(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)); } +template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { return _mm256_add_pd(_mm256_set1_pd(a), _mm256_set_pd(3.0,2.0,1.0,0.0)); } + +template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) +{ + return _mm256_sub_ps(_mm256_set1_ps(0.0),a); +} +template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) +{ + return _mm256_sub_pd(_mm256_set1_pd(0.0),a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_mul_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_mul_pd(a,b); } + + +template<> EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_div_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_div_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, const Packet8i& /*b*/) +{ eigen_assert(false && "packet integer division are not supported by AVX"); + return pset1<Packet8i>(0); +} + +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { +#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) + // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and gcc stupidly generates a vfmadd132ps instruction, + // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate + // the result of the product. + Packet8f res = c; + __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_ps(a,b,c); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { +#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) + // see above + Packet4d res = c; + __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_pd(a,b,c); +#endif +} +#endif + +template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_ps(b,a); +} +template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { + // Arguments are swapped to match NaN propagation behavior of std::min. + return _mm256_min_pd(b,a); +} +template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_ps(b,a); +} +template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { + // Arguments are swapped to match NaN propagation behavior of std::max. + return _mm256_max_pd(b,a); +} +template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } + +template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); } +template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); } + +template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); } +template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); } + +template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } + +template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); } + +template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } +template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); } + +// Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} +template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) +{ + // TODO try to find a way to avoid the need of a temporary register +// Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); +// tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); +// return _mm256_unpacklo_ps(tmp,tmp); + + // _mm256_insertf128_ps is very slow on Haswell, thus: + Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); + // mimic an "inplace" permutation of the lower 128bits using a blend + tmp = _mm256_blend_ps(tmp,_mm256_castps128_ps256(_mm_permute_ps( _mm256_castps256_ps128(tmp), _MM_SHUFFLE(1,0,1,0))), 15); + // then we can perform a consistent permutation on the global register to get everything in shape: + return _mm256_permute_ps(tmp, _MM_SHUFFLE(3,3,2,2)); +} +// Loads 2 doubles from memory a returns the packet {a0, a0 a1, a1} +template<> EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) +{ + Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from); + return _mm256_permute_pd(tmp, 3<<2); +} + +// Loads 2 floats from memory a returns the packet {a0, a0 a0, a0, a1, a1, a1, a1} +template<> EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) +{ + Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from)); + return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from+1), 1); +} + +template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } + +// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available +// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); +template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) +{ + return _mm256_set_ps(from[7*stride], from[6*stride], from[5*stride], from[4*stride], + from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) +{ + return _mm256_set_pd(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) +{ + __m128 low = _mm256_extractf128_ps(from, 0); + to[stride*0] = _mm_cvtss_f32(low); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)); + + __m128 high = _mm256_extractf128_ps(from, 1); + to[stride*4] = _mm_cvtss_f32(high); + to[stride*5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)); + to[stride*6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)); + to[stride*7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) +{ + __m128d low = _mm256_extractf128_pd(from, 0); + to[stride*0] = _mm_cvtsd_f64(low); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)); + __m128d high = _mm256_extractf128_pd(from, 1); + to[stride*2] = _mm_cvtsd_f64(high); + to[stride*3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)); +} + +template<> EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) +{ + Packet8f pa = pset1<Packet8f>(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) +{ + Packet4d pa = pset1<Packet4d>(a); + pstore(to, pa); +} +template<> EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) +{ + Packet8i pa = pset1<Packet8i>(a); + pstore(to, pa); +} + +#ifndef EIGEN_VECTORIZE_AVX512 +template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif + +template<> EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) { + return _mm_cvtss_f32(_mm256_castps256_ps128(a)); +} +template<> EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) { + return _mm_cvtsd_f64(_mm256_castpd256_pd128(a)); +} +template<> EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) { + return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); +} + + +template<> EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) +{ + __m256 tmp = _mm256_shuffle_ps(a,a,0x1b); + return _mm256_permute2f128_ps(tmp, tmp, 1); +} +template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) +{ + __m256d tmp = _mm256_shuffle_pd(a,a,5); + return _mm256_permute2f128_pd(tmp, tmp, 1); + + __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); + return _mm256_permute_pd(swap_halves,5); +} + +// pabs should be ok +template<> EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) +{ + const Packet8f mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); + return _mm256_and_ps(a,mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) +{ + const Packet4d mask = _mm256_castsi256_pd(_mm256_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); + return _mm256_and_pd(a,mask); +} + +// preduxp should be ok +// FIXME: why is this ok? why isn't the simply implementation working as expected? +template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs) +{ + __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); + __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); + __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); + __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); + + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + return final; +} +template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs) +{ + Packet4d tmp0, tmp1; + + tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + return _mm256_blend_pd(tmp0, tmp1, 0xC); +} + +template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) +{ + return predux(Packet4f(_mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)))); +} +template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a) +{ + return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); +} + +template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a) +{ + return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); +} + +template<> EIGEN_STRONG_INLINE float predux_mul<Packet8f>(const Packet8f& a) +{ + Packet8f tmp; + tmp = _mm256_mul_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_mul_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_mul<Packet4d>(const Packet4d& a) +{ + Packet4d tmp; + tmp = _mm256_mul_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_mul_pd(tmp, _mm256_shuffle_pd(tmp,tmp,1))); +} + +template<> EIGEN_STRONG_INLINE float predux_min<Packet8f>(const Packet8f& a) +{ + Packet8f tmp = _mm256_min_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_min_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} +template<> EIGEN_STRONG_INLINE double predux_min<Packet4d>(const Packet4d& a) +{ + Packet4d tmp = _mm256_min_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_min_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} + +template<> EIGEN_STRONG_INLINE float predux_max<Packet8f>(const Packet8f& a) +{ + Packet8f tmp = _mm256_max_ps(a, _mm256_permute2f128_ps(a,a,1)); + tmp = _mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,_MM_SHUFFLE(1,0,3,2))); + return pfirst(_mm256_max_ps(tmp, _mm256_shuffle_ps(tmp,tmp,1))); +} + +template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a) +{ + Packet4d tmp = _mm256_max_pd(a, _mm256_permute2f128_pd(a,a,1)); + return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); +} + + +template<int Offset> +struct palign_impl<Offset,Packet8f> +{ + static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) + { + if (Offset==1) + { + first = _mm256_blend_ps(first, second, 1); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0x88); + } + else if (Offset==2) + { + first = _mm256_blend_ps(first, second, 3); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0xcc); + } + else if (Offset==3) + { + first = _mm256_blend_ps(first, second, 7); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0xee); + } + else if (Offset==4) + { + first = _mm256_blend_ps(first, second, 15); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); + } + else if (Offset==5) + { + first = _mm256_blend_ps(first, second, 31); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0x88); + } + else if (Offset==6) + { + first = _mm256_blend_ps(first, second, 63); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xcc); + } + else if (Offset==7) + { + first = _mm256_blend_ps(first, second, 127); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xee); + } + } +}; + +template<int Offset> +struct palign_impl<Offset,Packet4d> +{ + static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) + { + if (Offset==1) + { + first = _mm256_blend_pd(first, second, 1); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 0xA); + } + else if (Offset==2) + { + first = _mm256_blend_pd(first, second, 3); + first = _mm256_permute2f128_pd(first, first, 1); + } + else if (Offset==3) + { + first = _mm256_blend_pd(first, second, 7); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 5); + } + } +}; + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet8f,8>& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + __m256 S4 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(1,0,1,0)); + __m256 S5 = _mm256_shuffle_ps(T4,T6,_MM_SHUFFLE(3,2,3,2)); + __m256 S6 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(1,0,1,0)); + __m256 S7 = _mm256_shuffle_ps(T5,T7,_MM_SHUFFLE(3,2,3,2)); + kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20); + kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20); + kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31); + kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31); + kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31); + kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet8f,4>& kernel) { + __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + + __m256 S0 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(1,0,1,0)); + __m256 S1 = _mm256_shuffle_ps(T0,T2,_MM_SHUFFLE(3,2,3,2)); + __m256 S2 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(1,0,1,0)); + __m256 S3 = _mm256_shuffle_ps(T1,T3,_MM_SHUFFLE(3,2,3,2)); + + kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20); + kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20); + kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31); + kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4d,4>& kernel) { + __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15); + __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); + __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15); + __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); + + kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32); + kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49); + kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32); + kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49); +} + +template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) { + const __m256 zero = _mm256_setzero_ps(); + const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_ps(thenPacket, elsePacket, false_mask); +} +template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) { + const __m256d zero = _mm256_setzero_pd(); + const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ); + return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); +} + +template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) +{ + return _mm256_blend_ps(a,pset1<Packet8f>(b),1); +} + +template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) +{ + return _mm256_blend_pd(a,pset1<Packet4d>(b),1); +} + +template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) +{ + return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7)); +} + +template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) +{ + return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3)); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_AVX_H diff --git a/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h b/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h new file mode 100644 index 0000000..83bfdc6 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -0,0 +1,51 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_AVX_H +#define EIGEN_TYPE_CASTING_AVX_H + +namespace Eigen { + +namespace internal { + +// For now we use SSE to handle integers, so we can't use AVX instructions to cast +// from int to float +template <> +struct type_casting_traits<float, int> { + enum { + VectorizedCast = 0, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template <> +struct type_casting_traits<int, float> { + enum { + VectorizedCast = 0, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + + + +template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) { + return _mm256_cvtps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) { + return _mm256_cvtepi32_ps(a); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_AVX_H diff --git a/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h new file mode 100644 index 0000000..399be0e --- /dev/null +++ b/eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -0,0 +1,396 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ +#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ + +namespace Eigen { + +namespace internal { + +// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. +#if EIGEN_GNUC_AT_LEAST(5, 3) + +#define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ + const Packet16f p16f_##NAME = pset1<Packet16f>(X) + +#define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ + const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X) + +#define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ + const Packet8d p8d_##NAME = pset1<Packet8d>(X) + +#define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ + const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) + +// Natural logarithm +// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) +// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can +// be easily approximated by a polynomial centered on m=1 for stability. +#if defined(EIGEN_VECTORIZE_AVX512DQ) +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +plog<Packet16f>(const Packet16f& _x) { + Packet16f x = _x; + _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet16f(126f, 126.0f); + + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inv_mant_mask, ~0x7f800000); + + // The smallest non denormalized float number. + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); + + // Polynomial coefficients. + _EIGEN_DECLARE_CONST_Packet16f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p1, -1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p3, -1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p4, +1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p5, -1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p6, +2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p7, -2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_p8, +3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f); + + // invalid_mask is set to true when x is NaN + __mmask16 invalid_mask = + _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); + __mmask16 iszero_mask = + _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ); + + // Truncate input values to the minimum positive normal. + x = pmax(x, p16f_min_norm_pos); + + // Extract the shifted exponents. + Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23)); + Packet16f e = _mm512_sub_ps(emm0, p16f_126f); + + // Set the exponents to -1, i.e. x are in the range [0.5,1). + x = _mm512_and_ps(x, p16f_inv_mant_mask); + x = _mm512_or_ps(x, p16f_half); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); + Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); + x = psub(x, p16f_1); + e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); + x = padd(x, tmp); + + Packet16f x2 = pmul(x, x); + Packet16f x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet16f y, y1, y2; + y = pmadd(p16f_cephes_log_p0, x, p16f_cephes_log_p1); + y1 = pmadd(p16f_cephes_log_p3, x, p16f_cephes_log_p4); + y2 = pmadd(p16f_cephes_log_p6, x, p16f_cephes_log_p7); + y = pmadd(y, x, p16f_cephes_log_p2); + y1 = pmadd(y1, x, p16f_cephes_log_p5); + y2 = pmadd(y2, x, p16f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + // Add the logarithm of the exponent back to the result of the interpolation. + y1 = pmul(e, p16f_cephes_log_q1); + tmp = pmul(x2, p16f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p16f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + + // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, + _mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); +} +#endif + +// Exponential function. Works by writing "x = m*log(2) + r" where +// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then +// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +pexp<Packet16f>(const Packet16f& _x) { + _EIGEN_DECLARE_CONST_Packet16f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet16f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet16f(127, 127.0f); + + _EIGEN_DECLARE_CONST_Packet16f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet16f(exp_lo, -88.3762626647949f); + + _EIGEN_DECLARE_CONST_Packet16f(cephes_LOG2EF, 1.44269504088896341f); + + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet16f(cephes_exp_p5, 5.0000001201E-1f); + + // Clamp x. + Packet16f x = pmax(pmin(_x, p16f_exp_hi), p16f_exp_lo); + + // Express exp(x) as exp(m*ln(2) + r), start by extracting + // m = floor(x/ln(2) + 0.5). + Packet16f m = _mm512_floor_ps(pmadd(x, p16f_cephes_LOG2EF, p16f_half)); + + // Get r = x - m*ln(2). Note that we can do this without losing more than one + // ulp precision due to the FMA instruction. + _EIGEN_DECLARE_CONST_Packet16f(nln2, -0.6931471805599453f); + Packet16f r = _mm512_fmadd_ps(m, p16f_nln2, x); + Packet16f r2 = pmul(r, r); + + // TODO(gonnet): Split into odd/even polynomials and try to exploit + // instruction-level parallelism. + Packet16f y = p16f_cephes_exp_p0; + y = pmadd(y, r, p16f_cephes_exp_p1); + y = pmadd(y, r, p16f_cephes_exp_p2); + y = pmadd(y, r, p16f_cephes_exp_p3); + y = pmadd(y, r, p16f_cephes_exp_p4); + y = pmadd(y, r, p16f_cephes_exp_p5); + y = pmadd(y, r2, r); + y = padd(y, p16f_1); + + // Build emm0 = 2^m. + Packet16i emm0 = _mm512_cvttps_epi32(padd(m, p16f_127)); + emm0 = _mm512_slli_epi32(emm0, 23); + + // Return 2^m * exp(r). + return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x); +} + +/*template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +pexp<Packet8d>(const Packet8d& _x) { + Packet8d x = _x; + + _EIGEN_DECLARE_CONST_Packet8d(1, 1.0); + _EIGEN_DECLARE_CONST_Packet8d(2, 2.0); + + _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303); + + _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599); + + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1); + + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0); + + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6); + + // clamp x + x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo); + + // Express exp(x) as exp(g + n*log(2)). + const Packet8d n = + _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1); + const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2); + x = psub(x, nC1); + x = psub(x, nC2); + + const Packet8d x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet8d px = p8d_cephes_exp_p0; + px = pmadd(px, x2, p8d_cephes_exp_p1); + px = pmadd(px, x2, p8d_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet8d qx = p8d_cephes_exp_q0; + qx = pmadd(qx, x2, p8d_cephes_exp_q1); + qx = pmadd(qx, x2, p8d_cephes_exp_q2); + qx = pmadd(qx, x2, p8d_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = _mm512_div_pd(px, psub(qx, px)); + x = pmadd(p8d_2, x, p8d_1); + + // Build e=2^n. + const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64( + _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52)); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + return pmax(pmul(x, e), _x); + }*/ + +// Functions for sqrt. +// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step +// of Newton's method, at a cost of 1-2 bits of precision as opposed to the +// exact solution. The main advantage of this approach is not just speed, but +// also the fact that it can be inlined and pipelined with other computations, +// further reducing its effective latency. +#if EIGEN_FAST_MATH +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +psqrt<Packet16f>(const Packet16f& _x) { + _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); + + Packet16f neg_half = pmul(_x, p16f_minus_half); + + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); + Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x), + _mm512_setzero_ps()); + + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + + // Multiply the original _x by it's reciprocal square root to extract the + // square root. + return pmul(_x, x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +psqrt<Packet8d>(const Packet8d& _x) { + _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); + _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + + Packet8d neg_half = pmul(_x, p8d_minus_half); + + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); + Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x), + _mm512_setzero_pd()); + + // Do a first step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + + // Do a second step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + + // Multiply the original _x by it's reciprocal square root to extract the + // square root. + return pmul(_x, x); +} +#else +template <> +EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) { + return _mm512_sqrt_ps(x); +} +template <> +EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) { + return _mm512_sqrt_pd(x); +} +#endif + +// Functions for rsqrt. +// Almost identical to the sqrt routine, just leave out the last multiplication +// and fill in NaN/Inf where needed. Note that this function only exists as an +// iterative version for doubles since there is no instruction for diretly +// computing the reciprocal square root in AVX-512. +#ifdef EIGEN_FAST_MATH +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f +prsqrt<Packet16f>(const Packet16f& _x) { + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); + + Packet16f neg_half = pmul(_x, p16f_minus_half); + + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); + Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), + _mm512_rsqrt14_ps(_x)); + + // Fill in NaNs and Infs for the negative/zero entries. + __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); + Packet16f infs_and_nans = _mm512_mask_blend_ps( + neg_mask, p16f_nan, + _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps())); + + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + + // Insert NaNs and Infs in all the right places. + return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); +} + +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d +prsqrt<Packet8d>(const Packet8d& _x) { + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); + _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + + Packet8d neg_half = pmul(_x, p8d_minus_half); + + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); + Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), + _mm512_rsqrt14_pd(_x)); + + // Fill in NaNs and Infs for the negative/zero entries. + __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); + Packet8d infs_and_nans = _mm512_mask_blend_pd( + neg_mask, p8d_nan, + _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd())); + + // Do a first step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + + // Do a second step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + + // Insert NaNs and Infs in all the right places. + return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); +} +#else +template <> +EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) { + return _mm512_rsqrt28_ps(x); +} +#endif +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_ diff --git a/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h new file mode 100644 index 0000000..12b8975 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -0,0 +1,1286 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_AVX512_H +#define EIGEN_PACKET_MATH_AVX512_H + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#endif + +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif +#endif + +typedef __m512 Packet16f; +typedef __m512i Packet16i; +typedef __m512d Packet8d; + +template <> +struct is_arithmetic<__m512> { + enum { value = true }; +}; +template <> +struct is_arithmetic<__m512i> { + enum { value = true }; +}; +template <> +struct is_arithmetic<__m512d> { + enum { value = true }; +}; + +template<> struct packet_traits<float> : default_packet_traits +{ + typedef Packet16f type; + typedef Packet8f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 1, +#if EIGEN_GNUC_AT_LEAST(5, 3) +#ifdef EIGEN_VECTORIZE_AVX512DQ + HasLog = 1, +#endif + HasExp = 1, + HasSqrt = EIGEN_FAST_MATH, + HasRsqrt = EIGEN_FAST_MATH, +#endif + HasDiv = 1 + }; + }; +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet8d type; + typedef Packet4d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 1, +#if EIGEN_GNUC_AT_LEAST(5, 3) + HasSqrt = EIGEN_FAST_MATH, + HasRsqrt = EIGEN_FAST_MATH, +#endif + HasDiv = 1 + }; +}; + +/* TODO Implement AVX512 for integers +template<> struct packet_traits<int> : default_packet_traits +{ + typedef Packet16i type; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=8 + }; +}; +*/ + +template <> +struct unpacket_traits<Packet16f> { + typedef float type; + typedef Packet8f half; + enum { size = 16, alignment=Aligned64 }; +}; +template <> +struct unpacket_traits<Packet8d> { + typedef double type; + typedef Packet4d half; + enum { size = 8, alignment=Aligned64 }; +}; +template <> +struct unpacket_traits<Packet16i> { + typedef int type; + typedef Packet8i half; + enum { size = 16, alignment=Aligned64 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) { + return _mm512_set1_ps(from); +} +template <> +EIGEN_STRONG_INLINE Packet8d pset1<Packet8d>(const double& from) { + return _mm512_set1_pd(from); +} +template <> +EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) { + return _mm512_set1_epi32(from); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) { + return _mm512_broadcastss_ps(_mm_load_ps1(from)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) { + return _mm512_broadcastsd_pd(_mm_load_pd1(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) { + return _mm512_add_ps( + _mm512_set1_ps(a), + _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, + 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)); +} +template <> +EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) { + return _mm512_add_pd(_mm512_set1_pd(a), + _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0)); +} + +template <> +EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, + const Packet16f& b) { + return _mm512_add_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, + const Packet8d& b) { + return _mm512_add_pd(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, + const Packet16f& b) { + return _mm512_sub_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a, + const Packet8d& b) { + return _mm512_sub_pd(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { + return _mm512_sub_ps(_mm512_set1_ps(0.0), a); +} +template <> +EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { + return _mm512_sub_pd(_mm512_set1_pd(0.0), a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) { + return a; +} +template <> +EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, + const Packet16f& b) { + return _mm512_mul_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a, + const Packet8d& b) { + return _mm512_mul_pd(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, + const Packet16f& b) { + return _mm512_div_ps(a, b); +} +template <> +EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a, + const Packet8d& b) { + return _mm512_div_pd(a, b); +} + +#ifdef __FMA__ +template <> +EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, + const Packet16f& c) { + return _mm512_fmadd_ps(a, b, c); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, + const Packet8d& c) { + return _mm512_fmadd_pd(a, b, c); +} +#endif + +template <> +EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a, + const Packet16f& b) { + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm512_min_ps(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a, + const Packet8d& b) { + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm512_min_pd(b, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, + const Packet16f& b) { + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm512_max_ps(b, a); +} +template <> +EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, + const Packet8d& b) { + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm512_max_pd(b, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_and_ps(a, b); +#else + Packet16f res = _mm512_undefined_ps(); + Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); + Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); + res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0); + + Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); + Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); + res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1); + + Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); + Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); + res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2); + + Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); + Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); + res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3); + + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a, + const Packet8d& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_and_pd(a, b); +#else + Packet8d res = _mm512_undefined_pd(); + Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); + Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); + res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0); + + Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); + Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); + res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); + + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_or_ps(a, b); +#else + Packet16f res = _mm512_undefined_ps(); + Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); + Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); + res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0); + + Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); + Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); + res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1); + + Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); + Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); + res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2); + + Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); + Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); + res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3); + + return res; +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a, + const Packet8d& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_or_pd(a, b); +#else + Packet8d res = _mm512_undefined_pd(); + Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); + Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); + res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0); + + Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); + Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); + res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1); + + return res; +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_xor_ps(a, b); +#else + Packet16f res = _mm512_undefined_ps(); + Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); + Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); + res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0); + + Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); + Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); + res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1); + + Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); + Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); + res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2); + + Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); + Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); + res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3); + + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, + const Packet8d& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_xor_pd(a, b); +#else + Packet8d res = _mm512_undefined_pd(); + Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); + Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); + res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0); + + Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); + Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); + res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1); + + return res; +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, + const Packet16f& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_andnot_ps(a, b); +#else + Packet16f res = _mm512_undefined_ps(); + Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); + Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); + res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0); + + Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); + Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); + res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1); + + Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); + Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); + res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2); + + Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); + Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); + res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3); + + return res; +#endif +} +template <> +EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, + const Packet8d& b) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + return _mm512_andnot_pd(a, b); +#else + Packet8d res = _mm512_undefined_pd(); + Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); + Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); + res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0); + + Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); + Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); + res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1); + + return res; +#endif +} + +template <> +EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from); +} +template <> +EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from); +} +template <> +EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) { + EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( + reinterpret_cast<const __m512i*>(from)); +} + +template <> +EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from); +} +template <> +EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from); +} +template <> +EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_si512( + reinterpret_cast<const __m512i*>(from)); +} + +// Loads 8 floats from memory a returns the packet +// {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} +template <> +EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) { + __m256i low_half = _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); + __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half)); + __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); + return pairs; +} +// Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, +// a3} +template <> +EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) { + __m512d x = _mm512_setzero_pd(); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3); + return x; +} + +// Loads 4 floats from memory a returns the packet +// {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} +template <> +EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) { + Packet16f tmp = _mm512_undefined_ps(); + tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); + tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); + tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2); + tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3); + return tmp; +} +// Loads 2 doubles from memory a returns the packet +// {a0, a0 a0, a0, a1, a1, a1, a1} +template <> +EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) { + __m128d tmp0 = _mm_load_pd1(from); + __m256d lane0 = _mm256_broadcastsd_pd(tmp0); + __m128d tmp1 = _mm_load_pd1(from + 1); + __m256d lane1 = _mm256_broadcastsd_pd(tmp1); + __m512d tmp = _mm512_undefined_pd(); + tmp = _mm512_insertf64x4(tmp, lane0, 0); + return _mm512_insertf64x4(tmp, lane1, 1); +} + +template <> +EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from); +} +template <> +EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from); +} +template <> +EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) { + EIGEN_DEBUG_ALIGNED_STORE _mm512_storeu_si512(reinterpret_cast<__m512i*>(to), + from); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from); +} +template <> +EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from); +} +template <> +EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) { + EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( + reinterpret_cast<__m512i*>(to), from); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, + Index stride) { + Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_multiplier = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); + + return _mm512_i32gather_ps(indices, from, 4); +} +template <> +EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, + Index stride) { + Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); + + return _mm512_i32gather_pd(indices, from, 8); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, + const Packet16f& from, + Index stride) { + Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_multiplier = + _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); + _mm512_i32scatter_ps(to, indices, from, 4); +} +template <> +EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, + const Packet8d& from, + Index stride) { + Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); + _mm512_i32scatter_pd(to, indices, from, 8); +} + +template <> +EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) { + Packet16f pa = pset1<Packet16f>(a); + pstore(to, pa); +} +template <> +EIGEN_STRONG_INLINE void pstore1<Packet8d>(double* to, const double& a) { + Packet8d pa = pset1<Packet8d>(a); + pstore(to, pa); +} +template <> +EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) { + Packet16i pa = pset1<Packet16i>(a); + pstore(to, pa); +} + +template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } + +template <> +EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) { + return _mm_cvtss_f32(_mm512_extractf32x4_ps(a, 0)); +} +template <> +EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) { + return _mm_cvtsd_f64(_mm256_extractf128_pd(_mm512_extractf64x4_pd(a, 0), 0)); +} +template <> +EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) { + return _mm_extract_epi32(_mm512_extracti32x4_epi32(a, 0), 0); +} + +template<> EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) +{ + return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a); +} + +template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) +{ + return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a); +} + +template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) +{ + // _mm512_abs_ps intrinsic not found, so hack around it + return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); +} +template <> +EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { + // _mm512_abs_ps intrinsic not found, so hack around it + return (__m512d)_mm512_and_si512((__m512i)a, + _mm512_set1_epi64(0x7fffffffffffffff)); +} + +#ifdef EIGEN_VECTORIZE_AVX512DQ +// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 +#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ + __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \ + __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1) +#else +#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ + __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ + _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \ + _mm512_extractf32x4_ps(INPUT, 1), 1); \ + __m256 OUTPUT##_1 = _mm256_insertf128_ps( \ + _mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \ + _mm512_extractf32x4_ps(INPUT, 3), 1); +#endif + +#ifdef EIGEN_VECTORIZE_AVX512DQ +#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ + OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); +#else +#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ + OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ + OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ + OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); +#endif +template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* +vecs) +{ + EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); + EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); + EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2); + EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3); + EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4); + EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5); + EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6); + EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7); + EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8); + EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9); + EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10); + EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11); + EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12); + EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13); + EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14); + EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15); + + __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0); + __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0); + __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0); + __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0); + + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + + hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1); + hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1); + hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1); + hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1); + + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + final = _mm256_add_ps(final, _mm256_blend_ps(blend1, blend2, 0xf0)); + + hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0); + hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0); + hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0); + hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0); + + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0); + + hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1); + hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1); + hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1); + hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1); + + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + final_1 = _mm256_add_ps(final_1, _mm256_blend_ps(blend1, blend2, 0xf0)); + + __m512 final_output; + + EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1); + return final_output; +} + +template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs) +{ + Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0); + Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1); + + Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0); + Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1); + + Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0); + Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1); + + Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0); + Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1); + + Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0); + Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1); + + Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0); + Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1); + + Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0); + Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1); + + Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0); + Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1); + + Packet4d tmp0, tmp1; + + tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC); + + tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + final_0 = _mm256_add_pd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC)); + + tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC); + + tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); + + __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); + + return _mm512_insertf64x4(final_output, final_1, 1); +} + +template <> +EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256 lane0 = _mm512_extractf32x8_ps(a, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f x = _mm256_add_ps(lane0, lane1); + return predux<Packet8f>(x); +#else + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3)); + sum = _mm_hadd_ps(sum, sum); + sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); + return _mm_cvtss_f32(sum); +#endif +} +template <> +EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d sum = _mm256_add_pd(lane0, lane1); + __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); + return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0))); +} + +template <> +EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256 lane0 = _mm512_extractf32x8_ps(a, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a, 1); + return _mm256_add_ps(lane0, lane1); +#else + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 sum0 = _mm_add_ps(lane0, lane2); + __m128 sum1 = _mm_add_ps(lane1, lane3); + return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_add_pd(lane0, lane1); + return res; +} + +template <> +EIGEN_STRONG_INLINE float predux_mul<Packet16f>(const Packet16f& a) { +//#ifdef EIGEN_VECTORIZE_AVX512DQ +#if 0 + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f res = pmul(lane0, lane1); + res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); + res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +#else + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); + res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +#endif +} +template <> +EIGEN_STRONG_INLINE double predux_mul<Packet8d>(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = pmul(lane0, lane1); + res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); +} + +template <> +EIGEN_STRONG_INLINE float predux_min<Packet16f>(const Packet16f& a) { + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); + res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} +template <> +EIGEN_STRONG_INLINE double predux_min<Packet8d>(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_min_pd(lane0, lane1); + res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); +} + +template <> +EIGEN_STRONG_INLINE float predux_max<Packet16f>(const Packet16f& a) { + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); + res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} + +template <> +EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_max_pd(lane0, lane1); + res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); +} + +template <int Offset> +struct palign_impl<Offset, Packet16f> { + static EIGEN_STRONG_INLINE void run(Packet16f& first, + const Packet16f& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, + Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, + Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); + + __m512i second_idx = + _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, + Offset - 5, Offset - 6, Offset - 7, Offset - 8, + Offset - 9, Offset - 10, Offset - 11, Offset - 12, + Offset - 13, Offset - 14, Offset - 15, Offset - 16); + + unsigned short mask = 0xFFFF; + mask <<= (16 - Offset); + + first = _mm512_permutexvar_ps(first_idx, first); + Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); + first = _mm512_mask_blend_ps(mask, first, tmp); + } + } +}; +template <int Offset> +struct palign_impl<Offset, Packet8d> { + static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, + Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); + + __m512i second_idx = _mm512_set_epi32( + 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, + Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); + + unsigned char mask = 0xFF; + mask <<= (8 - Offset); + + first = _mm512_permutexvar_pd(first_idx, first); + Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); + first = _mm512_mask_blend_pd(mask, first, tmp); + } + } +}; + + +#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ + EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]); + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) { + __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]); + __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]); + __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]); + __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]); + __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]); + __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]); + __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]); + __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]); + __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]); + __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]); + __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]); + __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]); + __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2)); + + EIGEN_EXTRACT_8f_FROM_16f(S0, S0); + EIGEN_EXTRACT_8f_FROM_16f(S1, S1); + EIGEN_EXTRACT_8f_FROM_16f(S2, S2); + EIGEN_EXTRACT_8f_FROM_16f(S3, S3); + EIGEN_EXTRACT_8f_FROM_16f(S4, S4); + EIGEN_EXTRACT_8f_FROM_16f(S5, S5); + EIGEN_EXTRACT_8f_FROM_16f(S6, S6); + EIGEN_EXTRACT_8f_FROM_16f(S7, S7); + EIGEN_EXTRACT_8f_FROM_16f(S8, S8); + EIGEN_EXTRACT_8f_FROM_16f(S9, S9); + EIGEN_EXTRACT_8f_FROM_16f(S10, S10); + EIGEN_EXTRACT_8f_FROM_16f(S11, S11); + EIGEN_EXTRACT_8f_FROM_16f(S12, S12); + EIGEN_EXTRACT_8f_FROM_16f(S13, S13); + EIGEN_EXTRACT_8f_FROM_16f(S14, S14); + EIGEN_EXTRACT_8f_FROM_16f(S15, S15); + + PacketBlock<Packet8f, 32> tmp; + + tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20); + tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20); + tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20); + tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20); + tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31); + tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31); + tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31); + tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31); + + tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20); + tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20); + tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20); + tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20); + tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31); + tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31); + tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31); + tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31); + + // Second set of _m256 outputs + tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20); + tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20); + tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20); + tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20); + tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31); + tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31); + tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31); + tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31); + + tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20); + tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20); + tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20); + tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20); + tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31); + tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31); + tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31); + tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31); + + // Pack them into the output + PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16); + + PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16); + + PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16); + + PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16); + PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16); +} +#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \ + EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], \ + INPUT[2 * INDEX + STRIDE]); + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) { + __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]); + __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]); + __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]); + __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]); + + __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2)); + + EIGEN_EXTRACT_8f_FROM_16f(S0, S0); + EIGEN_EXTRACT_8f_FROM_16f(S1, S1); + EIGEN_EXTRACT_8f_FROM_16f(S2, S2); + EIGEN_EXTRACT_8f_FROM_16f(S3, S3); + + PacketBlock<Packet8f, 8> tmp; + + tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20); + tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20); + tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31); + tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31); + + tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20); + tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20); + tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31); + tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31); + + PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1); + PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1); + PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1); + PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1); +} + +#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE) \ + OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \ + OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1); + +#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE) \ + OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \ + OUTPUT[INDEX] = \ + _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1); + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) { + __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0); + __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff); + __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0); + __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff); + + PacketBlock<Packet4d, 8> tmp; + + tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), + _mm512_extractf64x4_pd(T2, 0), 0x20); + tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), + _mm512_extractf64x4_pd(T3, 0), 0x20); + tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), + _mm512_extractf64x4_pd(T2, 0), 0x31); + tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), + _mm512_extractf64x4_pd(T3, 0), 0x31); + + tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), + _mm512_extractf64x4_pd(T2, 1), 0x20); + tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), + _mm512_extractf64x4_pd(T3, 1), 0x20); + tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), + _mm512_extractf64x4_pd(T2, 1), 0x31); + tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), + _mm512_extractf64x4_pd(T3, 1), 0x31); + + PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1); + PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1); + PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1); + PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1); +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) { + __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]); + __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]); + __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]); + __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]); + __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]); + __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]); + + PacketBlock<Packet4d, 16> tmp; + + tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), + _mm512_extractf64x4_pd(T2, 0), 0x20); + tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), + _mm512_extractf64x4_pd(T3, 0), 0x20); + tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), + _mm512_extractf64x4_pd(T2, 0), 0x31); + tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), + _mm512_extractf64x4_pd(T3, 0), 0x31); + + tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), + _mm512_extractf64x4_pd(T2, 1), 0x20); + tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), + _mm512_extractf64x4_pd(T3, 1), 0x20); + tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), + _mm512_extractf64x4_pd(T2, 1), 0x31); + tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), + _mm512_extractf64x4_pd(T3, 1), 0x31); + + tmp.packet[8] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), + _mm512_extractf64x4_pd(T6, 0), 0x20); + tmp.packet[9] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), + _mm512_extractf64x4_pd(T7, 0), 0x20); + tmp.packet[10] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 0), + _mm512_extractf64x4_pd(T6, 0), 0x31); + tmp.packet[11] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 0), + _mm512_extractf64x4_pd(T7, 0), 0x31); + + tmp.packet[12] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), + _mm512_extractf64x4_pd(T6, 1), 0x20); + tmp.packet[13] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), + _mm512_extractf64x4_pd(T7, 1), 0x20); + tmp.packet[14] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T4, 1), + _mm512_extractf64x4_pd(T6, 1), 0x31); + tmp.packet[15] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T5, 1), + _mm512_extractf64x4_pd(T7, 1), 0x31); + + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 0, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 1, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 2, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 3, 8); + + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 4, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 5, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 6, 8); + PACK_OUTPUT_SQ_D(kernel.packet, tmp.packet, 7, 8); +} +template <> +EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, + const Packet16f& /*thenPacket*/, + const Packet16f& /*elsePacket*/) { + assert(false && "To be implemented"); + return Packet16f(); +} +template <> +EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, + const Packet8d& /*thenPacket*/, + const Packet8d& /*elsePacket*/) { + assert(false && "To be implemented"); + return Packet8d(); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_AVX512_H diff --git a/eigen/Eigen/src/Core/arch/AltiVec/CMakeLists.txt b/eigen/Eigen/src/Core/arch/AltiVec/CMakeLists.txt deleted file mode 100644 index 9f8d2e9..0000000 --- a/eigen/Eigen/src/Core/arch/AltiVec/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Core_arch_AltiVec_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Core_arch_AltiVec_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/AltiVec COMPONENT Devel -) diff --git a/eigen/Eigen/src/Core/arch/AltiVec/Complex.h b/eigen/Eigen/src/Core/arch/AltiVec/Complex.h index 68d9a2b..67db2f8 100644 --- a/eigen/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/eigen/Eigen/src/Core/arch/AltiVec/Complex.h @@ -2,30 +2,34 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_COMPLEX_ALTIVEC_H -#define EIGEN_COMPLEX_ALTIVEC_H +#ifndef EIGEN_COMPLEX32_ALTIVEC_H +#define EIGEN_COMPLEX32_ALTIVEC_H namespace Eigen { namespace internal { -static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; -static Packet16uc p16uc_COMPLEX_RE = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; -static Packet16uc p16uc_COMPLEX_IM = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; -static Packet16uc p16uc_COMPLEX_REV = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; -static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; -static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +static Packet4ui p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; +#ifdef __VSX__ +#if defined(_BIG_ENDIAN) +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +#else +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_MZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2d_MZERO, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +#endif +#endif //---------- float ---------- struct Packet2cf { - EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf() : v(p4f_ZERO) {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} Packet4f v; }; @@ -33,10 +37,12 @@ struct Packet2cf template<> struct packet_traits<std::complex<float> > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -47,65 +53,78 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, +#ifdef __VSX__ + HasBlend = 1, +#endif HasSetLinear = 0 }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) { Packet2cf res; - /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */ - if((ptrdiff_t(&from) % 16) == 0) + if((std::ptrdiff_t(&from) % 16) == 0) res.v = pload<Packet4f>((const float *)&from); else res.v = ploadu<Packet4f>((const float *)&from); - res.v = vec_perm(res.v, res.v, p16uc_PSET_HI); + res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); return res; } -template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { return Packet2cf(pload<Packet4f>((const float *) from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { return Packet2cf(ploadu<Packet4f>((const float*) from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } + +template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride) +{ + std::complex<float> EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet2cf>(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride) +{ + std::complex<float> EIGEN_ALIGN16 af[2]; + pstore<std::complex<float> >((std::complex<float> *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); } template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { Packet4f v1, v2; // Permute and multiply the real parts of a and b - v1 = vec_perm(a.v, a.v, p16uc_COMPLEX_RE); + v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD); // Get the imaginary parts of a - v2 = vec_perm(a.v, a.v, p16uc_COMPLEX_IM); + v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); // multiply a_re * b v1 = vec_madd(v1, b.v, p4f_ZERO); // multiply a_im * b and get the conjugate result v2 = vec_madd(v2, b.v, p4f_ZERO); - v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR); + v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); // permute back to a proper order - v2 = vec_perm(v2, v2, p16uc_COMPLEX_REV); + v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV); - return Packet2cf(vec_add(v1, v2)); + return Packet2cf(padd<Packet4f>(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); } - -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) -{ - return pset1<Packet2cf>(*from); -} - -template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } - -template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); } +template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) { @@ -118,26 +137,30 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { Packet4f rev_a; - rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX_REV2); + rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); return Packet2cf(rev_a); } template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) { Packet4f b; - b = (Packet4f) vec_sld(a.v, a.v, 8); - b = padd(a.v, b); - return pfirst(Packet2cf(b)); + b = vec_sld(a.v, a.v, 8); + b = padd<Packet4f>(a.v, b); + return pfirst<Packet2cf>(Packet2cf(b)); } template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) { Packet4f b1, b2; - - b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8); - b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8); - b2 = (Packet4f) vec_sld(b2, b2, 8); - b2 = padd(b1, b2); +#ifdef _BIG_ENDIAN + b1 = vec_sld(vecs[0].v, vecs[1].v, 8); + b2 = vec_sld(vecs[1].v, vecs[0].v, 8); +#else + b1 = vec_sld(vecs[1].v, vecs[0].v, 8); + b2 = vec_sld(vecs[0].v, vecs[1].v, 8); +#endif + b2 = vec_sld(b2, b2, 8); + b2 = padd<Packet4f>(b1, b2); return Packet2cf(b2); } @@ -146,10 +169,10 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P { Packet4f b; Packet2cf prod; - b = (Packet4f) vec_sld(a.v, a.v, 8); - prod = pmul(a, Packet2cf(b)); + b = vec_sld(a.v, a.v, 8); + prod = pmul<Packet2cf>(a, Packet2cf(b)); - return pfirst(prod); + return pfirst<Packet2cf>(prod); } template<int Offset> @@ -159,7 +182,11 @@ struct palign_impl<Offset,Packet2cf> { if (Offset==1) { +#ifdef _BIG_ENDIAN first.v = vec_sld(first.v, second.v, 8); +#else + first.v = vec_sld(second.v, first.v, 8); +#endif } } }; @@ -197,21 +224,238 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true> } }; +template<> struct conj_helper<Packet4f, Packet2cf, false,false> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const + { return Packet2cf(internal::pmul<Packet4f>(x, y.v)); } +}; + +template<> struct conj_helper<Packet2cf, Packet4f, false,false> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const + { return Packet2cf(internal::pmul<Packet4f>(x.v, y)); } +}; + template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec - Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b); - Packet4f s = vec_madd(b.v, b.v, p4f_ZERO); - return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX_REV)))); + Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b); + Packet4f s = pmul<Packet4f>(b.v, b.v); + return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); } template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x) { - return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV)); + return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel) +{ + Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +#ifdef __VSX__ +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v))); + return result; +} +#endif + +//---------- double ---------- +#ifdef __VSX__ +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +template<> struct packet_traits<std::complex<double> > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride) +{ + std::complex<double> EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet1cd>(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride) +{ + std::complex<double> EIGEN_ALIGN16 af[2]; + pstore<std::complex<double> >(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d a_re, a_im, v1, v2; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + // multiply a_re * b + v1 = vec_madd(a_re, b.v, p2d_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8)); + v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1)); + + return Packet1cd(padd<Packet2d>(v1, v2)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(pandnot(a.v, b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); } + +template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_PPC_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) +{ + std::complex<double> EIGEN_ALIGN16 res[2]; + pstore<std::complex<double> >(res, a); + + return res[0]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; } + +template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); } + +template<int Offset> +struct palign_impl<Offset,Packet1cd> +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, false,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; +template<> struct conj_helper<Packet2d, Packet1cd, false,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const + { return Packet1cd(internal::pmul<Packet2d>(x, y.v)); } +}; + +template<> struct conj_helper<Packet1cd, Packet2d, false,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const + { return padd(c, pmul(x,y)); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const + { return Packet1cd(internal::pmul<Packet2d>(x.v, y)); } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b); + Packet2d s = pmul<Packet2d>(b.v, b.v); + return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64)))); } +EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} +#endif // __VSX__ } // end namespace internal } // end namespace Eigen -#endif // EIGEN_COMPLEX_ALTIVEC_H +#endif // EIGEN_COMPLEX32_ALTIVEC_H diff --git a/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h new file mode 100644 index 0000000..c5e4bed --- /dev/null +++ b/eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -0,0 +1,322 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H + +namespace Eigen { + +namespace internal { + +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); + +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + +/* the smallest non denormalized float number */ +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + +#ifdef __VSX__ +static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + +static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + +#ifdef __POWER8_VECTOR__ +static Packet2l p2l_1023 = { 1023, 1023 }; +static Packet2ul p2ul_52 = { 52, 52 }; +#endif + +#endif + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f plog<Packet4f>(const Packet4f& _x) +{ + Packet4f x = _x; + + Packet4i emm0; + + /* isvalid_mask is 0 if x < 0 or x is NaN. */ + Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO)); + Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO)); + + x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ + emm0 = vec_sr(reinterpret_cast<Packet4i>(x), + reinterpret_cast<Packet4ui>(p4i_23)); + + /* keep only the fractional part */ + x = pand(x, p4f_inv_mant_mask); + x = por(x, p4f_half); + + emm0 = psub(emm0, p4i_0x7f); + Packet4f e = padd(vec_ctf(emm0, 0), p4f_1); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF)); + Packet4f tmp = pand(x, mask); + x = psub(x, p4f_1); + e = psub(e, pand(p4f_1, mask)); + x = padd(x, tmp); + + Packet4f x2 = pmul(x,x); + Packet4f x3 = pmul(x2,x); + + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y , x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y1 = pmul(e, p4f_cephes_log_q1); + tmp = pmul(x2, p4f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p4f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + // negative arg will be NAN, 0 will be -INF + x = vec_sel(x, p4f_minus_inf, iszero_mask); + x = vec_sel(p4f_minus_nan, x, isvalid_mask); + return x; +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp<Packet4f>(const Packet4f& _x) +{ + Packet4f x = _x; + + Packet4f tmp, fx; + Packet4i emm0; + + // clamp x + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + + // express exp(x) as exp(g + n*log(2)) + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + z = pmul(x,x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // build 2^n + emm0 = vec_cts(fx, 0); + emm0 = vec_add(emm0, p4i_0x7f); + emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23)); + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x), + isnumber_mask); +} + +#ifndef EIGEN_COMP_CLANG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt<Packet4f>(const Packet4f& x) +{ + return vec_rsqrt(x); +} +#endif + +#ifdef __VSX__ +#ifndef EIGEN_COMP_CLANG +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt<Packet2d>(const Packet2d& x) +{ + return vec_rsqrt(x); +} +#endif + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt<Packet4f>(const Packet4f& x) +{ + return vec_sqrt(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt<Packet2d>(const Packet2d& x) +{ + return vec_sqrt(x); +} + +// VSX support varies between different compilers and even different +// versions of the same compiler. For gcc version >= 4.9.3, we can use +// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use +// a slow version that works with older compilers. +// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles +// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 +static inline Packet2l ConvertToPacket2l(const Packet2d& x) { +#if EIGEN_GNUC_AT_LEAST(5, 4) || \ + (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) + return vec_cts(x, 0); // TODO: check clang version. +#else + double tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2l l = { static_cast<long long>(tmp[0]), + static_cast<long long>(tmp[1]) }; + return l; +#endif +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp<Packet2d>(const Packet2d& _x) +{ + Packet2d x = _x; + + Packet2d tmp, fx; + Packet2l emm0; + + // clamp x + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet2d x2 = pmul(x,x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + + // build 2^n + emm0 = ConvertToPacket2l(fx); + +#ifdef __POWER8_VECTOR__ + emm0 = vec_add(emm0, p2l_1023); + emm0 = vec_sl(emm0, p2ul_52); +#else + // Code is a bit complex for POWER7. There is actually a + // vec_xxsldi intrinsic but it is not supported by some gcc versions. + // So we shift (52-32) bits and do a word swap with zeros. + _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); + _EIGEN_DECLARE_CONST_Packet4i(20, 20); // 52 - 32 + + Packet4i emm04i = reinterpret_cast<Packet4i>(emm0); + emm04i = vec_add(emm04i, p4i_1023); + emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20)); + static const Packet16uc perm = { + 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; +#ifdef _BIG_ENDIAN + emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm)); +#else + emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm)); +#endif + +#endif + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x), + isnumber_mask); +} +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H diff --git a/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h b/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h index e408996..b3f1ea1 100644 --- a/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr> +// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -18,13 +18,17 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif -#ifndef EIGEN_HAS_FUSE_CJMADD -#define EIGEN_HAS_FUSE_CJMADD 1 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD #endif // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif typedef __vector float Packet4f; @@ -38,7 +42,7 @@ typedef __vector unsigned char Packet16uc; // and it doesn't really work to declare them global, so we define macros instead #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X) + Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X)) #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = vec_splat_s32(X) @@ -46,60 +50,158 @@ typedef __vector unsigned char Packet16uc; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = pset1<Packet4f>(X) -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) - #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = pset1<Packet4i>(X) +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = pset1<Packet2d>(X) + +#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = pset1<Packet2l>(X) + +#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ + const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X)) + #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) + +// These constants are endian-agnostic +static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} +static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} +#ifndef __VSX__ +static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} +#endif + +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; + +static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; + +// Mask alignment +#ifdef __PPC64__ +#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 +#else +#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 +#endif + +#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) + +// Handle endianness properly while loading constants // Define global static constants: -static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 }; -static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 }; -static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; +#ifdef _BIG_ENDIAN static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); -static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7}; - -static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); -static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); -static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); +#ifdef __VSX__ +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +#endif +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; +static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; +static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; +#else +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; +static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; +static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; +#endif // _BIG_ENDIAN + +static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; + +static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; + +#ifdef _BIG_ENDIAN +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +#else +static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +#endif // _BIG_ENDIAN + +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); +#else + #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#endif template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, - - // FIXME check the Has* + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, HasSin = 0, HasCos = 0, HasLog = 0, - HasExp = 0, - HasSqrt = 0 + HasExp = 1, +#ifdef __VSX__ + HasSqrt = 1, +#if !EIGEN_COMP_CLANG + HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif +#else + HasSqrt = 0, + HasRsqrt = 0, +#endif + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 }; }; template<> struct packet_traits<int> : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { - // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size = 4, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 0, + HasBlend = 1 }; }; -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; -/* + +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; + +inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) +{ + union { + Packet16uc v; + unsigned char n[16]; + } vt; + vt.v = v; + for (int i=0; i< 16; i++) + s << (int)vt.n[i] << ", "; + return s; +} + inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) { union { @@ -133,89 +235,136 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) return s; } -inline std::ostream & operator <<(std::ostream & s, const Packetbi & v) +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { - union { - Packet4bi v; - unsigned int n[4]; - } vt; - vt.v = v; - s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; - return s; -} -*/ -template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - float EIGEN_ALIGN16 af[4]; - af[0] = from; - Packet4f vc = vec_ld(0, af); - vc = vec_splat(vc, 0); - return vc; + EIGEN_DEBUG_ALIGNED_LOAD +#ifdef __VSX__ + return vec_vsx_ld(0, from); +#else + return vec_ld(0, from); +#endif } -template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { - int EIGEN_ALIGN16 ai[4]; - ai[0] = from; - Packet4i vc = vec_ld(0, ai); - vc = vec_splat(vc, 0); - return vc; +template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD +#ifdef __VSX__ + return vec_vsx_ld(0, from); +#else + return vec_ld(0, from); +#endif } -template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); } - -template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); } +template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) +{ + EIGEN_DEBUG_ALIGNED_STORE +#ifdef __VSX__ + vec_vsx_st(from, 0, to); +#else + vec_st(from, 0, to); +#endif +} -template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); } +template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) +{ + EIGEN_DEBUG_ALIGNED_STORE +#ifdef __VSX__ + vec_vsx_st(from, 0, to); +#else + vec_st(from, 0, to); +#endif +} -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); } +template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { + Packet4f v = {from, from, from, from}; + return v; +} -template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { + Packet4i v = {from, from, from, from}; + return v; +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4f>(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload<Packet4f>(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4i>(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = pload<Packet4i>(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} -template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); } -/* Commented out: it's actually slower than processing it scalar - * -template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) +{ + float EIGEN_ALIGN16 af[4]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + af[2] = from[2*stride]; + af[3] = from[3*stride]; + return pload<Packet4f>(af); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload<Packet4i>(ai); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) { - // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec - //Set up constants, variables - Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel; + float EIGEN_ALIGN16 af[4]; + pstore<float>(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; + to[2*stride] = af[2]; + to[3*stride] = af[3]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + pstore<int>((int *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} - // Get the absolute values - a1 = vec_abs(a); - b1 = vec_abs(b); +template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; } - // Get the signs using xor - Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO); +template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; } - // Do the multiplication for the asbolute values. - bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 ); - low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1); - high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO); - high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16); - prod = vec_add( low_prod, high_prod ); +template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; } - // NOR the product and select only the negative elements according to the sign mask - prod_ = vec_nor(prod, prod); - prod_ = vec_sel(p4i_ZERO, prod_, sgn); +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } - // Add 1 to the result to get the negative numbers - v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn); - prod_ = vec_add(prod_, v1sel); +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } - // Merge the results back to the final vector. - prod = vec_sel(prod, prod_, sgn); +template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } +template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; } - return prod; -} -*/ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { - Packet4f t, y_0, y_1, res; +#ifndef __VSX__ // VSX actually provides a div instruction + Packet4f t, y_0, y_1; // Altivec does not offer a divide instruction, we have to do a reciprocal approximation y_0 = vec_re(b); @@ -224,8 +373,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const t = vec_nmsub(y_0, b, p4f_ONE); y_1 = vec_madd(y_0, t, y_0); - res = vec_madd(a, y_1, p4f_ZERO); - return res; + return vec_madd(a, y_1, p4f_MZERO); +#else + return vec_div(a, b); +#endif } template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) @@ -234,8 +385,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, co } // for some weird raisons, it has to be overloaded for packet of integers -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } @@ -243,7 +394,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } -// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } @@ -256,13 +406,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } -template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); } +template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); } +#ifdef _BIG_ENDIAN template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html Packet16uc MSQ, LSQ; Packet16uc mask; MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword @@ -282,25 +433,36 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) mask = vec_lvsl(0, from); // create the permute mask return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data } +#else +// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); +} +template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) +{ + EIGEN_DEBUG_UNALIGNED_LOAD + return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); +} +#endif template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) { Packet4f p; - if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4f>(from); - else p = ploadu<Packet4f>(from); - return vec_perm(p, p, p16uc_DUPLICATE); + if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from); + else p = ploadu<Packet4f>(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); } template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) { Packet4i p; - if((ptrdiff_t(&from) % 16) == 0) p = pload<Packet4i>(from); - else p = ploadu<Packet4i>(from); - return vec_perm(p, p, p16uc_DUPLICATE); + if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from); + else p = ploadu<Packet4i>(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); } -template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } -template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); } - +#ifdef _BIG_ENDIAN template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE @@ -337,15 +499,33 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& f vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part } +#else +// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX +template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); +} +template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); +} +#endif -template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } -template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); } +template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } -template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); +} +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); } template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } @@ -353,10 +533,10 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) { Packet4f b, sum; - b = (Packet4f) vec_sld(a, a, 8); - sum = vec_add(a, b); - b = (Packet4f) vec_sld(sum, sum, 4); - sum = vec_add(sum, b); + b = vec_sld(a, a, 8); + sum = a + b; + b = vec_sld(sum, sum, 4); + sum += b; return pfirst(sum); } @@ -379,11 +559,11 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) // Now do the summation: // Lines 0+1 - sum[0] = vec_add(sum[0], sum[1]); + sum[0] = sum[0] + sum[1]; // Lines 2+3 - sum[1] = vec_add(sum[2], sum[3]); + sum[1] = sum[2] + sum[3]; // Add the results - sum[0] = vec_add(sum[0], sum[1]); + sum[0] = sum[0] + sum[1]; return sum[0]; } @@ -392,7 +572,11 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) { Packet4i sum; sum = vec_sums(a, p4i_ZERO); +#ifdef _BIG_ENDIAN sum = vec_sld(sum, p4i_ZERO, 12); +#else + sum = vec_sld(p4i_ZERO, sum, 4); +#endif return pfirst(sum); } @@ -415,11 +599,11 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) // Now do the summation: // Lines 0+1 - sum[0] = vec_add(sum[0], sum[1]); + sum[0] = sum[0] + sum[1]; // Lines 2+3 - sum[1] = vec_add(sum[2], sum[3]); + sum[1] = sum[2] + sum[3]; // Add the results - sum[0] = vec_add(sum[0], sum[1]); + sum[0] = sum[0] + sum[1]; return sum[0]; } @@ -429,8 +613,8 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) { Packet4f prod; - prod = pmul(a, (Packet4f)vec_sld(a, a, 8)); - return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4))); + prod = pmul(a, vec_sld(a, a, 8)); + return pfirst(pmul(prod, vec_sld(prod, prod, 4))); } template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) @@ -479,8 +663,25 @@ struct palign_impl<Offset,Packet4f> { static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) { - if (Offset!=0) - first = vec_sld(first, second, Offset*4); +#ifdef _BIG_ENDIAN + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } +#else + switch (Offset % 4) { + case 1: + first = vec_sld(second, first, 12); break; + case 2: + first = vec_sld(second, first, 8); break; + case 3: + first = vec_sld(second, first, 4); break; + } +#endif } }; @@ -489,11 +690,342 @@ struct palign_impl<Offset,Packet4i> { static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) { - if (Offset!=0) - first = vec_sld(first, second, Offset*4); +#ifdef _BIG_ENDIAN + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } +#else + switch (Offset % 4) { + case 1: + first = vec_sld(second, first, 12); break; + case 2: + first = vec_sld(second, first, 8); break; + case 3: + first = vec_sld(second, first, 4); break; + } +#endif } }; +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4f,4>& kernel) { + Packet4f t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4i,4>& kernel) { + Packet4i t0, t1, t2, t3; + t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); + return vec_sel(elsePacket, thenPacket, mask); +} + + +//---------- double ---------- +#ifdef __VSX__ +typedef __vector double Packet2d; +typedef __vector unsigned long long Packet2ul; +typedef __vector long long Packet2l; +#if EIGEN_COMP_CLANG +typedef Packet2ul Packet2bl; +#else +typedef __vector __bool long Packet2bl; +#endif + +static Packet2l p2l_ONE = { 1, 1 }; +static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); +static Packet2d p2d_MZERO = { -0.0, -0.0 }; + +#ifdef _BIG_ENDIAN +static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8)); +#else +static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8)); +#endif + +template<int index> Packet2d vec_splat_dbl(Packet2d& a); + +template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) +{ + return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI)); +} + +template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) +{ + return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO)); +} + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; + +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; + +inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) +{ + union { + Packet2l v; + int64_t n[2]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) +{ + union { + Packet2d v; + double n[2]; + } vt; + vt.v = v; + s << vt.n[0] << ", " << vt.n[1]; + return s; +} + +// Need to define them first or we get specialization after instantiation errors +template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD +#ifdef __VSX__ + return vec_vsx_ld(0, from); +#else + return vec_ld(0, from); +#endif +} + +template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) +{ + EIGEN_DEBUG_ALIGNED_STORE +#ifdef __VSX__ + vec_vsx_st(from, 0, to); +#else + vec_st(from, 0, to); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { + Packet2d v = {from, from}; + return v; +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet2d>(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload<Packet2d>(a); + a0 = vec_splat_dbl<0>(a1); + a1 = vec_splat_dbl<1>(a1); + a3 = pload<Packet2d>(a+2); + a2 = vec_splat_dbl<0>(a3); + a3 = vec_splat_dbl<1>(a3); +} + +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet2d>(af); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + pstore<double>(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; } + +template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; } + +template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; } + +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } + +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); } +template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } + +// for some weird raisons, it has to be overloaded for packet of integers +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } + +template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } + +template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } + +template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) +{ + EIGEN_DEBUG_ALIGNED_LOAD + return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); +} + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) +{ + Packet2d p; + if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); + else p = ploadu<Packet2d>(from); + return vec_splat_dbl<0>(p); +} + +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) +{ + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); +} + +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ + return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64)); +} +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } + +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) +{ + Packet2d b, sum; + b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8)); + sum = a + b; + return pfirst<Packet2d>(sum); +} + +template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8)); + v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8)); + +#ifdef _BIG_ENDIAN + sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8)); +#else + sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8)); +#endif + + return sum; +} +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) +{ + return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); +} + +// min +template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) +{ + return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); +} + +// max +template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) +{ + return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); +} + +template<int Offset> +struct palign_impl<Offset,Packet2d> +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) +#ifdef _BIG_ENDIAN + first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8)); +#else + first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8)); +#endif + } +}; + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2d,2>& kernel) { + Packet2d t0, t1; + t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + kernel.packet[0] = t0; + kernel.packet[1] = t1; +} + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} +#endif // __VSX__ } // end namespace internal } // end namespace Eigen diff --git a/eigen/Eigen/src/Core/arch/CMakeLists.txt b/eigen/Eigen/src/Core/arch/CMakeLists.txt deleted file mode 100644 index 8456dec..0000000 --- a/eigen/Eigen/src/Core/arch/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -ADD_SUBDIRECTORY(SSE) -ADD_SUBDIRECTORY(AltiVec) -ADD_SUBDIRECTORY(NEON) -ADD_SUBDIRECTORY(Default) diff --git a/eigen/Eigen/src/Core/arch/CUDA/Complex.h b/eigen/Eigen/src/Core/arch/CUDA/Complex.h new file mode 100644 index 0000000..9c25365 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/CUDA/Complex.h @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_CUDA_H +#define EIGEN_COMPLEX_CUDA_H + +// clang-format off + +namespace Eigen { + +namespace internal { + +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) + +// Many std::complex methods such as operator+, operator-, operator* and +// operator/ are not constexpr. Due to this, clang does not treat them as device +// functions and thus Eigen functors making use of these operators fail to +// compile. Here, we manually specialize these functors for complex types when +// building for CUDA to avoid non-constexpr methods. + +// Sum +template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > { + typedef typename std::complex<T> result_type; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const { + return std::complex<T>(numext::real(a) + numext::real(b), + numext::imag(a) + numext::imag(b)); + } +}; + +template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {}; + + +// Difference +template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > { + typedef typename std::complex<T> result_type; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const { + return std::complex<T>(numext::real(a) - numext::real(b), + numext::imag(a) - numext::imag(b)); + } +}; + +template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {}; + + +// Product +template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > { + enum { + Vectorizable = packet_traits<std::complex<T>>::HasMul + }; + typedef typename std::complex<T> result_type; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + return std::complex<T>(a_real * b_real - a_imag * b_imag, + a_real * b_imag + a_imag * b_real); + } +}; + +template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {}; + + +// Quotient +template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > { + enum { + Vectorizable = packet_traits<std::complex<T>>::HasDiv + }; + typedef typename std::complex<T> result_type; + + EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); + const T b_real = numext::real(b); + const T b_imag = numext::imag(b); + const T norm = T(1) / (b_real * b_real + b_imag * b_imag); + return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm, + (a_imag * b_real - a_real * b_imag) * norm); + } +}; + +template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {}; + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_CUDA_H diff --git a/eigen/Eigen/src/Core/arch/CUDA/Half.h b/eigen/Eigen/src/Core/arch/CUDA/Half.h new file mode 100644 index 0000000..67518da --- /dev/null +++ b/eigen/Eigen/src/Core/arch/CUDA/Half.h @@ -0,0 +1,604 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// type Eigen::half (inheriting from CUDA's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. + + +#ifndef EIGEN_HALF_CUDA_H +#define EIGEN_HALF_CUDA_H + +#if __cplusplus > 199711L +#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() +#else +#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() +#endif + + +namespace Eigen { + +struct half; + +namespace half_impl { + +#if !defined(EIGEN_HAS_CUDA_FP16) + +// Make our own __half definition that is similar to CUDA's. +struct __half { + EIGEN_DEVICE_FUNC __half() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} + unsigned short x; +}; + +#endif + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); + +struct half_base : public __half { + EIGEN_DEVICE_FUNC half_base() {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} +}; + +} // namespace half_impl + +// Class definition. +struct half : public half_impl::half_base { + #if !defined(EIGEN_HAS_CUDA_FP16) + typedef half_impl::__half __half; + #endif + + EIGEN_DEVICE_FUNC half() {} + + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} + + explicit EIGEN_DEVICE_FUNC half(bool b) + : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + template<class T> + explicit EIGEN_DEVICE_FUNC half(const T& val) + : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {} + explicit EIGEN_DEVICE_FUNC half(float f) + : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} + + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { + // +0.0 and -0.0 become false, everything else becomes true. + return (x & 0x7fff) != 0; + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { + return static_cast<signed char>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { + return static_cast<unsigned char>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { + return static_cast<short>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { + return static_cast<unsigned short>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { + return static_cast<int>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { + return static_cast<unsigned int>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { + return static_cast<long>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { + return static_cast<unsigned long>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { + return static_cast<long long>(half_impl::half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { + return static_cast<unsigned long long>(half_to_float(*this)); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { + return half_impl::half_to_float(*this); + } + EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { + return static_cast<double>(half_impl::half_to_float(*this)); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } +}; + +namespace half_impl { + +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +__device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +__device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +__device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +__device__ half operator / (const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} +__device__ half operator - (const half& a) { + return __hneg(a); +} +__device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +__device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +__device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +__device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +__device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +__device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +__device__ bool operator < (const half& a, const half& b) { + return __hlt(a, b); +} +__device__ bool operator <= (const half& a, const half& b) { + return __hle(a, b); +} +__device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} +__device__ bool operator >= (const half& a, const half& b) { + return __hge(a, b); +} + +#else // Emulate support for half floats + +// Definitions for CPUs and older CUDA, mostly working through conversion +// to/from fp32. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return float(a) == float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return float(a) != float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) { + return float(a) <= float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) { + return float(a) >= float(b); +} + +#endif // Emulate support for half floats + +// Division by an index. Do it in full float precision to avoid accuracy +// issues in converting the denominator to half. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { + return half(static_cast<float>(a) / static_cast<float>(b)); +} + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { + __half h; + h.x = x; + return h; +} + +union FP32 { + unsigned int u; + float f; +}; + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(ff); + +#elif defined(EIGEN_HAS_FP16_C) + __half h; + h.x = _cvtss_sh(ff, 0); + return h; + +#else + FP32 f; f.f = ff; + + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + __half o; + o.x = static_cast<unsigned short>(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = static_cast<unsigned short>(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = static_cast<unsigned short>(f.u >> 13); + } + } + + o.x |= static_cast<unsigned short>(sign >> 16); + return o; +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __half2float(h); + +#elif defined(EIGEN_HAS_FP16_C) + return _cvtsh_ss(h.x); + +#else + const FP32 magic = { 113 << 23 }; + const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +// --- standard functions --- + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hisnan(a); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) { + return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a)); +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { + half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { + return half(numext::expm1(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return half(::hlog(a)); +#else + return half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { + return half(numext::log1p(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { + return half(::log10f(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { + return half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { + return half(::sinf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { + return half(::cosf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { + return half(::tanf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { + return half(::tanhf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 + return half(hfloor(a)); +#else + return half(::floorf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 + return half(hceil(a)); +#else + return half(::ceilf(float(a))); +#endif +} + +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(b, a) ? b : a; +#else + const float f1 = static_cast<float>(a); + const float f2 = static_cast<float>(b); + return f2 < f1 ? b : a; +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(a, b) ? b : a; +#else + const float f1 = static_cast<float>(a); + const float f2 = static_cast<float>(b); + return f1 < f2 ? b : a; +#endif +} + +EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { + os << static_cast<float>(v); + return os; +} + +} // end namespace half_impl + +// import Eigen::half_impl::half into Eigen namespace +// using half_impl::half; + +namespace internal { + +template<> +struct random_default_impl<half, false, false> +{ + static inline half run(const half& x, const half& y) + { + return x + (y-x) * half(float(std::rand()) / float(RAND_MAX)); + } + static inline half run() + { + return run(half(-1.f), half(1.f)); + } +}; + +template<> struct is_arithmetic<half> { enum { value = true }; }; + +} // end namespace internal + +template<> struct NumTraits<Eigen::half> + : GenericNumTraits<Eigen::half> +{ + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { + return half_impl::raw_uint16_to_half(0x0800); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { + return half_impl::raw_uint16_to_half(0x7bff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { + return half_impl::raw_uint16_to_half(0xfbff); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { + return half_impl::raw_uint16_to_half(0x7c00); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { + return half_impl::raw_uint16_to_half(0x7c01); + } +}; + +} // end namespace Eigen + +// C-like standard mathematical functions and trancendentals. +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { + return Eigen::half(::expf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return Eigen::half(::hlog(a)); +#else + return Eigen::half(::logf(float(a))); +#endif +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { + return Eigen::half(::sqrtf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) { + return Eigen::half(::powf(float(a), float(b))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { + return Eigen::half(::floorf(float(a))); +} +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { + return Eigen::half(::ceilf(float(a))); +} + +namespace std { + +#if __cplusplus > 199711L +template <> +struct hash<Eigen::half> { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const { + return static_cast<std::size_t>(a.x); + } +}; +#endif + +} // end namespace std + + +// Add the missing shfl_xor intrinsic +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width)); +} +#endif + +// ldg() has an overload for __half, but we also need one for Eigen::half. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { + return Eigen::half_impl::raw_uint16_to_half( + __ldg(reinterpret_cast<const unsigned short*>(ptr))); +} +#endif + + +#if defined(__CUDA_ARCH__) +namespace Eigen { +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isnan)(const Eigen::half& h) { + return (half_impl::isnan)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isinf)(const Eigen::half& h) { + return (half_impl::isinf)(h); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +bool (isfinite)(const Eigen::half& h) { + return (half_impl::isfinite)(h); +} + +} // namespace Eigen +} // namespace numext +#endif + +#endif // EIGEN_HALF_CUDA_H diff --git a/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h b/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h new file mode 100644 index 0000000..987a529 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H +#define EIGEN_MATH_FUNCTIONS_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plog<float4>(const float4& a) +{ + return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plog<double2>(const double2& a) +{ + using ::log; + return make_double2(log(a.x), log(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 plog1p<float4>(const float4& a) +{ + return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 plog1p<double2>(const double2& a) +{ + return make_double2(log1p(a.x), log1p(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pexp<float4>(const float4& a) +{ + return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pexp<double2>(const double2& a) +{ + using ::exp; + return make_double2(exp(a.x), exp(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 pexpm1<float4>(const float4& a) +{ + return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 pexpm1<double2>(const double2& a) +{ + return make_double2(expm1(a.x), expm1(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 psqrt<float4>(const float4& a) +{ + return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 psqrt<double2>(const double2& a) +{ + using ::sqrt; + return make_double2(sqrt(a.x), sqrt(a.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +float4 prsqrt<float4>(const float4& a) +{ + return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +double2 prsqrt<double2>(const double2& a) +{ + return make_double2(rsqrt(a.x), rsqrt(a.y)); +} + + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h b/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h new file mode 100644 index 0000000..8c46af0 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -0,0 +1,333 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_CUDA_H +#define EIGEN_PACKET_MATH_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> struct is_arithmetic<float4> { enum { value = true }; }; +template<> struct is_arithmetic<double2> { enum { value = true }; }; + +template<> struct packet_traits<float> : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, + HasBetaInc = 1, + + HasBlend = 0, + }; +}; + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, + HasBetaInc = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; +template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) { + return make_double2(from, from); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) { + return *reinterpret_cast<const float4*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) { + return *reinterpret_cast<const double2*>(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) { + *reinterpret_cast<float4*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) { + *reinterpret_cast<double2*>(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return __ldg((const double2*)from); +#else + return make_double2(from[0], from[1]); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return make_double2(__ldg(from+0), __ldg(from+1)); +#else + return make_double2(from[0], from[1]); +#endif +} + +template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) { + return fmin(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) { + return a.x * a.y * a.z * a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) { + return a.x * a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) { + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) { + return make_double2(fabs(a.x), fabs(a.y)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<float4,4>& kernel) { + float tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<double2,2>& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_CUDA_H diff --git a/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h new file mode 100644 index 0000000..b9a125b --- /dev/null +++ b/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -0,0 +1,1132 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H +#define EIGEN_PACKET_MATH_HALF_CUDA_H + + +namespace Eigen { +namespace internal { + +// Most of the following operations require arch >= 3.0 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + +template<> struct is_arithmetic<half2> { enum { value = true }; }; + +template<> struct packet_traits<Eigen::half> : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasAdd = 1, + HasMul = 1, + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasExpm1 = 1, + HasLog = 1, + HasLog1p = 1 + }; +}; + +template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) { + return __half2half2(from); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) { + return *reinterpret_cast<const half2*>(from); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) { + *reinterpret_cast<half2*>(to) = from; +} + +template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> + __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) { +#if __CUDA_ARCH__ >= 350 + return __ldg((const half2*)from); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif +} + +template<> +__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) { +#if __CUDA_ARCH__ >= 350 + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) { + return __low2half(a); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) { + half2 result; + result.x = a.x & 0x7FFF7FFF; + return result; +} + + +__device__ EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<half2,2>& kernel) { + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) { +#if __CUDA_ARCH__ >= 530 + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +#else + float f = __half2float(a) + 1.0f; + return __halves2half2(a, __float2half(f)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 + return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 + return __hsub2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 - b1; + float r2 = a2 - b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if __CUDA_ARCH__ >= 530 + return __hneg2(a); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return __floats2half2_rn(-a1, -a2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 + return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) { +#if __CUDA_ARCH__ >= 530 + return __hfma2(a, b, c); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float c1 = __low2float(c); + float c2 = __high2float(c); + float r1 = a1 * b1 + c1; + float r2 = a2 * b2 + c2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) { +#if __CUDA_ARCH__ >= 530 + return __hadd(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) { +#if __CUDA_ARCH__ >= 530 + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 > a2 ? __low2half(a) : __high2half(a); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) { +#if __CUDA_ARCH__ >= 530 + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 < a2 ? __low2half(a) : __high2half(a); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) { +#if __CUDA_ARCH__ >= 530 + return __hmul(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = log1pf(a1); + float r2 = log1pf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expm1f(a1); + float r2 = expm1f(a2); + return __floats2half2_rn(r1, r2); +} + +#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 + +template<> __device__ EIGEN_STRONG_INLINE +half2 plog<half2>(const half2& a) { + return h2log(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 pexp<half2>(const half2& a) { + return h2exp(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 psqrt<half2>(const half2& a) { + return h2sqrt(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 prsqrt<half2>(const half2& a) { + return h2rsqrt(a); +} + +#else + +template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = logf(a1); + float r2 = logf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expf(a1); + float r2 = expf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = sqrtf(a1); + float r2 = sqrtf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = rsqrtf(a1); + float r2 = rsqrtf(a2); + return __floats2half2_rn(r1, r2); +} + +#endif + +#elif defined EIGEN_VECTORIZE_AVX512 + +typedef struct { + __m256i x; +} Packet16h; + + +template<> struct is_arithmetic<Packet16h> { enum { value = true }; }; + +template <> +struct packet_traits<half> : default_packet_traits { + typedef Packet16h type; + // There is no half-size packet for Packet16h. + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; + +template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) { + Packet16h result; + result.x = _mm256_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) { + return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) { + _mm256_store_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) { + _mm256_storeu_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploadquad(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtph_ps(a.x); +#else + EIGEN_ALIGN64 half aux[16]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + float f8(aux[8]); + float f9(aux[9]); + float fa(aux[10]); + float fb(aux[11]); + float fc(aux[12]); + float fd(aux[13]); + float fe(aux[14]); + float ff(aux[15]); + + return _mm512_set_ps( + ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet16h result; + result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN64 float aux[16]; + pstore(aux, a); + half h0(aux[0]); + half h1(aux[1]); + half h2(aux[2]); + half h3(aux[3]); + half h4(aux[4]); + half h5(aux[5]); + half h6(aux[6]); + half h7(aux[7]); + half h8(aux[8]); + half h9(aux[9]); + half ha(aux[10]); + half hb(aux[11]); + half hc(aux[12]); + half hd(aux[13]); + half he(aux[14]); + half hf(aux[15]); + + Packet16h result; + result.x = _mm256_set_epi16( + hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, + h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) +{ + Packet16h result; + result.x = _mm256_set_epi16( + from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, + from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, + from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, + from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) +{ + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; + to[stride*8].x = aux[8].x; + to[stride*9].x = aux[9].x; + to[stride*10].x = aux[10].x; + to[stride*11].x = aux[11].x; + to[stride*12].x = aux[12].x; + to[stride*13].x = aux[13].x; + to[stride*14].x = aux[14].x; + to[stride*15].x = aux[15].x; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet16h,16>& kernel) { + __m256i a = kernel.packet[0].x; + __m256i b = kernel.packet[1].x; + __m256i c = kernel.packet[2].x; + __m256i d = kernel.packet[3].x; + __m256i e = kernel.packet[4].x; + __m256i f = kernel.packet[5].x; + __m256i g = kernel.packet[6].x; + __m256i h = kernel.packet[7].x; + __m256i i = kernel.packet[8].x; + __m256i j = kernel.packet[9].x; + __m256i k = kernel.packet[10].x; + __m256i l = kernel.packet[11].x; + __m256i m = kernel.packet[12].x; + __m256i n = kernel.packet[13].x; + __m256i o = kernel.packet[14].x; + __m256i p = kernel.packet[15].x; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0].x = a_p_0; + kernel.packet[1].x = a_p_1; + kernel.packet[2].x = a_p_2; + kernel.packet[3].x = a_p_3; + kernel.packet[4].x = a_p_4; + kernel.packet[5].x = a_p_5; + kernel.packet[6].x = a_p_6; + kernel.packet[7].x = a_p_7; + kernel.packet[8].x = a_p_8; + kernel.packet[9].x = a_p_9; + kernel.packet[10].x = a_p_a; + kernel.packet[11].x = a_p_b; + kernel.packet[12].x = a_p_c; + kernel.packet[13].x = a_p_d; + kernel.packet[14].x = a_p_e; + kernel.packet[15].x = a_p_f; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet16h,8>& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore<half>(in[0], kernel.packet[0]); + pstore<half>(in[1], kernel.packet[1]); + pstore<half>(in[2], kernel.packet[2]); + pstore<half>(in[3], kernel.packet[3]); + pstore<half>(in[4], kernel.packet[4]); + pstore<half>(in[5], kernel.packet[5]); + pstore<half>(in[6], kernel.packet[6]); + pstore<half>(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j+8] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload<Packet16h>(out[0]); + kernel.packet[1] = pload<Packet16h>(out[1]); + kernel.packet[2] = pload<Packet16h>(out[2]); + kernel.packet[3] = pload<Packet16h>(out[3]); + kernel.packet[4] = pload<Packet16h>(out[4]); + kernel.packet[5] = pload<Packet16h>(out[5]); + kernel.packet[6] = pload<Packet16h>(out[6]); + kernel.packet[7] = pload<Packet16h>(out[7]); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet16h,4>& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore<half>(in[0], kernel.packet[0]); + pstore<half>(in[1], kernel.packet[1]); + pstore<half>(in[2], kernel.packet[2]); + pstore<half>(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][4*i+1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+8] = in[j][4*i+2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+12] = in[j][4*i+3]; + } + } + + kernel.packet[0] = pload<Packet16h>(out[0]); + kernel.packet[1] = pload<Packet16h>(out[1]); + kernel.packet[2] = pload<Packet16h>(out[2]); + kernel.packet[3] = pload<Packet16h>(out[3]); +} + + +#elif defined EIGEN_VECTORIZE_AVX + +typedef struct { + __m128i x; +} Packet8h; + + +template<> struct is_arithmetic<Packet8h> { enum { value = true }; }; + +template <> +struct packet_traits<Eigen::half> : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) { + Packet8h result; + result.x = _mm_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) { + return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) { + Packet8h result; + result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) { + Packet8h result; + result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad<Packet8h>(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a.x); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet8h result; + result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + Eigen::half h4(aux[4]); + Eigen::half h5(aux[5]); + Eigen::half h6(aux[6]); + Eigen::half h7(aux[7]); + + Packet8h result; + result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) +{ + Packet8h result; + result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) +{ + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux<Packet8f>(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max<Packet8f>(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min<Packet8f>(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul<Packet8f>(af); + return Eigen::half(reduced); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet8h,8>& kernel) { + __m128i a = kernel.packet[0].x; + __m128i b = kernel.packet[1].x; + __m128i c = kernel.packet[2].x; + __m128i d = kernel.packet[3].x; + __m128i e = kernel.packet[4].x; + __m128i f = kernel.packet[5].x; + __m128i g = kernel.packet[6].x; + __m128i h = kernel.packet[7].x; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0].x = a0b0c0d0e0f0g0h0; + kernel.packet[1].x = a1b1c1d1e1f1g1h1; + kernel.packet[2].x = a2b2c2d2e2f2g2h2; + kernel.packet[3].x = a3b3c3d3e3f3g3h3; + kernel.packet[4].x = a4b4c4d4e4f4g4h4; + kernel.packet[5].x = a5b5c5d5e5f5g5h5; + kernel.packet[6].x = a6b6c6d6e6f6g6h6; + kernel.packet[7].x = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet8h,4>& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore<Eigen::half>(in[0], kernel.packet[0]); + pstore<Eigen::half>(in[1], kernel.packet[1]); + pstore<Eigen::half>(in[2], kernel.packet[2]); + pstore<Eigen::half>(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload<Packet8h>(out[0]); + kernel.packet[1] = pload<Packet8h>(out[1]); + kernel.packet[2] = pload<Packet8h>(out[2]); + kernel.packet[3] = pload<Packet8h>(out[3]); +} + + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic<Packet4h> { enum { value = true }; }; + +template <> +struct packet_traits<Eigen::half> : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x))); +} + +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad<Packet4h>(const Eigen::half* from) { + return pset1<Packet4h>(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride) +{ + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast<unsigned short>(a); + to[stride*1].x = static_cast<unsigned short>(a >> 16); + to[stride*2].x = static_cast<unsigned short>(a >> 32); + to[stride*3].x = static_cast<unsigned short>(a >> 48); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock<Packet4h,4>& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); +} + +#endif + +} +} + +#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h b/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h new file mode 100644 index 0000000..aa5fbce --- /dev/null +++ b/eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_CUDA_H +#define EIGEN_TYPE_CASTING_CUDA_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op<float, Eigen::half> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<float, Eigen::half> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op<int, Eigen::half> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(static_cast<float>(a)); + #else + return Eigen::half(static_cast<float>(a)); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<int, Eigen::half> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op<Eigen::half, float> { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __half2float(a); + #else + return static_cast<float>(a); + #endif + } +}; + +template<> +struct functor_traits<scalar_cast_op<Eigen::half, float> > +{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; }; + + + +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + +template <> +struct type_casting_traits<Eigen::half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + +template <> +struct type_casting_traits<float, Eigen::half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) { + // Simply discard the second half of the input + return __floats2half2_rn(a.x, a.y); +} + +#elif defined EIGEN_VECTORIZE_AVX512 +template <> +struct type_casting_traits<half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits<float, half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) { + return float2half(a); +} + +#elif defined EIGEN_VECTORIZE_AVX + +template <> +struct type_casting_traits<Eigen::half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits<float, Eigen::half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) { + return float2half(a); +} + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +template <> +struct type_casting_traits<Eigen::half, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64)); + float f1 = static_cast<float>(h); + h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16)); + float f2 = static_cast<float>(h); + h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32)); + float f3 = static_cast<float>(h); + h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48)); + float f4 = static_cast<float>(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits<float, Eigen::half> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_CUDA_H diff --git a/eigen/Eigen/src/Core/arch/Default/CMakeLists.txt b/eigen/Eigen/src/Core/arch/Default/CMakeLists.txt deleted file mode 100644 index 339c091..0000000 --- a/eigen/Eigen/src/Core/arch/Default/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Core_arch_Default_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Core_arch_Default_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/Default COMPONENT Devel -) diff --git a/eigen/Eigen/src/Core/arch/NEON/CMakeLists.txt b/eigen/Eigen/src/Core/arch/NEON/CMakeLists.txt deleted file mode 100644 index fd4d4af..0000000 --- a/eigen/Eigen/src/Core/arch/NEON/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Core_arch_NEON_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Core_arch_NEON_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/NEON COMPONENT Devel -) diff --git a/eigen/Eigen/src/Core/arch/NEON/Complex.h b/eigen/Eigen/src/Core/arch/NEON/Complex.h index 8d9255e..57e9b43 100644 --- a/eigen/Eigen/src/Core/arch/NEON/Complex.h +++ b/eigen/Eigen/src/Core/arch/NEON/Complex.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org> // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -14,8 +15,21 @@ namespace Eigen { namespace internal { -static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000); -static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000); +inline uint32x4_t p4ui_CONJ_XOR() { +// See bug 1325, clang fails to call vld1q_u64. +#if EIGEN_COMP_CLANG + uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; + return ret; +#else + static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; + return vld1q_u32( conj_XOR_DATA ); +#endif +} + +inline uint32x2_t p2ui_CONJ_XOR() { + static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 }; + return vld1_u32( conj_XOR_DATA ); +} //---------- float ---------- struct Packet2cf @@ -28,10 +42,12 @@ struct Packet2cf template<> struct packet_traits<std::complex<float> > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -46,7 +62,7 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits }; }; -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) { @@ -62,7 +78,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { Packet4ui b = vreinterpretq_u32_f32(a.v); - return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR))); + return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) @@ -71,14 +87,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con // Get the real values of a | a1_re | a1_re | a2_re | a2_re | v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0)); - // Get the real values of a | a1_im | a1_im | a2_im | a2_im | + // Get the imag values of a | a1_im | a1_im | a2_im | a2_im | v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1)); // Multiply the real a with b v1 = vmulq_f32(v1, b.v); // Multiply the imag a with b v2 = vmulq_f32(v2, b.v); // Conjugate v2 - v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR)); + v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64q_f32(v2); // Add and return the result @@ -87,7 +103,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { - return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); + return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); } template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { @@ -110,6 +126,22 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex< template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride) +{ + Packet4f res = pset1<Packet4f>(0.f); + res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); + res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); + res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); + return Packet2cf(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride) +{ + to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); + to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); +} + template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ARM_PREFETCH((float *)addr); } template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) @@ -177,7 +209,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P // Multiply the imag a with b v2 = vmul_f32(v2, a2); // Conjugate v2 - v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR)); + v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64_f32(v2); // Add v1, v2 @@ -235,7 +267,7 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true> template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for AltiVec + // TODO optimize it for NEON Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b); Packet4f s, rev_s; @@ -246,6 +278,207 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2cf,2>& kernel) { + Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} + +//---------- double ---------- +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG + +// See bug 1325, clang fails to call vld1q_u64. +#if EIGEN_COMP_CLANG + static uint64x2_t p2ul_CONJ_XOR = {0x0, 0x8000000000000000}; +#else + const uint64_t p2ul_conj_XOR_DATA[] = { 0x0, 0x8000000000000000 }; + static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA ); +#endif + +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +template<> struct packet_traits<std::complex<double> > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; + +template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } + +template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } + +template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d v1, v2; + + // Get the real values of a + v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); + // Get the imag values of a + v2 = vdupq_lane_f64(vget_high_f64(a.v), 0); + // Multiply the real a with b + v1 = vmulq_f64(v1, b.v); + // Multiply the imag a with b + v2 = vmulq_f64(v2, b.v); + // Conjugate v2 + v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); + // Swap real/imag elements in v2. + v2 = preverse<Packet2d>(v2); + // Add and return the result + return Packet1cd(vaddq_f64(v1, v2)); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); } + +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ARM_PREFETCH((double *)addr); } + +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride) +{ + Packet2d res = pset1<Packet2d>(0.0); + res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); + res = vsetq_lane_f64(std::imag(from[0*stride]), res, 1); + return Packet1cd(res); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride) +{ + to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); +} + + +template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) +{ + std::complex<double> EIGEN_ALIGN16 res; + pstore<std::complex<double> >(&res, a); + + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } + +template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); } + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; } + +template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); } + +template<int Offset> +struct palign_impl<Offset,Packet1cd> +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, false,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for NEON + Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b); + Packet2d s = pmul<Packet2d>(b.v, b.v); + Packet2d rev_s = preverse<Packet2d>(s); + + return Packet1cd(pdiv(res.v, padd<Packet2d>(s,rev_s))); +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel) +{ + Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v)); + kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v)); + kernel.packet[1].v = tmp; +} +#endif // EIGEN_ARCH_ARM64 + } // end namespace internal } // end namespace Eigen diff --git a/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h b/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h new file mode 100644 index 0000000..6bb05bb --- /dev/null +++ b/eigen/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -0,0 +1,91 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_NEON_H +#define EIGEN_MATH_FUNCTIONS_NEON_H + +namespace Eigen { + +namespace internal { + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp<Packet4f>(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4f tmp, fx; + + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + + x = vminq_f32(x, p4f_exp_hi); + x = vmaxq_f32(x, p4f_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); + + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + Packet4ui mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, p4f_cephes_exp_C1); + Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, p4f_cephes_exp_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, p4f_1); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, p4i_0x7f); + mm = vshlq_n_s32(mm, 23); + Packet4f pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_NEON_H diff --git a/eigen/Eigen/src/Core/arch/NEON/PacketMath.h b/eigen/Eigen/src/Core/arch/NEON/PacketMath.h index d49670e..aede4a6 100644 --- a/eigen/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/eigen/Eigen/src/Core/arch/NEON/PacketMath.h @@ -2,7 +2,7 @@ // for linear algebra. // // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> -// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr> +// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org> // Heavily based on Gael's SSE version. // // This Source Code Form is subject to the terms of the Mozilla @@ -20,43 +20,48 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -// FIXME NEON has 16 quad registers, but since the current register allocator -// is so bad, it is much better to reduce it to 8 +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif + #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8 +#if EIGEN_ARCH_ARM64 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#else +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#endif #endif +typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; +typedef int32x2_t Packet2i; typedef uint32x4_t Packet4ui; #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1<Packet4f>(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X)) + const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1<Packet4i>(X) -#if defined(__llvm__) && !defined(__clang__) - //Special treatment for Apple's llvm-gcc, its NEON packet types are unions - #define EIGEN_INIT_NEON_PACKET2(X, Y) {{X, Y}} - #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}} -#else - //Default initializer for packets - #define EIGEN_INIT_NEON_PACKET2(X, Y) {X, Y} - #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W} -#endif - -// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function -// which available on LLVM and GCC (at least) -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__) +#if EIGEN_ARCH_ARM64 + // __builtin_prefetch tends to do nothing on ARM64 compilers because the + // prefetch instructions there are too detailed for __builtin_prefetch to map + // meaningfully to them. + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : ); +#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif !defined(__aarch64__) - #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#elif EIGEN_ARCH_ARM32 + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching #define EIGEN_ARM_PREFETCH(ADDR) @@ -65,53 +70,60 @@ typedef uint32x4_t Packet4ui; template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; // Packet2f intrinsics not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, + HasHalfPacket=0, // Packet2f intrinsics not implemented yet HasDiv = 1, // FIXME check the Has* HasSin = 0, HasCos = 0, HasLog = 0, - HasExp = 0, + HasExp = 1, HasSqrt = 0 }; }; -template<> struct packet_traits<int> : default_packet_traits +template<> struct packet_traits<int32_t> : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; // Packet2i intrinsics not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + HasHalfPacket=0 // Packet2i intrinsics not implemented yet // FIXME check the Has* }; }; -#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__) +#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM // workaround gcc 4.2, 4.3 and 4.4 compilatin issue EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) +template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { - Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); + const float32_t f[] = {0, 1, 2, 3}; + Packet4f countdown = vld1q_f32(f); return vaddq_f32(pset1<Packet4f>(a), countdown); } -template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) +template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) { - Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3); + const int32_t i[] = {0, 1, 2, 3}; + Packet4i countdown = vld1q_s32(i); return vaddq_s32(pset1<Packet4i>(a), countdown); } @@ -132,6 +144,9 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { +#if EIGEN_ARCH_ARM64 + return vdivq_f32(a,b); +#else Packet4f inv, restep, div; // NEON does not offer a divide instruction, we have to do a reciprocal approximation @@ -150,14 +165,51 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const div = vmulq_f32(a, inv); return div; +#endif } + template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) { eigen_assert(false && "packet integer division are not supported by NEON"); return pset1<Packet4i>(0); } -// for some weird raisons, it has to be overloaded for packet of integers -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); } +// Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, +// then implements a slow software scalar fallback calling fmaf()! +// Filed LLVM bug: +// https://llvm.org/bugs/show_bug.cgi?id=27216 +#if (defined __ARM_FEATURE_FMA) && !(EIGEN_COMP_CLANG && EIGEN_ARCH_ARM) +// See bug 936. +// FMA is available on VFPv4 i.e. when compiling with -mfpu=neon-vfpv4. +// FMA is a true fused multiply-add i.e. only 1 rounding at the end, no intermediate rounding. +// MLA is not fused i.e. does 2 roundings. +// In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): +// MLA: 10 GFlop/s ; FMA: 12 GFlops/s. +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { +#if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM + // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, + // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on + // -march=armv7-a, that is a very common case. + // See e.g. this thread: + // http://lists.llvm.org/pipermail/llvm-dev/2013-December/068806.html + // Filed LLVM bug: + // https://llvm.org/bugs/show_bug.cgi?id=27219 + Packet4f r = c; + asm volatile( + "vmla.f32 %q[r], %q[a], %q[b]" + : [r] "+w" (r) + : [a] "w" (a), + [b] "w" (b) + : ); + return r; +#else + return vmlaq_f32(c,a,b); +#endif +} +#endif + +// No FMA instruction for int, so use MLA unconditionally. template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } @@ -191,20 +243,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, con } template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) { float32x2_t lo, hi; lo = vld1_dup_f32(from); hi = vld1_dup_f32(from+1); return vcombine_f32(lo, hi); } -template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) { int32x2_t lo, hi; lo = vld1_dup_s32(from); @@ -212,18 +264,52 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) return vcombine_s32(lo, hi); } -template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } + +template<> EIGEN_STRONG_INLINE void pstoreu<float> (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) +{ + Packet4f res = pset1<Packet4f>(0.f); + res = vsetq_lane_f32(from[0*stride], res, 0); + res = vsetq_lane_f32(from[1*stride], res, 1); + res = vsetq_lane_f32(from[2*stride], res, 2); + res = vsetq_lane_f32(from[3*stride], res, 3); + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) +{ + Packet4i res = pset1<Packet4i>(0); + res = vsetq_lane_s32(from[0*stride], res, 0); + res = vsetq_lane_s32(from[1*stride], res, 1); + res = vsetq_lane_s32(from[2*stride], res, 2); + res = vsetq_lane_s32(from[3*stride], res, 3); + return res; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) +{ + to[stride*0] = vgetq_lane_f32(from, 0); + to[stride*1] = vgetq_lane_f32(from, 1); + to[stride*2] = vgetq_lane_f32(from, 2); + to[stride*3] = vgetq_lane_f32(from, 3); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride) +{ + to[stride*0] = vgetq_lane_s32(from, 0); + to[stride*1] = vgetq_lane_s32(from, 1); + to[stride*2] = vgetq_lane_s32(from, 2); + to[stride*3] = vgetq_lane_s32(from, 3); +} -template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<float> (const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { float32x2_t a_lo, a_hi; @@ -243,6 +329,7 @@ template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { a_hi = vget_high_s32(a_r64); return vcombine_s32(a_hi, a_lo); } + template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } @@ -277,7 +364,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) return sum; } -template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) { int32x2_t a_lo, a_hi, sum; @@ -324,7 +411,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) return vget_lane_f32(prod, 0); } -template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) { int32x2_t a_lo, a_hi, prod; @@ -352,7 +439,7 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) return vget_lane_f32(min, 0); } -template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) { int32x2_t a_lo, a_hi, min; @@ -377,7 +464,7 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) return vget_lane_f32(max, 0); } -template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) { int32x2_t a_lo, a_hi, max; @@ -410,9 +497,231 @@ PALIGN_NEON(0,Packet4i,vextq_s32) PALIGN_NEON(1,Packet4i,vextq_s32) PALIGN_NEON(2,Packet4i,vextq_s32) PALIGN_NEON(3,Packet4i,vextq_s32) - + #undef PALIGN_NEON +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4f,4>& kernel) { + float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); + float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); + kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); + kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); + kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4i,4>& kernel) { + int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); + int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); + kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); + kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); + kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); + kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +} + +//---------- double ---------- + +// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrisics for double. +// Confirmed at least with __apple_build_version__ = 6000054. +#ifdef __apple_build_version__ +// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed. +// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with +// major toolchain updates. +#define EIGEN_APPLE_DOUBLE_NEON_BUG (__apple_build_version__ < 6010000) +#else +#define EIGEN_APPLE_DOUBLE_NEON_BUG 0 +#endif + +#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG + +// Bug 907: workaround missing declarations of the following two functions in the ADK +// Defining these functions as templates ensures that if these intrinsics are +// already defined in arm_neon.h, then our workaround doesn't cause a conflict +// and has lower priority in overload resolution. +template <typename T> +uint64x2_t vreinterpretq_u64_f64(T a) +{ + return (uint64x2_t) a; +} + +template <typename T> +float64x2_t vreinterpretq_f64_u64(T a) +{ + return (float64x2_t) a; +} + +typedef float64x2_t Packet2d; +typedef float64x1_t Packet1d; + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket=0, + + HasDiv = 1, + // FIXME check the Has* + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 0, + HasSqrt = 0 + }; +}; + +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; + +template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) +{ + const double countdown_raw[] = {0.0,1.0}; + const Packet2d countdown = vld1q_f64(countdown_raw); + return vaddq_f64(pset1<Packet2d>(a), countdown); +} +template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); } + +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } + +#ifdef __ARM_FEATURE_FMA +// See bug 936. See above comment about FMA for float. +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } +#else +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } +#endif + +template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } + +template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } + +// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics +template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) +{ + return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} + +template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) +{ + return vld1q_dup_f64(from); +} +template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } + +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } + +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) +{ + Packet2d res = pset1<Packet2d>(0.0); + res = vsetq_lane_f64(from[0*stride], res, 0); + res = vsetq_lane_f64(from[1*stride], res, 1); + return res; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) +{ + to[stride*0] = vgetq_lane_f64(from, 0); + to[stride*1] = vgetq_lane_f64(from, 1); +} +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); } + +// FIXME only store the 2 first elements ? +template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); } + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } + +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } + +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +// workaround ICE, see bug 907 +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +#endif + +template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) +{ + float64x2_t trn1, trn2; + + // NEON zip performs interleaving of the supplied vectors. + // We perform two interleaves in a row to acquire the transposed vector + trn1 = vzip1q_f64(vecs[0], vecs[1]); + trn2 = vzip2q_f64(vecs[0], vecs[1]); + + // Do the addition of the resulting vectors + return vaddq_f64(trn1, trn2); +} +// Other reduction functions: +// mul +#if EIGEN_COMP_CLANG && defined(__apple_build_version__) +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } +#else +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +#endif + +// min +template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } + +// max +template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } + +// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, +// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 +#define PALIGN_NEON(Offset,Type,Command) \ +template<>\ +struct palign_impl<Offset,Type>\ +{\ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ + {\ + if (Offset!=0)\ + first = Command(first, second, Offset);\ + }\ +};\ + +PALIGN_NEON(0,Packet2d,vextq_f64) +PALIGN_NEON(1,Packet2d,vextq_f64) +#undef PALIGN_NEON + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2d,2>& kernel) { + float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); + + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; +} +#endif // EIGEN_ARCH_ARM64 + } // end namespace internal } // end namespace Eigen diff --git a/eigen/Eigen/src/Core/arch/SSE/CMakeLists.txt b/eigen/Eigen/src/Core/arch/SSE/CMakeLists.txt deleted file mode 100644 index 46ea7cc..0000000 --- a/eigen/Eigen/src/Core/arch/SSE/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -FILE(GLOB Eigen_Core_arch_SSE_SRCS "*.h") - -INSTALL(FILES - ${Eigen_Core_arch_SSE_SRCS} - DESTINATION ${INCLUDE_INSTALL_DIR}/Eigen/src/Core/arch/SSE COMPONENT Devel -) diff --git a/eigen/Eigen/src/Core/arch/SSE/Complex.h b/eigen/Eigen/src/Core/arch/SSE/Complex.h index 91bba5e..5607fe0 100644 --- a/eigen/Eigen/src/Core/arch/SSE/Complex.h +++ b/eigen/Eigen/src/Core/arch/SSE/Complex.h @@ -22,13 +22,18 @@ struct Packet2cf __m128 v; }; +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits<std::complex<float> > : default_packet_traits { typedef Packet2cf type; + typedef Packet2cf half; enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -39,11 +44,13 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0 + HasSetLinear = 0, + HasBlend = 1 }; }; +#endif -template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; }; +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -60,7 +67,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { - // TODO optimize it for SSE3 and 4 #ifdef EIGEN_VECTORIZE_SSE3 return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v), _mm_mul_ps(_mm_movehdup_ps(a.v), @@ -104,8 +110,23 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } -template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v)); } + + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride) +{ + return Packet2cf(_mm_set_ps(std::imag(from[1*stride]), std::real(from[1*stride]), + std::imag(from[0*stride]), std::real(from[0*stride]))); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride) +{ + to[stride*0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1))); + to[stride*1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)), + _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); +} template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -124,7 +145,7 @@ template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Pack #endif } -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v))))); } template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) { @@ -214,7 +235,7 @@ template<> struct conj_helper<Packet4f, Packet2cf, false,false> { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } + { return Packet2cf(Eigen::internal::pmul<Packet4f>(x, y.v)); } }; template<> struct conj_helper<Packet2cf, Packet4f, false,false> @@ -223,7 +244,7 @@ template<> struct conj_helper<Packet2cf, Packet4f, false,false> { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } + { return Packet2cf(Eigen::internal::pmul<Packet4f>(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) @@ -234,7 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); } -EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x) +EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x) { return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); } @@ -248,13 +269,18 @@ struct Packet1cd __m128d v; }; +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits<std::complex<double> > : default_packet_traits { typedef Packet1cd type; + typedef Packet1cd half; enum { Vectorizable = 1, AlignedOnScalar = 0, size = 1, + HasHalfPacket = 0, HasAdd = 1, HasSub = 1, @@ -268,12 +294,13 @@ template<> struct packet_traits<std::complex<double> > : default_packet_traits HasSetLinear = 0 }; }; +#endif -template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); @@ -282,9 +309,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { - // TODO optimize it for SSE3 and 4 #ifdef EIGEN_VECTORIZE_SSE3 - return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), + return Packet1cd(_mm_addsub_pd(_mm_mul_pd(_mm_movedup_pd(a.v), b.v), _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), vec2d_swizzle1(b.v, 1, 0)))); #else @@ -311,8 +337,8 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); } // FIXME force unaligned store, this is a temporary fix -template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } @@ -410,7 +436,7 @@ template<> struct conj_helper<Packet2d, Packet1cd, false,false> { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } + { return Packet1cd(Eigen::internal::pmul<Packet2d>(x, y.v)); } }; template<> struct conj_helper<Packet1cd, Packet2d, false,false> @@ -419,7 +445,7 @@ template<> struct conj_helper<Packet1cd, Packet2d, false,false> { return padd(c, pmul(x,y)); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } + { return Packet1cd(Eigen::internal::pmul<Packet2d>(x.v, y)); } }; template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) @@ -430,9 +456,44 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } -EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +EIGEN_STRONG_INLINE Packet1cd pcplxflip/* <Packet1cd> */(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2cf,2>& kernel) { + __m128d w1 = _mm_castps_pd(kernel.packet[0].v); + __m128d w2 = _mm_castps_pd(kernel.packet[1].v); + + __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2)); + kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2)); + kernel.packet[1].v = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); + return Packet2cf(_mm_castpd_ps(result)); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex<float> b) +{ + return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast<const __m64*>(&b))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex<double> b) +{ + return pset1<Packet1cd>(b); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex<float> b) +{ + return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast<const __m64*>(&b))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex<double> b) { - return Packet1cd(preverse(x.v)); + return pset1<Packet1cd>(b); } } // end namespace internal diff --git a/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h b/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h index 2b07168..7b5f948 100644 --- a/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/eigen/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -32,7 +32,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x) /* the smallest non denormalized float number */ _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); - + /* natural logarithm computed for 4 simultaneous float return NaN for x <= 0 */ @@ -63,7 +63,7 @@ Packet4f plog<Packet4f>(const Packet4f& _x) x = _mm_or_ps(x, p4f_half); emm0 = _mm_sub_epi32(emm0, p4i_0x7f); - Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1); + Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); /* part2: if( x < SQRTHF ) { @@ -72,9 +72,9 @@ Packet4f plog<Packet4f>(const Packet4f& _x) } else { x = x - 1.0; } */ Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); - Packet4f tmp = _mm_and_ps(x, mask); + Packet4f tmp = pand(x, mask); x = psub(x, p4f_1); - e = psub(e, _mm_and_ps(p4f_1, mask)); + e = psub(e, pand(p4f_1, mask)); x = padd(x, tmp); Packet4f x2 = pmul(x,x); @@ -444,32 +444,119 @@ Packet4f pcos<Packet4f>(const Packet4f& _x) #if EIGEN_FAST_MATH -// This is based on Quake3's fast inverse square root. +// Functions for sqrt. +// The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step +// of Newton's method, at a cost of 1-2 bits of precision as opposed to the +// exact solution. It does not handle +inf, or denormalized numbers correctly. +// The main advantage of this approach is not just speed, but also the fact that +// it can be inlined and pipelined with other computations, further reducing its +// effective latency. This is similar to Quake3's fast inverse square root. // For detail see here: http://www.beyond3d.com/content/articles/8/ -// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly. template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt<Packet4f>(const Packet4f& _x) { Packet4f half = pmul(_x, pset1<Packet4f>(.5f)); + Packet4f denormal_mask = _mm_and_ps( + _mm_cmpge_ps(_x, _mm_setzero_ps()), + _mm_cmplt_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()))); - /* select only the inverse sqrt of non-zero inputs */ - Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)())); - Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x)); - + // Compute approximate reciprocal sqrt. + Packet4f x = _mm_rsqrt_ps(_x); + // Do a single step of Newton's iteration. x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x)))); - return pmul(_x,x); + // Flush results for denormals to zero. + return _mm_andnot_ps(denormal_mask, pmul(_x,x)); } #else -template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); } +template<>EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); } + +#endif + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); } + +#if EIGEN_FAST_MATH + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt<Packet4f>(const Packet4f& _x) { + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); + _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); + _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); + + Packet4f neg_half = pmul(_x, p4f_minus_half); + + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); + Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); + + // Fill in NaNs and Infs for the negative/zero entries. + Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); + Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); + Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), + _mm_and_ps(zero_mask, p4f_inf)); + + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); + + // Insert NaNs and Infs in all the right places. + return _mm_or_ps(x, infs_and_nans); +} + +#else + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt<Packet4f>(const Packet4f& x) { + // Unfortunately we can't use the much faster mm_rqsrt_ps since it only provides an approximation. + return _mm_div_ps(pset1<Packet4f>(1.0f), _mm_sqrt_ps(x)); +} #endif -template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt<Packet2d>(const Packet2d& x) { + // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. + return _mm_div_pd(pset1<Packet2d>(1.0), _mm_sqrt_pd(x)); +} + +// Hyperbolic Tangent function. +template <> +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f +ptanh<Packet4f>(const Packet4f& x) { + return internal::generic_fast_tanh_float(x); +} } // end namespace internal +namespace numext { + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +float sqrt(const float &x) +{ + return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x)))); +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +double sqrt(const double &x) +{ +#if EIGEN_COMP_GNUC_STRICT + // This works around a GCC bug generating poor code for _mm_sqrt_pd + // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b + return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); +#else + return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); +#endif +} + +} // end namespace numex + } // end namespace Eigen #endif // EIGEN_MATH_FUNCTIONS_SSE_H diff --git a/eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/eigen/Eigen/src/Core/arch/SSE/PacketMath.h index bef898b..03c8a2c 100644 --- a/eigen/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/eigen/Eigen/src/Core/arch/SSE/PacketMath.h @@ -22,9 +22,40 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif +#ifdef __FMA__ +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 +#endif +#endif + +#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004) +// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot +// have overloads for both types without linking error. +// One solution is to increase ABI version using -fabi-version=4 (or greater). +// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper +// structure: +template<typename T> +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; +typedef eigen_packet_wrapper<__m128d> Packet2d; +#else typedef __m128 Packet4f; typedef __m128i Packet4i; typedef __m128d Packet2d; +#endif template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; @@ -38,7 +69,7 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; #define vec2d_swizzle1(v,p,q) \ (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) - + #define vec4f_swizzle2(a,b,p,q,r,s) \ (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) @@ -58,51 +89,85 @@ template<> struct is_arithmetic<__m128d> { enum { value = true }; }; const Packet4i p4i_##NAME = pset1<Packet4i>(X) +// Use the packet_traits defined in AVX/PacketMath.h instead if we're going +// to leverage AVX instructions. +#ifndef EIGEN_VECTORIZE_AVX template<> struct packet_traits<float> : default_packet_traits { typedef Packet4f type; + typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=4, + HasHalfPacket = 0, HasDiv = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasRsqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasBlend = 1 + +#ifdef EIGEN_VECTORIZE_SSE4_1 + , + HasRound = 1, + HasFloor = 1, + HasCeil = 1 +#endif }; }; template<> struct packet_traits<double> : default_packet_traits { typedef Packet2d type; + typedef Packet2d half; enum { Vectorizable = 1, AlignedOnScalar = 1, size=2, + HasHalfPacket = 0, HasDiv = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasRsqrt = 1, + HasBlend = 1 + +#ifdef EIGEN_VECTORIZE_SSE4_1 + , + HasRound = 1, + HasFloor = 1, + HasCeil = 1 +#endif }; }; +#endif template<> struct packet_traits<int> : default_packet_traits { typedef Packet4i type; + typedef Packet4i half; enum { - // FIXME check the Has* Vectorizable = 1, AlignedOnScalar = 1, - size=4 + size=4, + + HasBlend = 1 }; }; -template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4}; }; -template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; }; -template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4}; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; + +#ifndef EIGEN_VECTORIZE_AVX +template<> struct scalar_div_cost<float,true> { enum { value = 7 }; }; +template<> struct scalar_div_cost<double,true> { enum { value = 8 }; }; +#endif -#if defined(_MSC_VER) && (_MSC_VER==1500) +#if EIGEN_COMP_MSVC==1500 // Workaround MSVC 9 internal compiler error. // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). @@ -110,14 +175,25 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); } #else -template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set1_ps(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); } template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } #endif -template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } -template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } -template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } +// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. +// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) +// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. +// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. +// Also note that with AVX, we want it to generate a vbroadcastss. +#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) +template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) { + return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); +} +#endif + +template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } +template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } +template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } @@ -139,7 +215,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { - return psub(_mm_setr_epi32(0,0,0,0), a); + return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } @@ -166,16 +242,42 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ eigen_assert(false && "packet integer division are not supported by SSE"); - return pset1<Packet4i>(0); -} // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } +#endif -template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet4f res = b; + asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_min_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet2d res = b; + asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm_min_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -187,8 +289,34 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { +#if EIGEN_COMP_GNUC + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_ps, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet4f res = b; + asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_ps(b, a); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { +#if EIGEN_COMP_GNUC + // There appears to be a bug in GCC, by which the optimizer may + // flip the argument order in calls to _mm_max_pd, so we have to + // resort to inline ASM here. This is supposed to be fixed in gcc6.3, + // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + Packet2d res = b; + asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); + return res; +#else + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm_max_pd(b, a); +#endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -200,6 +328,17 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const #endif } +#ifdef EIGEN_VECTORIZE_SSE4_1 +template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); } +template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); } + +template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); } + +template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); } +template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); } +#endif + template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } @@ -218,16 +357,14 @@ template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, con template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); } +template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); } -#if defined(_MSC_VER) +#if EIGEN_COMP_MSVC template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD - #if (_MSC_VER==1600) + #if (EIGEN_COMP_MSVC==1600) // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps // (i.e., it does not generate an unaligned load!! - // TODO On most architectures this version should also be faster than a single _mm_loadu_ps - // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so... __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); res = _mm_loadh_pi(res, (const __m64*)(from+2)); return res; @@ -266,46 +403,77 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) { Packet4i tmp; - tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from)); + tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from)); return vec4i_swizzle1(tmp, 0, 0, 1, 1); } template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } -template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); } +template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } + +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) +{ + return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) +{ + return _mm_set_pd(from[1*stride], from[0*stride]); +} +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) +{ + return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); + } -template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { - EIGEN_DEBUG_UNALIGNED_STORE - _mm_storel_pd((to), from); - _mm_storeh_pd((to+1), from); +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) +{ + to[stride*0] = _mm_cvtss_f32(from); + to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); + to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); + to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) +{ + to[stride*0] = _mm_cvtsd_f64(from); + to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) +{ + to[stride*0] = _mm_cvtsi128_si32(from); + to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); + to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); + to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); } -template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); } -template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); } // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) { Packet4f pa = _mm_set_ss(a); - pstore(to, vec4f_swizzle1(pa,0,0,0,0)); + pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); } // some compilers might be tempted to perform multiple moves instead of using a vector path. template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) { Packet2d pa = _mm_set_sd(a); - pstore(to, vec2d_swizzle1(pa,0,0)); + pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } +#ifndef EIGEN_VECTORIZE_AVX template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +#endif -#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER) +#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 // Direct of the struct members fixed bug #62. template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; } template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; } template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } -#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#elif EIGEN_COMP_MSVC_STRICT // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } @@ -323,7 +491,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); } - template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); @@ -344,6 +511,38 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } +// with AVX, the default implementations based on pload1 are faster +#ifndef __AVX__ +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4f>(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload<Packet4f>(a); + a0 = vec4f_swizzle1(a3, 0,0,0,0); + a1 = vec4f_swizzle1(a3, 1,1,1,1); + a2 = vec4f_swizzle1(a3, 2,2,2,2); + a3 = vec4f_swizzle1(a3, 3,3,3,3); +} +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet2d>(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ +#ifdef EIGEN_VECTORIZE_SSE3 + a0 = _mm_loaddup_pd(a+0); + a1 = _mm_loaddup_pd(a+1); + a2 = _mm_loaddup_pd(a+2); + a3 = _mm_loaddup_pd(a+3); +#else + a1 = pload<Packet2d>(a); + a0 = vec2d_swizzle1(a1, 0,0); + a1 = vec2d_swizzle1(a1, 1,1); + a3 = pload<Packet2d>(a+2); + a2 = vec2d_swizzle1(a3, 0,0); + a3 = vec2d_swizzle1(a3, 1,1); +#endif +} +#endif + EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) { vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); @@ -353,47 +552,17 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) } #ifdef EIGEN_VECTORIZE_SSE3 -// TODO implement SSE2 versions as well as integer versions template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) { return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); } + template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) { return _mm_hadd_pd(vecs[0], vecs[1]); } -// SSSE3 version: -// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) -// { -// return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); -// } -template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) -{ - Packet4f tmp0 = _mm_hadd_ps(a,a); - return pfirst(_mm_hadd_ps(tmp0, tmp0)); -} - -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); } - -// SSSE3 version: -// EIGEN_STRONG_INLINE float predux(const Packet4i& a) -// { -// Packet4i tmp0 = _mm_hadd_epi32(a,a); -// return pfirst(_mm_hadd_epi32(tmp0, tmp0)); -// } #else -// SSE2 versions -template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) -{ - Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); -} -template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) -{ - return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); -} - template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) { Packet4f tmp0, tmp1, tmp2; @@ -414,10 +583,45 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) } #endif // SSE3 +template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) +{ + // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures + // (from Nehalem to Haswell) +// #ifdef EIGEN_VECTORIZE_SSE3 +// Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3)); +// return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp)); +// #else + Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); + return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); +// #endif +} + +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) +{ + // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures + // (from Nehalem to Haswell) +// #ifdef EIGEN_VECTORIZE_SSE3 +// return pfirst<Packet2d>(_mm_hadd_pd(a, a)); +// #else + return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); +// #endif +} + +#ifdef EIGEN_VECTORIZE_SSSE3 +template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) +{ + return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); +} +template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) +{ + Packet4i tmp0 = _mm_hadd_epi32(a,a); + return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0)); +} +#else template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); - return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); + return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)); } template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) @@ -433,18 +637,18 @@ template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); return _mm_add_epi32(tmp0, tmp2); } - +#endif // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) { Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { - return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) { @@ -460,14 +664,18 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) { Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { - return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) { +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) EIGEN_ALIGN16 int aux[4]; @@ -475,20 +683,25 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; return aux0<aux2 ? aux0 : aux2; +#endif // EIGEN_VECTORIZE_SSE4_1 } // max template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) { Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); - return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); + return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); } template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { - return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); + return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); } template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) { +#ifdef EIGEN_VECTORIZE_SSE4_1 + Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); + return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); +#else // after some experiments, it is seems this is the fastest way to implement it // for GCC (eg., it does not like using std::min after the pstore !!) EIGEN_ALIGN16 int aux[4]; @@ -496,9 +709,10 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; return aux0>aux2 ? aux0 : aux2; +#endif // EIGEN_VECTORIZE_SSE4_1 } -#if (defined __GNUC__) +#if EIGEN_COMP_GNUC // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) // { // Packet4f res = b; @@ -606,6 +820,110 @@ struct palign_impl<Offset,Packet2d> }; #endif +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4f,4>& kernel) { + _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2d,2>& kernel) { + __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); + kernel.packet[1] = tmp; +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4i,4>& kernel) { + __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); + __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); + __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); + + kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); + kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); + kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); + kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); +} + +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + const __m128i zero = _mm_setzero_si128(); + const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128i false_mask = _mm_cmpeq_epi32(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); +#else + return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + const __m128 zero = _mm_setzero_ps(); + const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); + __m128 false_mask = _mm_cmpeq_ps(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_ps(thenPacket, elsePacket, false_mask); +#else + return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); +#endif +} +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + const __m128d zero = _mm_setzero_pd(); + const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); + __m128d false_mask = _mm_cmpeq_pd(select, zero); +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blendv_pd(thenPacket, elsePacket, false_mask); +#else + return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1<Packet4f>(b),1); +#else + return _mm_move_ss(a, _mm_load_ss(&b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1<Packet2d>(b),1); +#else + return _mm_move_sd(a, _mm_load_sd(&b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3)); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); + return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b))); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1)); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); + return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b))); +#endif +} + +// Scalar path for pmadd with FMA to ensure consistency with vectorized path. +#ifdef __FMA__ +template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { + return ::fmaf(a,b,c); +} +template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { + return ::fma(a,b,c); +} +#endif + } // end namespace internal } // end namespace Eigen diff --git a/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h b/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h new file mode 100644 index 0000000..c848932 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -0,0 +1,77 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_SSE_H +#define EIGEN_TYPE_CASTING_SSE_H + +namespace Eigen { + +namespace internal { + +template <> +struct type_casting_traits<float, int> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) { + return _mm_cvttps_epi32(a); +} + + +template <> +struct type_casting_traits<int, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) { + return _mm_cvtepi32_ps(a); +} + + +template <> +struct type_casting_traits<double, float> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) { + return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); +} + +template <> +struct type_casting_traits<float, double> { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) { + // Simply discard the second half of the input + return _mm_cvtps_pd(a); +} + + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_SSE_H diff --git a/eigen/Eigen/src/Core/arch/ZVector/Complex.h b/eigen/Eigen/src/Core/arch/ZVector/Complex.h new file mode 100644 index 0000000..d39d2d1 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/ZVector/Complex.h @@ -0,0 +1,394 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX32_ALTIVEC_H +#define EIGEN_COMPLEX32_ALTIVEC_H + +namespace Eigen { + +namespace internal { + +static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; +static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; + +struct Packet1cd +{ + EIGEN_STRONG_INLINE Packet1cd() {} + EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {} + Packet2d v; +}; + +struct Packet2cf +{ + EIGEN_STRONG_INLINE Packet2cf() {} + EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} + union { + Packet4f v; + Packet1cd cd[2]; + }; +}; + +template<> struct packet_traits<std::complex<float> > : default_packet_traits +{ + typedef Packet2cf type; + typedef Packet2cf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 2, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasBlend = 1, + HasSetLinear = 0 + }; +}; + + +template<> struct packet_traits<std::complex<double> > : default_packet_traits +{ + typedef Packet1cd type; + typedef Packet1cd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 1, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasSetLinear = 0 + }; +}; + +template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; + +/* Forward declaration */ +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel); + +template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } + +template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) +{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); } + +template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) +{ + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride) +{ + std::complex<float> EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet2cf>(af); +} +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED) +{ + return pload<Packet1cd>(from); +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride) +{ + std::complex<float> EIGEN_ALIGN16 af[2]; + pstore<std::complex<float> >((std::complex<float> *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED) +{ + pstore<std::complex<double> >(to, from); +} + +template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v; + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + Packet2d a_re, a_im, v1, v2; + + // Permute and multiply the real parts of a and b + a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI); + // Get the imaginary parts of a + a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO); + // multiply a_re * b + v1 = vec_madd(a_re, b.v, p2d_ZERO); + // multiply a_im * b and get the conjugate result + v2 = vec_madd(a_im, b.v, p2d_ZERO); + v2 = (Packet2d) vec_sld((Packet4ui)v2, (Packet4ui)v2, 8); + v2 = (Packet2d) vec_xor((Packet2d)v2, (Packet2d) p2ul_CONJ_XOR1); + + return Packet1cd(v1 + v2); +} +template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v; + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd pand <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pand <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); } + +template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) +{ + std::complex<double> EIGEN_ALIGN16 res; + pstore<std::complex<double> >(&res, a); + + return res; +} +template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) +{ + std::complex<float> EIGEN_ALIGN16 res[2]; + pstore<std::complex<float> >(res, a); + + return res[0]; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} + +template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) +{ + return pfirst(a); +} +template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) +{ + std::complex<float> res; + Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) +{ + return vecs[0]; +} +template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs) +{ + PacketBlock<Packet2cf,2> transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + ptranspose(transpose); + + return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]); +} + +template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) +{ + return pfirst(a); +} +template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) +{ + std::complex<float> res; + Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template<int Offset> +struct palign_impl<Offset,Packet1cd> +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template<int Offset> +struct palign_impl<Offset,Packet2cf> +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset == 1) { + first.cd[0] = first.cd[1]; + first.cd[1] = second.cd[0]; + } + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, false,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,false> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet1cd, Packet1cd, true,true> +{ + EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> struct conj_helper<Packet2cf, Packet2cf, false,true> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(a, pconj(b)); + } +}; + +template<> struct conj_helper<Packet2cf, Packet2cf, true,false> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return internal::pmul(pconj(a), b); + } +}; + +template<> struct conj_helper<Packet2cf, Packet2cf, true,true> +{ + EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const + { return padd(pmul(x,y),c); } + + EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const + { + return pconj(internal::pmul(a, b)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) +{ + // TODO optimize it for AltiVec + Packet2cf res; + res.cd[0] = pdiv<Packet1cd>(a.cd[0], b.cd[0]); + res.cd[1] = pdiv<Packet1cd>(a.cd[1], b.cd[1]); + return res; +} + +EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + +EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x) +{ + Packet2cf res; + res.cd[0] = pcplxflip(x.cd[0]); + res.cd[1] = pcplxflip(x.cd[1]); + return res; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel) +{ + Packet1cd tmp = kernel.packet[0].cd[1]; + kernel.packet[0].cd[1] = kernel.packet[1].cd[0]; + kernel.packet[1].cd[0] = tmp; +} + +template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { + Packet2cf result; + const Selector<4> ifPacket4 = { ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1] }; + result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v); + return result; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX32_ALTIVEC_H diff --git a/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h b/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h new file mode 100644 index 0000000..5c7aa72 --- /dev/null +++ b/eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -0,0 +1,137 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2007 Julien Pommier +// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + +#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H +#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H + +namespace Eigen { + +namespace internal { + +static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + +static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp<Packet2d>(const Packet2d& _x) +{ + Packet2d x = _x; + + Packet2d tmp, fx; + Packet2l emm0; + + // clamp x + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + /* express exp(x) as exp(g + n*log(2)) */ + fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); + + fx = vec_floor(fx); + + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet2d x2 = pmul(x,x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + + // build 2^n + emm0 = vec_ctsl(fx, 0); + + static const Packet2l p2l_1023 = { 1023, 1023 }; + static const Packet2ul p2ul_52 = { 52, 52 }; + + emm0 = emm0 + p2l_1023; + emm0 = emm0 << reinterpret_cast<Packet2l>(p2ul_52); + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x), + isnumber_mask); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp<Packet4f>(const Packet4f& x) +{ + Packet4f res; + res.v4f[0] = pexp<Packet2d>(x.v4f[0]); + res.v4f[1] = pexp<Packet2d>(x.v4f[1]); + return res; +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d psqrt<Packet2d>(const Packet2d& x) +{ + return __builtin_s390_vfsqdb(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f psqrt<Packet4f>(const Packet4f& x) +{ + Packet4f res; + res.v4f[0] = psqrt<Packet2d>(x.v4f[0]); + res.v4f[1] = psqrt<Packet2d>(x.v4f[1]); + return res; +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d prsqrt<Packet2d>(const Packet2d& x) { + // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. + return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f prsqrt<Packet4f>(const Packet4f& x) { + Packet4f res; + res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]); + res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]); + return res; +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_ALTIVEC_H diff --git a/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h b/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h new file mode 100644 index 0000000..57b01fc --- /dev/null +++ b/eigen/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -0,0 +1,945 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_ZVECTOR_H +#define EIGEN_PACKET_MATH_ZVECTOR_H + +#include <stdint.h> + +namespace Eigen { + +namespace internal { + +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD +#endif + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#endif + +typedef __vector int Packet4i; +typedef __vector unsigned int Packet4ui; +typedef __vector __bool int Packet4bi; +typedef __vector short int Packet8i; +typedef __vector unsigned char Packet16uc; +typedef __vector double Packet2d; +typedef __vector unsigned long long Packet2ul; +typedef __vector long long Packet2l; + +typedef struct { + Packet2d v4f[2]; +} Packet4f; + +typedef union { + int32_t i[4]; + uint32_t ui[4]; + int64_t l[2]; + uint64_t ul[2]; + double d[2]; + Packet4i v4i; + Packet4ui v4ui; + Packet2l v2l; + Packet2ul v2ul; + Packet2d v2d; +} Packet; + +// We don't want to write the same code all the time, but we need to reuse the constants +// and it doesn't really work to declare them global, so we define macros instead + +#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X)) + +#define _EIGEN_DECLARE_CONST_FAST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X)) + +#define _EIGEN_DECLARE_CONST_FAST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X)) + +#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ + Packet4i p4i_##NAME = pset1<Packet4i>(X) + +#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ + Packet2d p2d_##NAME = pset1<Packet2d>(X) + +#define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ + Packet2l p2l_##NAME = pset1<Packet2l>(X) + +// These constants are endian-agnostic +//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} + +static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); +static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); + +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; + +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8)); + +static Packet16uc p16uc_PSET64_HI = { 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; +static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; + +// Mask alignment +#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 + +#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) + +// Handle endianness properly while loading constants +// Define global static constants: + +static Packet16uc p16uc_FORWARD = { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }; +static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; +static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; + +static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; +static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; +/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; + +static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/ +static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; +/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/ +static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; +static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; + +//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; + +//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; + + +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + #define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR); +#else + #define EIGEN_ZVECTOR_PREFETCH(ADDR) asm( " pfd [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#endif + +template<> struct packet_traits<int> : default_packet_traits +{ + typedef Packet4i type; + typedef Packet4i half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasBlend = 1 + }; +}; + +template<> struct packet_traits<float> : default_packet_traits +{ + typedef Packet4f type; + typedef Packet4f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; + +template<> struct packet_traits<double> : default_packet_traits +{ + typedef Packet2d type; + typedef Packet2d half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 1, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasRound = 1, + HasFloor = 1, + HasCeil = 1, + HasNegate = 1, + HasBlend = 1 + }; +}; + +template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; + +/* Forward declaration */ +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel); + +inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) +{ + Packet vt; + vt.v4i = v; + s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) +{ + Packet vt; + vt.v4ui = v; + s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) +{ + Packet vt; + vt.v2l = v; + s << vt.l[0] << ", " << vt.l[1]; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet2ul & v) +{ + Packet vt; + vt.v2ul = v; + s << vt.ul[0] << ", " << vt.ul[1] ; + return s; +} + +inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) +{ + Packet vt; + vt.v2d = v; + s << vt.d[0] << ", " << vt.d[1]; + return s; +} + +/* Helper function to simulate a vec_splat_packet4f + */ +template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) +{ + Packet4f splat; + switch (element) { + case 0: + splat.v4f[0] = vec_splat(from.v4f[0], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 1: + splat.v4f[0] = vec_splat(from.v4f[0], 1); + splat.v4f[1] = splat.v4f[0]; + break; + case 2: + splat.v4f[0] = vec_splat(from.v4f[1], 0); + splat.v4f[1] = splat.v4f[0]; + break; + case 3: + splat.v4f[0] = vec_splat(from.v4f[1], 1); + splat.v4f[1] = splat.v4f[0]; + break; + } + return splat; +} + +template<int Offset> +struct palign_impl<Offset,Packet4i> +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } + } +}; + +/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double + */ +template<int Offset> +struct palign_impl<Offset,Packet4f> +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); + first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); + break; + case 2: + first.v4f[0] = first.v4f[1]; + first.v4f[1] = second.v4f[0]; + break; + case 3: + first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); + first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); + break; + } + } +}; + + +template<int Offset> +struct palign_impl<Offset,Packet2d> +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) + first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4i; +} + +template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet4f vfrom; + vfrom.v4f[0] = vec_ld2f(&from[0]); + vfrom.v4f[1] = vec_ld2f(&from[2]); + return vfrom; +} + +template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v2d; +} + +template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4i = from; +} + +template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + vec_st2f(from.v4f[0], &to[0]); + vec_st2f(from.v4f[1], &to[2]); +} + + +template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v2d = from; +} + +template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) +{ + return vec_splats(from); +} +template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { + return vec_splats(from); +} +template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) +{ + Packet4f to; + to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from)); + to.v4f[1] = to.v4f[0]; + return to; +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4i>(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = pload<Packet4i>(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet4f>(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) +{ + a3 = pload<Packet4f>(a); + a0 = vec_splat_packet4f<0>(a3); + a1 = vec_splat_packet4f<1>(a3); + a2 = vec_splat_packet4f<2>(a3); + a3 = vec_splat_packet4f<3>(a3); +} + +template<> EIGEN_STRONG_INLINE void +pbroadcast4<Packet2d>(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload<Packet2d>(a); + a0 = vec_splat(a1, 0); + a1 = vec_splat(a1, 1); + a3 = pload<Packet2d>(a+2); + a2 = vec_splat(a3, 0); + a3 = vec_splat(a3, 1); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload<Packet4i>(ai); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload<Packet4f>(ai); +} + +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload<Packet2d>(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + pstore<int>((int *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) +{ + float EIGEN_ALIGN16 ai[4]; + pstore<float>((float *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + pstore<double>(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] + b.v4f[0]; + c.v4f[1] = a.v4f[1] + b.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); } + +template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] - b.v4f[0]; + c.v4f[1] = a.v4f[1] - b.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); } + +template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] * b.v4f[0]; + c.v4f[1] = a.v4f[1] * b.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); } + +template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f c; + c.v4f[0] = a.v4f[0] / b.v4f[0]; + c.v4f[1] = a.v4f[1] / b.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); } + +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ + Packet4f c; + c.v4f[0] = -a.v4f[0]; + c.v4f[1] = -a.v4f[1]; + return c; +} +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } + +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) +{ + Packet4f res; + res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]); + res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } + +template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); } + +template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmin(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmin(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pmax(a.v4f[0], b.v4f[0]); + res.v4f[1] = pmax(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pand(a.v4f[0], b.v4f[0]); + res.v4f[1] = pand(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) +{ + Packet4f res; + res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]); + res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_round(a.v4f[0]); + res.v4f[1] = vec_round(a.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); } +template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_ceil(a.v4f[0]); + res.v4f[1] = vec_ceil(a.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); } +template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = vec_floor(a.v4f[0]); + res.v4f[1] = vec_floor(a.v4f[1]); + return res; +} +template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); } + +template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) { return pload<Packet4i>(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { return pload<Packet4f>(from); } +template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { return pload<Packet2d>(from); } + + +template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) +{ + Packet4i p = pload<Packet4i>(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} + +template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) +{ + Packet4f p = pload<Packet4f>(from); + p.v4f[1] = vec_splat(p.v4f[0], 1); + p.v4f[0] = vec_splat(p.v4f[0], 0); + return p; +} + +template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) +{ + Packet2d p = pload<Packet2d>(from); + return vec_perm(p, p, p16uc_PSET64_HI); +} + +template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { pstore<int>(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { pstore<double>(to, from); } + +template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ + return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64)); +} + +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) +{ + Packet4f rev; + rev.v4f[0] = preverse<Packet2d>(a.v4f[1]); + rev.v4f[1] = preverse<Packet2d>(a.v4f[0]); + return rev; +} + +template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) +{ + Packet4f res; + res.v4f[0] = pabs(a.v4f[0]); + res.v4f[1] = pabs(a.v4f[1]); + return res; +} + +template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) +{ + Packet4i b, sum; + b = vec_sld(a, a, 8); + sum = padd<Packet4i>(a, b); + b = vec_sld(sum, sum, 4); + sum = padd<Packet4i>(sum, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) +{ + Packet2d b, sum; + b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8)); + sum = padd<Packet2d>(a, b); + return pfirst(sum); +} +template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) +{ + Packet2d sum; + sum = padd<Packet2d>(a.v4f[0], a.v4f[1]); + double first = predux<Packet2d>(sum); + return static_cast<float>(first); +} + +template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) +{ + Packet4i v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + + // Now do the summation: + // Lines 0+1 + sum[0] = padd<Packet4i>(sum[0], sum[1]); + // Lines 2+3 + sum[1] = padd<Packet4i>(sum[2], sum[3]); + // Add the results + sum[0] = padd<Packet4i>(sum[0], sum[1]); + + return sum[0]; +} + +template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8))); + v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8))); + + sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8)); + + return sum; +} + +template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) +{ + PacketBlock<Packet4f,4> transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + transpose.packet[2] = vecs[2]; + transpose.packet[3] = vecs[3]; + ptranspose(transpose); + + Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); + sum = padd(sum, transpose.packet[2]); + sum = padd(sum, transpose.packet[3]); + return sum; +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return aux[0] * aux[1] * aux[2] * aux[3]; +} + +template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) +{ + return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8)))); +} + +template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) +{ + // Return predux_mul<Packet2d> of the subvectors product + return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); +} + +// min +template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) +{ + Packet4i b, res; + b = pmin<Packet4i>(a, vec_sld(a, a, 8)); + res = pmin<Packet4i>(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) +{ + return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8)))); +} + +template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) +{ + Packet2d b, res; + b = pmin<Packet2d>(a.v4f[0], a.v4f[1]); + res = pmin<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8))); + return static_cast<float>(pfirst(res)); +} + +// max +template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) +{ + Packet4i b, res; + b = pmax<Packet4i>(a, vec_sld(a, a, 8)); + res = pmax<Packet4i>(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) +{ + return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8)))); +} + +template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) +{ + Packet2d b, res; + b = pmax<Packet2d>(a.v4f[0], a.v4f[1]); + res = pmax<Packet2d>(b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8))); + return static_cast<float>(pfirst(res)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4i,4>& kernel) { + Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet2d,2>& kernel) { + Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + kernel.packet[0] = t0; + kernel.packet[1] = t1; +} + +/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one + */ +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock<Packet4f,4>& kernel) { + PacketBlock<Packet2d,2> t0,t1,t2,t3; + // copy top-left 2x2 Packet2d block + t0.packet[0] = kernel.packet[0].v4f[0]; + t0.packet[1] = kernel.packet[1].v4f[0]; + + // copy top-right 2x2 Packet2d block + t1.packet[0] = kernel.packet[0].v4f[1]; + t1.packet[1] = kernel.packet[1].v4f[1]; + + // copy bottom-left 2x2 Packet2d block + t2.packet[0] = kernel.packet[2].v4f[0]; + t2.packet[1] = kernel.packet[3].v4f[0]; + + // copy bottom-right 2x2 Packet2d block + t3.packet[0] = kernel.packet[2].v4f[1]; + t3.packet[1] = kernel.packet[3].v4f[1]; + + // Transpose all 2x2 blocks + ptranspose(t0); + ptranspose(t1); + ptranspose(t2); + ptranspose(t3); + + // Copy back transposed blocks, but exchange t1 and t2 due to transposition + kernel.packet[0].v4f[0] = t0.packet[0]; + kernel.packet[0].v4f[1] = t2.packet[0]; + kernel.packet[1].v4f[0] = t0.packet[1]; + kernel.packet[1].v4f[1] = t2.packet[1]; + kernel.packet[2].v4f[0] = t1.packet[0]; + kernel.packet[2].v4f[1] = t3.packet[0]; + kernel.packet[3].v4f[0] = t1.packet[1]; + kernel.packet[3].v4f[1] = t3.packet[1]; +} + +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { + Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; + Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE)); + Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE)); + Packet4f result; + result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi); + result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_ZVECTOR_H |