1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H 11 #define EIGEN_PACKET_MATH_ALTIVEC_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 19 #endif 20 21 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 22 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 23 #endif 24 25 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 26 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 27 #endif 28 29 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 30 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 31 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 32 #endif 33 34 typedef __vector float Packet4f; 35 typedef __vector int Packet4i; 36 typedef __vector unsigned int Packet4ui; 37 typedef __vector __bool int Packet4bi; 38 typedef __vector short int Packet8i; 39 typedef __vector unsigned char Packet16uc; 40 41 // We don't want to write the same code all the time, but we need to reuse the constants 42 // and it doesn't really work to declare them global, so we define macros instead 43 44 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 45 Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X)) 46 47 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 48 Packet4i p4i_##NAME = vec_splat_s32(X) 49 50 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 51 Packet4f p4f_##NAME = pset1<Packet4f>(X) 52 53 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 54 Packet4i p4i_##NAME = pset1<Packet4i>(X) 55 56 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ 57 Packet2d p2d_##NAME = pset1<Packet2d>(X) 58 59 #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ 60 Packet2l p2l_##NAME = pset1<Packet2l>(X) 61 62 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 63 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X)) 64 65 #define DST_CHAN 1 66 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 67 68 69 // These constants are endian-agnostic 70 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} 71 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} 72 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} 73 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} 74 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} 75 static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} 76 #ifndef __VSX__ 77 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} 78 #endif 79 80 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; 81 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; 82 83 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; 84 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; 85 86 // Mask alignment 87 #ifdef __PPC64__ 88 #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 89 #else 90 #define _EIGEN_MASK_ALIGNMENT 0xfffffff0 91 #endif 92 93 #define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) 94 95 // Handle endianness properly while loading constants 96 // Define global static constants: 97 #ifdef _BIG_ENDIAN 98 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 99 #ifdef __VSX__ 100 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 101 #endif 102 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 103 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 104 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 105 #else 106 static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 107 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 108 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 109 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 110 static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 111 #endif // _BIG_ENDIAN 112 113 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; 114 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; 115 static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; 116 static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; 117 118 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; 119 120 #ifdef _BIG_ENDIAN 121 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 122 #else 123 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 124 #endif // _BIG_ENDIAN 125 126 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC 127 #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); 128 #else 129 #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); 130 #endif 131 132 template<> struct packet_traits<float> : default_packet_traits 133 { 134 typedef Packet4f type; 135 typedef Packet4f half; 136 enum { 137 Vectorizable = 1, 138 AlignedOnScalar = 1, 139 size=4, 140 HasHalfPacket = 1, 141 142 HasAdd = 1, 143 HasSub = 1, 144 HasMul = 1, 145 HasDiv = 1, 146 HasMin = 1, 147 HasMax = 1, 148 HasAbs = 1, 149 HasSin = 0, 150 HasCos = 0, 151 HasLog = 0, 152 HasExp = 1, 153 #ifdef __VSX__ 154 HasSqrt = 1, 155 #if !EIGEN_COMP_CLANG 156 HasRsqrt = 1, 157 #else 158 HasRsqrt = 0, 159 #endif 160 #else 161 HasSqrt = 0, 162 HasRsqrt = 0, 163 #endif 164 HasRound = 1, 165 HasFloor = 1, 166 HasCeil = 1, 167 HasNegate = 1, 168 HasBlend = 1 169 }; 170 }; 171 template<> struct packet_traits<int> : default_packet_traits 172 { 173 typedef Packet4i type; 174 typedef Packet4i half; 175 enum { 176 Vectorizable = 1, 177 AlignedOnScalar = 1, 178 size = 4, 179 HasHalfPacket = 0, 180 181 HasAdd = 1, 182 HasSub = 1, 183 HasMul = 1, 184 HasDiv = 0, 185 HasBlend = 1 186 }; 187 }; 188 189 190 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; 191 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; 192 193 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) 194 { 195 union { 196 Packet16uc v; 197 unsigned char n[16]; 198 } vt; 199 vt.v = v; 200 for (int i=0; i< 16; i++) 201 s << (int)vt.n[i] << ", "; 202 return s; 203 } 204 205 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 206 { 207 union { 208 Packet4f v; 209 float n[4]; 210 } vt; 211 vt.v = v; 212 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 213 return s; 214 } 215 216 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 217 { 218 union { 219 Packet4i v; 220 int n[4]; 221 } vt; 222 vt.v = v; 223 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 224 return s; 225 } 226 227 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 228 { 229 union { 230 Packet4ui v; 231 unsigned int n[4]; 232 } vt; 233 vt.v = v; 234 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 235 return s; 236 } 237 238 // Need to define them first or we get specialization after instantiation errors 239 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) 240 { 241 EIGEN_DEBUG_ALIGNED_LOAD 242 #ifdef __VSX__ 243 return vec_vsx_ld(0, from); 244 #else 245 return vec_ld(0, from); 246 #endif 247 } 248 249 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) 250 { 251 EIGEN_DEBUG_ALIGNED_LOAD 252 #ifdef __VSX__ 253 return vec_vsx_ld(0, from); 254 #else 255 return vec_ld(0, from); 256 #endif 257 } 258 259 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) 260 { 261 EIGEN_DEBUG_ALIGNED_STORE 262 #ifdef __VSX__ 263 vec_vsx_st(from, 0, to); 264 #else 265 vec_st(from, 0, to); 266 #endif 267 } 268 269 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) 270 { 271 EIGEN_DEBUG_ALIGNED_STORE 272 #ifdef __VSX__ 273 vec_vsx_st(from, 0, to); 274 #else 275 vec_st(from, 0, to); 276 #endif 277 } 278 279 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 280 Packet4f v = {from, from, from, from}; 281 return v; 282 } 283 284 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 285 Packet4i v = {from, from, from, from}; 286 return v; 287 } 288 template<> EIGEN_STRONG_INLINE void 289 pbroadcast4<Packet4f>(const float *a, 290 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) 291 { 292 a3 = pload<Packet4f>(a); 293 a0 = vec_splat(a3, 0); 294 a1 = vec_splat(a3, 1); 295 a2 = vec_splat(a3, 2); 296 a3 = vec_splat(a3, 3); 297 } 298 template<> EIGEN_STRONG_INLINE void 299 pbroadcast4<Packet4i>(const int *a, 300 Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) 301 { 302 a3 = pload<Packet4i>(a); 303 a0 = vec_splat(a3, 0); 304 a1 = vec_splat(a3, 1); 305 a2 = vec_splat(a3, 2); 306 a3 = vec_splat(a3, 3); 307 } 308 309 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) 310 { 311 float EIGEN_ALIGN16 af[4]; 312 af[0] = from[0*stride]; 313 af[1] = from[1*stride]; 314 af[2] = from[2*stride]; 315 af[3] = from[3*stride]; 316 return pload<Packet4f>(af); 317 } 318 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) 319 { 320 int EIGEN_ALIGN16 ai[4]; 321 ai[0] = from[0*stride]; 322 ai[1] = from[1*stride]; 323 ai[2] = from[2*stride]; 324 ai[3] = from[3*stride]; 325 return pload<Packet4i>(ai); 326 } 327 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) 328 { 329 float EIGEN_ALIGN16 af[4]; 330 pstore<float>(af, from); 331 to[0*stride] = af[0]; 332 to[1*stride] = af[1]; 333 to[2*stride] = af[2]; 334 to[3*stride] = af[3]; 335 } 336 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) 337 { 338 int EIGEN_ALIGN16 ai[4]; 339 pstore<int>((int *)ai, from); 340 to[0*stride] = ai[0]; 341 to[1*stride] = ai[1]; 342 to[2*stride] = ai[2]; 343 to[3*stride] = ai[3]; 344 } 345 346 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; } 347 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; } 348 349 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; } 350 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; } 351 352 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; } 353 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; } 354 355 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } 356 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } 357 358 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 359 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 360 361 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } 362 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; } 363 364 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 365 { 366 #ifndef __VSX__ // VSX actually provides a div instruction 367 Packet4f t, y_0, y_1; 368 369 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 370 y_0 = vec_re(b); 371 372 // Do one Newton-Raphson iteration to get the needed accuracy 373 t = vec_nmsub(y_0, b, p4f_ONE); 374 y_1 = vec_madd(y_0, t, y_0); 375 376 return vec_madd(a, y_1, p4f_MZERO); 377 #else 378 return vec_div(a, b); 379 #endif 380 } 381 382 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 383 { eigen_assert(false && "packet integer division are not supported by AltiVec"); 384 return pset1<Packet4i>(0); 385 } 386 387 // for some weird raisons, it has to be overloaded for packet of integers 388 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } 389 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } 390 391 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) 392 { 393 #ifdef __VSX__ 394 Packet4f ret; 395 __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 396 return ret; 397 #else 398 return vec_min(a, b); 399 #endif 400 } 401 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 402 403 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) 404 { 405 #ifdef __VSX__ 406 Packet4f ret; 407 __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 408 return ret; 409 #else 410 return vec_max(a, b); 411 #endif 412 } 413 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 414 415 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 416 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 417 418 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 419 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 420 421 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 422 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 423 424 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } 425 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } 426 427 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); } 428 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); } 429 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); } 430 431 #ifdef _BIG_ENDIAN 432 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 433 { 434 EIGEN_DEBUG_ALIGNED_LOAD 435 Packet16uc MSQ, LSQ; 436 Packet16uc mask; 437 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 438 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 439 mask = vec_lvsl(0, from); // create the permute mask 440 return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data 441 442 } 443 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 444 { 445 EIGEN_DEBUG_ALIGNED_LOAD 446 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 447 Packet16uc MSQ, LSQ; 448 Packet16uc mask; 449 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 450 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 451 mask = vec_lvsl(0, from); // create the permute mask 452 return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data 453 } 454 #else 455 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX 456 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 457 { 458 EIGEN_DEBUG_UNALIGNED_LOAD 459 return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); 460 } 461 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 462 { 463 EIGEN_DEBUG_UNALIGNED_LOAD 464 return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); 465 } 466 #endif 467 468 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 469 { 470 Packet4f p; 471 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from); 472 else p = ploadu<Packet4f>(from); 473 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 474 } 475 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 476 { 477 Packet4i p; 478 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from); 479 else p = ploadu<Packet4i>(from); 480 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 481 } 482 483 #ifdef _BIG_ENDIAN 484 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 485 { 486 EIGEN_DEBUG_UNALIGNED_STORE 487 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 488 // Warning: not thread safe! 489 Packet16uc MSQ, LSQ, edges; 490 Packet16uc edgeAlign, align; 491 492 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 493 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 494 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 495 edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 496 align = vec_lvsr( 0, to ); // permute map to misalign data 497 MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 498 LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 499 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 500 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 501 } 502 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 503 { 504 EIGEN_DEBUG_UNALIGNED_STORE 505 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 506 // Warning: not thread safe! 507 Packet16uc MSQ, LSQ, edges; 508 Packet16uc edgeAlign, align; 509 510 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 511 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 512 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 513 edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges 514 align = vec_lvsr( 0, to ); // permute map to misalign data 515 MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) 516 LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) 517 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 518 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 519 } 520 #else 521 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX 522 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 523 { 524 EIGEN_DEBUG_ALIGNED_STORE 525 vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); 526 } 527 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 528 { 529 EIGEN_DEBUG_ALIGNED_STORE 530 vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); 531 } 532 #endif 533 534 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } 535 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); } 536 537 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } 538 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } 539 540 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) 541 { 542 return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); 543 } 544 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) 545 { 546 return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); } 547 548 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 549 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 550 551 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 552 { 553 Packet4f b, sum; 554 b = vec_sld(a, a, 8); 555 sum = a + b; 556 b = vec_sld(sum, sum, 4); 557 sum += b; 558 return pfirst(sum); 559 } 560 561 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 562 { 563 Packet4f v[4], sum[4]; 564 565 // It's easier and faster to transpose then add as columns 566 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 567 // Do the transpose, first set of moves 568 v[0] = vec_mergeh(vecs[0], vecs[2]); 569 v[1] = vec_mergel(vecs[0], vecs[2]); 570 v[2] = vec_mergeh(vecs[1], vecs[3]); 571 v[3] = vec_mergel(vecs[1], vecs[3]); 572 // Get the resulting vectors 573 sum[0] = vec_mergeh(v[0], v[2]); 574 sum[1] = vec_mergel(v[0], v[2]); 575 sum[2] = vec_mergeh(v[1], v[3]); 576 sum[3] = vec_mergel(v[1], v[3]); 577 578 // Now do the summation: 579 // Lines 0+1 580 sum[0] = sum[0] + sum[1]; 581 // Lines 2+3 582 sum[1] = sum[2] + sum[3]; 583 // Add the results 584 sum[0] = sum[0] + sum[1]; 585 586 return sum[0]; 587 } 588 589 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 590 { 591 Packet4i sum; 592 sum = vec_sums(a, p4i_ZERO); 593 #ifdef _BIG_ENDIAN 594 sum = vec_sld(sum, p4i_ZERO, 12); 595 #else 596 sum = vec_sld(p4i_ZERO, sum, 4); 597 #endif 598 return pfirst(sum); 599 } 600 601 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 602 { 603 Packet4i v[4], sum[4]; 604 605 // It's easier and faster to transpose then add as columns 606 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 607 // Do the transpose, first set of moves 608 v[0] = vec_mergeh(vecs[0], vecs[2]); 609 v[1] = vec_mergel(vecs[0], vecs[2]); 610 v[2] = vec_mergeh(vecs[1], vecs[3]); 611 v[3] = vec_mergel(vecs[1], vecs[3]); 612 // Get the resulting vectors 613 sum[0] = vec_mergeh(v[0], v[2]); 614 sum[1] = vec_mergel(v[0], v[2]); 615 sum[2] = vec_mergeh(v[1], v[3]); 616 sum[3] = vec_mergel(v[1], v[3]); 617 618 // Now do the summation: 619 // Lines 0+1 620 sum[0] = sum[0] + sum[1]; 621 // Lines 2+3 622 sum[1] = sum[2] + sum[3]; 623 // Add the results 624 sum[0] = sum[0] + sum[1]; 625 626 return sum[0]; 627 } 628 629 // Other reduction functions: 630 // mul 631 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 632 { 633 Packet4f prod; 634 prod = pmul(a, vec_sld(a, a, 8)); 635 return pfirst(pmul(prod, vec_sld(prod, prod, 4))); 636 } 637 638 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 639 { 640 EIGEN_ALIGN16 int aux[4]; 641 pstore(aux, a); 642 return aux[0] * aux[1] * aux[2] * aux[3]; 643 } 644 645 // min 646 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 647 { 648 Packet4f b, res; 649 b = vec_min(a, vec_sld(a, a, 8)); 650 res = vec_min(b, vec_sld(b, b, 4)); 651 return pfirst(res); 652 } 653 654 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 655 { 656 Packet4i b, res; 657 b = vec_min(a, vec_sld(a, a, 8)); 658 res = vec_min(b, vec_sld(b, b, 4)); 659 return pfirst(res); 660 } 661 662 // max 663 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 664 { 665 Packet4f b, res; 666 b = vec_max(a, vec_sld(a, a, 8)); 667 res = vec_max(b, vec_sld(b, b, 4)); 668 return pfirst(res); 669 } 670 671 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 672 { 673 Packet4i b, res; 674 b = vec_max(a, vec_sld(a, a, 8)); 675 res = vec_max(b, vec_sld(b, b, 4)); 676 return pfirst(res); 677 } 678 679 template<int Offset> 680 struct palign_impl<Offset,Packet4f> 681 { 682 static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 683 { 684 #ifdef _BIG_ENDIAN 685 switch (Offset % 4) { 686 case 1: 687 first = vec_sld(first, second, 4); break; 688 case 2: 689 first = vec_sld(first, second, 8); break; 690 case 3: 691 first = vec_sld(first, second, 12); break; 692 } 693 #else 694 switch (Offset % 4) { 695 case 1: 696 first = vec_sld(second, first, 12); break; 697 case 2: 698 first = vec_sld(second, first, 8); break; 699 case 3: 700 first = vec_sld(second, first, 4); break; 701 } 702 #endif 703 } 704 }; 705 706 template<int Offset> 707 struct palign_impl<Offset,Packet4i> 708 { 709 static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 710 { 711 #ifdef _BIG_ENDIAN 712 switch (Offset % 4) { 713 case 1: 714 first = vec_sld(first, second, 4); break; 715 case 2: 716 first = vec_sld(first, second, 8); break; 717 case 3: 718 first = vec_sld(first, second, 12); break; 719 } 720 #else 721 switch (Offset % 4) { 722 case 1: 723 first = vec_sld(second, first, 12); break; 724 case 2: 725 first = vec_sld(second, first, 8); break; 726 case 3: 727 first = vec_sld(second, first, 4); break; 728 } 729 #endif 730 } 731 }; 732 733 EIGEN_DEVICE_FUNC inline void 734 ptranspose(PacketBlock<Packet4f,4>& kernel) { 735 Packet4f t0, t1, t2, t3; 736 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 737 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 738 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 739 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 740 kernel.packet[0] = vec_mergeh(t0, t2); 741 kernel.packet[1] = vec_mergel(t0, t2); 742 kernel.packet[2] = vec_mergeh(t1, t3); 743 kernel.packet[3] = vec_mergel(t1, t3); 744 } 745 746 EIGEN_DEVICE_FUNC inline void 747 ptranspose(PacketBlock<Packet4i,4>& kernel) { 748 Packet4i t0, t1, t2, t3; 749 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 750 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 751 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 752 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 753 kernel.packet[0] = vec_mergeh(t0, t2); 754 kernel.packet[1] = vec_mergel(t0, t2); 755 kernel.packet[2] = vec_mergeh(t1, t3); 756 kernel.packet[3] = vec_mergel(t1, t3); 757 } 758 759 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { 760 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; 761 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); 762 return vec_sel(elsePacket, thenPacket, mask); 763 } 764 765 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { 766 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; 767 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); 768 return vec_sel(elsePacket, thenPacket, mask); 769 } 770 771 772 //---------- double ---------- 773 #ifdef __VSX__ 774 typedef __vector double Packet2d; 775 typedef __vector unsigned long long Packet2ul; 776 typedef __vector long long Packet2l; 777 #if EIGEN_COMP_CLANG 778 typedef Packet2ul Packet2bl; 779 #else 780 typedef __vector __bool long Packet2bl; 781 #endif 782 783 static Packet2l p2l_ONE = { 1, 1 }; 784 static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); 785 static Packet2d p2d_ONE = { 1.0, 1.0 }; 786 static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); 787 static Packet2d p2d_MZERO = { -0.0, -0.0 }; 788 789 #ifdef _BIG_ENDIAN 790 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8)); 791 #else 792 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8)); 793 #endif 794 795 template<int index> Packet2d vec_splat_dbl(Packet2d& a); 796 797 template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) 798 { 799 return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI)); 800 } 801 802 template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) 803 { 804 return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO)); 805 } 806 807 template<> struct packet_traits<double> : default_packet_traits 808 { 809 typedef Packet2d type; 810 typedef Packet2d half; 811 enum { 812 Vectorizable = 1, 813 AlignedOnScalar = 1, 814 size=2, 815 HasHalfPacket = 1, 816 817 HasAdd = 1, 818 HasSub = 1, 819 HasMul = 1, 820 HasDiv = 1, 821 HasMin = 1, 822 HasMax = 1, 823 HasAbs = 1, 824 HasSin = 0, 825 HasCos = 0, 826 HasLog = 0, 827 HasExp = 1, 828 HasSqrt = 1, 829 HasRsqrt = 1, 830 HasRound = 1, 831 HasFloor = 1, 832 HasCeil = 1, 833 HasNegate = 1, 834 HasBlend = 1 835 }; 836 }; 837 838 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; 839 840 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) 841 { 842 union { 843 Packet2l v; 844 int64_t n[2]; 845 } vt; 846 vt.v = v; 847 s << vt.n[0] << ", " << vt.n[1]; 848 return s; 849 } 850 851 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) 852 { 853 union { 854 Packet2d v; 855 double n[2]; 856 } vt; 857 vt.v = v; 858 s << vt.n[0] << ", " << vt.n[1]; 859 return s; 860 } 861 862 // Need to define them first or we get specialization after instantiation errors 863 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) 864 { 865 EIGEN_DEBUG_ALIGNED_LOAD 866 #ifdef __VSX__ 867 return vec_vsx_ld(0, from); 868 #else 869 return vec_ld(0, from); 870 #endif 871 } 872 873 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) 874 { 875 EIGEN_DEBUG_ALIGNED_STORE 876 #ifdef __VSX__ 877 vec_vsx_st(from, 0, to); 878 #else 879 vec_st(from, 0, to); 880 #endif 881 } 882 883 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { 884 Packet2d v = {from, from}; 885 return v; 886 } 887 888 template<> EIGEN_STRONG_INLINE void 889 pbroadcast4<Packet2d>(const double *a, 890 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) 891 { 892 a1 = pload<Packet2d>(a); 893 a0 = vec_splat_dbl<0>(a1); 894 a1 = vec_splat_dbl<1>(a1); 895 a3 = pload<Packet2d>(a+2); 896 a2 = vec_splat_dbl<0>(a3); 897 a3 = vec_splat_dbl<1>(a3); 898 } 899 900 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) 901 { 902 double EIGEN_ALIGN16 af[2]; 903 af[0] = from[0*stride]; 904 af[1] = from[1*stride]; 905 return pload<Packet2d>(af); 906 } 907 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) 908 { 909 double EIGEN_ALIGN16 af[2]; 910 pstore<double>(af, from); 911 to[0*stride] = af[0]; 912 to[1*stride] = af[1]; 913 } 914 915 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; } 916 917 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; } 918 919 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; } 920 921 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } 922 923 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } 924 925 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); } 926 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } 927 928 // for some weird raisons, it has to be overloaded for packet of integers 929 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } 930 931 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) 932 { 933 Packet2d ret; 934 __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 935 return ret; 936 } 937 938 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) 939 { 940 Packet2d ret; 941 __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 942 return ret; 943 } 944 945 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } 946 947 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } 948 949 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } 950 951 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } 952 953 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); } 954 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); } 955 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); } 956 957 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) 958 { 959 EIGEN_DEBUG_ALIGNED_LOAD 960 return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); 961 } 962 963 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) 964 { 965 Packet2d p; 966 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); 967 else p = ploadu<Packet2d>(from); 968 return vec_splat_dbl<0>(p); 969 } 970 971 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) 972 { 973 EIGEN_DEBUG_ALIGNED_STORE 974 vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); 975 } 976 977 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } 978 979 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; } 980 981 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) 982 { 983 return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64)); 984 } 985 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } 986 987 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) 988 { 989 Packet2d b, sum; 990 b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8)); 991 sum = a + b; 992 return pfirst<Packet2d>(sum); 993 } 994 995 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) 996 { 997 Packet2d v[2], sum; 998 v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8)); 999 v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8)); 1000 1001 #ifdef _BIG_ENDIAN 1002 sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8)); 1003 #else 1004 sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8)); 1005 #endif 1006 1007 return sum; 1008 } 1009 // Other reduction functions: 1010 // mul 1011 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) 1012 { 1013 return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 1014 } 1015 1016 // min 1017 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) 1018 { 1019 return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 1020 } 1021 1022 // max 1023 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) 1024 { 1025 return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 1026 } 1027 1028 template<int Offset> 1029 struct palign_impl<Offset,Packet2d> 1030 { 1031 static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) 1032 { 1033 if (Offset == 1) 1034 #ifdef _BIG_ENDIAN 1035 first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8)); 1036 #else 1037 first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8)); 1038 #endif 1039 } 1040 }; 1041 1042 EIGEN_DEVICE_FUNC inline void 1043 ptranspose(PacketBlock<Packet2d,2>& kernel) { 1044 Packet2d t0, t1; 1045 t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); 1046 t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); 1047 kernel.packet[0] = t0; 1048 kernel.packet[1] = t1; 1049 } 1050 1051 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { 1052 Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; 1053 Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) ); 1054 return vec_sel(elsePacket, thenPacket, mask); 1055 } 1056 #endif // __VSX__ 1057 } // end namespace internal 1058 1059 } // end namespace Eigen 1060 1061 #endif // EIGEN_PACKET_MATH_ALTIVEC_H 1062