1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H 11 #define EIGEN_PACKET_MATH_ALTIVEC_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 19 #endif 20 21 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 22 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 23 #endif 24 25 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 26 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 27 #endif 28 29 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 30 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 31 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 32 #endif 33 34 typedef __vector float Packet4f; 35 typedef __vector int Packet4i; 36 typedef __vector unsigned int Packet4ui; 37 typedef __vector __bool int Packet4bi; 38 typedef __vector short int Packet8s; 39 typedef __vector unsigned short int Packet8us; 40 typedef __vector signed char Packet16c; 41 typedef __vector unsigned char Packet16uc; 42 typedef eigen_packet_wrapper<__vector unsigned short int,0> Packet8bf; 43 44 // We don't want to write the same code all the time, but we need to reuse the constants 45 // and it doesn't really work to declare them global, so we define macros instead 46 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 47 Packet4f p4f_##NAME = {X, X, X, X} 48 49 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 50 Packet4i p4i_##NAME = vec_splat_s32(X) 51 52 #define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \ 53 Packet4ui p4ui_##NAME = {X, X, X, X} 54 55 #define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \ 56 Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X} 57 58 #define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \ 59 Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X} 60 61 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 62 Packet4f p4f_##NAME = pset1<Packet4f>(X) 63 64 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 65 Packet4i p4i_##NAME = pset1<Packet4i>(X) 66 67 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ 68 Packet2d p2d_##NAME = pset1<Packet2d>(X) 69 70 #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ 71 Packet2l p2l_##NAME = pset1<Packet2l>(X) 72 73 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 74 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X)) 75 76 #define DST_CHAN 1 77 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 78 #define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type 79 80 // These constants are endian-agnostic 81 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} 82 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} 83 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} 84 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} 85 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} 86 static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u); 87 static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); 88 static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} 89 static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); 90 static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} 91 #ifndef __VSX__ 92 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} 93 #endif 94 95 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; 96 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; 97 static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; 98 static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; 99 100 static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 101 8, 9, 10, 11, 12, 13, 14, 15}; 102 static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 103 8, 9, 10, 11, 12, 13, 14, 15}; 104 105 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; 106 static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 }; 107 static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; 108 109 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; 110 static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 }; 111 static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 }; 112 static const Packet16uc p16uc_DUPLICATE16_EVEN= { 0,1 ,0,1, 4,5, 4,5, 8,9, 8,9, 12,13, 12,13 }; 113 static const Packet16uc p16uc_DUPLICATE16_ODD = { 2,3 ,2,3, 6,7, 6,7, 10,11, 10,11, 14,15, 14,15 }; 114 115 static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 }; 116 117 // Handle endianness properly while loading constants 118 // Define global static constants: 119 #ifdef _BIG_ENDIAN 120 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 121 #ifdef __VSX__ 122 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 123 #endif 124 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 125 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 126 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 127 #else 128 static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 129 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 130 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 131 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 132 static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 133 #endif // _BIG_ENDIAN 134 135 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; 136 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; 137 static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; 138 static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; 139 140 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; 141 142 #ifdef _BIG_ENDIAN 143 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 144 #else 145 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 146 #endif // _BIG_ENDIAN 147 148 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC 149 #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); 150 #else 151 #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); 152 #endif 153 154 template <> 155 struct packet_traits<float> : default_packet_traits { 156 typedef Packet4f type; 157 typedef Packet4f half; 158 enum { 159 Vectorizable = 1, 160 AlignedOnScalar = 1, 161 size = 4, 162 HasHalfPacket = 1, 163 164 HasAdd = 1, 165 HasSub = 1, 166 HasMul = 1, 167 HasDiv = 1, 168 HasMin = 1, 169 HasMax = 1, 170 HasAbs = 1, 171 HasSin = EIGEN_FAST_MATH, 172 HasCos = EIGEN_FAST_MATH, 173 HasLog = 1, 174 HasExp = 1, 175 #ifdef __VSX__ 176 HasSqrt = 1, 177 #if !EIGEN_COMP_CLANG 178 HasRsqrt = 1, 179 #else 180 HasRsqrt = 0, 181 #endif 182 #else 183 HasSqrt = 0, 184 HasRsqrt = 0, 185 HasTanh = EIGEN_FAST_MATH, 186 HasErf = EIGEN_FAST_MATH, 187 #endif 188 HasRound = 1, 189 HasFloor = 1, 190 HasCeil = 1, 191 HasNegate = 1, 192 HasBlend = 1 193 }; 194 }; 195 template <> 196 struct packet_traits<bfloat16> : default_packet_traits { 197 typedef Packet8bf type; 198 typedef Packet8bf half; 199 enum { 200 Vectorizable = 1, 201 AlignedOnScalar = 1, 202 size = 8, 203 HasHalfPacket = 0, 204 205 HasAdd = 1, 206 HasSub = 1, 207 HasMul = 1, 208 HasDiv = 1, 209 HasMin = 1, 210 HasMax = 1, 211 HasAbs = 1, 212 HasSin = EIGEN_FAST_MATH, 213 HasCos = EIGEN_FAST_MATH, 214 HasLog = 1, 215 HasExp = 1, 216 #ifdef __VSX__ 217 HasSqrt = 1, 218 #if !EIGEN_COMP_CLANG 219 HasRsqrt = 1, 220 #else 221 HasRsqrt = 0, 222 #endif 223 #else 224 HasSqrt = 0, 225 HasRsqrt = 0, 226 HasTanh = EIGEN_FAST_MATH, 227 HasErf = EIGEN_FAST_MATH, 228 #endif 229 HasRound = 1, 230 HasFloor = 1, 231 HasCeil = 1, 232 HasNegate = 1, 233 HasBlend = 1 234 }; 235 }; 236 237 template <> 238 struct packet_traits<int> : default_packet_traits { 239 typedef Packet4i type; 240 typedef Packet4i half; 241 enum { 242 Vectorizable = 1, 243 AlignedOnScalar = 1, 244 size = 4, 245 HasHalfPacket = 0, 246 247 HasAdd = 1, 248 HasSub = 1, 249 HasShift = 1, 250 HasMul = 1, 251 HasDiv = 0, 252 HasBlend = 1 253 }; 254 }; 255 256 template <> 257 struct packet_traits<short int> : default_packet_traits { 258 typedef Packet8s type; 259 typedef Packet8s half; 260 enum { 261 Vectorizable = 1, 262 AlignedOnScalar = 1, 263 size = 8, 264 HasHalfPacket = 0, 265 266 HasAdd = 1, 267 HasSub = 1, 268 HasMul = 1, 269 HasDiv = 0, 270 HasBlend = 1 271 }; 272 }; 273 274 template <> 275 struct packet_traits<unsigned short int> : default_packet_traits { 276 typedef Packet8us type; 277 typedef Packet8us half; 278 enum { 279 Vectorizable = 1, 280 AlignedOnScalar = 1, 281 size = 8, 282 HasHalfPacket = 0, 283 284 HasAdd = 1, 285 HasSub = 1, 286 HasMul = 1, 287 HasDiv = 0, 288 HasBlend = 1 289 }; 290 }; 291 292 template <> 293 struct packet_traits<signed char> : default_packet_traits { 294 typedef Packet16c type; 295 typedef Packet16c half; 296 enum { 297 Vectorizable = 1, 298 AlignedOnScalar = 1, 299 size = 16, 300 HasHalfPacket = 0, 301 302 HasAdd = 1, 303 HasSub = 1, 304 HasMul = 1, 305 HasDiv = 0, 306 HasBlend = 1 307 }; 308 }; 309 310 template <> 311 struct packet_traits<unsigned char> : default_packet_traits { 312 typedef Packet16uc type; 313 typedef Packet16uc half; 314 enum { 315 Vectorizable = 1, 316 AlignedOnScalar = 1, 317 size = 16, 318 HasHalfPacket = 0, 319 320 HasAdd = 1, 321 HasSub = 1, 322 HasMul = 1, 323 HasDiv = 0, 324 HasBlend = 1 325 }; 326 }; 327 328 template<> struct unpacket_traits<Packet4f> 329 { 330 typedef float type; 331 typedef Packet4f half; 332 typedef Packet4i integer_packet; 333 enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 334 }; 335 template<> struct unpacket_traits<Packet4i> 336 { 337 typedef int type; 338 typedef Packet4i half; 339 enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 340 }; 341 template<> struct unpacket_traits<Packet8s> 342 { 343 typedef short int type; 344 typedef Packet8s half; 345 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 346 }; 347 template<> struct unpacket_traits<Packet8us> 348 { 349 typedef unsigned short int type; 350 typedef Packet8us half; 351 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 352 }; 353 354 template<> struct unpacket_traits<Packet16c> 355 { 356 typedef signed char type; 357 typedef Packet16c half; 358 enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 359 }; 360 template<> struct unpacket_traits<Packet16uc> 361 { 362 typedef unsigned char type; 363 typedef Packet16uc half; 364 enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 365 }; 366 367 template<> struct unpacket_traits<Packet8bf> 368 { 369 typedef bfloat16 type; 370 typedef Packet8bf half; 371 enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; 372 }; 373 inline std::ostream & operator <<(std::ostream & s, const Packet16c & v) 374 { 375 union { 376 Packet16c v; 377 signed char n[16]; 378 } vt; 379 vt.v = v; 380 for (int i=0; i< 16; i++) 381 s << vt.n[i] << ", "; 382 return s; 383 } 384 385 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) 386 { 387 union { 388 Packet16uc v; 389 unsigned char n[16]; 390 } vt; 391 vt.v = v; 392 for (int i=0; i< 16; i++) 393 s << vt.n[i] << ", "; 394 return s; 395 } 396 397 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 398 { 399 union { 400 Packet4f v; 401 float n[4]; 402 } vt; 403 vt.v = v; 404 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 405 return s; 406 } 407 408 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 409 { 410 union { 411 Packet4i v; 412 int n[4]; 413 } vt; 414 vt.v = v; 415 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 416 return s; 417 } 418 419 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 420 { 421 union { 422 Packet4ui v; 423 unsigned int n[4]; 424 } vt; 425 vt.v = v; 426 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 427 return s; 428 } 429 430 template <typename Packet> 431 EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) 432 { 433 // some versions of GCC throw "unused-but-set-parameter". 434 // ignoring these warnings for now. 435 EIGEN_UNUSED_VARIABLE(from); 436 EIGEN_DEBUG_ALIGNED_LOAD 437 #ifdef __VSX__ 438 return vec_xl(0, from); 439 #else 440 return vec_ld(0, from); 441 #endif 442 } 443 444 // Need to define them first or we get specialization after instantiation errors 445 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) 446 { 447 return pload_common<Packet4f>(from); 448 } 449 450 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) 451 { 452 return pload_common<Packet4i>(from); 453 } 454 455 template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) 456 { 457 return pload_common<Packet8s>(from); 458 } 459 460 template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) 461 { 462 return pload_common<Packet8us>(from); 463 } 464 465 template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) 466 { 467 return pload_common<Packet16c>(from); 468 } 469 470 template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) 471 { 472 return pload_common<Packet16uc>(from); 473 } 474 475 template<> EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) 476 { 477 return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); 478 } 479 480 template <typename Packet> 481 EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet)* to, const Packet& from){ 482 // some versions of GCC throw "unused-but-set-parameter" (float *to). 483 // ignoring these warnings for now. 484 EIGEN_UNUSED_VARIABLE(to); 485 EIGEN_DEBUG_ALIGNED_STORE 486 #ifdef __VSX__ 487 vec_xst(from, 0, to); 488 #else 489 vec_st(from, 0, to); 490 #endif 491 } 492 493 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) 494 { 495 pstore_common<Packet4f>(to, from); 496 } 497 498 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) 499 { 500 pstore_common<Packet4i>(to, from); 501 } 502 503 template<> EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) 504 { 505 pstore_common<Packet8s>(to, from); 506 } 507 508 template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) 509 { 510 pstore_common<Packet8us>(to, from); 511 } 512 513 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) 514 { 515 pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from); 516 } 517 518 template<> EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) 519 { 520 pstore_common<Packet16c>(to, from); 521 } 522 523 template<> EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) 524 { 525 pstore_common<Packet16uc>(to, from); 526 } 527 528 template<typename Packet> 529 EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet)& from) 530 { 531 Packet v = {from, from, from, from}; 532 return v; 533 } 534 535 template<typename Packet> 536 EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet)& from) 537 { 538 Packet v = {from, from, from, from, from, from, from, from}; 539 return v; 540 } 541 542 template<typename Packet> 543 EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet)& from) 544 { 545 Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; 546 return v; 547 } 548 549 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 550 return pset1_size4<Packet4f>(from); 551 } 552 553 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 554 return pset1_size4<Packet4i>(from); 555 } 556 557 template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) { 558 return pset1_size8<Packet8s>(from); 559 } 560 561 template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) { 562 return pset1_size8<Packet8us>(from); 563 } 564 565 template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) { 566 return pset1_size16<Packet16c>(from); 567 } 568 569 template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) { 570 return pset1_size16<Packet16uc>(from); 571 } 572 573 template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { 574 return reinterpret_cast<Packet4f>(pset1<Packet4i>(from)); 575 } 576 577 template<> EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) { 578 return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from)); 579 } 580 581 template<typename Packet> EIGEN_STRONG_INLINE void 582 pbroadcast4_common(const __UNPACK_TYPE__(Packet) *a, 583 Packet& a0, Packet& a1, Packet& a2, Packet& a3) 584 { 585 a3 = pload<Packet>(a); 586 a0 = vec_splat(a3, 0); 587 a1 = vec_splat(a3, 1); 588 a2 = vec_splat(a3, 2); 589 a3 = vec_splat(a3, 3); 590 } 591 592 template<> EIGEN_STRONG_INLINE void 593 pbroadcast4<Packet4f>(const float *a, 594 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) 595 { 596 pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3); 597 } 598 template<> EIGEN_STRONG_INLINE void 599 pbroadcast4<Packet4i>(const int *a, 600 Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) 601 { 602 pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3); 603 } 604 605 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_common(const __UNPACK_TYPE__(Packet)* from, Index stride) 606 { 607 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; 608 a[0] = from[0*stride]; 609 a[1] = from[1*stride]; 610 a[2] = from[2*stride]; 611 a[3] = from[3*stride]; 612 return pload<Packet>(a); 613 } 614 615 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) 616 { 617 return pgather_common<Packet4f>(from, stride); 618 } 619 620 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) 621 { 622 return pgather_common<Packet4i>(from, stride); 623 } 624 625 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size8(const __UNPACK_TYPE__(Packet)* from, Index stride) 626 { 627 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; 628 a[0] = from[0*stride]; 629 a[1] = from[1*stride]; 630 a[2] = from[2*stride]; 631 a[3] = from[3*stride]; 632 a[4] = from[4*stride]; 633 a[5] = from[5*stride]; 634 a[6] = from[6*stride]; 635 a[7] = from[7*stride]; 636 return pload<Packet>(a); 637 } 638 639 template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) 640 { 641 return pgather_size8<Packet8s>(from, stride); 642 } 643 644 template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride) 645 { 646 return pgather_size8<Packet8us>(from, stride); 647 } 648 649 template<> EIGEN_DEVICE_FUNC inline Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) 650 { 651 return pgather_size8<Packet8bf>(from, stride); 652 } 653 654 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather_size16(const __UNPACK_TYPE__(Packet)* from, Index stride) 655 { 656 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; 657 a[0] = from[0*stride]; 658 a[1] = from[1*stride]; 659 a[2] = from[2*stride]; 660 a[3] = from[3*stride]; 661 a[4] = from[4*stride]; 662 a[5] = from[5*stride]; 663 a[6] = from[6*stride]; 664 a[7] = from[7*stride]; 665 a[8] = from[8*stride]; 666 a[9] = from[9*stride]; 667 a[10] = from[10*stride]; 668 a[11] = from[11*stride]; 669 a[12] = from[12*stride]; 670 a[13] = from[13*stride]; 671 a[14] = from[14*stride]; 672 a[15] = from[15*stride]; 673 return pload<Packet>(a); 674 } 675 676 677 template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) 678 { 679 return pgather_size16<Packet16c>(from, stride); 680 } 681 682 template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from, Index stride) 683 { 684 return pgather_size16<Packet16uc>(from, stride); 685 } 686 687 template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size4(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) 688 { 689 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[4]; 690 pstore<__UNPACK_TYPE__(Packet)>(a, from); 691 to[0*stride] = a[0]; 692 to[1*stride] = a[1]; 693 to[2*stride] = a[2]; 694 to[3*stride] = a[3]; 695 } 696 697 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) 698 { 699 pscatter_size4<Packet4f>(to, from, stride); 700 } 701 702 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) 703 { 704 pscatter_size4<Packet4i>(to, from, stride); 705 } 706 707 template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size8(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) 708 { 709 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[8]; 710 pstore<__UNPACK_TYPE__(Packet)>(a, from); 711 to[0*stride] = a[0]; 712 to[1*stride] = a[1]; 713 to[2*stride] = a[2]; 714 to[3*stride] = a[3]; 715 to[4*stride] = a[4]; 716 to[5*stride] = a[5]; 717 to[6*stride] = a[6]; 718 to[7*stride] = a[7]; 719 } 720 721 722 template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride) 723 { 724 pscatter_size8<Packet8s>(to, from, stride); 725 } 726 727 template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride) 728 { 729 pscatter_size8<Packet8us>(to, from, stride); 730 } 731 732 template<> EIGEN_DEVICE_FUNC inline void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride) 733 { 734 pscatter_size8<Packet8bf>(to, from, stride); 735 } 736 737 template<typename Packet> EIGEN_DEVICE_FUNC inline void pscatter_size16(__UNPACK_TYPE__(Packet)* to, const Packet& from, Index stride) 738 { 739 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[16]; 740 pstore<__UNPACK_TYPE__(Packet)>(a, from); 741 to[0*stride] = a[0]; 742 to[1*stride] = a[1]; 743 to[2*stride] = a[2]; 744 to[3*stride] = a[3]; 745 to[4*stride] = a[4]; 746 to[5*stride] = a[5]; 747 to[6*stride] = a[6]; 748 to[7*stride] = a[7]; 749 to[8*stride] = a[8]; 750 to[9*stride] = a[9]; 751 to[10*stride] = a[10]; 752 to[11*stride] = a[11]; 753 to[12*stride] = a[12]; 754 to[13*stride] = a[13]; 755 to[14*stride] = a[14]; 756 to[15*stride] = a[15]; 757 } 758 759 template<> EIGEN_DEVICE_FUNC inline void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from, Index stride) 760 { 761 pscatter_size16<Packet16c>(to, from, stride); 762 } 763 764 template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned char, Packet16uc>(unsigned char* to, const Packet16uc& from, Index stride) 765 { 766 pscatter_size16<Packet16uc>(to, from, stride); 767 } 768 769 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; } 770 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; } 771 template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; } 772 template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; } 773 template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) { return pset1<Packet16c>(a) + p16c_COUNTDOWN; } 774 template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; } 775 776 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f> (const Packet4f& a, const Packet4f& b) { return a + b; } 777 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i> (const Packet4i& a, const Packet4i& b) { return a + b; } 778 template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui> (const Packet4ui& a, const Packet4ui& b) { return a + b; } 779 template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s> (const Packet8s& a, const Packet8s& b) { return a + b; } 780 template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us> (const Packet8us& a, const Packet8us& b) { return a + b; } 781 template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c> (const Packet16c& a, const Packet16c& b) { return a + b; } 782 template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; } 783 784 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f> (const Packet4f& a, const Packet4f& b) { return a - b; } 785 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i> (const Packet4i& a, const Packet4i& b) { return a - b; } 786 template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s> (const Packet8s& a, const Packet8s& b) { return a - b; } 787 template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us> (const Packet8us& a, const Packet8us& b) { return a - b; } 788 template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c> (const Packet16c& a, const Packet16c& b) { return a - b; } 789 template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; } 790 791 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } 792 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } 793 794 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 795 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 796 797 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f> (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } 798 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i> (const Packet4i& a, const Packet4i& b) { return a * b; } 799 template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s> (const Packet8s& a, const Packet8s& b) { return vec_mul(a,b); } 800 template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us> (const Packet8us& a, const Packet8us& b) { return vec_mul(a,b); } 801 template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c> (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); } 802 template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); } 803 804 805 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 806 { 807 #ifndef __VSX__ // VSX actually provides a div instruction 808 Packet4f t, y_0, y_1; 809 810 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 811 y_0 = vec_re(b); 812 813 // Do one Newton-Raphson iteration to get the needed accuracy 814 t = vec_nmsub(y_0, b, p4f_ONE); 815 y_1 = vec_madd(y_0, t, y_0); 816 817 return vec_madd(a, y_1, p4f_MZERO); 818 #else 819 return vec_div(a, b); 820 #endif 821 } 822 823 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 824 { eigen_assert(false && "packet integer division are not supported by AltiVec"); 825 return pset1<Packet4i>(0); 826 } 827 828 // for some weird raisons, it has to be overloaded for packet of integers 829 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } 830 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } 831 template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) { return vec_madd(a,b,c); } 832 template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) { return vec_madd(a,b,c); } 833 834 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) 835 { 836 #ifdef __VSX__ 837 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN 838 Packet4f ret; 839 __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 840 return ret; 841 #else 842 return vec_min(a, b); 843 #endif 844 } 845 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 846 template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); } 847 template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); } 848 template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); } 849 template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); } 850 851 852 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) 853 { 854 #ifdef __VSX__ 855 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN 856 Packet4f ret; 857 __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 858 return ret; 859 #else 860 return vec_max(a, b); 861 #endif 862 } 863 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 864 template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); } 865 template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); } 866 template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); } 867 template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); } 868 869 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); } 870 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); } 871 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); } 872 template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); } 873 template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); } 874 875 template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); } 876 template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); } 877 878 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { 879 Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b)); 880 return vec_nor(c,c); 881 } 882 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); } 883 884 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 885 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 886 template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vec_and(a, b); } 887 template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_and(a, b); } 888 template<> EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 889 return pand<Packet8us>(a, b); 890 } 891 892 893 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 894 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 895 template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_or(a, b); } 896 template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_or(a, b); } 897 template<> EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 898 return por<Packet8us>(a, b); 899 } 900 901 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 902 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 903 template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 904 return pxor<Packet8us>(a, b); 905 } 906 907 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } 908 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } 909 910 template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { 911 return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask)); 912 } 913 914 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { 915 Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a); 916 Packet4f res; 917 918 __asm__("vrfiz %0, %1\n\t" 919 : "=v" (res) 920 : "v" (t)); 921 922 return res; 923 } 924 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); } 925 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); } 926 927 template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet)* from) 928 { 929 EIGEN_DEBUG_ALIGNED_LOAD 930 #ifdef _BIG_ENDIAN 931 Packet16uc MSQ, LSQ; 932 Packet16uc mask; 933 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 934 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 935 mask = vec_lvsl(0, from); // create the permute mask 936 //TODO: Add static_cast here 937 return (Packet) vec_perm(MSQ, LSQ, mask); // align the data 938 #else 939 EIGEN_DEBUG_UNALIGNED_LOAD 940 return vec_xl(0, from); 941 #endif 942 } 943 944 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 945 { 946 return ploadu_common<Packet4f>(from); 947 } 948 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 949 { 950 return ploadu_common<Packet4i>(from); 951 } 952 template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) 953 { 954 return ploadu_common<Packet8s>(from); 955 } 956 template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) 957 { 958 return ploadu_common<Packet8us>(from); 959 } 960 template<> EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) 961 { 962 return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); 963 } 964 template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) 965 { 966 return ploadu_common<Packet16c>(from); 967 } 968 template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) 969 { 970 return ploadu_common<Packet16uc>(from); 971 } 972 973 template<typename Packet> EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet)* from) 974 { 975 Packet p; 976 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet>(from); 977 else p = ploadu<Packet>(from); 978 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 979 } 980 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 981 { 982 return ploaddup_common<Packet4f>(from); 983 } 984 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 985 { 986 return ploaddup_common<Packet4i>(from); 987 } 988 989 template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) 990 { 991 Packet8s p; 992 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from); 993 else p = ploadu<Packet8s>(from); 994 return vec_perm(p, p, p16uc_DUPLICATE16_HI); 995 } 996 997 template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) 998 { 999 Packet8us p; 1000 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from); 1001 else p = ploadu<Packet8us>(from); 1002 return vec_perm(p, p, p16uc_DUPLICATE16_HI); 1003 } 1004 1005 template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) 1006 { 1007 Packet8s p; 1008 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8s>(from); 1009 else p = ploadu<Packet8s>(from); 1010 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); 1011 } 1012 1013 template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) 1014 { 1015 Packet8us p; 1016 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet8us>(from); 1017 else p = ploadu<Packet8us>(from); 1018 return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); 1019 } 1020 1021 template<> EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) 1022 { 1023 return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); 1024 } 1025 1026 template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) 1027 { 1028 Packet16c p; 1029 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16c>(from); 1030 else p = ploadu<Packet16c>(from); 1031 return vec_perm(p, p, p16uc_DUPLICATE8_HI); 1032 } 1033 1034 template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) 1035 { 1036 Packet16uc p; 1037 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet16uc>(from); 1038 else p = ploadu<Packet16uc>(from); 1039 return vec_perm(p, p, p16uc_DUPLICATE8_HI); 1040 } 1041 1042 template<typename Packet> EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet)* to, const Packet& from) 1043 { 1044 EIGEN_DEBUG_UNALIGNED_STORE 1045 #ifdef _BIG_ENDIAN 1046 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 1047 // Warning: not thread safe! 1048 Packet16uc MSQ, LSQ, edges; 1049 Packet16uc edgeAlign, align; 1050 1051 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 1052 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 1053 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 1054 edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 1055 align = vec_lvsr( 0, to ); // permute map to misalign data 1056 MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 1057 LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 1058 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 1059 #else 1060 vec_xst(from, 0, to); 1061 #endif 1062 } 1063 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 1064 { 1065 pstoreu_common<Packet4f>(to, from); 1066 } 1067 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 1068 { 1069 pstoreu_common<Packet4i>(to, from); 1070 } 1071 template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) 1072 { 1073 pstoreu_common<Packet8s>(to, from); 1074 } 1075 template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) 1076 { 1077 pstoreu_common<Packet8us>(to, from); 1078 } 1079 template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) 1080 { 1081 pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from); 1082 } 1083 template<> EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) 1084 { 1085 pstoreu_common<Packet16c>(to, from); 1086 } 1087 template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) 1088 { 1089 pstoreu_common<Packet16uc>(to, from); 1090 } 1091 1092 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } 1093 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); } 1094 1095 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } 1096 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } 1097 1098 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) { 1099 EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x; 1100 vec_ste(a, 0, &x); 1101 return x; 1102 } 1103 1104 template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) { 1105 return pfirst_common<Packet8s>(a); 1106 } 1107 1108 template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) { 1109 return pfirst_common<Packet8us>(a); 1110 } 1111 1112 template<> EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) 1113 { 1114 return pfirst_common<Packet16c>(a); 1115 } 1116 1117 template<> EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) 1118 { 1119 return pfirst_common<Packet16uc>(a); 1120 } 1121 1122 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) 1123 { 1124 return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); 1125 } 1126 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) 1127 { 1128 return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); 1129 } 1130 template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) 1131 { 1132 return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16)); 1133 } 1134 template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) 1135 { 1136 return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16)); 1137 } 1138 template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) 1139 { 1140 return vec_perm(a, a, p16uc_REVERSE8); 1141 } 1142 template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) 1143 { 1144 return vec_perm(a, a, p16uc_REVERSE8); 1145 } 1146 template<> EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) 1147 { 1148 return preverse<Packet8us>(a); 1149 } 1150 1151 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 1152 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 1153 template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); } 1154 template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } 1155 template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); } 1156 template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } 1157 template<> EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) { 1158 _EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask,0x7FFF); 1159 return pand<Packet8us>(p8us_abs_mask, a); 1160 } 1161 1162 template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) 1163 { return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); } 1164 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) 1165 { return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); } 1166 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) 1167 { return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); } 1168 template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_left(Packet4f a) 1169 { 1170 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); 1171 Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask); 1172 return reinterpret_cast<Packet4f>(r); 1173 } 1174 1175 template<int N> EIGEN_STRONG_INLINE Packet4f plogical_shift_right(Packet4f a) 1176 { 1177 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); 1178 Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask); 1179 return reinterpret_cast<Packet4f>(r); 1180 } 1181 1182 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) 1183 { 1184 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); 1185 return vec_sr(a, p4ui_mask); 1186 } 1187 1188 template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) 1189 { 1190 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N); 1191 return vec_sl(a, p4ui_mask); 1192 } 1193 1194 template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) 1195 { 1196 const _EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N); 1197 return vec_sl(a, p8us_mask); 1198 } 1199 1200 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf){ 1201 return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val)); 1202 } 1203 1204 EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf){ 1205 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000); 1206 return pand<Packet4f>( 1207 reinterpret_cast<Packet4f>(bf.m_val), 1208 reinterpret_cast<Packet4f>(p4ui_high_mask) 1209 ); 1210 } 1211 1212 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){ 1213 Packet4ui input = reinterpret_cast<Packet4ui>(p4f); 1214 Packet4ui lsb = plogical_shift_right<16>(input); 1215 lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE)); 1216 1217 _EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS,0x7FFFu); 1218 Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS); 1219 input = padd<Packet4ui>(input, rounding_bias); 1220 1221 //Test NaN and Subnormal - Begin 1222 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000); 1223 Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f)); 1224 1225 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF); 1226 Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f)); 1227 1228 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(max_exp, 0x7F800000); 1229 Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp); 1230 Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO)); 1231 1232 Packet4bi is_mant_not_zero = vec_cmpne(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO)); 1233 Packet4ui nan_selector = pand<Packet4ui>( 1234 reinterpret_cast<Packet4ui>(is_max_exp), 1235 reinterpret_cast<Packet4ui>(is_mant_not_zero) 1236 ); 1237 1238 Packet4ui subnormal_selector = pand<Packet4ui>( 1239 reinterpret_cast<Packet4ui>(is_zero_exp), 1240 reinterpret_cast<Packet4ui>(is_mant_not_zero) 1241 ); 1242 1243 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000); 1244 input = vec_sel(input, p4ui_nan, nan_selector); 1245 input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector); 1246 //Test NaN and Subnormal - End 1247 1248 input = plogical_shift_right<16>(input); 1249 return reinterpret_cast<Packet8us>(input); 1250 } 1251 1252 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd){ 1253 Packet4f bf_odd, bf_even; 1254 bf_odd = reinterpret_cast<Packet4f>(F32ToBf16(odd).m_val); 1255 bf_odd = plogical_shift_left<16>(bf_odd); 1256 bf_even = reinterpret_cast<Packet4f>(F32ToBf16(even).m_val); 1257 return reinterpret_cast<Packet8us>(por<Packet4f>(bf_even, bf_odd)); 1258 } 1259 #define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \ 1260 Packet4f a_even = Bf16ToF32Even(A);\ 1261 Packet4f a_odd = Bf16ToF32Odd(A);\ 1262 Packet4f op_even = OP(a_even);\ 1263 Packet4f op_odd = OP(a_odd);\ 1264 return F32ToBf16(op_even, op_odd);\ 1265 1266 #define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \ 1267 Packet4f a_even = Bf16ToF32Even(A);\ 1268 Packet4f a_odd = Bf16ToF32Odd(A);\ 1269 Packet4f b_even = Bf16ToF32Even(B);\ 1270 Packet4f b_odd = Bf16ToF32Odd(B);\ 1271 Packet4f op_even = OP(a_even, b_even);\ 1272 Packet4f op_odd = OP(a_odd, b_odd);\ 1273 return F32ToBf16(op_even, op_odd);\ 1274 1275 template<> EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1276 BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b); 1277 } 1278 1279 template<> EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1280 BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b); 1281 } 1282 1283 template<> EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1284 BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b); 1285 } 1286 1287 template<> EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) { 1288 BF16_TO_F32_UNARY_OP_WRAPPER(pnegate<Packet4f>, a); 1289 } 1290 1291 template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1292 BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b); 1293 } 1294 1295 template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){ 1296 BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); 1297 } 1298 template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){ 1299 BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a); 1300 } 1301 template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){ 1302 BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); 1303 } 1304 template<> EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf> (const Packet8bf& a){ 1305 BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a); 1306 } 1307 template<> EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf> (const Packet8bf& a){ 1308 BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a); 1309 } 1310 template<> EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf> (const Packet8bf& a){ 1311 BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a); 1312 } 1313 template<> EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf> (const Packet8bf& a){ 1314 BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a); 1315 } 1316 template<> EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf> (const Packet8bf& a){ 1317 BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a); 1318 } 1319 template<> EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf> (const Packet8bf& a){ 1320 BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a); 1321 } 1322 template<> EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) { 1323 Packet4f a_even = Bf16ToF32Even(a); 1324 Packet4f a_odd = Bf16ToF32Odd(a); 1325 Packet4f b_even = Bf16ToF32Even(b); 1326 Packet4f b_odd = Bf16ToF32Odd(b); 1327 Packet4f c_even = Bf16ToF32Even(c); 1328 Packet4f c_odd = Bf16ToF32Odd(c); 1329 Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even); 1330 Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd); 1331 return F32ToBf16(pmadd_even, pmadd_odd); 1332 } 1333 1334 template<> EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1335 BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b); 1336 } 1337 1338 template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) { 1339 BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b); 1340 } 1341 1342 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) { 1343 BF16_TO_F32_BINARY_OP_WRAPPER(pcmp_lt<Packet4f>, a, b); 1344 } 1345 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) { 1346 BF16_TO_F32_BINARY_OP_WRAPPER(pcmp_le<Packet4f>, a, b); 1347 } 1348 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) { 1349 BF16_TO_F32_BINARY_OP_WRAPPER(pcmp_eq<Packet4f>, a, b); 1350 } 1351 1352 template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) { 1353 return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a))); 1354 } 1355 1356 template<> EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) 1357 { 1358 return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from)); 1359 } 1360 1361 template<> EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) { 1362 bfloat16 countdown[8] = { bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3), 1363 bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7) }; 1364 return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown)); 1365 } 1366 1367 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) { 1368 return pfrexp_float(a,exponent); 1369 } 1370 1371 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) { 1372 return pldexp_float(a,exponent); 1373 } 1374 1375 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 1376 { 1377 Packet4f b, sum; 1378 b = vec_sld(a, a, 8); 1379 sum = a + b; 1380 b = vec_sld(sum, sum, 4); 1381 sum += b; 1382 return pfirst(sum); 1383 } 1384 1385 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 1386 { 1387 Packet4i sum; 1388 sum = vec_sums(a, p4i_ZERO); 1389 #ifdef _BIG_ENDIAN 1390 sum = vec_sld(sum, p4i_ZERO, 12); 1391 #else 1392 sum = vec_sld(p4i_ZERO, sum, 4); 1393 #endif 1394 return pfirst(sum); 1395 } 1396 1397 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) 1398 { 1399 float redux_even = predux<Packet4f>(Bf16ToF32Even(a)); 1400 float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a)); 1401 float f32_result = redux_even + redux_odd; 1402 return bfloat16(f32_result); 1403 } 1404 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) 1405 { 1406 union{ 1407 Packet v; 1408 __UNPACK_TYPE__(Packet) n[8]; 1409 } vt; 1410 vt.v = a; 1411 1412 EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; 1413 EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; 1414 Packet4i first_half = pload<Packet4i>(first_loader); 1415 Packet4i second_half = pload<Packet4i>(second_loader); 1416 1417 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half)); 1418 } 1419 1420 template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) 1421 { 1422 return predux_size8<Packet8s>(a); 1423 } 1424 1425 template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) 1426 { 1427 return predux_size8<Packet8us>(a); 1428 } 1429 1430 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) 1431 { 1432 union{ 1433 Packet v; 1434 __UNPACK_TYPE__(Packet) n[16]; 1435 } vt; 1436 vt.v = a; 1437 1438 EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; 1439 EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; 1440 EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] }; 1441 EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] }; 1442 1443 Packet4i first_quarter = pload<Packet4i>(first_loader); 1444 Packet4i second_quarter = pload<Packet4i>(second_loader); 1445 Packet4i third_quarter = pload<Packet4i>(third_loader); 1446 Packet4i fourth_quarter = pload<Packet4i>(fourth_loader); 1447 1448 return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) 1449 + predux(third_quarter) + predux(fourth_quarter)); 1450 } 1451 1452 template<> EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) 1453 { 1454 return predux_size16<Packet16c>(a); 1455 } 1456 1457 template<> EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) 1458 { 1459 return predux_size16<Packet16uc>(a); 1460 } 1461 1462 // Other reduction functions: 1463 // mul 1464 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 1465 { 1466 Packet4f prod; 1467 prod = pmul(a, vec_sld(a, a, 8)); 1468 return pfirst(pmul(prod, vec_sld(prod, prod, 4))); 1469 } 1470 1471 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 1472 { 1473 EIGEN_ALIGN16 int aux[4]; 1474 pstore(aux, a); 1475 return aux[0] * aux[1] * aux[2] * aux[3]; 1476 } 1477 1478 template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) 1479 { 1480 Packet8s pair, quad, octo; 1481 1482 pair = vec_mul(a, vec_sld(a, a, 8)); 1483 quad = vec_mul(pair, vec_sld(pair, pair, 4)); 1484 octo = vec_mul(quad, vec_sld(quad, quad, 2)); 1485 1486 return pfirst(octo); 1487 } 1488 1489 template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) 1490 { 1491 Packet8us pair, quad, octo; 1492 1493 pair = vec_mul(a, vec_sld(a, a, 8)); 1494 quad = vec_mul(pair, vec_sld(pair, pair, 4)); 1495 octo = vec_mul(quad, vec_sld(quad, quad, 2)); 1496 1497 return pfirst(octo); 1498 } 1499 1500 template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) 1501 { 1502 float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a)); 1503 float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a)); 1504 float f32_result = redux_even * redux_odd; 1505 return bfloat16(f32_result); 1506 } 1507 1508 1509 template<> EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) 1510 { 1511 Packet16c pair, quad, octo, result; 1512 1513 pair = vec_mul(a, vec_sld(a, a, 8)); 1514 quad = vec_mul(pair, vec_sld(pair, pair, 4)); 1515 octo = vec_mul(quad, vec_sld(quad, quad, 2)); 1516 result = vec_mul(octo, vec_sld(octo, octo, 1)); 1517 1518 return pfirst(result); 1519 } 1520 1521 template<> EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) 1522 { 1523 Packet16uc pair, quad, octo, result; 1524 1525 pair = vec_mul(a, vec_sld(a, a, 8)); 1526 quad = vec_mul(pair, vec_sld(pair, pair, 4)); 1527 octo = vec_mul(quad, vec_sld(quad, quad, 2)); 1528 result = vec_mul(octo, vec_sld(octo, octo, 1)); 1529 1530 return pfirst(result); 1531 } 1532 1533 // min 1534 template<typename Packet> EIGEN_STRONG_INLINE 1535 __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) 1536 { 1537 Packet b, res; 1538 b = vec_min(a, vec_sld(a, a, 8)); 1539 res = vec_min(b, vec_sld(b, b, 4)); 1540 return pfirst(res); 1541 } 1542 1543 1544 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 1545 { 1546 return predux_min4<Packet4f>(a); 1547 } 1548 1549 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 1550 { 1551 return predux_min4<Packet4i>(a); 1552 } 1553 1554 template<> EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) 1555 { 1556 float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a)); 1557 float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a)); 1558 float f32_result = (std::min)(redux_even, redux_odd); 1559 return bfloat16(f32_result); 1560 } 1561 1562 template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) 1563 { 1564 Packet8s pair, quad, octo; 1565 1566 //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } 1567 pair = vec_min(a, vec_sld(a, a, 8)); 1568 1569 //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } 1570 quad = vec_min(pair, vec_sld(pair, pair, 4)); 1571 1572 //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } 1573 octo = vec_min(quad, vec_sld(quad, quad, 2)); 1574 return pfirst(octo); 1575 } 1576 1577 template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) 1578 { 1579 Packet8us pair, quad, octo; 1580 1581 //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } 1582 pair = vec_min(a, vec_sld(a, a, 8)); 1583 1584 //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } 1585 quad = vec_min(pair, vec_sld(pair, pair, 4)); 1586 1587 //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } 1588 octo = vec_min(quad, vec_sld(quad, quad, 2)); 1589 return pfirst(octo); 1590 } 1591 1592 template<> EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) 1593 { 1594 Packet16c pair, quad, octo, result; 1595 1596 pair = vec_min(a, vec_sld(a, a, 8)); 1597 quad = vec_min(pair, vec_sld(pair, pair, 4)); 1598 octo = vec_min(quad, vec_sld(quad, quad, 2)); 1599 result = vec_min(octo, vec_sld(octo, octo, 1)); 1600 1601 return pfirst(result); 1602 } 1603 1604 template<> EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) 1605 { 1606 Packet16uc pair, quad, octo, result; 1607 1608 pair = vec_min(a, vec_sld(a, a, 8)); 1609 quad = vec_min(pair, vec_sld(pair, pair, 4)); 1610 octo = vec_min(quad, vec_sld(quad, quad, 2)); 1611 result = vec_min(octo, vec_sld(octo, octo, 1)); 1612 1613 return pfirst(result); 1614 } 1615 // max 1616 template<typename Packet> EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) 1617 { 1618 Packet b, res; 1619 b = vec_max(a, vec_sld(a, a, 8)); 1620 res = vec_max(b, vec_sld(b, b, 4)); 1621 return pfirst(res); 1622 } 1623 1624 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 1625 { 1626 return predux_max4<Packet4f>(a); 1627 } 1628 1629 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 1630 { 1631 return predux_max4<Packet4i>(a); 1632 } 1633 1634 template<> EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) 1635 { 1636 float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a)); 1637 float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a)); 1638 float f32_result = (std::max)(redux_even, redux_odd); 1639 return bfloat16(f32_result); 1640 } 1641 1642 template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) 1643 { 1644 Packet8s pair, quad, octo; 1645 1646 //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } 1647 pair = vec_max(a, vec_sld(a, a, 8)); 1648 1649 //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } 1650 quad = vec_max(pair, vec_sld(pair, pair, 4)); 1651 1652 //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } 1653 octo = vec_max(quad, vec_sld(quad, quad, 2)); 1654 return pfirst(octo); 1655 } 1656 1657 template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) 1658 { 1659 Packet8us pair, quad, octo; 1660 1661 //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } 1662 pair = vec_max(a, vec_sld(a, a, 8)); 1663 1664 //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } 1665 quad = vec_max(pair, vec_sld(pair, pair, 4)); 1666 1667 //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } 1668 octo = vec_max(quad, vec_sld(quad, quad, 2)); 1669 return pfirst(octo); 1670 } 1671 1672 template<> EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) 1673 { 1674 Packet16c pair, quad, octo, result; 1675 1676 pair = vec_max(a, vec_sld(a, a, 8)); 1677 quad = vec_max(pair, vec_sld(pair, pair, 4)); 1678 octo = vec_max(quad, vec_sld(quad, quad, 2)); 1679 result = vec_max(octo, vec_sld(octo, octo, 1)); 1680 1681 return pfirst(result); 1682 } 1683 1684 template<> EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) 1685 { 1686 Packet16uc pair, quad, octo, result; 1687 1688 pair = vec_max(a, vec_sld(a, a, 8)); 1689 quad = vec_max(pair, vec_sld(pair, pair, 4)); 1690 octo = vec_max(quad, vec_sld(quad, quad, 2)); 1691 result = vec_max(octo, vec_sld(octo, octo, 1)); 1692 1693 return pfirst(result); 1694 } 1695 1696 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) 1697 { 1698 return vec_any_ne(x, pzero(x)); 1699 } 1700 1701 template <typename T> EIGEN_DEVICE_FUNC inline void 1702 ptranpose_common(PacketBlock<T,4>& kernel){ 1703 T t0, t1, t2, t3; 1704 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1705 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1706 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1707 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1708 kernel.packet[0] = vec_mergeh(t0, t2); 1709 kernel.packet[1] = vec_mergel(t0, t2); 1710 kernel.packet[2] = vec_mergeh(t1, t3); 1711 kernel.packet[3] = vec_mergel(t1, t3); 1712 } 1713 1714 EIGEN_DEVICE_FUNC inline void 1715 ptranspose(PacketBlock<Packet4f,4>& kernel) { 1716 ptranpose_common<Packet4f>(kernel); 1717 } 1718 1719 EIGEN_DEVICE_FUNC inline void 1720 ptranspose(PacketBlock<Packet4i,4>& kernel) { 1721 ptranpose_common<Packet4i>(kernel); 1722 } 1723 1724 EIGEN_DEVICE_FUNC inline void 1725 ptranspose(PacketBlock<Packet8s,4>& kernel) { 1726 Packet8s t0, t1, t2, t3; 1727 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1728 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1729 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1730 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1731 kernel.packet[0] = vec_mergeh(t0, t2); 1732 kernel.packet[1] = vec_mergel(t0, t2); 1733 kernel.packet[2] = vec_mergeh(t1, t3); 1734 kernel.packet[3] = vec_mergel(t1, t3); 1735 } 1736 1737 EIGEN_DEVICE_FUNC inline void 1738 ptranspose(PacketBlock<Packet8us,4>& kernel) { 1739 Packet8us t0, t1, t2, t3; 1740 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1741 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1742 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1743 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1744 kernel.packet[0] = vec_mergeh(t0, t2); 1745 kernel.packet[1] = vec_mergel(t0, t2); 1746 kernel.packet[2] = vec_mergeh(t1, t3); 1747 kernel.packet[3] = vec_mergel(t1, t3); 1748 } 1749 1750 1751 EIGEN_DEVICE_FUNC inline void 1752 ptranspose(PacketBlock<Packet8bf,4>& kernel) { 1753 Packet8us t0, t1, t2, t3; 1754 1755 t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val); 1756 t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val); 1757 t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val); 1758 t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val); 1759 kernel.packet[0] = vec_mergeh(t0, t2); 1760 kernel.packet[1] = vec_mergel(t0, t2); 1761 kernel.packet[2] = vec_mergeh(t1, t3); 1762 kernel.packet[3] = vec_mergel(t1, t3); 1763 } 1764 1765 EIGEN_DEVICE_FUNC inline void 1766 ptranspose(PacketBlock<Packet16c,4>& kernel) { 1767 Packet16c t0, t1, t2, t3; 1768 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1769 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1770 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1771 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1772 kernel.packet[0] = vec_mergeh(t0, t2); 1773 kernel.packet[1] = vec_mergel(t0, t2); 1774 kernel.packet[2] = vec_mergeh(t1, t3); 1775 kernel.packet[3] = vec_mergel(t1, t3); 1776 } 1777 1778 1779 EIGEN_DEVICE_FUNC inline void 1780 ptranspose(PacketBlock<Packet16uc,4>& kernel) { 1781 Packet16uc t0, t1, t2, t3; 1782 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 1783 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 1784 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 1785 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 1786 kernel.packet[0] = vec_mergeh(t0, t2); 1787 kernel.packet[1] = vec_mergel(t0, t2); 1788 kernel.packet[2] = vec_mergeh(t1, t3); 1789 kernel.packet[3] = vec_mergel(t1, t3); 1790 } 1791 1792 EIGEN_DEVICE_FUNC inline void 1793 ptranspose(PacketBlock<Packet8s,8>& kernel) { 1794 Packet8s v[8], sum[8]; 1795 1796 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); 1797 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); 1798 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); 1799 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); 1800 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); 1801 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); 1802 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); 1803 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); 1804 sum[0] = vec_mergeh(v[0], v[4]); 1805 sum[1] = vec_mergel(v[0], v[4]); 1806 sum[2] = vec_mergeh(v[1], v[5]); 1807 sum[3] = vec_mergel(v[1], v[5]); 1808 sum[4] = vec_mergeh(v[2], v[6]); 1809 sum[5] = vec_mergel(v[2], v[6]); 1810 sum[6] = vec_mergeh(v[3], v[7]); 1811 sum[7] = vec_mergel(v[3], v[7]); 1812 1813 kernel.packet[0] = vec_mergeh(sum[0], sum[4]); 1814 kernel.packet[1] = vec_mergel(sum[0], sum[4]); 1815 kernel.packet[2] = vec_mergeh(sum[1], sum[5]); 1816 kernel.packet[3] = vec_mergel(sum[1], sum[5]); 1817 kernel.packet[4] = vec_mergeh(sum[2], sum[6]); 1818 kernel.packet[5] = vec_mergel(sum[2], sum[6]); 1819 kernel.packet[6] = vec_mergeh(sum[3], sum[7]); 1820 kernel.packet[7] = vec_mergel(sum[3], sum[7]); 1821 } 1822 1823 EIGEN_DEVICE_FUNC inline void 1824 ptranspose(PacketBlock<Packet8us,8>& kernel) { 1825 Packet8us v[8], sum[8]; 1826 1827 v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); 1828 v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); 1829 v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); 1830 v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); 1831 v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); 1832 v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); 1833 v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); 1834 v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); 1835 sum[0] = vec_mergeh(v[0], v[4]); 1836 sum[1] = vec_mergel(v[0], v[4]); 1837 sum[2] = vec_mergeh(v[1], v[5]); 1838 sum[3] = vec_mergel(v[1], v[5]); 1839 sum[4] = vec_mergeh(v[2], v[6]); 1840 sum[5] = vec_mergel(v[2], v[6]); 1841 sum[6] = vec_mergeh(v[3], v[7]); 1842 sum[7] = vec_mergel(v[3], v[7]); 1843 1844 kernel.packet[0] = vec_mergeh(sum[0], sum[4]); 1845 kernel.packet[1] = vec_mergel(sum[0], sum[4]); 1846 kernel.packet[2] = vec_mergeh(sum[1], sum[5]); 1847 kernel.packet[3] = vec_mergel(sum[1], sum[5]); 1848 kernel.packet[4] = vec_mergeh(sum[2], sum[6]); 1849 kernel.packet[5] = vec_mergel(sum[2], sum[6]); 1850 kernel.packet[6] = vec_mergeh(sum[3], sum[7]); 1851 kernel.packet[7] = vec_mergel(sum[3], sum[7]); 1852 } 1853 1854 EIGEN_DEVICE_FUNC inline void 1855 ptranspose(PacketBlock<Packet8bf,8>& kernel) { 1856 Packet8bf v[8], sum[8]; 1857 1858 v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val); 1859 v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val); 1860 v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val); 1861 v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val); 1862 v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val); 1863 v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val); 1864 v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val); 1865 v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val); 1866 sum[0] = vec_mergeh(v[0].m_val, v[4].m_val); 1867 sum[1] = vec_mergel(v[0].m_val, v[4].m_val); 1868 sum[2] = vec_mergeh(v[1].m_val, v[5].m_val); 1869 sum[3] = vec_mergel(v[1].m_val, v[5].m_val); 1870 sum[4] = vec_mergeh(v[2].m_val, v[6].m_val); 1871 sum[5] = vec_mergel(v[2].m_val, v[6].m_val); 1872 sum[6] = vec_mergeh(v[3].m_val, v[7].m_val); 1873 sum[7] = vec_mergel(v[3].m_val, v[7].m_val); 1874 1875 kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val); 1876 kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val); 1877 kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val); 1878 kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val); 1879 kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val); 1880 kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val); 1881 kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val); 1882 kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val); 1883 } 1884 1885 EIGEN_DEVICE_FUNC inline void 1886 ptranspose(PacketBlock<Packet16c,16>& kernel) { 1887 Packet16c step1[16], step2[16], step3[16]; 1888 1889 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); 1890 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); 1891 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); 1892 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); 1893 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); 1894 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); 1895 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); 1896 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); 1897 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); 1898 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); 1899 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); 1900 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); 1901 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); 1902 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); 1903 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); 1904 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); 1905 1906 step2[0] = vec_mergeh(step1[0], step1[8]); 1907 step2[1] = vec_mergel(step1[0], step1[8]); 1908 step2[2] = vec_mergeh(step1[1], step1[9]); 1909 step2[3] = vec_mergel(step1[1], step1[9]); 1910 step2[4] = vec_mergeh(step1[2], step1[10]); 1911 step2[5] = vec_mergel(step1[2], step1[10]); 1912 step2[6] = vec_mergeh(step1[3], step1[11]); 1913 step2[7] = vec_mergel(step1[3], step1[11]); 1914 step2[8] = vec_mergeh(step1[4], step1[12]); 1915 step2[9] = vec_mergel(step1[4], step1[12]); 1916 step2[10] = vec_mergeh(step1[5], step1[13]); 1917 step2[11] = vec_mergel(step1[5], step1[13]); 1918 step2[12] = vec_mergeh(step1[6], step1[14]); 1919 step2[13] = vec_mergel(step1[6], step1[14]); 1920 step2[14] = vec_mergeh(step1[7], step1[15]); 1921 step2[15] = vec_mergel(step1[7], step1[15]); 1922 1923 step3[0] = vec_mergeh(step2[0], step2[8]); 1924 step3[1] = vec_mergel(step2[0], step2[8]); 1925 step3[2] = vec_mergeh(step2[1], step2[9]); 1926 step3[3] = vec_mergel(step2[1], step2[9]); 1927 step3[4] = vec_mergeh(step2[2], step2[10]); 1928 step3[5] = vec_mergel(step2[2], step2[10]); 1929 step3[6] = vec_mergeh(step2[3], step2[11]); 1930 step3[7] = vec_mergel(step2[3], step2[11]); 1931 step3[8] = vec_mergeh(step2[4], step2[12]); 1932 step3[9] = vec_mergel(step2[4], step2[12]); 1933 step3[10] = vec_mergeh(step2[5], step2[13]); 1934 step3[11] = vec_mergel(step2[5], step2[13]); 1935 step3[12] = vec_mergeh(step2[6], step2[14]); 1936 step3[13] = vec_mergel(step2[6], step2[14]); 1937 step3[14] = vec_mergeh(step2[7], step2[15]); 1938 step3[15] = vec_mergel(step2[7], step2[15]); 1939 1940 kernel.packet[0] = vec_mergeh(step3[0], step3[8]); 1941 kernel.packet[1] = vec_mergel(step3[0], step3[8]); 1942 kernel.packet[2] = vec_mergeh(step3[1], step3[9]); 1943 kernel.packet[3] = vec_mergel(step3[1], step3[9]); 1944 kernel.packet[4] = vec_mergeh(step3[2], step3[10]); 1945 kernel.packet[5] = vec_mergel(step3[2], step3[10]); 1946 kernel.packet[6] = vec_mergeh(step3[3], step3[11]); 1947 kernel.packet[7] = vec_mergel(step3[3], step3[11]); 1948 kernel.packet[8] = vec_mergeh(step3[4], step3[12]); 1949 kernel.packet[9] = vec_mergel(step3[4], step3[12]); 1950 kernel.packet[10] = vec_mergeh(step3[5], step3[13]); 1951 kernel.packet[11] = vec_mergel(step3[5], step3[13]); 1952 kernel.packet[12] = vec_mergeh(step3[6], step3[14]); 1953 kernel.packet[13] = vec_mergel(step3[6], step3[14]); 1954 kernel.packet[14] = vec_mergeh(step3[7], step3[15]); 1955 kernel.packet[15] = vec_mergel(step3[7], step3[15]); 1956 } 1957 1958 EIGEN_DEVICE_FUNC inline void 1959 ptranspose(PacketBlock<Packet16uc,16>& kernel) { 1960 Packet16uc step1[16], step2[16], step3[16]; 1961 1962 step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); 1963 step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); 1964 step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); 1965 step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); 1966 step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); 1967 step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); 1968 step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); 1969 step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); 1970 step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); 1971 step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); 1972 step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); 1973 step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); 1974 step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); 1975 step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); 1976 step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); 1977 step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); 1978 1979 step2[0] = vec_mergeh(step1[0], step1[8]); 1980 step2[1] = vec_mergel(step1[0], step1[8]); 1981 step2[2] = vec_mergeh(step1[1], step1[9]); 1982 step2[3] = vec_mergel(step1[1], step1[9]); 1983 step2[4] = vec_mergeh(step1[2], step1[10]); 1984 step2[5] = vec_mergel(step1[2], step1[10]); 1985 step2[6] = vec_mergeh(step1[3], step1[11]); 1986 step2[7] = vec_mergel(step1[3], step1[11]); 1987 step2[8] = vec_mergeh(step1[4], step1[12]); 1988 step2[9] = vec_mergel(step1[4], step1[12]); 1989 step2[10] = vec_mergeh(step1[5], step1[13]); 1990 step2[11] = vec_mergel(step1[5], step1[13]); 1991 step2[12] = vec_mergeh(step1[6], step1[14]); 1992 step2[13] = vec_mergel(step1[6], step1[14]); 1993 step2[14] = vec_mergeh(step1[7], step1[15]); 1994 step2[15] = vec_mergel(step1[7], step1[15]); 1995 1996 step3[0] = vec_mergeh(step2[0], step2[8]); 1997 step3[1] = vec_mergel(step2[0], step2[8]); 1998 step3[2] = vec_mergeh(step2[1], step2[9]); 1999 step3[3] = vec_mergel(step2[1], step2[9]); 2000 step3[4] = vec_mergeh(step2[2], step2[10]); 2001 step3[5] = vec_mergel(step2[2], step2[10]); 2002 step3[6] = vec_mergeh(step2[3], step2[11]); 2003 step3[7] = vec_mergel(step2[3], step2[11]); 2004 step3[8] = vec_mergeh(step2[4], step2[12]); 2005 step3[9] = vec_mergel(step2[4], step2[12]); 2006 step3[10] = vec_mergeh(step2[5], step2[13]); 2007 step3[11] = vec_mergel(step2[5], step2[13]); 2008 step3[12] = vec_mergeh(step2[6], step2[14]); 2009 step3[13] = vec_mergel(step2[6], step2[14]); 2010 step3[14] = vec_mergeh(step2[7], step2[15]); 2011 step3[15] = vec_mergel(step2[7], step2[15]); 2012 2013 kernel.packet[0] = vec_mergeh(step3[0], step3[8]); 2014 kernel.packet[1] = vec_mergel(step3[0], step3[8]); 2015 kernel.packet[2] = vec_mergeh(step3[1], step3[9]); 2016 kernel.packet[3] = vec_mergel(step3[1], step3[9]); 2017 kernel.packet[4] = vec_mergeh(step3[2], step3[10]); 2018 kernel.packet[5] = vec_mergel(step3[2], step3[10]); 2019 kernel.packet[6] = vec_mergeh(step3[3], step3[11]); 2020 kernel.packet[7] = vec_mergel(step3[3], step3[11]); 2021 kernel.packet[8] = vec_mergeh(step3[4], step3[12]); 2022 kernel.packet[9] = vec_mergel(step3[4], step3[12]); 2023 kernel.packet[10] = vec_mergeh(step3[5], step3[13]); 2024 kernel.packet[11] = vec_mergel(step3[5], step3[13]); 2025 kernel.packet[12] = vec_mergeh(step3[6], step3[14]); 2026 kernel.packet[13] = vec_mergel(step3[6], step3[14]); 2027 kernel.packet[14] = vec_mergeh(step3[7], step3[15]); 2028 kernel.packet[15] = vec_mergel(step3[7], step3[15]); 2029 } 2030 2031 template<typename Packet> EIGEN_STRONG_INLINE 2032 Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) { 2033 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; 2034 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); 2035 return vec_sel(elsePacket, thenPacket, mask); 2036 } 2037 2038 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { 2039 return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket); 2040 } 2041 2042 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { 2043 return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket); 2044 } 2045 2046 template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) { 2047 Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], 2048 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; 2049 Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE)); 2050 Packet8s result = vec_sel(elsePacket, thenPacket, mask); 2051 return result; 2052 } 2053 2054 template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) { 2055 Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], 2056 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; 2057 Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE)); 2058 return vec_sel(elsePacket, thenPacket, mask); 2059 } 2060 2061 template<> EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket, const Packet8bf& elsePacket) { 2062 return pblend<Packet8us>(ifPacket, thenPacket, elsePacket); 2063 } 2064 2065 template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) { 2066 Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], 2067 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], 2068 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], 2069 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; 2070 2071 Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE)); 2072 return vec_sel(elsePacket, thenPacket, mask); 2073 } 2074 2075 template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) { 2076 Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], 2077 ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], 2078 ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], 2079 ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; 2080 2081 Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE)); 2082 return vec_sel(elsePacket, thenPacket, mask); 2083 } 2084 2085 template <> 2086 struct type_casting_traits<float, int> { 2087 enum { 2088 VectorizedCast = 1, 2089 SrcCoeffRatio = 1, 2090 TgtCoeffRatio = 1 2091 }; 2092 }; 2093 2094 template <> 2095 struct type_casting_traits<int, float> { 2096 enum { 2097 VectorizedCast = 1, 2098 SrcCoeffRatio = 1, 2099 TgtCoeffRatio = 1 2100 }; 2101 }; 2102 2103 template <> 2104 struct type_casting_traits<bfloat16, unsigned short int> { 2105 enum { 2106 VectorizedCast = 1, 2107 SrcCoeffRatio = 1, 2108 TgtCoeffRatio = 1 2109 }; 2110 }; 2111 2112 template <> 2113 struct type_casting_traits<unsigned short int, bfloat16> { 2114 enum { 2115 VectorizedCast = 1, 2116 SrcCoeffRatio = 1, 2117 TgtCoeffRatio = 1 2118 }; 2119 }; 2120 2121 template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) { 2122 return vec_cts(a,0); 2123 } 2124 2125 template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) { 2126 return vec_ctu(a,0); 2127 } 2128 2129 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) { 2130 return vec_ctf(a,0); 2131 } 2132 2133 template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) { 2134 return vec_ctf(a,0); 2135 } 2136 2137 template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) { 2138 Packet4f float_even = Bf16ToF32Even(a); 2139 Packet4f float_odd = Bf16ToF32Odd(a); 2140 Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even); 2141 Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd); 2142 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); 2143 Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask); 2144 Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask); 2145 2146 //Check values that are bigger than USHRT_MAX (0xFFFF) 2147 Packet4bi overflow_selector; 2148 if(vec_any_gt(int_even, p4ui_low_mask)){ 2149 overflow_selector = vec_cmpgt(int_even, p4ui_low_mask); 2150 low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector); 2151 } 2152 if(vec_any_gt(int_odd, p4ui_low_mask)){ 2153 overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask); 2154 low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector); 2155 } 2156 2157 low_odd = plogical_shift_left<16>(low_odd); 2158 2159 Packet4ui int_final = por<Packet4ui>(low_even, low_odd); 2160 return reinterpret_cast<Packet8us>(int_final); 2161 } 2162 2163 template<> EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) { 2164 //short -> int -> float -> bfloat16 2165 const _EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF); 2166 Packet4ui int_cast = reinterpret_cast<Packet4ui>(a); 2167 Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask); 2168 Packet4ui int_odd = plogical_shift_right<16>(int_cast); 2169 Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even); 2170 Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd); 2171 return F32ToBf16(float_even, float_odd); 2172 } 2173 2174 2175 template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) { 2176 return reinterpret_cast<Packet4i>(a); 2177 } 2178 2179 template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) { 2180 return reinterpret_cast<Packet4f>(a); 2181 } 2182 2183 2184 2185 //---------- double ---------- 2186 #ifdef __VSX__ 2187 typedef __vector double Packet2d; 2188 typedef __vector unsigned long long Packet2ul; 2189 typedef __vector long long Packet2l; 2190 #if EIGEN_COMP_CLANG 2191 typedef Packet2ul Packet2bl; 2192 #else 2193 typedef __vector __bool long Packet2bl; 2194 #endif 2195 2196 static Packet2l p2l_ONE = { 1, 1 }; 2197 static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); 2198 static Packet2d p2d_ONE = { 1.0, 1.0 }; 2199 static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); 2200 static Packet2d p2d_MZERO = { -0.0, -0.0 }; 2201 2202 #ifdef _BIG_ENDIAN 2203 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8)); 2204 #else 2205 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8)); 2206 #endif 2207 2208 template<int index> Packet2d vec_splat_dbl(Packet2d& a); 2209 2210 template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) 2211 { 2212 return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI)); 2213 } 2214 2215 template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) 2216 { 2217 return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO)); 2218 } 2219 2220 template<> struct packet_traits<double> : default_packet_traits 2221 { 2222 typedef Packet2d type; 2223 typedef Packet2d half; 2224 enum { 2225 Vectorizable = 1, 2226 AlignedOnScalar = 1, 2227 size=2, 2228 HasHalfPacket = 1, 2229 2230 HasAdd = 1, 2231 HasSub = 1, 2232 HasMul = 1, 2233 HasDiv = 1, 2234 HasMin = 1, 2235 HasMax = 1, 2236 HasAbs = 1, 2237 HasSin = 0, 2238 HasCos = 0, 2239 HasLog = 0, 2240 HasExp = 1, 2241 HasSqrt = 1, 2242 HasRsqrt = 1, 2243 HasRound = 1, 2244 HasFloor = 1, 2245 HasCeil = 1, 2246 HasNegate = 1, 2247 HasBlend = 1 2248 }; 2249 }; 2250 2251 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; 2252 2253 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) 2254 { 2255 union { 2256 Packet2l v; 2257 int64_t n[2]; 2258 } vt; 2259 vt.v = v; 2260 s << vt.n[0] << ", " << vt.n[1]; 2261 return s; 2262 } 2263 2264 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) 2265 { 2266 union { 2267 Packet2d v; 2268 double n[2]; 2269 } vt; 2270 vt.v = v; 2271 s << vt.n[0] << ", " << vt.n[1]; 2272 return s; 2273 } 2274 2275 // Need to define them first or we get specialization after instantiation errors 2276 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) 2277 { 2278 EIGEN_DEBUG_ALIGNED_LOAD 2279 return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang 2280 } 2281 2282 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) 2283 { 2284 EIGEN_DEBUG_ALIGNED_STORE 2285 vec_xst(from, 0, to); 2286 } 2287 2288 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { 2289 Packet2d v = {from, from}; 2290 return v; 2291 } 2292 2293 template<> EIGEN_STRONG_INLINE void 2294 pbroadcast4<Packet2d>(const double *a, 2295 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) 2296 { 2297 a1 = pload<Packet2d>(a); 2298 a0 = vec_splat_dbl<0>(a1); 2299 a1 = vec_splat_dbl<1>(a1); 2300 a3 = pload<Packet2d>(a+2); 2301 a2 = vec_splat_dbl<0>(a3); 2302 a3 = vec_splat_dbl<1>(a3); 2303 } 2304 2305 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) 2306 { 2307 EIGEN_ALIGN16 double af[2]; 2308 af[0] = from[0*stride]; 2309 af[1] = from[1*stride]; 2310 return pload<Packet2d>(af); 2311 } 2312 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) 2313 { 2314 EIGEN_ALIGN16 double af[2]; 2315 pstore<double>(af, from); 2316 to[0*stride] = af[0]; 2317 to[1*stride] = af[1]; 2318 } 2319 2320 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; } 2321 2322 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; } 2323 2324 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; } 2325 2326 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } 2327 2328 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } 2329 2330 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); } 2331 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } 2332 2333 // for some weird raisons, it has to be overloaded for packet of integers 2334 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } 2335 2336 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) 2337 { 2338 // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN 2339 Packet2d ret; 2340 __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 2341 return ret; 2342 } 2343 2344 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) 2345 { 2346 // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN 2347 Packet2d ret; 2348 __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); 2349 return ret; 2350 } 2351 2352 template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); } 2353 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); } 2354 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); } 2355 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { 2356 Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b)); 2357 return vec_nor(c,c); 2358 } 2359 2360 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } 2361 2362 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } 2363 2364 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } 2365 2366 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } 2367 2368 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); } 2369 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); } 2370 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); } 2371 2372 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) 2373 { 2374 EIGEN_DEBUG_UNALIGNED_LOAD 2375 return vec_xl(0, from); 2376 } 2377 2378 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) 2379 { 2380 Packet2d p; 2381 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); 2382 else p = ploadu<Packet2d>(from); 2383 return vec_splat_dbl<0>(p); 2384 } 2385 2386 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) 2387 { 2388 EIGEN_DEBUG_UNALIGNED_STORE 2389 vec_xst(from, 0, to); 2390 } 2391 2392 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } 2393 2394 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; } 2395 2396 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) 2397 { 2398 return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64)); 2399 } 2400 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } 2401 2402 // VSX support varies between different compilers and even different 2403 // versions of the same compiler. For gcc version >= 4.9.3, we can use 2404 // vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use 2405 // a slow version that works with older compilers. 2406 // Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles 2407 // are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 2408 static inline Packet2l ConvertToPacket2l(const Packet2d& x) { 2409 #if EIGEN_GNUC_AT_LEAST(5, 4) || \ 2410 (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) 2411 return vec_cts(x, 0); // TODO: check clang version. 2412 #else 2413 double tmp[2]; 2414 memcpy(tmp, &x, sizeof(tmp)); 2415 Packet2l l = { static_cast<long long>(tmp[0]), 2416 static_cast<long long>(tmp[1]) }; 2417 return l; 2418 #endif 2419 } 2420 2421 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) { 2422 2423 // build 2^n 2424 Packet2l emm0 = ConvertToPacket2l(exponent); 2425 2426 #ifdef __POWER8_VECTOR__ 2427 const Packet2l p2l_1023 = { 1023, 1023 }; 2428 const Packet2ul p2ul_52 = { 52, 52 }; 2429 emm0 = vec_add(emm0, p2l_1023); 2430 emm0 = vec_sl(emm0, p2ul_52); 2431 #else 2432 // Code is a bit complex for POWER7. There is actually a 2433 // vec_xxsldi intrinsic but it is not supported by some gcc versions. 2434 // So we shift (52-32) bits and do a word swap with zeros. 2435 const Packet4i p4i_1023 = pset1<Packet4i>(1023); 2436 const Packet4i p4i_20 = pset1<Packet4i>(20); // 52 - 32 2437 2438 Packet4i emm04i = reinterpret_cast<Packet4i>(emm0); 2439 emm04i = vec_add(emm04i, p4i_1023); 2440 emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20)); 2441 static const Packet16uc perm = { 2442 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 2443 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; 2444 #ifdef _BIG_ENDIAN 2445 emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm)); 2446 #else 2447 emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm)); 2448 #endif 2449 2450 #endif 2451 2452 return pmul(a, reinterpret_cast<Packet2d>(emm0)); 2453 } 2454 2455 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) 2456 { 2457 Packet2d b, sum; 2458 b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8)); 2459 sum = a + b; 2460 return pfirst<Packet2d>(sum); 2461 } 2462 2463 // Other reduction functions: 2464 // mul 2465 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) 2466 { 2467 return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 2468 } 2469 2470 // min 2471 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) 2472 { 2473 return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 2474 } 2475 2476 // max 2477 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) 2478 { 2479 return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 2480 } 2481 2482 EIGEN_DEVICE_FUNC inline void 2483 ptranspose(PacketBlock<Packet2d,2>& kernel) { 2484 Packet2d t0, t1; 2485 t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); 2486 t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); 2487 kernel.packet[0] = t0; 2488 kernel.packet[1] = t1; 2489 } 2490 2491 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { 2492 Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; 2493 Packet2bl mask = reinterpret_cast<Packet2bl>( vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)) ); 2494 return vec_sel(elsePacket, thenPacket, mask); 2495 } 2496 2497 2498 #endif // __VSX__ 2499 } // end namespace internal 2500 2501 } // end namespace Eigen 2502 2503 #endif // EIGEN_PACKET_MATH_ALTIVEC_H 2504