1 // ppc_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file ppc_simd.h
4 /// \brief Support functions for PowerPC and vector operations
5 /// \details This header provides an agnostic interface into Clang, GCC
6 ///  and IBM XL C/C++ compilers modulo their different built-in functions
7 ///  for accessing vector intructions.
8 /// \details The abstractions are necesssary to support back to GCC 4.8 and
9 ///  XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
10 ///  default compiler for GCC112, GCC119 and others on the compile farm.
11 ///  Older IBM XL C/C++ compilers also have the need due to lack of
12 ///  <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
13 ///  compilers provide best support and don't need many of the hacks
14 ///  below.
15 /// \details The library is tested with the following PowerPC machines and
16 ///  compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
17 ///  the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
18 ///  - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
19 ///  - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
20 ///  - GCC110, Linux, POWER7, GCC 4.8.5
21 ///  - GCC110, Linux, POWER7, XLC 12.01
22 ///  - GCC111, AIX, POWER7, GCC 4.8.1
23 ///  - GCC111, AIX, POWER7, XLC 12.01
24 ///  - GCC112, Linux, POWER8, GCC 4.8.5
25 ///  - GCC112, Linux, POWER8, XLC 13.01
26 ///  - GCC112, Linux, POWER8, Clang 7.0
27 ///  - GCC119, AIX, POWER8, GCC 7.2.0
28 ///  - GCC119, AIX, POWER8, XLC 13.01
29 ///  - GCC135, Linux, POWER9, GCC 7.0
30 /// \details 12 machines are used for testing because the three compilers form
31 ///  five or six profiles. The profiles are listed below.
32 ///  - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
33 ///  - XLC 13.0 and earlier (all IBM components)
34 ///  - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
35 ///  - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
36 ///  - early LLVM Clang (traditional Clang compiler)
37 ///  - late LLVM Clang (traditional Clang compiler)
38 /// \details The LLVM front-end makes it tricky to write portable code because
39 ///  LLVM pretends to be other compilers but cannot consume other compiler's
40 ///  builtins. When using XLC with -qxlcompatmacros the compiler pretends to
41 ///  be GCC, Clang and XLC all at once but it can only consume it's variety
42 ///  of builtins.
43 /// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were
44 ///  renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was
45 ///  changed to <tt>VecAnd</tt>. The name change helped consolidate two
46 ///  slightly different implementations.
47 /// \details At Crypto++ 8.3 the library added select 64-bit functions for
48 ///  32-bit Altivec. For example, <tt>VecAdd64</tt> and <tt>VecSub64</tt>
49 ///  take 32-bit vectors and adds or subtracts them as if there were vectors
50 ///  with two 64-bit elements. The functions dramtically improve performance
51 ///  for some algorithms on some platforms, like SIMON128 and SPECK128 on
52 ///  Power6 and earlier. For example, SPECK128 improved from 70 cpb to
53 ///  10 cpb on an old PowerMac. Use the functions like shown below.
54 ///  <pre>
55 ///    \#if defined(_ARCH_PWR8)
56 ///    \#  define speck128_t uint64x2_p
57 ///    \#else
58 ///    \#  define speck128_t uint32x4_p
59 ///    \#endif
60 ///
61 ///    speck128_t rk, x1, x2, y1, y2;
62 ///    rk = (speck128_t)VecLoadAligned(ptr);
63 ///    x1 = VecRotateRight64<8>(x1);
64 ///    x1 = VecAdd64(x1, y1);
65 ///    ...</pre>
66 /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
67 
68 // Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting
69 // actual availaibility of the feature for the source file being compiled.
70 // The preprocessor macros depend on compiler options like -maltivec; and
71 // not compiler versions.
72 
73 // For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html
74 // For XLC see the Compiler Reference manual. For Clang you have to experiment.
75 // Clang does not document the compiler options, does not reject options it does
76 // not understand, and pretends to be other compilers even though it cannot
77 // process the builtins and intrinsics. Clang will waste hours of your time.
78 
79 // DO NOT USE this pattern in VecLoad and VecStore. We have to use the
80 // code paths guarded by preprocessor macros because XLC 12 generates
81 // bad code in some places. To verify the bad code generation test on
82 // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
83 //
84 //   inline uint32x4_p VecLoad(const byte src[16])
85 //   {
86 //   #if defined(__VSX__) || defined(_ARCH_PWR8)
87 //       return (uint32x4_p) *(uint8x16_p*)((byte*)src);
88 //   #else
89 //       return VecLoad_ALTIVEC(src);
90 //   #endif
91 //   }
92 
93 // We should be able to perform the load using inline asm on Power7 with
94 // VSX or Power8. The inline asm will avoid C undefined behavior due to
95 // casting from byte* to word32*. We are safe because our byte* are
96 // 16-byte aligned for Altivec. Below is the big endian load. Little
97 // endian would need to follow with xxpermdi for the reversal.
98 //
99 //   __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );
100 
101 // GCC and XLC use integer math for the address (D-form or byte-offset
102 // in the ISA manual). LLVM uses pointer math for the address (DS-form
103 // or indexed in the ISA manual). To keep them consistent we calculate
104 // the address from the offset and pass to a load or store function
105 // using a 0 offset.
106 
107 #ifndef CRYPTOPP_PPC_CRYPTO_H
108 #define CRYPTOPP_PPC_CRYPTO_H
109 
110 #include "config.h"
111 #include "misc.h"
112 
113 #if defined(__ALTIVEC__)
114 # include <altivec.h>
115 # undef vector
116 # undef pixel
117 # undef bool
118 #endif
119 
120 // XL C++ on AIX does not define VSX and does not
121 // provide an option to set it. We have to set it
122 // for the code below. This define must stay in
123 // sync with the define in test_ppc_power7.cxx.
124 #ifndef CRYPTOPP_DISABLE_POWER7
125 # if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
126 #  define __VSX__ 1
127 # endif
128 #endif
129 
130 // XL C++ on AIX does not define CRYPTO and does not
131 // provide an option to set it. We have to set it
132 // for the code below. This define must stay in
133 // sync with the define in test_ppc_power8.cxx
134 #ifndef CRYPTOPP_DISABLE_POWER8
135 # if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)
136 #  define __CRYPTO__ 1
137 # endif
138 #endif
139 
140 /// \brief Cast array to vector pointer
141 /// \details CONST_V8_CAST casts a const array to a vector
142 ///  pointer for a byte array. The Power ABI says source arrays
143 ///  are non-const, so this define removes the const. XLC++ will
144 ///  fail the compile if the source array is const.
145 #define CONST_V8_CAST(x)  ((unsigned char*)(x))
146 /// \brief Cast array to vector pointer
147 /// \details CONST_V32_CAST casts a const array to a vector
148 ///  pointer for a word array. The Power ABI says source arrays
149 ///  are non-const, so this define removes the const. XLC++ will
150 ///  fail the compile if the source array is const.
151 #define CONST_V32_CAST(x) ((unsigned int*)(x))
152 /// \brief Cast array to vector pointer
153 /// \details CONST_V64_CAST casts a const array to a vector
154 ///  pointer for a double word array. The Power ABI says source arrays
155 ///  are non-const, so this define removes the const. XLC++ will
156 ///  fail the compile if the source array is const.
157 #define CONST_V64_CAST(x) ((unsigned long long*)(x))
158 /// \brief Cast array to vector pointer
159 /// \details NCONST_V8_CAST casts an array to a vector
160 ///  pointer for a byte array. The Power ABI says source arrays
161 ///  are non-const, so this define removes the const. XLC++ will
162 ///  fail the compile if the source array is const.
163 #define NCONST_V8_CAST(x)  ((unsigned char*)(x))
164 /// \brief Cast array to vector pointer
165 /// \details NCONST_V32_CAST casts an array to a vector
166 ///  pointer for a word array. The Power ABI says source arrays
167 ///  are non-const, so this define removes the const. XLC++ will
168 ///  fail the compile if the source array is const.
169 #define NCONST_V32_CAST(x) ((unsigned int*)(x))
170 /// \brief Cast array to vector pointer
171 /// \details NCONST_V64_CAST casts an array to a vector
172 ///  pointer for a double word array. The Power ABI says source arrays
173 ///  are non-const, so this define removes the const. XLC++ will
174 ///  fail the compile if the source array is const.
175 #define NCONST_V64_CAST(x) ((unsigned long long*)(x))
176 
177 // VecLoad_ALTIVEC and VecStore_ALTIVEC are
178 // too noisy on modern compilers
179 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
180 # pragma GCC diagnostic push
181 # pragma GCC diagnostic ignored "-Wdeprecated"
182 #endif
183 
184 NAMESPACE_BEGIN(CryptoPP)
185 
186 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
187 
188 /// \brief Vector of 8-bit elements
189 /// \par Wraps
190 ///  __vector unsigned char
191 /// \since Crypto++ 6.0
192 typedef __vector unsigned char   uint8x16_p;
193 /// \brief Vector of 16-bit elements
194 /// \par Wraps
195 ///  __vector unsigned short
196 /// \since Crypto++ 6.0
197 typedef __vector unsigned short  uint16x8_p;
198 /// \brief Vector of 32-bit elements
199 /// \par Wraps
200 ///  __vector unsigned int
201 /// \since Crypto++ 6.0
202 typedef __vector unsigned int    uint32x4_p;
203 
204 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
205 /// \brief Vector of 64-bit elements
206 /// \details uint64x2_p is available on POWER7 with VSX and above. Most
207 ///  supporting functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>)
208 ///  and <tt>vec_sub</tt> (<tt>vsubudm</tt>), did not arrive until POWER8.
209 /// \par Wraps
210 ///  __vector unsigned long long
211 /// \since Crypto++ 6.0
212 typedef __vector unsigned long long uint64x2_p;
213 #endif  // VSX or ARCH_PWR8
214 
215 /// \brief The 0 vector
216 /// \return a 32-bit vector of 0's
217 /// \since Crypto++ 8.0
VecZero()218 inline uint32x4_p VecZero()
219 {
220     const uint32x4_p v = {0,0,0,0};
221     return v;
222 }
223 
224 /// \brief The 1 vector
225 /// \return a 32-bit vector of 1's
226 /// \since Crypto++ 8.0
VecOne()227 inline uint32x4_p VecOne()
228 {
229     const uint32x4_p v = {1,1,1,1};
230     return v;
231 }
232 
233 /// \brief Reverse bytes in a vector
234 /// \tparam T vector type
235 /// \param data the vector
236 /// \return vector
237 /// \details VecReverse() reverses the bytes in a vector
238 /// \par Wraps
239 ///  vec_perm
240 /// \since Crypto++ 6.0
241 template <class T>
VecReverse(const T data)242 inline T VecReverse(const T data)
243 {
244 #if defined(CRYPTOPP_BIG_ENDIAN)
245     const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
246     return (T)vec_perm(data, data, mask);
247 #else
248     const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
249     return (T)vec_perm(data, data, mask);
250 #endif
251 }
252 
253 /// \brief Reverse bytes in a vector
254 /// \tparam T vector type
255 /// \param data the vector
256 /// \return vector
257 /// \details VecReverseLE() reverses the bytes in a vector on
258 ///  little-endian systems.
259 /// \par Wraps
260 ///  vec_perm
261 /// \since Crypto++ 6.0
262 template <class T>
VecReverseLE(const T data)263 inline T VecReverseLE(const T data)
264 {
265 #if defined(CRYPTOPP_LITTLE_ENDIAN)
266     const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
267     return (T)vec_perm(data, data, mask);
268 #else
269     return data;
270 #endif
271 }
272 
273 /// \brief Reverse bytes in a vector
274 /// \tparam T vector type
275 /// \param data the vector
276 /// \return vector
277 /// \details VecReverseBE() reverses the bytes in a vector on
278 ///  big-endian systems.
279 /// \par Wraps
280 ///  vec_perm
281 /// \since Crypto++ 6.0
282 template <class T>
VecReverseBE(const T data)283 inline T VecReverseBE(const T data)
284 {
285 #if defined(CRYPTOPP_BIG_ENDIAN)
286     const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
287     return (T)vec_perm(data, data, mask);
288 #else
289     return data;
290 #endif
291 }
292 
293 /// \name LOAD OPERATIONS
294 //@{
295 
296 /// \brief Loads a vector from a byte array
297 /// \param src the byte array
298 /// \details Loads a vector in native endian format from a byte array.
299 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
300 ///  of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
301 ///  <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
302 ///  <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
303 ///  you should provide aligned memory adresses.
304 /// \par Wraps
305 ///  vec_ld, vec_lvsl, vec_perm
306 /// \sa VecLoad, VecLoadAligned
307 /// \since Crypto++ 6.0
VecLoad_ALTIVEC(const byte src[16])308 inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
309 {
310     // Avoid IsAlignedOn for convenience.
311     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
312     if (addr % 16 == 0)
313     {
314         return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
315     }
316     else
317     {
318         // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
319         const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
320         const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
321         const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
322         return (uint32x4_p)vec_perm(low, high, perm);
323     }
324 }
325 
326 /// \brief Loads a vector from a byte array
327 /// \param src the byte array
328 /// \param off offset into the src byte array
329 /// \details Loads a vector in native endian format from a byte array.
330 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
331 ///  of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
332 ///  <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
333 /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
334 ///  relatively expensive so you should provide aligned memory adresses.
335 /// \par Wraps
336 ///  vec_ld, vec_lvsl, vec_perm
337 /// \sa VecLoad, VecLoadAligned
338 /// \since Crypto++ 6.0
VecLoad_ALTIVEC(int off,const byte src[16])339 inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
340 {
341     // Avoid IsAlignedOn for convenience.
342     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
343     if (addr % 16 == 0)
344     {
345         return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
346     }
347     else
348     {
349         // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
350         const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
351         const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
352         const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
353         return (uint32x4_p)vec_perm(low, high, perm);
354     }
355 }
356 
357 /// \brief Loads a vector from a byte array
358 /// \param src the byte array
359 /// \details VecLoad() loads a vector from a byte array.
360 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
361 ///  The instruction does not require aligned effective memory addresses.
362 ///  VecLoad_ALTIVEC() is used if POWER9 is not available.
363 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
364 ///  are required to fix up unaligned memory addresses.
365 /// \par Wraps
366 ///  vec_xl on POWER9 and above, Altivec load on POWER8 and below
367 /// \sa VecLoad_ALTIVEC, VecLoadAligned
368 /// \since Crypto++ 6.0
VecLoad(const byte src[16])369 inline uint32x4_p VecLoad(const byte src[16])
370 {
371     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
372     // word pointers. The ISA lacks loads for short* and char*.
373     // Power9/ISA 3.0 provides vec_xl for all datatypes.
374 
375     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
376     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
377     CRYPTOPP_UNUSED(addr);
378 
379 #if defined(_ARCH_PWR9)
380     return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
381 #else
382     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
383 #endif
384 }
385 
386 /// \brief Loads a vector from a byte array
387 /// \param src the byte array
388 /// \param off offset into the src byte array
389 /// \details VecLoad() loads a vector from a byte array.
390 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
391 ///  The instruction does not require aligned effective memory addresses.
392 ///  VecLoad_ALTIVEC() is used if POWER9 is not available.
393 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
394 ///  are required to fix up unaligned memory addresses.
395 /// \par Wraps
396 ///  vec_xl on POWER9 and above, Altivec load on POWER8 and below
397 /// \sa VecLoad_ALTIVEC, VecLoadAligned
398 /// \since Crypto++ 6.0
VecLoad(int off,const byte src[16])399 inline uint32x4_p VecLoad(int off, const byte src[16])
400 {
401     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
402     // word pointers. The ISA lacks loads for short* and char*.
403     // Power9/ISA 3.0 provides vec_xl for all datatypes.
404 
405     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
406     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
407     CRYPTOPP_UNUSED(addr);
408 
409 #if defined(_ARCH_PWR9)
410     return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
411 #else
412     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
413 #endif
414 }
415 
416 /// \brief Loads a vector from a word array
417 /// \param src the word array
418 /// \details VecLoad() loads a vector from a word array.
419 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
420 ///  The instruction does not require aligned effective memory addresses.
421 ///  VecLoad_ALTIVEC() is used if POWER7 is not available.
422 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
423 ///  are required to fix up unaligned memory addresses.
424 /// \par Wraps
425 ///  vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
426 /// \sa VecLoad_ALTIVEC, VecLoadAligned
427 /// \since Crypto++ 8.0
VecLoad(const word32 src[4])428 inline uint32x4_p VecLoad(const word32 src[4])
429 {
430     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
431     // word pointers. The ISA lacks loads for short* and char*.
432     // Power9/ISA 3.0 provides vec_xl for all datatypes.
433 
434     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
435     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
436     CRYPTOPP_UNUSED(addr);
437 
438 #if defined(_ARCH_PWR9)
439     return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
440 #elif defined(__VSX__) || defined(_ARCH_PWR8)
441     return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
442 #else
443     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
444 #endif
445 }
446 
447 /// \brief Loads a vector from a word array
448 /// \param src the word array
449 /// \param off offset into the word array
450 /// \details VecLoad() loads a vector from a word array.
451 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
452 ///  The instruction does not require aligned effective memory addresses.
453 ///  VecLoad_ALTIVEC() is used if POWER7 is not available.
454 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
455 ///  are required to fix up unaligned memory addresses.
456 /// \par Wraps
457 ///  vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
458 /// \sa VecLoad_ALTIVEC, VecLoadAligned
459 /// \since Crypto++ 8.0
VecLoad(int off,const word32 src[4])460 inline uint32x4_p VecLoad(int off, const word32 src[4])
461 {
462     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
463     // word pointers. The ISA lacks loads for short* and char*.
464     // Power9/ISA 3.0 provides vec_xl for all datatypes.
465 
466     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
467     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
468     CRYPTOPP_UNUSED(addr);
469 
470 #if defined(_ARCH_PWR9)
471     return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
472 #elif defined(__VSX__) || defined(_ARCH_PWR8)
473     return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
474 #else
475     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
476 #endif
477 }
478 
479 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
480 
481 /// \brief Loads a vector from a double word array
482 /// \param src the double word array
483 /// \details VecLoad() loads a vector from a double word array.
484 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
485 ///  The instruction does not require aligned effective memory addresses.
486 ///  VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
487 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
488 ///  are required to fix up unaligned memory addresses.
489 /// \details VecLoad() with 64-bit elements is available on POWER7 and above.
490 /// \par Wraps
491 ///  vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
492 /// \sa VecLoad_ALTIVEC, VecLoadAligned
493 /// \since Crypto++ 8.0
VecLoad(const word64 src[2])494 inline uint64x2_p VecLoad(const word64 src[2])
495 {
496     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
497     // word pointers. The ISA lacks loads for short* and char*.
498     // Power9/ISA 3.0 provides vec_xl for all datatypes.
499 
500     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
501     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
502     CRYPTOPP_UNUSED(addr);
503 
504 #if defined(_ARCH_PWR9)
505     return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
506 #elif defined(__VSX__) || defined(_ARCH_PWR8)
507     // The 32-bit cast is not a typo. Compiler workaround.
508     return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
509 #else
510     return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
511 #endif
512 }
513 
514 /// \brief Loads a vector from a double word array
515 /// \param src the double word array
516 /// \param off offset into the double word array
517 /// \details VecLoad() loads a vector from a double word array.
518 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
519 ///  The instruction does not require aligned effective memory addresses.
520 ///  VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
521 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
522 ///  are required to fix up unaligned memory addresses.
523 /// \details VecLoad() with 64-bit elements is available on POWER8 and above.
524 /// \par Wraps
525 ///  vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
526 /// \sa VecLoad_ALTIVEC, VecLoadAligned
527 /// \since Crypto++ 8.0
VecLoad(int off,const word64 src[2])528 inline uint64x2_p VecLoad(int off, const word64 src[2])
529 {
530     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
531     // word pointers. The ISA lacks loads for short* and char*.
532     // Power9/ISA 3.0 provides vec_xl for all datatypes.
533 
534     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
535     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
536     CRYPTOPP_UNUSED(addr);
537 
538 #if defined(_ARCH_PWR9)
539     return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
540 #elif defined(__VSX__) || defined(_ARCH_PWR8)
541     // The 32-bit cast is not a typo. Compiler workaround.
542     return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
543 #else
544     return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
545 #endif
546 }
547 
548 #endif  // VSX or ARCH_PWR8
549 
550 /// \brief Loads a vector from an aligned byte array
551 /// \param src the byte array
552 /// \details VecLoadAligned() loads a vector from an aligned byte array.
553 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
554 ///  <tt>vec_ld</tt> is used if POWER9 is not available. The effective
555 ///  address of <tt>src</tt> must be 16-byte aligned for Altivec.
556 /// \par Wraps
557 ///  vec_xl on POWER9, vec_ld on POWER8 and below
558 /// \sa VecLoad_ALTIVEC, VecLoad
559 /// \since Crypto++ 8.0
VecLoadAligned(const byte src[16])560 inline uint32x4_p VecLoadAligned(const byte src[16])
561 {
562     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
563     // word pointers. The ISA lacks loads for short* and char*.
564     // Power9/ISA 3.0 provides vec_xl for all datatypes.
565 
566     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
567     CRYPTOPP_ASSERT(addr % 16 == 0);
568     CRYPTOPP_UNUSED(addr);
569 
570 #if defined(_ARCH_PWR9)
571     return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
572 #else
573     return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
574 #endif
575 }
576 
577 /// \brief Loads a vector from an aligned byte array
578 /// \param src the byte array
579 /// \param off offset into the src byte array
580 /// \details VecLoadAligned() loads a vector from an aligned byte array.
581 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
582 ///  <tt>vec_ld</tt> is used if POWER9 is not available. The effective
583 ///  address of <tt>src</tt> must be 16-byte aligned for Altivec.
584 /// \par Wraps
585 ///  vec_xl on POWER9, vec_ld on POWER8 and below
586 /// \sa VecLoad_ALTIVEC, VecLoad
587 /// \since Crypto++ 8.0
VecLoadAligned(int off,const byte src[16])588 inline uint32x4_p VecLoadAligned(int off, const byte src[16])
589 {
590     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
591     // word pointers. The ISA lacks loads for short* and char*.
592     // Power9/ISA 3.0 provides vec_xl for all datatypes.
593 
594     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
595     CRYPTOPP_ASSERT(addr % 16 == 0);
596     CRYPTOPP_UNUSED(addr);
597 
598 #if defined(_ARCH_PWR9)
599     return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
600 #else
601     return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
602 #endif
603 }
604 
605 /// \brief Loads a vector from an aligned word array
606 /// \param src the word array
607 /// \details VecLoadAligned() loads a vector from an aligned word array.
608 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
609 ///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
610 ///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
611 /// \par Wraps
612 ///  vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
613 /// \sa VecLoad_ALTIVEC, VecLoad
614 /// \since Crypto++ 8.0
VecLoadAligned(const word32 src[4])615 inline uint32x4_p VecLoadAligned(const word32 src[4])
616 {
617     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
618     // word pointers. The ISA lacks loads for short* and char*.
619     // Power9/ISA 3.0 provides vec_xl for all datatypes.
620 
621     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
622     CRYPTOPP_ASSERT(addr % 16 == 0);
623     CRYPTOPP_UNUSED(addr);
624 
625 #if defined(_ARCH_PWR9)
626     return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
627 #elif defined(__VSX__) || defined(_ARCH_PWR8)
628     return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));
629 #else
630     return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
631 #endif
632 }
633 
634 /// \brief Loads a vector from an aligned word array
635 /// \param src the word array
636 /// \param off offset into the src word array
637 /// \details VecLoadAligned() loads a vector from an aligned word array.
638 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
639 ///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
640 ///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
641 /// \par Wraps
642 ///  vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
643 /// \sa VecLoad_ALTIVEC, VecLoad
644 /// \since Crypto++ 8.0
VecLoadAligned(int off,const word32 src[4])645 inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
646 {
647     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
648     // word pointers. The ISA lacks loads for short* and char*.
649     // Power9/ISA 3.0 provides vec_xl for all datatypes.
650 
651     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
652     CRYPTOPP_ASSERT(addr % 16 == 0);
653     CRYPTOPP_UNUSED(addr);
654 
655 #if defined(_ARCH_PWR9)
656     return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
657 #elif defined(__VSX__) || defined(_ARCH_PWR8)
658     return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
659 #else
660     return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
661 #endif
662 }
663 
664 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
665 
666 /// \brief Loads a vector from an aligned double word array
667 /// \param src the double word array
668 /// \details VecLoadAligned() loads a vector from an aligned double word array.
669 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
670 ///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
671 ///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
672 /// \par Wraps
673 ///  vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
674 /// \sa VecLoad_ALTIVEC, VecLoad
675 /// \since Crypto++ 8.0
VecLoadAligned(const word64 src[4])676 inline uint64x2_p VecLoadAligned(const word64 src[4])
677 {
678     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
679     // word pointers. The ISA lacks loads for short* and char*.
680     // Power9/ISA 3.0 provides vec_xl for all datatypes.
681 
682     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
683     CRYPTOPP_ASSERT(addr % 16 == 0);
684     CRYPTOPP_UNUSED(addr);
685 
686 #if defined(_ARCH_PWR9)
687     return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
688 #elif defined(__VSX__) || defined(_ARCH_PWR8)
689     // The 32-bit cast is not a typo. Compiler workaround.
690     return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
691 #else
692     return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
693 #endif
694 }
695 
696 /// \brief Loads a vector from an aligned double word array
697 /// \param src the double word array
698 /// \param off offset into the src double word array
699 /// \details VecLoadAligned() loads a vector from an aligned double word array.
700 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
701 ///  available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
702 ///  The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
703 /// \par Wraps
704 ///  vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
705 /// \sa VecLoad_ALTIVEC, VecLoad
706 /// \since Crypto++ 8.0
VecLoadAligned(int off,const word64 src[4])707 inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
708 {
709     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
710     // word pointers. The ISA lacks loads for short* and char*.
711     // Power9/ISA 3.0 provides vec_xl for all datatypes.
712 
713     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
714     CRYPTOPP_ASSERT(addr % 16 == 0);
715     CRYPTOPP_UNUSED(addr);
716 
717 #if defined(_ARCH_PWR9)
718     return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
719 #elif defined(__VSX__) || defined(_ARCH_PWR8)
720     // The 32-bit cast is not a typo. Compiler workaround.
721     return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
722 #else
723     return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
724 #endif
725 }
726 
727 #endif
728 
729 /// \brief Loads a vector from a byte array
730 /// \param src the byte array
731 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
732 ///  will reverse all bytes in the array on a little endian system.
733 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
734 ///  The instruction does not require aligned effective memory addresses.
735 ///  VecLoad_ALTIVEC() is used if POWER7 or VSX are not available.
736 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
737 ///  are required to fix up unaligned memory addresses.
738 /// \par Wraps
739 ///  vec_xl on POWER8, Altivec load on POWER7 and below
740 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
741 /// \since Crypto++ 6.0
VecLoadBE(const byte src[16])742 inline uint32x4_p VecLoadBE(const byte src[16])
743 {
744     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
745     // word pointers. The ISA lacks loads for short* and char*.
746     // Power9/ISA 3.0 provides vec_xl for all datatypes.
747 
748     const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
749     // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
750     CRYPTOPP_UNUSED(addr);
751 
752 #if defined(_ARCH_PWR9)
753     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
754     return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));
755 #elif defined(CRYPTOPP_BIG_ENDIAN)
756     return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src));
757 #else
758     return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(src)));
759 #endif
760 }
761 
762 /// \brief Loads a vector from a byte array
763 /// \param src the byte array
764 /// \param off offset into the src byte array
765 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
766 ///  will reverse all bytes in the array on a little endian system.
767 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
768 ///  The instruction does not require aligned effective memory addresses.
769 ///  VecLoad_ALTIVEC() is used if POWER7 is not available.
770 ///  VecLoad_ALTIVEC() can be relatively expensive if extra instructions
771 ///  are required to fix up unaligned memory addresses.
772 /// \par Wraps
773 ///  vec_xl on POWER8, Altivec load on POWER7 and below
774 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
775 /// \since Crypto++ 6.0
VecLoadBE(int off,const byte src[16])776 inline uint32x4_p VecLoadBE(int off, const byte src[16])
777 {
778     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
779     // word pointers. The ISA lacks loads for short* and char*.
780     // Power9/ISA 3.0 provides vec_xl for all datatypes.
781 
782     const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
783     // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
784     CRYPTOPP_UNUSED(addr);
785 
786 #if defined(_ARCH_PWR9)
787     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
788     return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));
789 #elif defined(CRYPTOPP_BIG_ENDIAN)
790     return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
791 #else
792     return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(addr)));
793 #endif
794 }
795 
796 //@}
797 
798 /// \name STORE OPERATIONS
799 //@{
800 
801 /// \brief Stores a vector to a byte array
802 /// \tparam T vector type
803 /// \param data the vector
804 /// \param dest the byte array
805 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
806 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
807 ///  of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
808 ///  <tt>vec_ste</tt> is relatively expensive so you should provide aligned
809 ///  memory adresses.
810 /// \details VecStore_ALTIVEC() is used when POWER7 or above
811 ///  and unaligned loads is not available.
812 /// \par Wraps
813 ///  vec_st, vec_ste, vec_lvsr, vec_perm
814 /// \sa VecStore, VecStoreAligned
815 /// \since Crypto++ 8.0
816 template<class T>
VecStore_ALTIVEC(const T data,byte dest[16])817 inline void VecStore_ALTIVEC(const T data, byte dest[16])
818 {
819     // Avoid IsAlignedOn for convenience.
820     uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
821     if (addr % 16 == 0)
822     {
823         vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
824     }
825     else
826     {
827         // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
828         uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
829         vec_ste((uint8x16_p) perm,  0, (unsigned char*) NCONST_V8_CAST(addr));
830         vec_ste((uint16x8_p) perm,  1, (unsigned short*)NCONST_V8_CAST(addr));
831         vec_ste((uint32x4_p) perm,  3, (unsigned int*)  NCONST_V8_CAST(addr));
832         vec_ste((uint32x4_p) perm,  4, (unsigned int*)  NCONST_V8_CAST(addr));
833         vec_ste((uint32x4_p) perm,  8, (unsigned int*)  NCONST_V8_CAST(addr));
834         vec_ste((uint32x4_p) perm, 12, (unsigned int*)  NCONST_V8_CAST(addr));
835         vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
836         vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
837     }
838 }
839 
840 /// \brief Stores a vector to a byte array
841 /// \tparam T vector type
842 /// \param data the vector
843 /// \param off offset into the dest byte array
844 /// \param dest the byte array
845 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
846 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
847 ///  of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
848 ///  <tt>vec_ste</tt> is relatively expensive so you should provide aligned
849 ///  memory adresses.
850 /// \details VecStore_ALTIVEC() is used when POWER7 or above
851 ///  and unaligned loads is not available.
852 /// \par Wraps
853 ///  vec_st, vec_ste, vec_lvsr, vec_perm
854 /// \sa VecStore, VecStoreAligned
855 /// \since Crypto++ 8.0
856 template<class T>
VecStore_ALTIVEC(const T data,int off,byte dest[16])857 inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
858 {
859     // Avoid IsAlignedOn for convenience.
860     uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
861     if (addr % 16 == 0)
862     {
863         vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
864     }
865     else
866     {
867         // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
868         uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
869         vec_ste((uint8x16_p) perm,  0, (unsigned char*) NCONST_V8_CAST(addr));
870         vec_ste((uint16x8_p) perm,  1, (unsigned short*)NCONST_V8_CAST(addr));
871         vec_ste((uint32x4_p) perm,  3, (unsigned int*)  NCONST_V8_CAST(addr));
872         vec_ste((uint32x4_p) perm,  4, (unsigned int*)  NCONST_V8_CAST(addr));
873         vec_ste((uint32x4_p) perm,  8, (unsigned int*)  NCONST_V8_CAST(addr));
874         vec_ste((uint32x4_p) perm, 12, (unsigned int*)  NCONST_V8_CAST(addr));
875         vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
876         vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
877     }
878 }
879 
880 /// \brief Stores a vector to a byte array
881 /// \tparam T vector type
882 /// \param data the vector
883 /// \param dest the byte array
884 /// \details VecStore() stores a vector to a byte array.
885 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
886 ///  The instruction does not require aligned effective memory addresses.
887 ///  VecStore_ALTIVEC() is used if POWER9 is not available.
888 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
889 ///  are required to fix up unaligned memory addresses.
890 /// \par Wraps
891 ///  vec_xst on POWER9 and above, Altivec store on POWER8 and below
892 /// \sa VecStore_ALTIVEC, VecStoreAligned
893 /// \since Crypto++ 6.0
894 template<class T>
VecStore(const T data,byte dest[16])895 inline void VecStore(const T data, byte dest[16])
896 {
897     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
898     // word pointers. The ISA lacks loads for short* and char*.
899     // Power9/ISA 3.0 provides vec_xl for all datatypes.
900 
901     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
902     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
903     CRYPTOPP_UNUSED(addr);
904 
905 #if defined(_ARCH_PWR9)
906     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
907 #else
908     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(dest));
909 #endif
910 }
911 
912 /// \brief Stores a vector to a byte array
913 /// \tparam T vector type
914 /// \param data the vector
915 /// \param off offset into the dest byte array
916 /// \param dest the byte array
917 /// \details VecStore() stores a vector to a byte array.
918 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
919 ///  The instruction does not require aligned effective memory addresses.
920 ///  VecStore_ALTIVEC() is used if POWER9 is not available.
921 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
922 ///  are required to fix up unaligned memory addresses.
923 /// \par Wraps
924 ///  vec_xst on POWER9 and above, Altivec store on POWER8 and below
925 /// \sa VecStore_ALTIVEC, VecStoreAligned
926 /// \since Crypto++ 6.0
927 template<class T>
VecStore(const T data,int off,byte dest[16])928 inline void VecStore(const T data, int off, byte dest[16])
929 {
930     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
931     // word pointers. The ISA lacks loads for short* and char*.
932     // Power9/ISA 3.0 provides vec_xl for all datatypes.
933 
934     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
935     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
936     CRYPTOPP_UNUSED(addr);
937 
938 #if defined(_ARCH_PWR9)
939     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
940 #else
941     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
942 #endif
943 }
944 
945 /// \brief Stores a vector to a word array
946 /// \tparam T vector type
947 /// \param data the vector
948 /// \param dest the word array
949 /// \details VecStore() stores a vector to a word array.
950 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
951 ///  The instruction does not require aligned effective memory addresses.
952 ///  VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
953 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
954 ///  are required to fix up unaligned memory addresses.
955 /// \par Wraps
956 ///  vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
957 /// \sa VecStore_ALTIVEC, VecStoreAligned
958 /// \since Crypto++ 8.0
959 template<class T>
VecStore(const T data,word32 dest[4])960 inline void VecStore(const T data, word32 dest[4])
961 {
962     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
963     // word pointers. The ISA lacks stores for short* and char*.
964     // Power9/ISA 3.0 provides vec_xst for all datatypes.
965 
966     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
967     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
968     CRYPTOPP_UNUSED(addr);
969 
970 #if defined(_ARCH_PWR9)
971     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
972 #elif defined(__VSX__) || defined(_ARCH_PWR8)
973     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
974 #else
975     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
976 #endif
977 }
978 
979 /// \brief Stores a vector to a word array
980 /// \tparam T vector type
981 /// \param data the vector
982 /// \param off offset into the dest word array
983 /// \param dest the word array
984 /// \details VecStore() stores a vector to a word array.
985 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
986 ///  The instruction does not require aligned effective memory addresses.
987 ///  VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
988 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
989 ///  are required to fix up unaligned memory addresses.
990 /// \par Wraps
991 ///  vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
992 /// \sa VecStore_ALTIVEC, VecStoreAligned
993 /// \since Crypto++ 8.0
994 template<class T>
VecStore(const T data,int off,word32 dest[4])995 inline void VecStore(const T data, int off, word32 dest[4])
996 {
997     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
998     // word pointers. The ISA lacks stores for short* and char*.
999     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1000 
1001     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1002     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1003     CRYPTOPP_UNUSED(addr);
1004 
1005 #if defined(_ARCH_PWR9)
1006     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1007 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1008     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1009 #else
1010     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1011 #endif
1012 }
1013 
1014 /// \brief Stores a vector to a word array
1015 /// \tparam T vector type
1016 /// \param data the vector
1017 /// \param dest the word array
1018 /// \details VecStore() stores a vector to a word array.
1019 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1020 ///  The instruction does not require aligned effective memory addresses.
1021 ///  VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1022 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
1023 ///  are required to fix up unaligned memory addresses.
1024 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
1025 /// \par Wraps
1026 ///  vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1027 /// \sa VecStore_ALTIVEC, VecStoreAligned
1028 /// \since Crypto++ 8.0
1029 template<class T>
VecStore(const T data,word64 dest[2])1030 inline void VecStore(const T data, word64 dest[2])
1031 {
1032     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1033     // word pointers. The ISA lacks stores for short* and char*.
1034     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1035 
1036     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1037     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1038     CRYPTOPP_UNUSED(addr);
1039 
1040 #if defined(_ARCH_PWR9)
1041     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1042 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1043     // 32-bit cast is not a typo. Compiler workaround.
1044     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1045 #else
1046     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1047 #endif
1048 }
1049 
1050 /// \brief Stores a vector to a word array
1051 /// \tparam T vector type
1052 /// \param data the vector
1053 /// \param off offset into the dest word array
1054 /// \param dest the word array
1055 /// \details VecStore() stores a vector to a word array.
1056 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1057 ///  The instruction does not require aligned effective memory addresses.
1058 ///  VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1059 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
1060 ///  are required to fix up unaligned memory addresses.
1061 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
1062 /// \par Wraps
1063 ///  vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1064 /// \sa VecStore_ALTIVEC, VecStoreAligned
1065 /// \since Crypto++ 8.0
1066 template<class T>
VecStore(const T data,int off,word64 dest[2])1067 inline void VecStore(const T data, int off, word64 dest[2])
1068 {
1069     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1070     // word pointers. The ISA lacks stores for short* and char*.
1071     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1072 
1073     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1074     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1075     CRYPTOPP_UNUSED(addr);
1076 
1077 #if defined(_ARCH_PWR9)
1078     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1079 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1080     // 32-bit cast is not a typo. Compiler workaround.
1081     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1082 #else
1083     VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1084 #endif
1085 }
1086 
1087 /// \brief Stores a vector to a byte array
1088 /// \tparam T vector type
1089 /// \param data the vector
1090 /// \param dest the byte array
1091 /// \details VecStoreAligned() stores a vector from an aligned byte array.
1092 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1093 ///  <tt>vec_st</tt> is used if POWER9 is not available. The effective
1094 ///  address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1095 /// \par Wraps
1096 ///  vec_xst on POWER9 or above, vec_st on POWER8 and below
1097 /// \sa VecStore_ALTIVEC, VecStore
1098 /// \since Crypto++ 8.0
1099 template<class T>
VecStoreAligned(const T data,byte dest[16])1100 inline void VecStoreAligned(const T data, byte dest[16])
1101 {
1102     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1103     // word pointers. The ISA lacks loads for short* and char*.
1104     // Power9/ISA 3.0 provides vec_xl for all datatypes.
1105 
1106     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1107     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1108     CRYPTOPP_UNUSED(addr);
1109 
1110 #if defined(_ARCH_PWR9)
1111     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1112 #else
1113     vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1114 #endif
1115 }
1116 
1117 /// \brief Stores a vector to a byte array
1118 /// \tparam T vector type
1119 /// \param data the vector
1120 /// \param off offset into the dest byte array
1121 /// \param dest the byte array
1122 /// \details VecStoreAligned() stores a vector from an aligned byte array.
1123 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1124 ///  <tt>vec_st</tt> is used if POWER9 is not available. The effective
1125 ///  address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1126 /// \par Wraps
1127 ///  vec_xst on POWER9 or above, vec_st on POWER8 and below
1128 /// \sa VecStore_ALTIVEC, VecStore
1129 /// \since Crypto++ 8.0
1130 template<class T>
VecStoreAligned(const T data,int off,byte dest[16])1131 inline void VecStoreAligned(const T data, int off, byte dest[16])
1132 {
1133     // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1134     // word pointers. The ISA lacks loads for short* and char*.
1135     // Power9/ISA 3.0 provides vec_xl for all datatypes.
1136 
1137     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1138     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1139     CRYPTOPP_UNUSED(addr);
1140 
1141 #if defined(_ARCH_PWR9)
1142     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1143 #else
1144     vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1145 #endif
1146 }
1147 
1148 /// \brief Stores a vector to a word array
1149 /// \tparam T vector type
1150 /// \param data the vector
1151 /// \param dest the word array
1152 /// \details VecStoreAligned() stores a vector from an aligned word array.
1153 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1154 ///  POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1155 ///  is used if POWER7 is not available. The effective address of <tt>dest</tt>
1156 ///  must be 16-byte aligned for Altivec.
1157 /// \par Wraps
1158 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1159 /// \sa VecStore_ALTIVEC, VecStore
1160 /// \since Crypto++ 8.0
1161 template<class T>
VecStoreAligned(const T data,word32 dest[4])1162 inline void VecStoreAligned(const T data, word32 dest[4])
1163 {
1164     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1165     // word pointers. The ISA lacks stores for short* and char*.
1166     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1167 
1168     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1169     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1170     CRYPTOPP_UNUSED(addr);
1171 
1172 #if defined(_ARCH_PWR9)
1173     vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1174 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1175     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1176 #else
1177     vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1178 #endif
1179 }
1180 
1181 /// \brief Stores a vector to a word array
1182 /// \tparam T vector type
1183 /// \param data the vector
1184 /// \param off offset into the dest word array
1185 /// \param dest the word array
1186 /// \details VecStoreAligned() stores a vector from an aligned word array.
1187 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1188 ///  POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1189 ///  is used if POWER7 is not available. The effective address of <tt>dest</tt>
1190 ///  must be 16-byte aligned for Altivec.
1191 /// \par Wraps
1192 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1193 /// \sa VecStore_ALTIVEC, VecStore
1194 /// \since Crypto++ 8.0
1195 template<class T>
VecStoreAligned(const T data,int off,word32 dest[4])1196 inline void VecStoreAligned(const T data, int off, word32 dest[4])
1197 {
1198     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1199     // word pointers. The ISA lacks stores for short* and char*.
1200     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1201 
1202     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1203     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1204     CRYPTOPP_UNUSED(addr);
1205 
1206 #if defined(_ARCH_PWR9)
1207     vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1208 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1209     vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1210 #else
1211     vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1212 #endif
1213 }
1214 
1215 /// \brief Stores a vector to a byte array
1216 /// \tparam T vector type
1217 /// \param data the vector
1218 /// \param dest the byte array
1219 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1220 ///  will reverse all bytes in the array on a little endian system.
1221 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1222 ///  The instruction does not require aligned effective memory addresses.
1223 ///  VecStore_ALTIVEC() is used if POWER7 is not available.
1224 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
1225 ///  are required to fix up unaligned memory addresses.
1226 /// \par Wraps
1227 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1228 /// \sa VecStore_ALTIVEC, VecStoreAligned
1229 /// \since Crypto++ 6.0
1230 template <class T>
VecStoreBE(const T data,byte dest[16])1231 inline void VecStoreBE(const T data, byte dest[16])
1232 {
1233     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1234     // word pointers. The ISA lacks stores for short* and char*.
1235     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1236 
1237     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1238     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1239     CRYPTOPP_UNUSED(addr);
1240 
1241 #if defined(_ARCH_PWR9)
1242     vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1243 #elif defined(CRYPTOPP_BIG_ENDIAN)
1244     VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1245 #else
1246     VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
1247 #endif
1248 }
1249 
1250 /// \brief Stores a vector to a byte array
1251 /// \tparam T vector type
1252 /// \param data the vector
1253 /// \param off offset into the dest byte array
1254 /// \param dest the byte array
1255 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1256 ///  will reverse all bytes in the array on a little endian system.
1257 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1258 ///  The instruction does not require aligned effective memory addresses.
1259 ///  VecStore_ALTIVEC() is used if POWER7 is not available.
1260 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
1261 ///  are required to fix up unaligned memory addresses.
1262 /// \par Wraps
1263 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1264 /// \sa VecStore_ALTIVEC, VecStoreAligned
1265 /// \since Crypto++ 6.0
1266 template <class T>
VecStoreBE(const T data,int off,byte dest[16])1267 inline void VecStoreBE(const T data, int off, byte dest[16])
1268 {
1269     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1270     // word pointers. The ISA lacks stores for short* and char*.
1271     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1272 
1273     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1274     CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1275     CRYPTOPP_UNUSED(addr);
1276 
1277 #if defined(_ARCH_PWR9)
1278     vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1279 #elif defined(CRYPTOPP_BIG_ENDIAN)
1280     VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1281 #else
1282     VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
1283 #endif
1284 }
1285 
1286 /// \brief Stores a vector to a word array
1287 /// \tparam T vector type
1288 /// \param data the vector
1289 /// \param dest the word array
1290 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1291 ///  will reverse all bytes in the array on a little endian system.
1292 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1293 ///  The instruction does not require aligned effective memory addresses.
1294 ///  VecStore_ALTIVEC() is used if POWER7 is not available.
1295 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
1296 ///  are required to fix up unaligned memory addresses.
1297 /// \par Wraps
1298 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1299 /// \sa VecStore_ALTIVEC, VecStoreAligned
1300 /// \since Crypto++ 8.0
1301 template <class T>
VecStoreBE(const T data,word32 dest[4])1302 inline void VecStoreBE(const T data, word32 dest[4])
1303 {
1304     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1305     // word pointers. The ISA lacks stores for short* and char*.
1306     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1307 
1308     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1309     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1310     CRYPTOPP_UNUSED(addr);
1311 
1312 #if defined(_ARCH_PWR9)
1313     vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1314 #elif defined(CRYPTOPP_BIG_ENDIAN)
1315     VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1316 #else
1317     VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
1318 #endif
1319 }
1320 
1321 /// \brief Stores a vector to a word array
1322 /// \tparam T vector type
1323 /// \param data the vector
1324 /// \param off offset into the dest word array
1325 /// \param dest the word array
1326 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1327 ///  will reverse all words in the array on a little endian system.
1328 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1329 ///  The instruction does not require aligned effective memory addresses.
1330 ///  VecStore_ALTIVEC() is used if POWER7 is not available.
1331 ///  VecStore_ALTIVEC() can be relatively expensive if extra instructions
1332 ///  are required to fix up unaligned memory addresses.
1333 /// \par Wraps
1334 ///  vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1335 /// \sa VecStore_ALTIVEC, VecStoreAligned
1336 /// \since Crypto++ 8.0
1337 template <class T>
VecStoreBE(const T data,int off,word32 dest[4])1338 inline void VecStoreBE(const T data, int off, word32 dest[4])
1339 {
1340     // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1341     // word pointers. The ISA lacks stores for short* and char*.
1342     // Power9/ISA 3.0 provides vec_xst for all datatypes.
1343 
1344     const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1345     CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1346     CRYPTOPP_UNUSED(addr);
1347 
1348 #if defined(_ARCH_PWR9)
1349     vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1350 #elif defined(CRYPTOPP_BIG_ENDIAN)
1351     VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1352 #else
1353     VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
1354 #endif
1355 }
1356 
1357 //@}
1358 
1359 /// \name LOGICAL OPERATIONS
1360 //@{
1361 
1362 /// \brief AND two vectors
1363 /// \tparam T1 vector type
1364 /// \tparam T2 vector type
1365 /// \param vec1 the first vector
1366 /// \param vec2 the second vector
1367 /// \return vector
1368 /// \details VecAnd() performs <tt>vec1 & vec2</tt>.
1369 ///  vec2 is cast to the same type as vec1. The return vector
1370 ///  is the same type as vec1.
1371 /// \par Wraps
1372 ///  vec_and
1373 /// \sa VecAnd64
1374 /// \since Crypto++ 6.0
1375 template <class T1, class T2>
VecAnd(const T1 vec1,const T2 vec2)1376 inline T1 VecAnd(const T1 vec1, const T2 vec2)
1377 {
1378     return (T1)vec_and(vec1, (T1)vec2);
1379 }
1380 
1381 /// \brief OR two vectors
1382 /// \tparam T1 vector type
1383 /// \tparam T2 vector type
1384 /// \param vec1 the first vector
1385 /// \param vec2 the second vector
1386 /// \return vector
1387 /// \details VecOr() performs <tt>vec1 | vec2</tt>.
1388 ///  vec2 is cast to the same type as vec1. The return vector
1389 ///  is the same type as vec1.
1390 /// \par Wraps
1391 ///  vec_or
1392 /// \sa VecOr64
1393 /// \since Crypto++ 6.0
1394 template <class T1, class T2>
VecOr(const T1 vec1,const T2 vec2)1395 inline T1 VecOr(const T1 vec1, const T2 vec2)
1396 {
1397     return (T1)vec_or(vec1, (T1)vec2);
1398 }
1399 
1400 /// \brief XOR two vectors
1401 /// \tparam T1 vector type
1402 /// \tparam T2 vector type
1403 /// \param vec1 the first vector
1404 /// \param vec2 the second vector
1405 /// \return vector
1406 /// \details VecXor() performs <tt>vec1 ^ vec2</tt>.
1407 ///  vec2 is cast to the same type as vec1. The return vector
1408 ///  is the same type as vec1.
1409 /// \par Wraps
1410 ///  vec_xor
1411 /// \sa VecXor64
1412 /// \since Crypto++ 6.0
1413 template <class T1, class T2>
VecXor(const T1 vec1,const T2 vec2)1414 inline T1 VecXor(const T1 vec1, const T2 vec2)
1415 {
1416     return (T1)vec_xor(vec1, (T1)vec2);
1417 }
1418 
1419 //@}
1420 
1421 /// \name ARITHMETIC OPERATIONS
1422 //@{
1423 
1424 /// \brief Add two vectors
1425 /// \tparam T1 vector type
1426 /// \tparam T2 vector type
1427 /// \param vec1 the first vector
1428 /// \param vec2 the second vector
1429 /// \return vector
1430 /// \details VecAdd() performs <tt>vec1 + vec2</tt>.
1431 ///  vec2 is cast to the same type as vec1. The return vector
1432 ///  is the same type as vec1.
1433 /// \par Wraps
1434 ///  vec_add
1435 /// \sa VecAdd64
1436 /// \since Crypto++ 6.0
1437 template <class T1, class T2>
VecAdd(const T1 vec1,const T2 vec2)1438 inline T1 VecAdd(const T1 vec1, const T2 vec2)
1439 {
1440     return (T1)vec_add(vec1, (T1)vec2);
1441 }
1442 
1443 /// \brief Subtract two vectors
1444 /// \tparam T1 vector type
1445 /// \tparam T2 vector type
1446 /// \param vec1 the first vector
1447 /// \param vec2 the second vector
1448 /// \details VecSub() performs <tt>vec1 - vec2</tt>.
1449 ///  vec2 is cast to the same type as vec1. The return vector
1450 ///  is the same type as vec1.
1451 /// \par Wraps
1452 ///  vec_sub
1453 /// \sa VecSub64
1454 /// \since Crypto++ 6.0
1455 template <class T1, class T2>
VecSub(const T1 vec1,const T2 vec2)1456 inline T1 VecSub(const T1 vec1, const T2 vec2)
1457 {
1458     return (T1)vec_sub(vec1, (T1)vec2);
1459 }
1460 
1461 //@}
1462 
1463 /// \name PERMUTE OPERATIONS
1464 //@{
1465 
1466 /// \brief Permutes a vector
1467 /// \tparam T1 vector type
1468 /// \tparam T2 vector type
1469 /// \param vec the vector
1470 /// \param mask vector mask
1471 /// \return vector
1472 /// \details VecPermute() creates a new vector from vec according to mask.
1473 ///  mask is an uint8x16_p vector. The return vector is the same type as vec.
1474 /// \par Wraps
1475 ///  vec_perm
1476 /// \since Crypto++ 6.0
1477 template <class T1, class T2>
VecPermute(const T1 vec,const T2 mask)1478 inline T1 VecPermute(const T1 vec, const T2 mask)
1479 {
1480     return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1481 }
1482 
1483 /// \brief Permutes two vectors
1484 /// \tparam T1 vector type
1485 /// \tparam T2 vector type
1486 /// \param vec1 the first vector
1487 /// \param vec2 the second vector
1488 /// \param mask vector mask
1489 /// \return vector
1490 /// \details VecPermute() creates a new vector from vec1 and vec2 according to mask.
1491 ///  mask is an uint8x16_p vector. The return vector is the same type as vec.
1492 /// \par Wraps
1493 ///  vec_perm
1494 /// \since Crypto++ 6.0
1495 template <class T1, class T2>
VecPermute(const T1 vec1,const T1 vec2,const T2 mask)1496 inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1497 {
1498     return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1499 }
1500 
1501 //@}
1502 
1503 /// \name SHIFT AND ROTATE OPERATIONS
1504 //@{
1505 
1506 /// \brief Shift a vector left
1507 /// \tparam C shift byte count
1508 /// \tparam T vector type
1509 /// \param vec the vector
1510 /// \return vector
1511 /// \details VecShiftLeftOctet() returns a new vector after shifting the
1512 ///  concatenation of the zero vector and the source vector by the specified
1513 ///  number of bytes. The return vector is the same type as vec.
1514 /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
1515 ///  c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
1516 ///  <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1517 ///  if on a big endian machine as shown below.
1518 /// <pre>
1519 ///   uint8x16_p x = VecLoad(ptr);
1520 ///   uint8x16_p y = VecShiftLeftOctet<12>(x);
1521 /// </pre>
1522 /// \par Wraps
1523 ///  vec_sld
1524 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1525 ///  endian sensitive?</A> on Stack Overflow
1526 /// \since Crypto++ 6.0
1527 template <unsigned int C, class T>
VecShiftLeftOctet(const T vec)1528 inline T VecShiftLeftOctet(const T vec)
1529 {
1530     const T zero = {0};
1531     if (C >= 16)
1532     {
1533         // Out of range
1534         return zero;
1535     }
1536     else if (C == 0)
1537     {
1538         // Noop
1539         return vec;
1540     }
1541     else
1542     {
1543 #if defined(CRYPTOPP_BIG_ENDIAN)
1544     enum { R=C&0xf };
1545     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1546 #else
1547     enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds
1548     return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1549 #endif
1550     }
1551 }
1552 
1553 /// \brief Shift a vector right
1554 /// \tparam C shift byte count
1555 /// \tparam T vector type
1556 /// \param vec the vector
1557 /// \return vector
1558 /// \details VecShiftRightOctet() returns a new vector after shifting the
1559 ///  concatenation of the zero vector and the source vector by the specified
1560 ///  number of bytes. The return vector is the same type as vec.
1561 /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
1562 ///  c)</tt>. On little endian machines VecShiftRightOctet() is translated to
1563 ///  <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1564 ///  if on a big endian machine as shown below.
1565 /// <pre>
1566 ///   uint8x16_p x = VecLoad(ptr);
1567 ///   uint8x16_p y = VecShiftRightOctet<12>(y);
1568 /// </pre>
1569 /// \par Wraps
1570 ///  vec_sld
1571 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1572 ///  endian sensitive?</A> on Stack Overflow
1573 /// \since Crypto++ 6.0
1574 template <unsigned int C, class T>
VecShiftRightOctet(const T vec)1575 inline T VecShiftRightOctet(const T vec)
1576 {
1577     const T zero = {0};
1578     if (C >= 16)
1579     {
1580         // Out of range
1581         return zero;
1582     }
1583     else if (C == 0)
1584     {
1585         // Noop
1586         return vec;
1587     }
1588     else
1589     {
1590 #if defined(CRYPTOPP_BIG_ENDIAN)
1591     enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds
1592     return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1593 #else
1594     enum { R=C&0xf };
1595     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1596 #endif
1597     }
1598 }
1599 
1600 /// \brief Rotate a vector left
1601 /// \tparam C shift byte count
1602 /// \tparam T vector type
1603 /// \param vec the vector
1604 /// \return vector
1605 /// \details VecRotateLeftOctet() returns a new vector after rotating the
1606 ///  concatenation of the source vector with itself by the specified
1607 ///  number of bytes. The return vector is the same type as vec.
1608 /// \par Wraps
1609 ///  vec_sld
1610 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1611 ///  endian sensitive?</A> on Stack Overflow
1612 /// \since Crypto++ 6.0
1613 template <unsigned int C, class T>
VecRotateLeftOctet(const T vec)1614 inline T VecRotateLeftOctet(const T vec)
1615 {
1616 #if defined(CRYPTOPP_BIG_ENDIAN)
1617     enum { R = C&0xf };
1618     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1619 #else
1620     enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds
1621     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1622 #endif
1623 }
1624 
1625 /// \brief Rotate a vector right
1626 /// \tparam C shift byte count
1627 /// \tparam T vector type
1628 /// \param vec the vector
1629 /// \return vector
1630 /// \details VecRotateRightOctet() returns a new vector after rotating the
1631 ///  concatenation of the source vector with itself by the specified
1632 ///  number of bytes. The return vector is the same type as vec.
1633 /// \par Wraps
1634 ///  vec_sld
1635 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1636 ///  endian sensitive?</A> on Stack Overflow
1637 /// \since Crypto++ 6.0
1638 template <unsigned int C, class T>
VecRotateRightOctet(const T vec)1639 inline T VecRotateRightOctet(const T vec)
1640 {
1641 #if defined(CRYPTOPP_BIG_ENDIAN)
1642     enum { R=(16-C)&0xf };  // Linux xlC 13.1 workaround in Debug builds
1643     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1644 #else
1645     enum { R = C&0xf };
1646     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1647 #endif
1648 }
1649 
1650 /// \brief Rotate a vector left
1651 /// \tparam C rotate bit count
1652 /// \param vec the vector
1653 /// \return vector
1654 /// \details VecRotateLeft() rotates each element in a vector by
1655 ///  bit count. The return vector is the same type as vec.
1656 /// \par Wraps
1657 ///  vec_rl
1658 /// \since Crypto++ 7.0
1659 template<unsigned int C>
VecRotateLeft(const uint32x4_p vec)1660 inline uint32x4_p VecRotateLeft(const uint32x4_p vec)
1661 {
1662     const uint32x4_p m = {C, C, C, C};
1663     return vec_rl(vec, m);
1664 }
1665 
1666 /// \brief Rotate a vector right
1667 /// \tparam C rotate bit count
1668 /// \param vec the vector
1669 /// \return vector
1670 /// \details VecRotateRight() rotates each element in a vector
1671 ///  by bit count. The return vector is the same type as vec.
1672 /// \par Wraps
1673 ///  vec_rl
1674 /// \since Crypto++ 7.0
1675 template<unsigned int C>
VecRotateRight(const uint32x4_p vec)1676 inline uint32x4_p VecRotateRight(const uint32x4_p vec)
1677 {
1678     const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1679     return vec_rl(vec, m);
1680 }
1681 
1682 /// \brief Shift a vector left
1683 /// \tparam C shift bit count
1684 /// \param vec the vector
1685 /// \return vector
1686 /// \details VecShiftLeft() rotates each element in a vector
1687 ///  by bit count. The return vector is the same type as vec.
1688 /// \par Wraps
1689 ///  vec_sl
1690 /// \since Crypto++ 8.1
1691 template<unsigned int C>
VecShiftLeft(const uint32x4_p vec)1692 inline uint32x4_p VecShiftLeft(const uint32x4_p vec)
1693 {
1694     const uint32x4_p m = {C, C, C, C};
1695     return vec_sl(vec, m);
1696 }
1697 
1698 /// \brief Shift a vector right
1699 /// \tparam C shift bit count
1700 /// \param vec the vector
1701 /// \return vector
1702 /// \details VecShiftRight() rotates each element in a vector
1703 ///  by bit count. The return vector is the same type as vec.
1704 /// \par Wraps
1705 ///  vec_rl
1706 /// \since Crypto++ 8.1
1707 template<unsigned int C>
VecShiftRight(const uint32x4_p vec)1708 inline uint32x4_p VecShiftRight(const uint32x4_p vec)
1709 {
1710     const uint32x4_p m = {C, C, C, C};
1711     return vec_sr(vec, m);
1712 }
1713 
1714 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
1715 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1716 
1717 /// \brief Rotate a vector left
1718 /// \tparam C rotate bit count
1719 /// \param vec the vector
1720 /// \return vector
1721 /// \details VecRotateLeft() rotates each element in a vector
1722 ///  by bit count. The return vector is the same type as vec.
1723 /// \details VecRotateLeft() with 64-bit elements is available on
1724 ///  POWER8 and above.
1725 /// \par Wraps
1726 ///  vec_rl
1727 /// \since Crypto++ 8.0
1728 template<unsigned int C>
VecRotateLeft(const uint64x2_p vec)1729 inline uint64x2_p VecRotateLeft(const uint64x2_p vec)
1730 {
1731     const uint64x2_p m = {C, C};
1732     return vec_rl(vec, m);
1733 }
1734 
1735 /// \brief Shift a vector left
1736 /// \tparam C shift bit count
1737 /// \param vec the vector
1738 /// \return vector
1739 /// \details VecShiftLeft() rotates each element in a vector
1740 ///  by bit count. The return vector is the same type as vec.
1741 /// \details VecShiftLeft() with 64-bit elements is available on
1742 ///  POWER8 and above.
1743 /// \par Wraps
1744 ///  vec_sl
1745 /// \since Crypto++ 8.1
1746 template<unsigned int C>
VecShiftLeft(const uint64x2_p vec)1747 inline uint64x2_p VecShiftLeft(const uint64x2_p vec)
1748 {
1749     const uint64x2_p m = {C, C};
1750     return vec_sl(vec, m);
1751 }
1752 
1753 /// \brief Rotate a vector right
1754 /// \tparam C rotate bit count
1755 /// \param vec the vector
1756 /// \return vector
1757 /// \details VecRotateRight() rotates each element in a vector
1758 ///  by bit count. The return vector is the same type as vec.
1759 /// \details VecRotateRight() with 64-bit elements is available on
1760 ///  POWER8 and above.
1761 /// \par Wraps
1762 ///  vec_rl
1763 /// \since Crypto++ 8.0
1764 template<unsigned int C>
VecRotateRight(const uint64x2_p vec)1765 inline uint64x2_p VecRotateRight(const uint64x2_p vec)
1766 {
1767     const uint64x2_p m = {64-C, 64-C};
1768     return vec_rl(vec, m);
1769 }
1770 
1771 /// \brief Shift a vector right
1772 /// \tparam C shift bit count
1773 /// \param vec the vector
1774 /// \return vector
1775 /// \details VecShiftRight() rotates each element in a vector
1776 ///  by bit count. The return vector is the same type as vec.
1777 /// \details VecShiftRight() with 64-bit elements is available on
1778 ///  POWER8 and above.
1779 /// \par Wraps
1780 ///  vec_sr
1781 /// \since Crypto++ 8.1
1782 template<unsigned int C>
VecShiftRight(const uint64x2_p vec)1783 inline uint64x2_p VecShiftRight(const uint64x2_p vec)
1784 {
1785     const uint64x2_p m = {C, C};
1786     return vec_sr(vec, m);
1787 }
1788 
1789 #endif  // ARCH_PWR8
1790 
1791 //@}
1792 
1793 /// \name OTHER OPERATIONS
1794 //@{
1795 
1796 /// \brief Merge two vectors
1797 /// \tparam T vector type
1798 /// \param vec1 the first vector
1799 /// \param vec2 the second vector
1800 /// \return vector
1801 /// \par Wraps
1802 ///  vec_mergel
1803 /// \since Crypto++ 8.1
1804 template <class T>
VecMergeLow(const T vec1,const T vec2)1805 inline T VecMergeLow(const T vec1, const T vec2)
1806 {
1807     return vec_mergel(vec1, vec2);
1808 }
1809 
1810 /// \brief Merge two vectors
1811 /// \tparam T vector type
1812 /// \param vec1 the first vector
1813 /// \param vec2 the second vector
1814 /// \return vector
1815 /// \par Wraps
1816 ///  vec_mergeh
1817 /// \since Crypto++ 8.1
1818 template <class T>
VecMergeHigh(const T vec1,const T vec2)1819 inline T VecMergeHigh(const T vec1, const T vec2)
1820 {
1821     return vec_mergeh(vec1, vec2);
1822 }
1823 
1824 /// \brief Broadcast 32-bit word to a vector
1825 /// \param val the 32-bit value
1826 /// \return vector
1827 /// \par Wraps
1828 ///  vec_splats
1829 /// \since Crypto++ 8.3
VecSplatWord(word32 val)1830 inline uint32x4_p VecSplatWord(word32 val)
1831 {
1832     // Fix spurious GCC warning???
1833     CRYPTOPP_UNUSED(val);
1834 
1835     // Apple Altivec and XL C++ do not offer vec_splats.
1836     // GCC offers vec_splats back to -mcpu=power4.
1837 #if defined(_ARCH_PWR4) && defined(__GNUC__)
1838     return vec_splats(val);
1839 #else
1840     //const word32 x[4] = {val,val,val,val};
1841     //return VecLoad(x);
1842     const word32 x[4] = {val};
1843     return vec_splat(VecLoad(x),0);
1844 #endif
1845 }
1846 
1847 /// \brief Broadcast 32-bit element to a vector
1848 /// \tparam the element number
1849 /// \param val the 32-bit value
1850 /// \return vector
1851 /// \par Wraps
1852 ///  vec_splat
1853 /// \since Crypto++ 8.3
1854 template <unsigned int N>
VecSplatElement(const uint32x4_p val)1855 inline uint32x4_p VecSplatElement(const uint32x4_p val)
1856 {
1857     return vec_splat(val, N);
1858 }
1859 
1860 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1861 /// \brief Broadcast 64-bit double word to a vector
1862 /// \param val the 64-bit value
1863 /// \return vector
1864 /// \par Wraps
1865 ///  vec_splats
1866 /// \since Crypto++ 8.3
VecSplatWord(word64 val)1867 inline uint64x2_p VecSplatWord(word64 val)
1868 {
1869     // The PPC64 ABI says so.
1870     return vec_splats((unsigned long long)val);
1871 }
1872 
1873 /// \brief Broadcast 64-bit element to a vector
1874 /// \tparam the element number
1875 /// \param val the 64-bit value
1876 /// \return vector
1877 /// \par Wraps
1878 ///  vec_splat
1879 /// \since Crypto++ 8.3
1880 template <unsigned int N>
VecSplatElement(const uint64x2_p val)1881 inline uint64x2_p VecSplatElement(const uint64x2_p val)
1882 {
1883 #if defined(__VSX__) || defined(_ARCH_PWR8)
1884     return vec_splat(val, N);
1885 #else
1886     enum {E=N&1};
1887     if (E == 0)
1888     {
1889         const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
1890         return vec_perm(val, val, m);
1891     }
1892     else // (E == 1)
1893     {
1894         const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
1895         return vec_perm(val, val, m);
1896     }
1897 #endif
1898 }
1899 #endif
1900 
1901 /// \brief Extract a dword from a vector
1902 /// \tparam T vector type
1903 /// \param val the vector
1904 /// \return vector created from low dword
1905 /// \details VecGetLow() extracts the low dword from a vector. The low dword
1906 ///  is composed of the least significant bits and occupies bytes 8 through 15
1907 ///  when viewed as a big endian array. The return vector is the same type as
1908 ///  the original vector and padded with 0's in the most significant bit positions.
1909 /// \par Wraps
1910 ///  vec_sld
1911 /// \since Crypto++ 7.0
1912 template <class T>
VecGetLow(const T val)1913 inline T VecGetLow(const T val)
1914 {
1915 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1916     const T zero = {0};
1917     return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1918 #else
1919     return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1920 #endif
1921 }
1922 
1923 /// \brief Extract a dword from a vector
1924 /// \tparam T vector type
1925 /// \param val the vector
1926 /// \return vector created from high dword
1927 /// \details VecGetHigh() extracts the high dword from a vector. The high dword
1928 ///  is composed of the most significant bits and occupies bytes 0 through 7
1929 ///  when viewed as a big endian array. The return vector is the same type as
1930 ///  the original vector and padded with 0's in the most significant bit positions.
1931 /// \par Wraps
1932 ///  vec_sld
1933 /// \since Crypto++ 7.0
1934 template <class T>
VecGetHigh(const T val)1935 inline T VecGetHigh(const T val)
1936 {
1937 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1938     const T zero = {0};
1939     return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1940 #else
1941     return VecShiftRightOctet<8>(val);
1942 #endif
1943 }
1944 
1945 /// \brief Exchange high and low double words
1946 /// \tparam T vector type
1947 /// \param vec the vector
1948 /// \return vector
1949 /// \par Wraps
1950 ///  vec_sld
1951 /// \since Crypto++ 7.0
1952 template <class T>
VecSwapWords(const T vec)1953 inline T VecSwapWords(const T vec)
1954 {
1955     return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1956 }
1957 
1958 //@}
1959 
1960 /// \name COMPARISON
1961 //@{
1962 
1963 /// \brief Compare two vectors
1964 /// \tparam T1 vector type
1965 /// \tparam T2 vector type
1966 /// \param vec1 the first vector
1967 /// \param vec2 the second vector
1968 /// \return true if vec1 equals vec2, false otherwise
1969 /// \details VecEqual() performs a bitwise compare. The vector element types do
1970 ///  not matter.
1971 /// \par Wraps
1972 ///  vec_all_eq
1973 /// \since Crypto++ 8.0
1974 template <class T1, class T2>
VecEqual(const T1 vec1,const T2 vec2)1975 inline bool VecEqual(const T1 vec1, const T2 vec2)
1976 {
1977     return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1978 }
1979 
1980 /// \brief Compare two vectors
1981 /// \tparam T1 vector type
1982 /// \tparam T2 vector type
1983 /// \param vec1 the first vector
1984 /// \param vec2 the second vector
1985 /// \return true if vec1 does not equal vec2, false otherwise
1986 /// \details VecNotEqual() performs a bitwise compare. The vector element types do
1987 ///  not matter.
1988 /// \par Wraps
1989 ///  vec_all_eq
1990 /// \since Crypto++ 8.0
1991 template <class T1, class T2>
VecNotEqual(const T1 vec1,const T2 vec2)1992 inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1993 {
1994     return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1995 }
1996 
1997 //@}
1998 
1999 ////////////////// 32-bit Altivec /////////////////
2000 
2001 /// \name 32-BIT ALTIVEC
2002 //@{
2003 
2004 /// \brief Add two vectors as if uint64x2_p
2005 /// \param vec1 the first vector
2006 /// \param vec2 the second vector
2007 /// \return vector
2008 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2009 ///  if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2010 ///  the carries from the elements.
2011 /// \par Wraps
2012 ///  vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
2013 /// \since Crypto++ 8.3
VecAdd64(const uint32x4_p & vec1,const uint32x4_p & vec2)2014 inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2015 {
2016     // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2017 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2018     return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
2019 #else
2020     // The carry mask selects carrys for elements 1 and 3 and sets
2021     // remaining elements to 0. The results is then shifted so the
2022     // carried values are added to elements 0 and 2.
2023 #if defined(CRYPTOPP_BIG_ENDIAN)
2024     const uint32x4_p zero = {0, 0, 0, 0};
2025     const uint32x4_p mask = {0, 1, 0, 1};
2026 #else
2027     const uint32x4_p zero = {0, 0, 0, 0};
2028     const uint32x4_p mask = {1, 0, 1, 0};
2029 #endif
2030 
2031     uint32x4_p cy = vec_addc(vec1, vec2);
2032     uint32x4_p res = vec_add(vec1, vec2);
2033     cy = vec_and(mask, cy);
2034     cy = vec_sld (cy, zero, 4);
2035     return vec_add(res, cy);
2036 #endif
2037 }
2038 
2039 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2040 /// \brief Add two vectors as if uint64x2_p
2041 /// \param vec1 the first vector
2042 /// \param vec2 the second vector
2043 /// \return vector
2044 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2045 ///  if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2046 ///  the carries from the elements.
2047 /// \par Wraps
2048 ///  vec_add for POWER8
2049 /// \since Crypto++ 8.3
VecAdd64(const uint64x2_p & vec1,const uint64x2_p & vec2)2050 inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2051 {
2052     // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2053     const uint64x2_p res = vec_add(vec1, vec2);
2054 
2055 #if defined(CRYPTOPP_DEBUG)
2056     // Test 32-bit add in debug builds while we are here.
2057     const uint32x4_p x = (uint32x4_p)vec1;
2058     const uint32x4_p y = (uint32x4_p)vec2;
2059     const uint32x4_p r = VecAdd64(x, y);
2060 
2061     CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2062 #endif
2063 
2064     return res;
2065 }
2066 #endif
2067 
2068 /// \brief Subtract two vectors as if uint64x2_p
2069 /// \param vec1 the first vector
2070 /// \param vec2 the second vector
2071 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2072 ///  if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2073 ///  manages the borrows from the elements.
2074 /// \par Wraps
2075 ///  vec_sub for POWER8, vec_subc, vec_andc, vec_perm, vec_sub for Altivec
2076 /// \since Crypto++ 8.3
VecSub64(const uint32x4_p & vec1,const uint32x4_p & vec2)2077 inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2078 {
2079 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2080     // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2081     return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);
2082 #else
2083     // The borrow mask selects borrows for elements 1 and 3 and sets
2084     // remaining elements to 0. The results is then shifted so the
2085     // borrowed values are subtracted from elements 0 and 2.
2086 #if defined(CRYPTOPP_BIG_ENDIAN)
2087     const uint32x4_p zero = {0, 0, 0, 0};
2088     const uint32x4_p mask = {0, 1, 0, 1};
2089 #else
2090     const uint32x4_p zero = {0, 0, 0, 0};
2091     const uint32x4_p mask = {1, 0, 1, 0};
2092 #endif
2093 
2094     // subc sets the complement of borrow, so we have to
2095     // un-complement it using andc.
2096     uint32x4_p bw = vec_subc(vec1, vec2);
2097     uint32x4_p res = vec_sub(vec1, vec2);
2098     bw = vec_andc(mask, bw);
2099     bw = vec_sld (bw, zero, 4);
2100     return vec_sub(res, bw);
2101 #endif
2102 }
2103 
2104 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2105 /// \brief Subtract two vectors as if uint64x2_p
2106 /// \param vec1 the first vector
2107 /// \param vec2 the second vector
2108 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2109 ///  if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2110 ///  manages the borrows from the elements.
2111 /// \par Wraps
2112 ///  vec_sub for POWER8
2113 /// \since Crypto++ 8.3
VecSub64(const uint64x2_p & vec1,const uint64x2_p & vec2)2114 inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2115 {
2116     // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2117     const uint64x2_p res = vec_sub(vec1, vec2);
2118 
2119 #if defined(CRYPTOPP_DEBUG)
2120     // Test 32-bit sub in debug builds while we are here.
2121     const uint32x4_p x = (uint32x4_p)vec1;
2122     const uint32x4_p y = (uint32x4_p)vec2;
2123     const uint32x4_p r = VecSub64(x, y);
2124 
2125     CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2126 #endif
2127 
2128     return res;
2129 }
2130 #endif
2131 
2132 /// \brief Rotate a vector left as if uint64x2_p
2133 /// \tparam C rotate bit count
2134 /// \param vec the vector
2135 /// \return vector
2136 /// \details VecRotateLeft() rotates each element in a vector by bit count.
2137 ///  vec is rotated as if uint64x2_p.
2138 /// \par Wraps
2139 ///  vec_rl
2140 /// \since Crypto++ 8.3
2141 template<unsigned int C>
VecRotateLeft64(const uint32x4_p vec)2142 inline uint32x4_p VecRotateLeft64(const uint32x4_p vec)
2143 {
2144 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2145     // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2146     return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);
2147 #else
2148     // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2149     enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2150 
2151     // Get the low bits, shift them to high bits
2152     uint32x4_p t1 = VecShiftLeft<S32>(vec);
2153     // Get the high bits, shift them to low bits
2154     uint32x4_p t2 = VecShiftRight<32-S32>(vec);
2155 
2156     if (S64 == 0)
2157     {
2158         const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2159         return VecPermute(vec, m);
2160     }
2161     else if (S64 == 32)
2162     {
2163         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2164         return VecPermute(vec, m);
2165     }
2166     else if (BR)  // Big rotate amount?
2167     {
2168         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2169         t1 = VecPermute(t1, m);
2170     }
2171     else
2172     {
2173         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2174         t2 = VecPermute(t2, m);
2175     }
2176 
2177     return vec_or(t1, t2);
2178 #endif
2179 }
2180 
2181 /// \brief Rotate a vector left as if uint64x2_p
2182 /// \param vec the vector
2183 /// \return vector
2184 /// \details VecRotateLeft<8>() rotates each element in a vector
2185 ///  by 8-bits. vec is rotated as if uint64x2_p. This specialization
2186 ///  is used by algorithms like Speck128.
2187 /// \par Wraps
2188 ///  vec_rl
2189 /// \since Crypto++ 8.3
2190 template<>
2191 inline uint32x4_p VecRotateLeft64<8>(const uint32x4_p vec)
2192 {
2193 #if (CRYPTOPP_BIG_ENDIAN)
2194     const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2195     return VecPermute(vec, m);
2196 #else
2197     const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2198     return VecPermute(vec, m);
2199 #endif
2200 }
2201 
2202 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2203 /// \brief Rotate a vector left as if uint64x2_p
2204 /// \tparam C rotate bit count
2205 /// \param vec the vector
2206 /// \return vector
2207 /// \details VecRotateLeft64() rotates each element in a vector by
2208 ///  bit count. vec is rotated as if uint64x2_p.
2209 /// \par Wraps
2210 ///  vec_rl
2211 /// \since Crypto++ 8.3
2212 template<unsigned int C>
VecRotateLeft64(const uint64x2_p vec)2213 inline uint64x2_p VecRotateLeft64(const uint64x2_p vec)
2214 {
2215     // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2216     const uint64x2_p res = VecRotateLeft<C>(vec);
2217 
2218 #if defined(CRYPTOPP_DEBUG)
2219     // Test 32-bit rotate in debug builds while we are here.
2220     const uint32x4_p x = (uint32x4_p)vec;
2221     const uint32x4_p r = VecRotateLeft64<C>(x);
2222 
2223     CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2224 #endif
2225 
2226     return res;
2227 }
2228 #endif
2229 
2230 /// \brief Rotate a vector right as if uint64x2_p
2231 /// \tparam C rotate bit count
2232 /// \param vec the vector
2233 /// \return vector
2234 /// \details VecRotateRight64() rotates each element in a vector by
2235 ///  bit count. vec is rotated as if uint64x2_p.
2236 /// \par Wraps
2237 ///  vec_rl
2238 /// \since Crypto++ 8.3
2239 template<unsigned int C>
VecRotateRight64(const uint32x4_p vec)2240 inline uint32x4_p VecRotateRight64(const uint32x4_p vec)
2241 {
2242 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2243     // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2244     return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);
2245 #else
2246     // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2247     enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2248 
2249     // Get the low bits, shift them to high bits
2250     uint32x4_p t1 = VecShiftRight<S32>(vec);
2251     // Get the high bits, shift them to low bits
2252     uint32x4_p t2 = VecShiftLeft<32-S32>(vec);
2253 
2254     if (S64 == 0)
2255     {
2256         const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2257         return VecPermute(vec, m);
2258     }
2259     else if (S64 == 32)
2260     {
2261         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2262         return VecPermute(vec, m);
2263     }
2264     else if (BR)  // Big rotate amount?
2265     {
2266         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2267         t1 = VecPermute(t1, m);
2268     }
2269     else
2270     {
2271         const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2272         t2 = VecPermute(t2, m);
2273     }
2274 
2275     return vec_or(t1, t2);
2276 #endif
2277 }
2278 
2279 /// \brief Rotate a vector right as if uint64x2_p
2280 /// \param vec the vector
2281 /// \return vector
2282 /// \details VecRotateRight64<8>() rotates each element in a vector
2283 ///  by 8-bits. vec is rotated as if uint64x2_p. This specialization
2284 ///  is used by algorithms like Speck128.
2285 /// \details vec is rotated as if uint64x2_p.
2286 /// \par Wraps
2287 ///  vec_rl
2288 /// \since Crypto++ 8.3
2289 template<>
2290 inline uint32x4_p VecRotateRight64<8>(const uint32x4_p vec)
2291 {
2292 #if (CRYPTOPP_BIG_ENDIAN)
2293     const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2294     return VecPermute(vec, m);
2295 #else
2296     const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2297     return VecPermute(vec, m);
2298 #endif
2299 }
2300 
2301 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2302 /// \brief Rotate a vector right as if uint64x2_p
2303 /// \tparam C rotate bit count
2304 /// \param vec the vector
2305 /// \return vector
2306 /// \details VecRotateRight64() rotates each element in a vector by
2307 ///  bit count. vec is rotated as if uint64x2_p.
2308 /// \par Wraps
2309 ///  vec_rl
2310 /// \since Crypto++ 8.3
2311 template<unsigned int C>
VecRotateRight64(const uint64x2_p vec)2312 inline uint64x2_p VecRotateRight64(const uint64x2_p vec)
2313 {
2314     // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2315     const uint64x2_p res = VecRotateRight<C>(vec);
2316 
2317 #if defined(CRYPTOPP_DEBUG)
2318     // Test 32-bit rotate in debug builds while we are here.
2319     const uint32x4_p x = (uint32x4_p)vec;
2320     const uint32x4_p r = VecRotateRight64<C>(x);
2321 
2322     CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2323 #endif
2324 
2325     return res;
2326 }
2327 #endif
2328 
2329 /// \brief AND two vectors as if uint64x2_p
2330 /// \tparam T1 vector type
2331 /// \tparam T2 vector type
2332 /// \param vec1 the first vector
2333 /// \param vec2 the second vector
2334 /// \return vector
2335 /// \details VecAnd64() performs <tt>vec1 & vec2</tt>.
2336 ///  vec2 is cast to the same type as vec1. The return vector
2337 ///  is the same type as vec1.
2338 /// \details VecAnd64() is a convenience function that simply performs a VecAnd().
2339 /// \par Wraps
2340 ///  vec_and
2341 /// \since Crypto++ 8.3
2342 template <class T1, class T2>
VecAnd64(const T1 vec1,const T2 vec2)2343 inline T1 VecAnd64(const T1 vec1, const T2 vec2)
2344 {
2345     return (T1)vec_and(vec1, (T1)vec2);
2346 }
2347 
2348 /// \brief OR two vectors as if uint64x2_p
2349 /// \tparam T1 vector type
2350 /// \tparam T2 vector type
2351 /// \param vec1 the first vector
2352 /// \param vec2 the second vector
2353 /// \return vector
2354 /// \details VecOr64() performs <tt>vec1 | vec2</tt>.
2355 ///  vec2 is cast to the same type as vec1. The return vector
2356 ///  is the same type as vec1.
2357 /// \details VecOr64() is a convenience function that simply performs a VecOr().
2358 /// \par Wraps
2359 ///  vec_or
2360 /// \since Crypto++ 8.3
2361 template <class T1, class T2>
VecOr64(const T1 vec1,const T2 vec2)2362 inline T1 VecOr64(const T1 vec1, const T2 vec2)
2363 {
2364     return (T1)vec_or(vec1, (T1)vec2);
2365 }
2366 
2367 /// \brief XOR two vectors as if uint64x2_p
2368 /// \tparam T1 vector type
2369 /// \tparam T2 vector type
2370 /// \param vec1 the first vector
2371 /// \param vec2 the second vector
2372 /// \return vector
2373 /// \details VecXor64() performs <tt>vec1 ^ vec2</tt>.
2374 ///  vec2 is cast to the same type as vec1. The return vector
2375 ///  is the same type as vec1.
2376 /// \details VecXor64() is a convenience function that simply performs a VecXor().
2377 /// \par Wraps
2378 ///  vec_xor
2379 /// \since Crypto++ 8.3
2380 template <class T1, class T2>
VecXor64(const T1 vec1,const T2 vec2)2381 inline T1 VecXor64(const T1 vec1, const T2 vec2)
2382 {
2383     return (T1)vec_xor(vec1, (T1)vec2);
2384 }
2385 
2386 /// \brief Broadcast 64-bit double word to a vector
2387 /// \param val the 64-bit value
2388 /// \return vector
2389 /// \par Wraps
2390 ///  vec_splats
2391 /// \since Crypto++ 8.3
VecSplatWord64(word64 val)2392 inline uint32x4_p VecSplatWord64(word64 val)
2393 {
2394 #if defined(_ARCH_PWR8)
2395     // The PPC64 ABI says so.
2396     return (uint32x4_p)vec_splats((unsigned long long)val);
2397 #else
2398     const word64 x[2] = {val,val};
2399     return (uint32x4_p)VecLoad((const word32*)x);
2400 #endif
2401 }
2402 
2403 /// \brief Broadcast 64-bit element to a vector as if uint64x2_p
2404 /// \tparam the element number
2405 /// \param val the 64-bit value
2406 /// \return vector
2407 /// \par Wraps
2408 ///  vec_splat
2409 /// \since Crypto++ 8.3
2410 template <unsigned int N>
VecSplatElement64(const uint32x4_p val)2411 inline uint32x4_p VecSplatElement64(const uint32x4_p val)
2412 {
2413 #if defined(__VSX__) || defined(_ARCH_PWR8)
2414     return (uint32x4_p)vec_splat((uint64x2_p)val, N);
2415 #else
2416     enum {E=N&1};
2417     if (E == 0)
2418     {
2419         const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
2420         return (uint32x4_p)vec_perm(val, val, m);
2421     }
2422     else // (E == 1)
2423     {
2424         const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
2425         return (uint32x4_p)vec_perm(val, val, m);
2426     }
2427 #endif
2428 }
2429 
2430 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2431 /// \brief Broadcast 64-bit element to a vector
2432 /// \tparam the element number
2433 /// \param val the 64-bit value
2434 /// \return vector
2435 /// \since Crypto++ 8.3
2436 template <unsigned int N>
VecSplatElement64(const uint64x2_p val)2437 inline uint64x2_p VecSplatElement64(const uint64x2_p val)
2438 {
2439     return vec_splat(val, N);
2440 }
2441 #endif
2442 
2443 //@}
2444 
2445 //////////////////////// Power8 Crypto ////////////////////////
2446 
2447 // __CRYPTO__ alone is not enough. Clang will define __CRYPTO__
2448 // when it is not available, like with Power7. Sigh...
2449 #if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2450 
2451 /// \name POLYNOMIAL MULTIPLICATION
2452 //@{
2453 
2454 /// \brief Polynomial multiplication
2455 /// \param a the first term
2456 /// \param b the second term
2457 /// \return vector product
2458 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2459 ///  polynomial multiplication multiplies the high and low terms, and then
2460 ///  XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2461 ///  al*bl</tt>. It is different behavior than Intel polynomial
2462 ///  multiplication. To obtain a single product without the XOR, then set
2463 ///  one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2464 ///  results in <tt>0*bh XOR al*bl = al*bl</tt>.
2465 /// \par Wraps
2466 ///  __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.
2467 /// \since Crypto++ 8.1
VecPolyMultiply(const uint32x4_p & a,const uint32x4_p & b)2468 inline uint32x4_p VecPolyMultiply(const uint32x4_p& a, const uint32x4_p& b)
2469 {
2470 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2471     return __vpmsumw (a, b);
2472 #elif defined(__clang__)
2473     return __builtin_altivec_crypto_vpmsumw (a, b);
2474 #else
2475     return __builtin_crypto_vpmsumw (a, b);
2476 #endif
2477 }
2478 
2479 /// \brief Polynomial multiplication
2480 /// \param a the first term
2481 /// \param b the second term
2482 /// \return vector product
2483 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2484 ///  polynomial multiplication multiplies the high and low terms, and then
2485 ///  XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2486 ///  al*bl</tt>. It is different behavior than Intel polynomial
2487 ///  multiplication. To obtain a single product without the XOR, then set
2488 ///  one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2489 ///  results in <tt>0*bh XOR al*bl = al*bl</tt>.
2490 /// \par Wraps
2491 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2492 /// \since Crypto++ 8.1
VecPolyMultiply(const uint64x2_p & a,const uint64x2_p & b)2493 inline uint64x2_p VecPolyMultiply(const uint64x2_p& a, const uint64x2_p& b)
2494 {
2495 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2496     return __vpmsumd (a, b);
2497 #elif defined(__clang__)
2498     return __builtin_altivec_crypto_vpmsumd (a, b);
2499 #else
2500     return __builtin_crypto_vpmsumd (a, b);
2501 #endif
2502 }
2503 
2504 /// \brief Polynomial multiplication
2505 /// \param a the first term
2506 /// \param b the second term
2507 /// \return vector product
2508 /// \details VecIntelMultiply00() performs polynomial multiplication and presents
2509 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
2510 ///  The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
2511 ///  are multiplied.
2512 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2513 ///  is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2514 /// \par Wraps
2515 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2516 /// \since Crypto++ 8.0
VecIntelMultiply00(const uint64x2_p & a,const uint64x2_p & b)2517 inline uint64x2_p VecIntelMultiply00(const uint64x2_p& a, const uint64x2_p& b)
2518 {
2519 #if defined(CRYPTOPP_BIG_ENDIAN)
2520     return VecSwapWords(VecPolyMultiply(VecGetHigh(a), VecGetHigh(b)));
2521 #else
2522     return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
2523 #endif
2524 }
2525 
2526 /// \brief Polynomial multiplication
2527 /// \param a the first term
2528 /// \param b the second term
2529 /// \return vector product
2530 /// \details VecIntelMultiply01 performs() polynomial multiplication and presents
2531 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
2532 ///  The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
2533 ///  64-bits of <tt>b</tt> are multiplied.
2534 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2535 ///  is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2536 /// \par Wraps
2537 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2538 /// \since Crypto++ 8.0
VecIntelMultiply01(const uint64x2_p & a,const uint64x2_p & b)2539 inline uint64x2_p VecIntelMultiply01(const uint64x2_p& a, const uint64x2_p& b)
2540 {
2541 #if defined(CRYPTOPP_BIG_ENDIAN)
2542     return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));
2543 #else
2544     return VecPolyMultiply(a, VecGetHigh(b));
2545 #endif
2546 }
2547 
2548 /// \brief Polynomial multiplication
2549 /// \param a the first term
2550 /// \param b the second term
2551 /// \return vector product
2552 /// \details VecIntelMultiply10() performs polynomial multiplication and presents
2553 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
2554 ///  The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
2555 ///  64-bits of <tt>b</tt> are multiplied.
2556 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2557 ///  is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2558 /// \par Wraps
2559 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2560 /// \since Crypto++ 8.0
VecIntelMultiply10(const uint64x2_p & a,const uint64x2_p & b)2561 inline uint64x2_p VecIntelMultiply10(const uint64x2_p& a, const uint64x2_p& b)
2562 {
2563 #if defined(CRYPTOPP_BIG_ENDIAN)
2564     return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));
2565 #else
2566     return VecPolyMultiply(VecGetHigh(a), b);
2567 #endif
2568 }
2569 
2570 /// \brief Polynomial multiplication
2571 /// \param a the first term
2572 /// \param b the second term
2573 /// \return vector product
2574 /// \details VecIntelMultiply11() performs polynomial multiplication and presents
2575 ///  the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
2576 ///  The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
2577 ///  are multiplied.
2578 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2579 ///  is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2580 /// \par Wraps
2581 ///  __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2582 /// \since Crypto++ 8.0
VecIntelMultiply11(const uint64x2_p & a,const uint64x2_p & b)2583 inline uint64x2_p VecIntelMultiply11(const uint64x2_p& a, const uint64x2_p& b)
2584 {
2585 #if defined(CRYPTOPP_BIG_ENDIAN)
2586     return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
2587 #else
2588     return VecPolyMultiply(VecGetLow(a), b);
2589 #endif
2590 }
2591 
2592 //@}
2593 
2594 /// \name AES ENCRYPTION
2595 //@{
2596 
2597 /// \brief One round of AES encryption
2598 /// \tparam T1 vector type
2599 /// \tparam T2 vector type
2600 /// \param state the state vector
2601 /// \param key the subkey vector
2602 /// \details VecEncrypt() performs one round of AES encryption of state
2603 ///  using subkey key. The return vector is the same type as state.
2604 /// \details VecEncrypt() is available on POWER8 and above.
2605 /// \par Wraps
2606 ///  __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
2607 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2608 template <class T1, class T2>
VecEncrypt(const T1 state,const T2 key)2609 inline T1 VecEncrypt(const T1 state, const T2 key)
2610 {
2611 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2612     return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
2613 #elif defined(__clang__)
2614     return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2615 #elif defined(__GNUC__)
2616     return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2617 #else
2618     CRYPTOPP_ASSERT(0);
2619 #endif
2620 }
2621 
2622 /// \brief Final round of AES encryption
2623 /// \tparam T1 vector type
2624 /// \tparam T2 vector type
2625 /// \param state the state vector
2626 /// \param key the subkey vector
2627 /// \details VecEncryptLast() performs the final round of AES encryption
2628 ///  of state using subkey key. The return vector is the same type as state.
2629 /// \details VecEncryptLast() is available on POWER8 and above.
2630 /// \par Wraps
2631 ///  __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
2632 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2633 template <class T1, class T2>
VecEncryptLast(const T1 state,const T2 key)2634 inline T1 VecEncryptLast(const T1 state, const T2 key)
2635 {
2636 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2637     return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
2638 #elif defined(__clang__)
2639     return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2640 #elif defined(__GNUC__)
2641     return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2642 #else
2643     CRYPTOPP_ASSERT(0);
2644 #endif
2645 }
2646 
2647 /// \brief One round of AES decryption
2648 /// \tparam T1 vector type
2649 /// \tparam T2 vector type
2650 /// \param state the state vector
2651 /// \param key the subkey vector
2652 /// \details VecDecrypt() performs one round of AES decryption of state
2653 ///  using subkey key. The return vector is the same type as state.
2654 /// \details VecDecrypt() is available on POWER8 and above.
2655 /// \par Wraps
2656 ///  __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
2657 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2658 template <class T1, class T2>
VecDecrypt(const T1 state,const T2 key)2659 inline T1 VecDecrypt(const T1 state, const T2 key)
2660 {
2661 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2662     return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
2663 #elif defined(__clang__)
2664     return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2665 #elif defined(__GNUC__)
2666     return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2667 #else
2668     CRYPTOPP_ASSERT(0);
2669 #endif
2670 }
2671 
2672 /// \brief Final round of AES decryption
2673 /// \tparam T1 vector type
2674 /// \tparam T2 vector type
2675 /// \param state the state vector
2676 /// \param key the subkey vector
2677 /// \details VecDecryptLast() performs the final round of AES decryption
2678 ///  of state using subkey key. The return vector is the same type as state.
2679 /// \details VecDecryptLast() is available on POWER8 and above.
2680 /// \par Wraps
2681 ///  __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
2682 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2683 template <class T1, class T2>
VecDecryptLast(const T1 state,const T2 key)2684 inline T1 VecDecryptLast(const T1 state, const T2 key)
2685 {
2686 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2687     return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
2688 #elif defined(__clang__)
2689     return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2690 #elif defined(__GNUC__)
2691     return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2692 #else
2693     CRYPTOPP_ASSERT(0);
2694 #endif
2695 }
2696 
2697 //@}
2698 
2699 /// \name SHA DIGESTS
2700 //@{
2701 
2702 /// \brief SHA256 Sigma functions
2703 /// \tparam func function
2704 /// \tparam fmask function mask
2705 /// \tparam T vector type
2706 /// \param data the block to transform
2707 /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
2708 ///  func and fmask. The return vector is the same type as data.
2709 /// \details VecSHA256() is available on POWER8 and above.
2710 /// \par Wraps
2711 ///  __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
2712 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2713 template <int func, int fmask, class T>
VecSHA256(const T data)2714 inline T VecSHA256(const T data)
2715 {
2716 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2717     return (T)__vshasigmaw((uint32x4_p)data, func, fmask);
2718 #elif defined(__clang__)
2719     return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2720 #elif defined(__GNUC__)
2721     return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2722 #else
2723     CRYPTOPP_ASSERT(0);
2724 #endif
2725 }
2726 
2727 /// \brief SHA512 Sigma functions
2728 /// \tparam func function
2729 /// \tparam fmask function mask
2730 /// \tparam T vector type
2731 /// \param data the block to transform
2732 /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
2733 ///  func and fmask. The return vector is the same type as data.
2734 /// \details VecSHA512() is available on POWER8 and above.
2735 /// \par Wraps
2736 ///  __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
2737 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2738 template <int func, int fmask, class T>
VecSHA512(const T data)2739 inline T VecSHA512(const T data)
2740 {
2741 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2742     return (T)__vshasigmad((uint64x2_p)data, func, fmask);
2743 #elif defined(__clang__)
2744     return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2745 #elif defined(__GNUC__)
2746     return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2747 #else
2748     CRYPTOPP_ASSERT(0);
2749 #endif
2750 }
2751 
2752 //@}
2753 
2754 #endif  // __CRYPTO__
2755 
2756 #endif  // _ALTIVEC_
2757 
2758 NAMESPACE_END
2759 
2760 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
2761 # pragma GCC diagnostic pop
2762 #endif
2763 
2764 #endif  // CRYPTOPP_PPC_CRYPTO_H
2765