1 // ppc_simd.h - written and placed in public domain by Jeffrey Walton
2
3 /// \file ppc_simd.h
4 /// \brief Support functions for PowerPC and vector operations
5 /// \details This header provides an agnostic interface into Clang, GCC
6 /// and IBM XL C/C++ compilers modulo their different built-in functions
7 /// for accessing vector intructions.
8 /// \details The abstractions are necesssary to support back to GCC 4.8 and
9 /// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
10 /// default compiler for GCC112, GCC119 and others on the compile farm.
11 /// Older IBM XL C/C++ compilers also have the need due to lack of
12 /// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
13 /// compilers provide best support and don't need many of the hacks
14 /// below.
15 /// \details The library is tested with the following PowerPC machines and
16 /// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
17 /// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
18 /// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
19 /// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
20 /// - GCC110, Linux, POWER7, GCC 4.8.5
21 /// - GCC110, Linux, POWER7, XLC 12.01
22 /// - GCC111, AIX, POWER7, GCC 4.8.1
23 /// - GCC111, AIX, POWER7, XLC 12.01
24 /// - GCC112, Linux, POWER8, GCC 4.8.5
25 /// - GCC112, Linux, POWER8, XLC 13.01
26 /// - GCC112, Linux, POWER8, Clang 7.0
27 /// - GCC119, AIX, POWER8, GCC 7.2.0
28 /// - GCC119, AIX, POWER8, XLC 13.01
29 /// - GCC135, Linux, POWER9, GCC 7.0
30 /// \details 12 machines are used for testing because the three compilers form
31 /// five or six profiles. The profiles are listed below.
32 /// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
33 /// - XLC 13.0 and earlier (all IBM components)
34 /// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
35 /// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
36 /// - early LLVM Clang (traditional Clang compiler)
37 /// - late LLVM Clang (traditional Clang compiler)
38 /// \details The LLVM front-end makes it tricky to write portable code because
39 /// LLVM pretends to be other compilers but cannot consume other compiler's
40 /// builtins. When using XLC with -qxlcompatmacros the compiler pretends to
41 /// be GCC, Clang and XLC all at once but it can only consume it's variety
42 /// of builtins.
43 /// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were
44 /// renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was
45 /// changed to <tt>VecAnd</tt>. The name change helped consolidate two
46 /// slightly different implementations.
47 /// \details At Crypto++ 8.3 the library added select 64-bit functions for
48 /// 32-bit Altivec. For example, <tt>VecAdd64</tt> and <tt>VecSub64</tt>
49 /// take 32-bit vectors and adds or subtracts them as if there were vectors
50 /// with two 64-bit elements. The functions dramtically improve performance
51 /// for some algorithms on some platforms, like SIMON128 and SPECK128 on
52 /// Power6 and earlier. For example, SPECK128 improved from 70 cpb to
53 /// 10 cpb on an old PowerMac. Use the functions like shown below.
54 /// <pre>
55 /// \#if defined(_ARCH_PWR8)
56 /// \# define speck128_t uint64x2_p
57 /// \#else
58 /// \# define speck128_t uint32x4_p
59 /// \#endif
60 ///
61 /// speck128_t rk, x1, x2, y1, y2;
62 /// rk = (speck128_t)VecLoadAligned(ptr);
63 /// x1 = VecRotateRight64<8>(x1);
64 /// x1 = VecAdd64(x1, y1);
65 /// ...</pre>
66 /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
67
68 // Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting
69 // actual availaibility of the feature for the source file being compiled.
70 // The preprocessor macros depend on compiler options like -maltivec; and
71 // not compiler versions.
72
73 // For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html
74 // For XLC see the Compiler Reference manual. For Clang you have to experiment.
75 // Clang does not document the compiler options, does not reject options it does
76 // not understand, and pretends to be other compilers even though it cannot
77 // process the builtins and intrinsics. Clang will waste hours of your time.
78
79 // DO NOT USE this pattern in VecLoad and VecStore. We have to use the
80 // code paths guarded by preprocessor macros because XLC 12 generates
81 // bad code in some places. To verify the bad code generation test on
82 // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
83 //
84 // inline uint32x4_p VecLoad(const byte src[16])
85 // {
86 // #if defined(__VSX__) || defined(_ARCH_PWR8)
87 // return (uint32x4_p) *(uint8x16_p*)((byte*)src);
88 // #else
89 // return VecLoad_ALTIVEC(src);
90 // #endif
91 // }
92
93 // We should be able to perform the load using inline asm on Power7 with
94 // VSX or Power8. The inline asm will avoid C undefined behavior due to
95 // casting from byte* to word32*. We are safe because our byte* are
96 // 16-byte aligned for Altivec. Below is the big endian load. Little
97 // endian would need to follow with xxpermdi for the reversal.
98 //
99 // __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );
100
101 // GCC and XLC use integer math for the address (D-form or byte-offset
102 // in the ISA manual). LLVM uses pointer math for the address (DS-form
103 // or indexed in the ISA manual). To keep them consistent we calculate
104 // the address from the offset and pass to a load or store function
105 // using a 0 offset.
106
107 #ifndef CRYPTOPP_PPC_CRYPTO_H
108 #define CRYPTOPP_PPC_CRYPTO_H
109
110 #include "config.h"
111 #include "misc.h"
112
113 #if defined(__ALTIVEC__)
114 # include <altivec.h>
115 # undef vector
116 # undef pixel
117 # undef bool
118 #endif
119
120 // XL C++ on AIX does not define VSX and does not
121 // provide an option to set it. We have to set it
122 // for the code below. This define must stay in
123 // sync with the define in test_ppc_power7.cxx.
124 #ifndef CRYPTOPP_DISABLE_POWER7
125 # if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
126 # define __VSX__ 1
127 # endif
128 #endif
129
130 // XL C++ on AIX does not define CRYPTO and does not
131 // provide an option to set it. We have to set it
132 // for the code below. This define must stay in
133 // sync with the define in test_ppc_power8.cxx
134 #ifndef CRYPTOPP_DISABLE_POWER8
135 # if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)
136 # define __CRYPTO__ 1
137 # endif
138 #endif
139
140 /// \brief Cast array to vector pointer
141 /// \details CONST_V8_CAST casts a const array to a vector
142 /// pointer for a byte array. The Power ABI says source arrays
143 /// are non-const, so this define removes the const. XLC++ will
144 /// fail the compile if the source array is const.
145 #define CONST_V8_CAST(x) ((unsigned char*)(x))
146 /// \brief Cast array to vector pointer
147 /// \details CONST_V32_CAST casts a const array to a vector
148 /// pointer for a word array. The Power ABI says source arrays
149 /// are non-const, so this define removes the const. XLC++ will
150 /// fail the compile if the source array is const.
151 #define CONST_V32_CAST(x) ((unsigned int*)(x))
152 /// \brief Cast array to vector pointer
153 /// \details CONST_V64_CAST casts a const array to a vector
154 /// pointer for a double word array. The Power ABI says source arrays
155 /// are non-const, so this define removes the const. XLC++ will
156 /// fail the compile if the source array is const.
157 #define CONST_V64_CAST(x) ((unsigned long long*)(x))
158 /// \brief Cast array to vector pointer
159 /// \details NCONST_V8_CAST casts an array to a vector
160 /// pointer for a byte array. The Power ABI says source arrays
161 /// are non-const, so this define removes the const. XLC++ will
162 /// fail the compile if the source array is const.
163 #define NCONST_V8_CAST(x) ((unsigned char*)(x))
164 /// \brief Cast array to vector pointer
165 /// \details NCONST_V32_CAST casts an array to a vector
166 /// pointer for a word array. The Power ABI says source arrays
167 /// are non-const, so this define removes the const. XLC++ will
168 /// fail the compile if the source array is const.
169 #define NCONST_V32_CAST(x) ((unsigned int*)(x))
170 /// \brief Cast array to vector pointer
171 /// \details NCONST_V64_CAST casts an array to a vector
172 /// pointer for a double word array. The Power ABI says source arrays
173 /// are non-const, so this define removes the const. XLC++ will
174 /// fail the compile if the source array is const.
175 #define NCONST_V64_CAST(x) ((unsigned long long*)(x))
176
177 // VecLoad_ALTIVEC and VecStore_ALTIVEC are
178 // too noisy on modern compilers
179 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
180 # pragma GCC diagnostic push
181 # pragma GCC diagnostic ignored "-Wdeprecated"
182 #endif
183
184 NAMESPACE_BEGIN(CryptoPP)
185
186 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
187
188 /// \brief Vector of 8-bit elements
189 /// \par Wraps
190 /// __vector unsigned char
191 /// \since Crypto++ 6.0
192 typedef __vector unsigned char uint8x16_p;
193 /// \brief Vector of 16-bit elements
194 /// \par Wraps
195 /// __vector unsigned short
196 /// \since Crypto++ 6.0
197 typedef __vector unsigned short uint16x8_p;
198 /// \brief Vector of 32-bit elements
199 /// \par Wraps
200 /// __vector unsigned int
201 /// \since Crypto++ 6.0
202 typedef __vector unsigned int uint32x4_p;
203
204 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
205 /// \brief Vector of 64-bit elements
206 /// \details uint64x2_p is available on POWER7 with VSX and above. Most
207 /// supporting functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>)
208 /// and <tt>vec_sub</tt> (<tt>vsubudm</tt>), did not arrive until POWER8.
209 /// \par Wraps
210 /// __vector unsigned long long
211 /// \since Crypto++ 6.0
212 typedef __vector unsigned long long uint64x2_p;
213 #endif // VSX or ARCH_PWR8
214
215 /// \brief The 0 vector
216 /// \return a 32-bit vector of 0's
217 /// \since Crypto++ 8.0
VecZero()218 inline uint32x4_p VecZero()
219 {
220 const uint32x4_p v = {0,0,0,0};
221 return v;
222 }
223
224 /// \brief The 1 vector
225 /// \return a 32-bit vector of 1's
226 /// \since Crypto++ 8.0
VecOne()227 inline uint32x4_p VecOne()
228 {
229 const uint32x4_p v = {1,1,1,1};
230 return v;
231 }
232
233 /// \brief Reverse bytes in a vector
234 /// \tparam T vector type
235 /// \param data the vector
236 /// \return vector
237 /// \details VecReverse() reverses the bytes in a vector
238 /// \par Wraps
239 /// vec_perm
240 /// \since Crypto++ 6.0
241 template <class T>
VecReverse(const T data)242 inline T VecReverse(const T data)
243 {
244 #if defined(CRYPTOPP_BIG_ENDIAN)
245 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
246 return (T)vec_perm(data, data, mask);
247 #else
248 const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
249 return (T)vec_perm(data, data, mask);
250 #endif
251 }
252
253 /// \brief Reverse bytes in a vector
254 /// \tparam T vector type
255 /// \param data the vector
256 /// \return vector
257 /// \details VecReverseLE() reverses the bytes in a vector on
258 /// little-endian systems.
259 /// \par Wraps
260 /// vec_perm
261 /// \since Crypto++ 6.0
262 template <class T>
VecReverseLE(const T data)263 inline T VecReverseLE(const T data)
264 {
265 #if defined(CRYPTOPP_LITTLE_ENDIAN)
266 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
267 return (T)vec_perm(data, data, mask);
268 #else
269 return data;
270 #endif
271 }
272
273 /// \brief Reverse bytes in a vector
274 /// \tparam T vector type
275 /// \param data the vector
276 /// \return vector
277 /// \details VecReverseBE() reverses the bytes in a vector on
278 /// big-endian systems.
279 /// \par Wraps
280 /// vec_perm
281 /// \since Crypto++ 6.0
282 template <class T>
VecReverseBE(const T data)283 inline T VecReverseBE(const T data)
284 {
285 #if defined(CRYPTOPP_BIG_ENDIAN)
286 const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
287 return (T)vec_perm(data, data, mask);
288 #else
289 return data;
290 #endif
291 }
292
293 /// \name LOAD OPERATIONS
294 //@{
295
296 /// \brief Loads a vector from a byte array
297 /// \param src the byte array
298 /// \details Loads a vector in native endian format from a byte array.
299 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
300 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
301 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
302 /// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
303 /// you should provide aligned memory adresses.
304 /// \par Wraps
305 /// vec_ld, vec_lvsl, vec_perm
306 /// \sa VecLoad, VecLoadAligned
307 /// \since Crypto++ 6.0
VecLoad_ALTIVEC(const byte src[16])308 inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
309 {
310 // Avoid IsAlignedOn for convenience.
311 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
312 if (addr % 16 == 0)
313 {
314 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
315 }
316 else
317 {
318 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
319 const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
320 const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
321 const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
322 return (uint32x4_p)vec_perm(low, high, perm);
323 }
324 }
325
326 /// \brief Loads a vector from a byte array
327 /// \param src the byte array
328 /// \param off offset into the src byte array
329 /// \details Loads a vector in native endian format from a byte array.
330 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
331 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
332 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
333 /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
334 /// relatively expensive so you should provide aligned memory adresses.
335 /// \par Wraps
336 /// vec_ld, vec_lvsl, vec_perm
337 /// \sa VecLoad, VecLoadAligned
338 /// \since Crypto++ 6.0
VecLoad_ALTIVEC(int off,const byte src[16])339 inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
340 {
341 // Avoid IsAlignedOn for convenience.
342 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
343 if (addr % 16 == 0)
344 {
345 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
346 }
347 else
348 {
349 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
350 const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
351 const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
352 const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
353 return (uint32x4_p)vec_perm(low, high, perm);
354 }
355 }
356
357 /// \brief Loads a vector from a byte array
358 /// \param src the byte array
359 /// \details VecLoad() loads a vector from a byte array.
360 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
361 /// The instruction does not require aligned effective memory addresses.
362 /// VecLoad_ALTIVEC() is used if POWER9 is not available.
363 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
364 /// are required to fix up unaligned memory addresses.
365 /// \par Wraps
366 /// vec_xl on POWER9 and above, Altivec load on POWER8 and below
367 /// \sa VecLoad_ALTIVEC, VecLoadAligned
368 /// \since Crypto++ 6.0
VecLoad(const byte src[16])369 inline uint32x4_p VecLoad(const byte src[16])
370 {
371 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
372 // word pointers. The ISA lacks loads for short* and char*.
373 // Power9/ISA 3.0 provides vec_xl for all datatypes.
374
375 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
376 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
377 CRYPTOPP_UNUSED(addr);
378
379 #if defined(_ARCH_PWR9)
380 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
381 #else
382 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
383 #endif
384 }
385
386 /// \brief Loads a vector from a byte array
387 /// \param src the byte array
388 /// \param off offset into the src byte array
389 /// \details VecLoad() loads a vector from a byte array.
390 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
391 /// The instruction does not require aligned effective memory addresses.
392 /// VecLoad_ALTIVEC() is used if POWER9 is not available.
393 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
394 /// are required to fix up unaligned memory addresses.
395 /// \par Wraps
396 /// vec_xl on POWER9 and above, Altivec load on POWER8 and below
397 /// \sa VecLoad_ALTIVEC, VecLoadAligned
398 /// \since Crypto++ 6.0
VecLoad(int off,const byte src[16])399 inline uint32x4_p VecLoad(int off, const byte src[16])
400 {
401 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
402 // word pointers. The ISA lacks loads for short* and char*.
403 // Power9/ISA 3.0 provides vec_xl for all datatypes.
404
405 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
406 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
407 CRYPTOPP_UNUSED(addr);
408
409 #if defined(_ARCH_PWR9)
410 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
411 #else
412 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
413 #endif
414 }
415
416 /// \brief Loads a vector from a word array
417 /// \param src the word array
418 /// \details VecLoad() loads a vector from a word array.
419 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
420 /// The instruction does not require aligned effective memory addresses.
421 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
422 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
423 /// are required to fix up unaligned memory addresses.
424 /// \par Wraps
425 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
426 /// \sa VecLoad_ALTIVEC, VecLoadAligned
427 /// \since Crypto++ 8.0
VecLoad(const word32 src[4])428 inline uint32x4_p VecLoad(const word32 src[4])
429 {
430 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
431 // word pointers. The ISA lacks loads for short* and char*.
432 // Power9/ISA 3.0 provides vec_xl for all datatypes.
433
434 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
435 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
436 CRYPTOPP_UNUSED(addr);
437
438 #if defined(_ARCH_PWR9)
439 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
440 #elif defined(__VSX__) || defined(_ARCH_PWR8)
441 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
442 #else
443 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
444 #endif
445 }
446
447 /// \brief Loads a vector from a word array
448 /// \param src the word array
449 /// \param off offset into the word array
450 /// \details VecLoad() loads a vector from a word array.
451 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
452 /// The instruction does not require aligned effective memory addresses.
453 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
454 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
455 /// are required to fix up unaligned memory addresses.
456 /// \par Wraps
457 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
458 /// \sa VecLoad_ALTIVEC, VecLoadAligned
459 /// \since Crypto++ 8.0
VecLoad(int off,const word32 src[4])460 inline uint32x4_p VecLoad(int off, const word32 src[4])
461 {
462 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
463 // word pointers. The ISA lacks loads for short* and char*.
464 // Power9/ISA 3.0 provides vec_xl for all datatypes.
465
466 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
467 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
468 CRYPTOPP_UNUSED(addr);
469
470 #if defined(_ARCH_PWR9)
471 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
472 #elif defined(__VSX__) || defined(_ARCH_PWR8)
473 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
474 #else
475 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
476 #endif
477 }
478
479 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
480
481 /// \brief Loads a vector from a double word array
482 /// \param src the double word array
483 /// \details VecLoad() loads a vector from a double word array.
484 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
485 /// The instruction does not require aligned effective memory addresses.
486 /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
487 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
488 /// are required to fix up unaligned memory addresses.
489 /// \details VecLoad() with 64-bit elements is available on POWER7 and above.
490 /// \par Wraps
491 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
492 /// \sa VecLoad_ALTIVEC, VecLoadAligned
493 /// \since Crypto++ 8.0
VecLoad(const word64 src[2])494 inline uint64x2_p VecLoad(const word64 src[2])
495 {
496 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
497 // word pointers. The ISA lacks loads for short* and char*.
498 // Power9/ISA 3.0 provides vec_xl for all datatypes.
499
500 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
501 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
502 CRYPTOPP_UNUSED(addr);
503
504 #if defined(_ARCH_PWR9)
505 return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
506 #elif defined(__VSX__) || defined(_ARCH_PWR8)
507 // The 32-bit cast is not a typo. Compiler workaround.
508 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
509 #else
510 return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
511 #endif
512 }
513
514 /// \brief Loads a vector from a double word array
515 /// \param src the double word array
516 /// \param off offset into the double word array
517 /// \details VecLoad() loads a vector from a double word array.
518 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
519 /// The instruction does not require aligned effective memory addresses.
520 /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
521 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
522 /// are required to fix up unaligned memory addresses.
523 /// \details VecLoad() with 64-bit elements is available on POWER8 and above.
524 /// \par Wraps
525 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
526 /// \sa VecLoad_ALTIVEC, VecLoadAligned
527 /// \since Crypto++ 8.0
VecLoad(int off,const word64 src[2])528 inline uint64x2_p VecLoad(int off, const word64 src[2])
529 {
530 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
531 // word pointers. The ISA lacks loads for short* and char*.
532 // Power9/ISA 3.0 provides vec_xl for all datatypes.
533
534 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
535 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
536 CRYPTOPP_UNUSED(addr);
537
538 #if defined(_ARCH_PWR9)
539 return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
540 #elif defined(__VSX__) || defined(_ARCH_PWR8)
541 // The 32-bit cast is not a typo. Compiler workaround.
542 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
543 #else
544 return (uint64x2_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
545 #endif
546 }
547
548 #endif // VSX or ARCH_PWR8
549
550 /// \brief Loads a vector from an aligned byte array
551 /// \param src the byte array
552 /// \details VecLoadAligned() loads a vector from an aligned byte array.
553 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
554 /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
555 /// address of <tt>src</tt> must be 16-byte aligned for Altivec.
556 /// \par Wraps
557 /// vec_xl on POWER9, vec_ld on POWER8 and below
558 /// \sa VecLoad_ALTIVEC, VecLoad
559 /// \since Crypto++ 8.0
VecLoadAligned(const byte src[16])560 inline uint32x4_p VecLoadAligned(const byte src[16])
561 {
562 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
563 // word pointers. The ISA lacks loads for short* and char*.
564 // Power9/ISA 3.0 provides vec_xl for all datatypes.
565
566 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
567 CRYPTOPP_ASSERT(addr % 16 == 0);
568 CRYPTOPP_UNUSED(addr);
569
570 #if defined(_ARCH_PWR9)
571 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
572 #else
573 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
574 #endif
575 }
576
577 /// \brief Loads a vector from an aligned byte array
578 /// \param src the byte array
579 /// \param off offset into the src byte array
580 /// \details VecLoadAligned() loads a vector from an aligned byte array.
581 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
582 /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
583 /// address of <tt>src</tt> must be 16-byte aligned for Altivec.
584 /// \par Wraps
585 /// vec_xl on POWER9, vec_ld on POWER8 and below
586 /// \sa VecLoad_ALTIVEC, VecLoad
587 /// \since Crypto++ 8.0
VecLoadAligned(int off,const byte src[16])588 inline uint32x4_p VecLoadAligned(int off, const byte src[16])
589 {
590 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
591 // word pointers. The ISA lacks loads for short* and char*.
592 // Power9/ISA 3.0 provides vec_xl for all datatypes.
593
594 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
595 CRYPTOPP_ASSERT(addr % 16 == 0);
596 CRYPTOPP_UNUSED(addr);
597
598 #if defined(_ARCH_PWR9)
599 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
600 #else
601 return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
602 #endif
603 }
604
605 /// \brief Loads a vector from an aligned word array
606 /// \param src the word array
607 /// \details VecLoadAligned() loads a vector from an aligned word array.
608 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
609 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
610 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
611 /// \par Wraps
612 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
613 /// \sa VecLoad_ALTIVEC, VecLoad
614 /// \since Crypto++ 8.0
VecLoadAligned(const word32 src[4])615 inline uint32x4_p VecLoadAligned(const word32 src[4])
616 {
617 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
618 // word pointers. The ISA lacks loads for short* and char*.
619 // Power9/ISA 3.0 provides vec_xl for all datatypes.
620
621 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
622 CRYPTOPP_ASSERT(addr % 16 == 0);
623 CRYPTOPP_UNUSED(addr);
624
625 #if defined(_ARCH_PWR9)
626 return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
627 #elif defined(__VSX__) || defined(_ARCH_PWR8)
628 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));
629 #else
630 return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
631 #endif
632 }
633
634 /// \brief Loads a vector from an aligned word array
635 /// \param src the word array
636 /// \param off offset into the src word array
637 /// \details VecLoadAligned() loads a vector from an aligned word array.
638 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
639 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
640 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
641 /// \par Wraps
642 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
643 /// \sa VecLoad_ALTIVEC, VecLoad
644 /// \since Crypto++ 8.0
VecLoadAligned(int off,const word32 src[4])645 inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
646 {
647 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
648 // word pointers. The ISA lacks loads for short* and char*.
649 // Power9/ISA 3.0 provides vec_xl for all datatypes.
650
651 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
652 CRYPTOPP_ASSERT(addr % 16 == 0);
653 CRYPTOPP_UNUSED(addr);
654
655 #if defined(_ARCH_PWR9)
656 return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
657 #elif defined(__VSX__) || defined(_ARCH_PWR8)
658 return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
659 #else
660 return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
661 #endif
662 }
663
664 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
665
666 /// \brief Loads a vector from an aligned double word array
667 /// \param src the double word array
668 /// \details VecLoadAligned() loads a vector from an aligned double word array.
669 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
670 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
671 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
672 /// \par Wraps
673 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
674 /// \sa VecLoad_ALTIVEC, VecLoad
675 /// \since Crypto++ 8.0
VecLoadAligned(const word64 src[4])676 inline uint64x2_p VecLoadAligned(const word64 src[4])
677 {
678 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
679 // word pointers. The ISA lacks loads for short* and char*.
680 // Power9/ISA 3.0 provides vec_xl for all datatypes.
681
682 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
683 CRYPTOPP_ASSERT(addr % 16 == 0);
684 CRYPTOPP_UNUSED(addr);
685
686 #if defined(_ARCH_PWR9)
687 return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
688 #elif defined(__VSX__) || defined(_ARCH_PWR8)
689 // The 32-bit cast is not a typo. Compiler workaround.
690 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
691 #else
692 return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
693 #endif
694 }
695
696 /// \brief Loads a vector from an aligned double word array
697 /// \param src the double word array
698 /// \param off offset into the src double word array
699 /// \details VecLoadAligned() loads a vector from an aligned double word array.
700 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
701 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
702 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
703 /// \par Wraps
704 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
705 /// \sa VecLoad_ALTIVEC, VecLoad
706 /// \since Crypto++ 8.0
VecLoadAligned(int off,const word64 src[4])707 inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
708 {
709 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
710 // word pointers. The ISA lacks loads for short* and char*.
711 // Power9/ISA 3.0 provides vec_xl for all datatypes.
712
713 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
714 CRYPTOPP_ASSERT(addr % 16 == 0);
715 CRYPTOPP_UNUSED(addr);
716
717 #if defined(_ARCH_PWR9)
718 return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
719 #elif defined(__VSX__) || defined(_ARCH_PWR8)
720 // The 32-bit cast is not a typo. Compiler workaround.
721 return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
722 #else
723 return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
724 #endif
725 }
726
727 #endif
728
729 /// \brief Loads a vector from a byte array
730 /// \param src the byte array
731 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
732 /// will reverse all bytes in the array on a little endian system.
733 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
734 /// The instruction does not require aligned effective memory addresses.
735 /// VecLoad_ALTIVEC() is used if POWER7 or VSX are not available.
736 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
737 /// are required to fix up unaligned memory addresses.
738 /// \par Wraps
739 /// vec_xl on POWER8, Altivec load on POWER7 and below
740 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
741 /// \since Crypto++ 6.0
VecLoadBE(const byte src[16])742 inline uint32x4_p VecLoadBE(const byte src[16])
743 {
744 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
745 // word pointers. The ISA lacks loads for short* and char*.
746 // Power9/ISA 3.0 provides vec_xl for all datatypes.
747
748 const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
749 // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
750 CRYPTOPP_UNUSED(addr);
751
752 #if defined(_ARCH_PWR9)
753 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
754 return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));
755 #elif defined(CRYPTOPP_BIG_ENDIAN)
756 return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src));
757 #else
758 return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(src)));
759 #endif
760 }
761
762 /// \brief Loads a vector from a byte array
763 /// \param src the byte array
764 /// \param off offset into the src byte array
765 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
766 /// will reverse all bytes in the array on a little endian system.
767 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
768 /// The instruction does not require aligned effective memory addresses.
769 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
770 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
771 /// are required to fix up unaligned memory addresses.
772 /// \par Wraps
773 /// vec_xl on POWER8, Altivec load on POWER7 and below
774 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
775 /// \since Crypto++ 6.0
VecLoadBE(int off,const byte src[16])776 inline uint32x4_p VecLoadBE(int off, const byte src[16])
777 {
778 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
779 // word pointers. The ISA lacks loads for short* and char*.
780 // Power9/ISA 3.0 provides vec_xl for all datatypes.
781
782 const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
783 // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
784 CRYPTOPP_UNUSED(addr);
785
786 #if defined(_ARCH_PWR9)
787 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
788 return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));
789 #elif defined(CRYPTOPP_BIG_ENDIAN)
790 return (uint32x4_p)VecLoad_ALTIVEC(CONST_V8_CAST(addr));
791 #else
792 return (uint32x4_p)VecReverseLE(VecLoad_ALTIVEC(CONST_V8_CAST(addr)));
793 #endif
794 }
795
796 //@}
797
798 /// \name STORE OPERATIONS
799 //@{
800
801 /// \brief Stores a vector to a byte array
802 /// \tparam T vector type
803 /// \param data the vector
804 /// \param dest the byte array
805 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
806 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
807 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
808 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
809 /// memory adresses.
810 /// \details VecStore_ALTIVEC() is used when POWER7 or above
811 /// and unaligned loads is not available.
812 /// \par Wraps
813 /// vec_st, vec_ste, vec_lvsr, vec_perm
814 /// \sa VecStore, VecStoreAligned
815 /// \since Crypto++ 8.0
816 template<class T>
VecStore_ALTIVEC(const T data,byte dest[16])817 inline void VecStore_ALTIVEC(const T data, byte dest[16])
818 {
819 // Avoid IsAlignedOn for convenience.
820 uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
821 if (addr % 16 == 0)
822 {
823 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
824 }
825 else
826 {
827 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
828 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
829 vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
830 vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
831 vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
832 vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
833 vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
834 vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
835 vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
836 vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
837 }
838 }
839
840 /// \brief Stores a vector to a byte array
841 /// \tparam T vector type
842 /// \param data the vector
843 /// \param off offset into the dest byte array
844 /// \param dest the byte array
845 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
846 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
847 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
848 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
849 /// memory adresses.
850 /// \details VecStore_ALTIVEC() is used when POWER7 or above
851 /// and unaligned loads is not available.
852 /// \par Wraps
853 /// vec_st, vec_ste, vec_lvsr, vec_perm
854 /// \sa VecStore, VecStoreAligned
855 /// \since Crypto++ 8.0
856 template<class T>
VecStore_ALTIVEC(const T data,int off,byte dest[16])857 inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
858 {
859 // Avoid IsAlignedOn for convenience.
860 uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
861 if (addr % 16 == 0)
862 {
863 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
864 }
865 else
866 {
867 // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
868 uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
869 vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
870 vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
871 vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
872 vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
873 vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
874 vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
875 vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
876 vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
877 }
878 }
879
880 /// \brief Stores a vector to a byte array
881 /// \tparam T vector type
882 /// \param data the vector
883 /// \param dest the byte array
884 /// \details VecStore() stores a vector to a byte array.
885 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
886 /// The instruction does not require aligned effective memory addresses.
887 /// VecStore_ALTIVEC() is used if POWER9 is not available.
888 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
889 /// are required to fix up unaligned memory addresses.
890 /// \par Wraps
891 /// vec_xst on POWER9 and above, Altivec store on POWER8 and below
892 /// \sa VecStore_ALTIVEC, VecStoreAligned
893 /// \since Crypto++ 6.0
894 template<class T>
VecStore(const T data,byte dest[16])895 inline void VecStore(const T data, byte dest[16])
896 {
897 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
898 // word pointers. The ISA lacks loads for short* and char*.
899 // Power9/ISA 3.0 provides vec_xl for all datatypes.
900
901 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
902 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
903 CRYPTOPP_UNUSED(addr);
904
905 #if defined(_ARCH_PWR9)
906 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
907 #else
908 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(dest));
909 #endif
910 }
911
912 /// \brief Stores a vector to a byte array
913 /// \tparam T vector type
914 /// \param data the vector
915 /// \param off offset into the dest byte array
916 /// \param dest the byte array
917 /// \details VecStore() stores a vector to a byte array.
918 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
919 /// The instruction does not require aligned effective memory addresses.
920 /// VecStore_ALTIVEC() is used if POWER9 is not available.
921 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
922 /// are required to fix up unaligned memory addresses.
923 /// \par Wraps
924 /// vec_xst on POWER9 and above, Altivec store on POWER8 and below
925 /// \sa VecStore_ALTIVEC, VecStoreAligned
926 /// \since Crypto++ 6.0
927 template<class T>
VecStore(const T data,int off,byte dest[16])928 inline void VecStore(const T data, int off, byte dest[16])
929 {
930 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
931 // word pointers. The ISA lacks loads for short* and char*.
932 // Power9/ISA 3.0 provides vec_xl for all datatypes.
933
934 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
935 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
936 CRYPTOPP_UNUSED(addr);
937
938 #if defined(_ARCH_PWR9)
939 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
940 #else
941 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
942 #endif
943 }
944
945 /// \brief Stores a vector to a word array
946 /// \tparam T vector type
947 /// \param data the vector
948 /// \param dest the word array
949 /// \details VecStore() stores a vector to a word array.
950 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
951 /// The instruction does not require aligned effective memory addresses.
952 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
953 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
954 /// are required to fix up unaligned memory addresses.
955 /// \par Wraps
956 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
957 /// \sa VecStore_ALTIVEC, VecStoreAligned
958 /// \since Crypto++ 8.0
959 template<class T>
VecStore(const T data,word32 dest[4])960 inline void VecStore(const T data, word32 dest[4])
961 {
962 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
963 // word pointers. The ISA lacks stores for short* and char*.
964 // Power9/ISA 3.0 provides vec_xst for all datatypes.
965
966 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
967 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
968 CRYPTOPP_UNUSED(addr);
969
970 #if defined(_ARCH_PWR9)
971 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
972 #elif defined(__VSX__) || defined(_ARCH_PWR8)
973 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
974 #else
975 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
976 #endif
977 }
978
979 /// \brief Stores a vector to a word array
980 /// \tparam T vector type
981 /// \param data the vector
982 /// \param off offset into the dest word array
983 /// \param dest the word array
984 /// \details VecStore() stores a vector to a word array.
985 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
986 /// The instruction does not require aligned effective memory addresses.
987 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
988 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
989 /// are required to fix up unaligned memory addresses.
990 /// \par Wraps
991 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
992 /// \sa VecStore_ALTIVEC, VecStoreAligned
993 /// \since Crypto++ 8.0
994 template<class T>
VecStore(const T data,int off,word32 dest[4])995 inline void VecStore(const T data, int off, word32 dest[4])
996 {
997 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
998 // word pointers. The ISA lacks stores for short* and char*.
999 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1000
1001 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1002 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1003 CRYPTOPP_UNUSED(addr);
1004
1005 #if defined(_ARCH_PWR9)
1006 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1007 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1008 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1009 #else
1010 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1011 #endif
1012 }
1013
1014 /// \brief Stores a vector to a word array
1015 /// \tparam T vector type
1016 /// \param data the vector
1017 /// \param dest the word array
1018 /// \details VecStore() stores a vector to a word array.
1019 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1020 /// The instruction does not require aligned effective memory addresses.
1021 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1022 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1023 /// are required to fix up unaligned memory addresses.
1024 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
1025 /// \par Wraps
1026 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1027 /// \sa VecStore_ALTIVEC, VecStoreAligned
1028 /// \since Crypto++ 8.0
1029 template<class T>
VecStore(const T data,word64 dest[2])1030 inline void VecStore(const T data, word64 dest[2])
1031 {
1032 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1033 // word pointers. The ISA lacks stores for short* and char*.
1034 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1035
1036 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1037 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1038 CRYPTOPP_UNUSED(addr);
1039
1040 #if defined(_ARCH_PWR9)
1041 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1042 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1043 // 32-bit cast is not a typo. Compiler workaround.
1044 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1045 #else
1046 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1047 #endif
1048 }
1049
1050 /// \brief Stores a vector to a word array
1051 /// \tparam T vector type
1052 /// \param data the vector
1053 /// \param off offset into the dest word array
1054 /// \param dest the word array
1055 /// \details VecStore() stores a vector to a word array.
1056 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1057 /// The instruction does not require aligned effective memory addresses.
1058 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1059 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1060 /// are required to fix up unaligned memory addresses.
1061 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
1062 /// \par Wraps
1063 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1064 /// \sa VecStore_ALTIVEC, VecStoreAligned
1065 /// \since Crypto++ 8.0
1066 template<class T>
VecStore(const T data,int off,word64 dest[2])1067 inline void VecStore(const T data, int off, word64 dest[2])
1068 {
1069 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1070 // word pointers. The ISA lacks stores for short* and char*.
1071 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1072
1073 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1074 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1075 CRYPTOPP_UNUSED(addr);
1076
1077 #if defined(_ARCH_PWR9)
1078 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1079 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1080 // 32-bit cast is not a typo. Compiler workaround.
1081 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1082 #else
1083 VecStore_ALTIVEC((uint8x16_p)data, NCONST_V8_CAST(addr));
1084 #endif
1085 }
1086
1087 /// \brief Stores a vector to a byte array
1088 /// \tparam T vector type
1089 /// \param data the vector
1090 /// \param dest the byte array
1091 /// \details VecStoreAligned() stores a vector from an aligned byte array.
1092 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1093 /// <tt>vec_st</tt> is used if POWER9 is not available. The effective
1094 /// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1095 /// \par Wraps
1096 /// vec_xst on POWER9 or above, vec_st on POWER8 and below
1097 /// \sa VecStore_ALTIVEC, VecStore
1098 /// \since Crypto++ 8.0
1099 template<class T>
VecStoreAligned(const T data,byte dest[16])1100 inline void VecStoreAligned(const T data, byte dest[16])
1101 {
1102 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1103 // word pointers. The ISA lacks loads for short* and char*.
1104 // Power9/ISA 3.0 provides vec_xl for all datatypes.
1105
1106 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1107 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1108 CRYPTOPP_UNUSED(addr);
1109
1110 #if defined(_ARCH_PWR9)
1111 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1112 #else
1113 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1114 #endif
1115 }
1116
1117 /// \brief Stores a vector to a byte array
1118 /// \tparam T vector type
1119 /// \param data the vector
1120 /// \param off offset into the dest byte array
1121 /// \param dest the byte array
1122 /// \details VecStoreAligned() stores a vector from an aligned byte array.
1123 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1124 /// <tt>vec_st</tt> is used if POWER9 is not available. The effective
1125 /// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1126 /// \par Wraps
1127 /// vec_xst on POWER9 or above, vec_st on POWER8 and below
1128 /// \sa VecStore_ALTIVEC, VecStore
1129 /// \since Crypto++ 8.0
1130 template<class T>
VecStoreAligned(const T data,int off,byte dest[16])1131 inline void VecStoreAligned(const T data, int off, byte dest[16])
1132 {
1133 // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1134 // word pointers. The ISA lacks loads for short* and char*.
1135 // Power9/ISA 3.0 provides vec_xl for all datatypes.
1136
1137 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1138 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1139 CRYPTOPP_UNUSED(addr);
1140
1141 #if defined(_ARCH_PWR9)
1142 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1143 #else
1144 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1145 #endif
1146 }
1147
1148 /// \brief Stores a vector to a word array
1149 /// \tparam T vector type
1150 /// \param data the vector
1151 /// \param dest the word array
1152 /// \details VecStoreAligned() stores a vector from an aligned word array.
1153 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1154 /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1155 /// is used if POWER7 is not available. The effective address of <tt>dest</tt>
1156 /// must be 16-byte aligned for Altivec.
1157 /// \par Wraps
1158 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1159 /// \sa VecStore_ALTIVEC, VecStore
1160 /// \since Crypto++ 8.0
1161 template<class T>
VecStoreAligned(const T data,word32 dest[4])1162 inline void VecStoreAligned(const T data, word32 dest[4])
1163 {
1164 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1165 // word pointers. The ISA lacks stores for short* and char*.
1166 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1167
1168 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1169 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1170 CRYPTOPP_UNUSED(addr);
1171
1172 #if defined(_ARCH_PWR9)
1173 vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1174 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1175 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1176 #else
1177 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1178 #endif
1179 }
1180
1181 /// \brief Stores a vector to a word array
1182 /// \tparam T vector type
1183 /// \param data the vector
1184 /// \param off offset into the dest word array
1185 /// \param dest the word array
1186 /// \details VecStoreAligned() stores a vector from an aligned word array.
1187 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1188 /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1189 /// is used if POWER7 is not available. The effective address of <tt>dest</tt>
1190 /// must be 16-byte aligned for Altivec.
1191 /// \par Wraps
1192 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1193 /// \sa VecStore_ALTIVEC, VecStore
1194 /// \since Crypto++ 8.0
1195 template<class T>
VecStoreAligned(const T data,int off,word32 dest[4])1196 inline void VecStoreAligned(const T data, int off, word32 dest[4])
1197 {
1198 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1199 // word pointers. The ISA lacks stores for short* and char*.
1200 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1201
1202 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1203 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1204 CRYPTOPP_UNUSED(addr);
1205
1206 #if defined(_ARCH_PWR9)
1207 vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1208 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1209 vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1210 #else
1211 vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1212 #endif
1213 }
1214
1215 /// \brief Stores a vector to a byte array
1216 /// \tparam T vector type
1217 /// \param data the vector
1218 /// \param dest the byte array
1219 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1220 /// will reverse all bytes in the array on a little endian system.
1221 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1222 /// The instruction does not require aligned effective memory addresses.
1223 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1224 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1225 /// are required to fix up unaligned memory addresses.
1226 /// \par Wraps
1227 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1228 /// \sa VecStore_ALTIVEC, VecStoreAligned
1229 /// \since Crypto++ 6.0
1230 template <class T>
VecStoreBE(const T data,byte dest[16])1231 inline void VecStoreBE(const T data, byte dest[16])
1232 {
1233 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1234 // word pointers. The ISA lacks stores for short* and char*.
1235 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1236
1237 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1238 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1239 CRYPTOPP_UNUSED(addr);
1240
1241 #if defined(_ARCH_PWR9)
1242 vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1243 #elif defined(CRYPTOPP_BIG_ENDIAN)
1244 VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1245 #else
1246 VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
1247 #endif
1248 }
1249
1250 /// \brief Stores a vector to a byte array
1251 /// \tparam T vector type
1252 /// \param data the vector
1253 /// \param off offset into the dest byte array
1254 /// \param dest the byte array
1255 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1256 /// will reverse all bytes in the array on a little endian system.
1257 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1258 /// The instruction does not require aligned effective memory addresses.
1259 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1260 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1261 /// are required to fix up unaligned memory addresses.
1262 /// \par Wraps
1263 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1264 /// \sa VecStore_ALTIVEC, VecStoreAligned
1265 /// \since Crypto++ 6.0
1266 template <class T>
VecStoreBE(const T data,int off,byte dest[16])1267 inline void VecStoreBE(const T data, int off, byte dest[16])
1268 {
1269 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1270 // word pointers. The ISA lacks stores for short* and char*.
1271 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1272
1273 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1274 CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1275 CRYPTOPP_UNUSED(addr);
1276
1277 #if defined(_ARCH_PWR9)
1278 vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1279 #elif defined(CRYPTOPP_BIG_ENDIAN)
1280 VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1281 #else
1282 VecStore((uint8x16_p)VecReverseLE(data), NCONST_V8_CAST(addr));
1283 #endif
1284 }
1285
1286 /// \brief Stores a vector to a word array
1287 /// \tparam T vector type
1288 /// \param data the vector
1289 /// \param dest the word array
1290 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1291 /// will reverse all bytes in the array on a little endian system.
1292 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1293 /// The instruction does not require aligned effective memory addresses.
1294 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1295 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1296 /// are required to fix up unaligned memory addresses.
1297 /// \par Wraps
1298 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1299 /// \sa VecStore_ALTIVEC, VecStoreAligned
1300 /// \since Crypto++ 8.0
1301 template <class T>
VecStoreBE(const T data,word32 dest[4])1302 inline void VecStoreBE(const T data, word32 dest[4])
1303 {
1304 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1305 // word pointers. The ISA lacks stores for short* and char*.
1306 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1307
1308 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1309 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1310 CRYPTOPP_UNUSED(addr);
1311
1312 #if defined(_ARCH_PWR9)
1313 vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1314 #elif defined(CRYPTOPP_BIG_ENDIAN)
1315 VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1316 #else
1317 VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
1318 #endif
1319 }
1320
1321 /// \brief Stores a vector to a word array
1322 /// \tparam T vector type
1323 /// \param data the vector
1324 /// \param off offset into the dest word array
1325 /// \param dest the word array
1326 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1327 /// will reverse all words in the array on a little endian system.
1328 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1329 /// The instruction does not require aligned effective memory addresses.
1330 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1331 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1332 /// are required to fix up unaligned memory addresses.
1333 /// \par Wraps
1334 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1335 /// \sa VecStore_ALTIVEC, VecStoreAligned
1336 /// \since Crypto++ 8.0
1337 template <class T>
VecStoreBE(const T data,int off,word32 dest[4])1338 inline void VecStoreBE(const T data, int off, word32 dest[4])
1339 {
1340 // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1341 // word pointers. The ISA lacks stores for short* and char*.
1342 // Power9/ISA 3.0 provides vec_xst for all datatypes.
1343
1344 const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1345 CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1346 CRYPTOPP_UNUSED(addr);
1347
1348 #if defined(_ARCH_PWR9)
1349 vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1350 #elif defined(CRYPTOPP_BIG_ENDIAN)
1351 VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1352 #else
1353 VecStore((uint32x4_p)VecReverseLE(data), NCONST_V32_CAST(addr));
1354 #endif
1355 }
1356
1357 //@}
1358
1359 /// \name LOGICAL OPERATIONS
1360 //@{
1361
1362 /// \brief AND two vectors
1363 /// \tparam T1 vector type
1364 /// \tparam T2 vector type
1365 /// \param vec1 the first vector
1366 /// \param vec2 the second vector
1367 /// \return vector
1368 /// \details VecAnd() performs <tt>vec1 & vec2</tt>.
1369 /// vec2 is cast to the same type as vec1. The return vector
1370 /// is the same type as vec1.
1371 /// \par Wraps
1372 /// vec_and
1373 /// \sa VecAnd64
1374 /// \since Crypto++ 6.0
1375 template <class T1, class T2>
VecAnd(const T1 vec1,const T2 vec2)1376 inline T1 VecAnd(const T1 vec1, const T2 vec2)
1377 {
1378 return (T1)vec_and(vec1, (T1)vec2);
1379 }
1380
1381 /// \brief OR two vectors
1382 /// \tparam T1 vector type
1383 /// \tparam T2 vector type
1384 /// \param vec1 the first vector
1385 /// \param vec2 the second vector
1386 /// \return vector
1387 /// \details VecOr() performs <tt>vec1 | vec2</tt>.
1388 /// vec2 is cast to the same type as vec1. The return vector
1389 /// is the same type as vec1.
1390 /// \par Wraps
1391 /// vec_or
1392 /// \sa VecOr64
1393 /// \since Crypto++ 6.0
1394 template <class T1, class T2>
VecOr(const T1 vec1,const T2 vec2)1395 inline T1 VecOr(const T1 vec1, const T2 vec2)
1396 {
1397 return (T1)vec_or(vec1, (T1)vec2);
1398 }
1399
1400 /// \brief XOR two vectors
1401 /// \tparam T1 vector type
1402 /// \tparam T2 vector type
1403 /// \param vec1 the first vector
1404 /// \param vec2 the second vector
1405 /// \return vector
1406 /// \details VecXor() performs <tt>vec1 ^ vec2</tt>.
1407 /// vec2 is cast to the same type as vec1. The return vector
1408 /// is the same type as vec1.
1409 /// \par Wraps
1410 /// vec_xor
1411 /// \sa VecXor64
1412 /// \since Crypto++ 6.0
1413 template <class T1, class T2>
VecXor(const T1 vec1,const T2 vec2)1414 inline T1 VecXor(const T1 vec1, const T2 vec2)
1415 {
1416 return (T1)vec_xor(vec1, (T1)vec2);
1417 }
1418
1419 //@}
1420
1421 /// \name ARITHMETIC OPERATIONS
1422 //@{
1423
1424 /// \brief Add two vectors
1425 /// \tparam T1 vector type
1426 /// \tparam T2 vector type
1427 /// \param vec1 the first vector
1428 /// \param vec2 the second vector
1429 /// \return vector
1430 /// \details VecAdd() performs <tt>vec1 + vec2</tt>.
1431 /// vec2 is cast to the same type as vec1. The return vector
1432 /// is the same type as vec1.
1433 /// \par Wraps
1434 /// vec_add
1435 /// \sa VecAdd64
1436 /// \since Crypto++ 6.0
1437 template <class T1, class T2>
VecAdd(const T1 vec1,const T2 vec2)1438 inline T1 VecAdd(const T1 vec1, const T2 vec2)
1439 {
1440 return (T1)vec_add(vec1, (T1)vec2);
1441 }
1442
1443 /// \brief Subtract two vectors
1444 /// \tparam T1 vector type
1445 /// \tparam T2 vector type
1446 /// \param vec1 the first vector
1447 /// \param vec2 the second vector
1448 /// \details VecSub() performs <tt>vec1 - vec2</tt>.
1449 /// vec2 is cast to the same type as vec1. The return vector
1450 /// is the same type as vec1.
1451 /// \par Wraps
1452 /// vec_sub
1453 /// \sa VecSub64
1454 /// \since Crypto++ 6.0
1455 template <class T1, class T2>
VecSub(const T1 vec1,const T2 vec2)1456 inline T1 VecSub(const T1 vec1, const T2 vec2)
1457 {
1458 return (T1)vec_sub(vec1, (T1)vec2);
1459 }
1460
1461 //@}
1462
1463 /// \name PERMUTE OPERATIONS
1464 //@{
1465
1466 /// \brief Permutes a vector
1467 /// \tparam T1 vector type
1468 /// \tparam T2 vector type
1469 /// \param vec the vector
1470 /// \param mask vector mask
1471 /// \return vector
1472 /// \details VecPermute() creates a new vector from vec according to mask.
1473 /// mask is an uint8x16_p vector. The return vector is the same type as vec.
1474 /// \par Wraps
1475 /// vec_perm
1476 /// \since Crypto++ 6.0
1477 template <class T1, class T2>
VecPermute(const T1 vec,const T2 mask)1478 inline T1 VecPermute(const T1 vec, const T2 mask)
1479 {
1480 return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1481 }
1482
1483 /// \brief Permutes two vectors
1484 /// \tparam T1 vector type
1485 /// \tparam T2 vector type
1486 /// \param vec1 the first vector
1487 /// \param vec2 the second vector
1488 /// \param mask vector mask
1489 /// \return vector
1490 /// \details VecPermute() creates a new vector from vec1 and vec2 according to mask.
1491 /// mask is an uint8x16_p vector. The return vector is the same type as vec.
1492 /// \par Wraps
1493 /// vec_perm
1494 /// \since Crypto++ 6.0
1495 template <class T1, class T2>
VecPermute(const T1 vec1,const T1 vec2,const T2 mask)1496 inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1497 {
1498 return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1499 }
1500
1501 //@}
1502
1503 /// \name SHIFT AND ROTATE OPERATIONS
1504 //@{
1505
1506 /// \brief Shift a vector left
1507 /// \tparam C shift byte count
1508 /// \tparam T vector type
1509 /// \param vec the vector
1510 /// \return vector
1511 /// \details VecShiftLeftOctet() returns a new vector after shifting the
1512 /// concatenation of the zero vector and the source vector by the specified
1513 /// number of bytes. The return vector is the same type as vec.
1514 /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
1515 /// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
1516 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1517 /// if on a big endian machine as shown below.
1518 /// <pre>
1519 /// uint8x16_p x = VecLoad(ptr);
1520 /// uint8x16_p y = VecShiftLeftOctet<12>(x);
1521 /// </pre>
1522 /// \par Wraps
1523 /// vec_sld
1524 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1525 /// endian sensitive?</A> on Stack Overflow
1526 /// \since Crypto++ 6.0
1527 template <unsigned int C, class T>
VecShiftLeftOctet(const T vec)1528 inline T VecShiftLeftOctet(const T vec)
1529 {
1530 const T zero = {0};
1531 if (C >= 16)
1532 {
1533 // Out of range
1534 return zero;
1535 }
1536 else if (C == 0)
1537 {
1538 // Noop
1539 return vec;
1540 }
1541 else
1542 {
1543 #if defined(CRYPTOPP_BIG_ENDIAN)
1544 enum { R=C&0xf };
1545 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1546 #else
1547 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1548 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1549 #endif
1550 }
1551 }
1552
1553 /// \brief Shift a vector right
1554 /// \tparam C shift byte count
1555 /// \tparam T vector type
1556 /// \param vec the vector
1557 /// \return vector
1558 /// \details VecShiftRightOctet() returns a new vector after shifting the
1559 /// concatenation of the zero vector and the source vector by the specified
1560 /// number of bytes. The return vector is the same type as vec.
1561 /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
1562 /// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
1563 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1564 /// if on a big endian machine as shown below.
1565 /// <pre>
1566 /// uint8x16_p x = VecLoad(ptr);
1567 /// uint8x16_p y = VecShiftRightOctet<12>(y);
1568 /// </pre>
1569 /// \par Wraps
1570 /// vec_sld
1571 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1572 /// endian sensitive?</A> on Stack Overflow
1573 /// \since Crypto++ 6.0
1574 template <unsigned int C, class T>
VecShiftRightOctet(const T vec)1575 inline T VecShiftRightOctet(const T vec)
1576 {
1577 const T zero = {0};
1578 if (C >= 16)
1579 {
1580 // Out of range
1581 return zero;
1582 }
1583 else if (C == 0)
1584 {
1585 // Noop
1586 return vec;
1587 }
1588 else
1589 {
1590 #if defined(CRYPTOPP_BIG_ENDIAN)
1591 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1592 return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1593 #else
1594 enum { R=C&0xf };
1595 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1596 #endif
1597 }
1598 }
1599
1600 /// \brief Rotate a vector left
1601 /// \tparam C shift byte count
1602 /// \tparam T vector type
1603 /// \param vec the vector
1604 /// \return vector
1605 /// \details VecRotateLeftOctet() returns a new vector after rotating the
1606 /// concatenation of the source vector with itself by the specified
1607 /// number of bytes. The return vector is the same type as vec.
1608 /// \par Wraps
1609 /// vec_sld
1610 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1611 /// endian sensitive?</A> on Stack Overflow
1612 /// \since Crypto++ 6.0
1613 template <unsigned int C, class T>
VecRotateLeftOctet(const T vec)1614 inline T VecRotateLeftOctet(const T vec)
1615 {
1616 #if defined(CRYPTOPP_BIG_ENDIAN)
1617 enum { R = C&0xf };
1618 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1619 #else
1620 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1621 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1622 #endif
1623 }
1624
1625 /// \brief Rotate a vector right
1626 /// \tparam C shift byte count
1627 /// \tparam T vector type
1628 /// \param vec the vector
1629 /// \return vector
1630 /// \details VecRotateRightOctet() returns a new vector after rotating the
1631 /// concatenation of the source vector with itself by the specified
1632 /// number of bytes. The return vector is the same type as vec.
1633 /// \par Wraps
1634 /// vec_sld
1635 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1636 /// endian sensitive?</A> on Stack Overflow
1637 /// \since Crypto++ 6.0
1638 template <unsigned int C, class T>
VecRotateRightOctet(const T vec)1639 inline T VecRotateRightOctet(const T vec)
1640 {
1641 #if defined(CRYPTOPP_BIG_ENDIAN)
1642 enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1643 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1644 #else
1645 enum { R = C&0xf };
1646 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1647 #endif
1648 }
1649
1650 /// \brief Rotate a vector left
1651 /// \tparam C rotate bit count
1652 /// \param vec the vector
1653 /// \return vector
1654 /// \details VecRotateLeft() rotates each element in a vector by
1655 /// bit count. The return vector is the same type as vec.
1656 /// \par Wraps
1657 /// vec_rl
1658 /// \since Crypto++ 7.0
1659 template<unsigned int C>
VecRotateLeft(const uint32x4_p vec)1660 inline uint32x4_p VecRotateLeft(const uint32x4_p vec)
1661 {
1662 const uint32x4_p m = {C, C, C, C};
1663 return vec_rl(vec, m);
1664 }
1665
1666 /// \brief Rotate a vector right
1667 /// \tparam C rotate bit count
1668 /// \param vec the vector
1669 /// \return vector
1670 /// \details VecRotateRight() rotates each element in a vector
1671 /// by bit count. The return vector is the same type as vec.
1672 /// \par Wraps
1673 /// vec_rl
1674 /// \since Crypto++ 7.0
1675 template<unsigned int C>
VecRotateRight(const uint32x4_p vec)1676 inline uint32x4_p VecRotateRight(const uint32x4_p vec)
1677 {
1678 const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1679 return vec_rl(vec, m);
1680 }
1681
1682 /// \brief Shift a vector left
1683 /// \tparam C shift bit count
1684 /// \param vec the vector
1685 /// \return vector
1686 /// \details VecShiftLeft() rotates each element in a vector
1687 /// by bit count. The return vector is the same type as vec.
1688 /// \par Wraps
1689 /// vec_sl
1690 /// \since Crypto++ 8.1
1691 template<unsigned int C>
VecShiftLeft(const uint32x4_p vec)1692 inline uint32x4_p VecShiftLeft(const uint32x4_p vec)
1693 {
1694 const uint32x4_p m = {C, C, C, C};
1695 return vec_sl(vec, m);
1696 }
1697
1698 /// \brief Shift a vector right
1699 /// \tparam C shift bit count
1700 /// \param vec the vector
1701 /// \return vector
1702 /// \details VecShiftRight() rotates each element in a vector
1703 /// by bit count. The return vector is the same type as vec.
1704 /// \par Wraps
1705 /// vec_rl
1706 /// \since Crypto++ 8.1
1707 template<unsigned int C>
VecShiftRight(const uint32x4_p vec)1708 inline uint32x4_p VecShiftRight(const uint32x4_p vec)
1709 {
1710 const uint32x4_p m = {C, C, C, C};
1711 return vec_sr(vec, m);
1712 }
1713
1714 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
1715 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1716
1717 /// \brief Rotate a vector left
1718 /// \tparam C rotate bit count
1719 /// \param vec the vector
1720 /// \return vector
1721 /// \details VecRotateLeft() rotates each element in a vector
1722 /// by bit count. The return vector is the same type as vec.
1723 /// \details VecRotateLeft() with 64-bit elements is available on
1724 /// POWER8 and above.
1725 /// \par Wraps
1726 /// vec_rl
1727 /// \since Crypto++ 8.0
1728 template<unsigned int C>
VecRotateLeft(const uint64x2_p vec)1729 inline uint64x2_p VecRotateLeft(const uint64x2_p vec)
1730 {
1731 const uint64x2_p m = {C, C};
1732 return vec_rl(vec, m);
1733 }
1734
1735 /// \brief Shift a vector left
1736 /// \tparam C shift bit count
1737 /// \param vec the vector
1738 /// \return vector
1739 /// \details VecShiftLeft() rotates each element in a vector
1740 /// by bit count. The return vector is the same type as vec.
1741 /// \details VecShiftLeft() with 64-bit elements is available on
1742 /// POWER8 and above.
1743 /// \par Wraps
1744 /// vec_sl
1745 /// \since Crypto++ 8.1
1746 template<unsigned int C>
VecShiftLeft(const uint64x2_p vec)1747 inline uint64x2_p VecShiftLeft(const uint64x2_p vec)
1748 {
1749 const uint64x2_p m = {C, C};
1750 return vec_sl(vec, m);
1751 }
1752
1753 /// \brief Rotate a vector right
1754 /// \tparam C rotate bit count
1755 /// \param vec the vector
1756 /// \return vector
1757 /// \details VecRotateRight() rotates each element in a vector
1758 /// by bit count. The return vector is the same type as vec.
1759 /// \details VecRotateRight() with 64-bit elements is available on
1760 /// POWER8 and above.
1761 /// \par Wraps
1762 /// vec_rl
1763 /// \since Crypto++ 8.0
1764 template<unsigned int C>
VecRotateRight(const uint64x2_p vec)1765 inline uint64x2_p VecRotateRight(const uint64x2_p vec)
1766 {
1767 const uint64x2_p m = {64-C, 64-C};
1768 return vec_rl(vec, m);
1769 }
1770
1771 /// \brief Shift a vector right
1772 /// \tparam C shift bit count
1773 /// \param vec the vector
1774 /// \return vector
1775 /// \details VecShiftRight() rotates each element in a vector
1776 /// by bit count. The return vector is the same type as vec.
1777 /// \details VecShiftRight() with 64-bit elements is available on
1778 /// POWER8 and above.
1779 /// \par Wraps
1780 /// vec_sr
1781 /// \since Crypto++ 8.1
1782 template<unsigned int C>
VecShiftRight(const uint64x2_p vec)1783 inline uint64x2_p VecShiftRight(const uint64x2_p vec)
1784 {
1785 const uint64x2_p m = {C, C};
1786 return vec_sr(vec, m);
1787 }
1788
1789 #endif // ARCH_PWR8
1790
1791 //@}
1792
1793 /// \name OTHER OPERATIONS
1794 //@{
1795
1796 /// \brief Merge two vectors
1797 /// \tparam T vector type
1798 /// \param vec1 the first vector
1799 /// \param vec2 the second vector
1800 /// \return vector
1801 /// \par Wraps
1802 /// vec_mergel
1803 /// \since Crypto++ 8.1
1804 template <class T>
VecMergeLow(const T vec1,const T vec2)1805 inline T VecMergeLow(const T vec1, const T vec2)
1806 {
1807 return vec_mergel(vec1, vec2);
1808 }
1809
1810 /// \brief Merge two vectors
1811 /// \tparam T vector type
1812 /// \param vec1 the first vector
1813 /// \param vec2 the second vector
1814 /// \return vector
1815 /// \par Wraps
1816 /// vec_mergeh
1817 /// \since Crypto++ 8.1
1818 template <class T>
VecMergeHigh(const T vec1,const T vec2)1819 inline T VecMergeHigh(const T vec1, const T vec2)
1820 {
1821 return vec_mergeh(vec1, vec2);
1822 }
1823
1824 /// \brief Broadcast 32-bit word to a vector
1825 /// \param val the 32-bit value
1826 /// \return vector
1827 /// \par Wraps
1828 /// vec_splats
1829 /// \since Crypto++ 8.3
VecSplatWord(word32 val)1830 inline uint32x4_p VecSplatWord(word32 val)
1831 {
1832 // Fix spurious GCC warning???
1833 CRYPTOPP_UNUSED(val);
1834
1835 // Apple Altivec and XL C++ do not offer vec_splats.
1836 // GCC offers vec_splats back to -mcpu=power4.
1837 #if defined(_ARCH_PWR4) && defined(__GNUC__)
1838 return vec_splats(val);
1839 #else
1840 //const word32 x[4] = {val,val,val,val};
1841 //return VecLoad(x);
1842 const word32 x[4] = {val};
1843 return vec_splat(VecLoad(x),0);
1844 #endif
1845 }
1846
1847 /// \brief Broadcast 32-bit element to a vector
1848 /// \tparam the element number
1849 /// \param val the 32-bit value
1850 /// \return vector
1851 /// \par Wraps
1852 /// vec_splat
1853 /// \since Crypto++ 8.3
1854 template <unsigned int N>
VecSplatElement(const uint32x4_p val)1855 inline uint32x4_p VecSplatElement(const uint32x4_p val)
1856 {
1857 return vec_splat(val, N);
1858 }
1859
1860 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1861 /// \brief Broadcast 64-bit double word to a vector
1862 /// \param val the 64-bit value
1863 /// \return vector
1864 /// \par Wraps
1865 /// vec_splats
1866 /// \since Crypto++ 8.3
VecSplatWord(word64 val)1867 inline uint64x2_p VecSplatWord(word64 val)
1868 {
1869 // The PPC64 ABI says so.
1870 return vec_splats((unsigned long long)val);
1871 }
1872
1873 /// \brief Broadcast 64-bit element to a vector
1874 /// \tparam the element number
1875 /// \param val the 64-bit value
1876 /// \return vector
1877 /// \par Wraps
1878 /// vec_splat
1879 /// \since Crypto++ 8.3
1880 template <unsigned int N>
VecSplatElement(const uint64x2_p val)1881 inline uint64x2_p VecSplatElement(const uint64x2_p val)
1882 {
1883 #if defined(__VSX__) || defined(_ARCH_PWR8)
1884 return vec_splat(val, N);
1885 #else
1886 enum {E=N&1};
1887 if (E == 0)
1888 {
1889 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
1890 return vec_perm(val, val, m);
1891 }
1892 else // (E == 1)
1893 {
1894 const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
1895 return vec_perm(val, val, m);
1896 }
1897 #endif
1898 }
1899 #endif
1900
1901 /// \brief Extract a dword from a vector
1902 /// \tparam T vector type
1903 /// \param val the vector
1904 /// \return vector created from low dword
1905 /// \details VecGetLow() extracts the low dword from a vector. The low dword
1906 /// is composed of the least significant bits and occupies bytes 8 through 15
1907 /// when viewed as a big endian array. The return vector is the same type as
1908 /// the original vector and padded with 0's in the most significant bit positions.
1909 /// \par Wraps
1910 /// vec_sld
1911 /// \since Crypto++ 7.0
1912 template <class T>
VecGetLow(const T val)1913 inline T VecGetLow(const T val)
1914 {
1915 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1916 const T zero = {0};
1917 return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1918 #else
1919 return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1920 #endif
1921 }
1922
1923 /// \brief Extract a dword from a vector
1924 /// \tparam T vector type
1925 /// \param val the vector
1926 /// \return vector created from high dword
1927 /// \details VecGetHigh() extracts the high dword from a vector. The high dword
1928 /// is composed of the most significant bits and occupies bytes 0 through 7
1929 /// when viewed as a big endian array. The return vector is the same type as
1930 /// the original vector and padded with 0's in the most significant bit positions.
1931 /// \par Wraps
1932 /// vec_sld
1933 /// \since Crypto++ 7.0
1934 template <class T>
VecGetHigh(const T val)1935 inline T VecGetHigh(const T val)
1936 {
1937 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1938 const T zero = {0};
1939 return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1940 #else
1941 return VecShiftRightOctet<8>(val);
1942 #endif
1943 }
1944
1945 /// \brief Exchange high and low double words
1946 /// \tparam T vector type
1947 /// \param vec the vector
1948 /// \return vector
1949 /// \par Wraps
1950 /// vec_sld
1951 /// \since Crypto++ 7.0
1952 template <class T>
VecSwapWords(const T vec)1953 inline T VecSwapWords(const T vec)
1954 {
1955 return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1956 }
1957
1958 //@}
1959
1960 /// \name COMPARISON
1961 //@{
1962
1963 /// \brief Compare two vectors
1964 /// \tparam T1 vector type
1965 /// \tparam T2 vector type
1966 /// \param vec1 the first vector
1967 /// \param vec2 the second vector
1968 /// \return true if vec1 equals vec2, false otherwise
1969 /// \details VecEqual() performs a bitwise compare. The vector element types do
1970 /// not matter.
1971 /// \par Wraps
1972 /// vec_all_eq
1973 /// \since Crypto++ 8.0
1974 template <class T1, class T2>
VecEqual(const T1 vec1,const T2 vec2)1975 inline bool VecEqual(const T1 vec1, const T2 vec2)
1976 {
1977 return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1978 }
1979
1980 /// \brief Compare two vectors
1981 /// \tparam T1 vector type
1982 /// \tparam T2 vector type
1983 /// \param vec1 the first vector
1984 /// \param vec2 the second vector
1985 /// \return true if vec1 does not equal vec2, false otherwise
1986 /// \details VecNotEqual() performs a bitwise compare. The vector element types do
1987 /// not matter.
1988 /// \par Wraps
1989 /// vec_all_eq
1990 /// \since Crypto++ 8.0
1991 template <class T1, class T2>
VecNotEqual(const T1 vec1,const T2 vec2)1992 inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1993 {
1994 return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1995 }
1996
1997 //@}
1998
1999 ////////////////// 32-bit Altivec /////////////////
2000
2001 /// \name 32-BIT ALTIVEC
2002 //@{
2003
2004 /// \brief Add two vectors as if uint64x2_p
2005 /// \param vec1 the first vector
2006 /// \param vec2 the second vector
2007 /// \return vector
2008 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2009 /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2010 /// the carries from the elements.
2011 /// \par Wraps
2012 /// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
2013 /// \since Crypto++ 8.3
VecAdd64(const uint32x4_p & vec1,const uint32x4_p & vec2)2014 inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2015 {
2016 // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2017 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2018 return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
2019 #else
2020 // The carry mask selects carrys for elements 1 and 3 and sets
2021 // remaining elements to 0. The results is then shifted so the
2022 // carried values are added to elements 0 and 2.
2023 #if defined(CRYPTOPP_BIG_ENDIAN)
2024 const uint32x4_p zero = {0, 0, 0, 0};
2025 const uint32x4_p mask = {0, 1, 0, 1};
2026 #else
2027 const uint32x4_p zero = {0, 0, 0, 0};
2028 const uint32x4_p mask = {1, 0, 1, 0};
2029 #endif
2030
2031 uint32x4_p cy = vec_addc(vec1, vec2);
2032 uint32x4_p res = vec_add(vec1, vec2);
2033 cy = vec_and(mask, cy);
2034 cy = vec_sld (cy, zero, 4);
2035 return vec_add(res, cy);
2036 #endif
2037 }
2038
2039 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2040 /// \brief Add two vectors as if uint64x2_p
2041 /// \param vec1 the first vector
2042 /// \param vec2 the second vector
2043 /// \return vector
2044 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2045 /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2046 /// the carries from the elements.
2047 /// \par Wraps
2048 /// vec_add for POWER8
2049 /// \since Crypto++ 8.3
VecAdd64(const uint64x2_p & vec1,const uint64x2_p & vec2)2050 inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2051 {
2052 // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2053 const uint64x2_p res = vec_add(vec1, vec2);
2054
2055 #if defined(CRYPTOPP_DEBUG)
2056 // Test 32-bit add in debug builds while we are here.
2057 const uint32x4_p x = (uint32x4_p)vec1;
2058 const uint32x4_p y = (uint32x4_p)vec2;
2059 const uint32x4_p r = VecAdd64(x, y);
2060
2061 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2062 #endif
2063
2064 return res;
2065 }
2066 #endif
2067
2068 /// \brief Subtract two vectors as if uint64x2_p
2069 /// \param vec1 the first vector
2070 /// \param vec2 the second vector
2071 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2072 /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2073 /// manages the borrows from the elements.
2074 /// \par Wraps
2075 /// vec_sub for POWER8, vec_subc, vec_andc, vec_perm, vec_sub for Altivec
2076 /// \since Crypto++ 8.3
VecSub64(const uint32x4_p & vec1,const uint32x4_p & vec2)2077 inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2078 {
2079 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2080 // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2081 return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);
2082 #else
2083 // The borrow mask selects borrows for elements 1 and 3 and sets
2084 // remaining elements to 0. The results is then shifted so the
2085 // borrowed values are subtracted from elements 0 and 2.
2086 #if defined(CRYPTOPP_BIG_ENDIAN)
2087 const uint32x4_p zero = {0, 0, 0, 0};
2088 const uint32x4_p mask = {0, 1, 0, 1};
2089 #else
2090 const uint32x4_p zero = {0, 0, 0, 0};
2091 const uint32x4_p mask = {1, 0, 1, 0};
2092 #endif
2093
2094 // subc sets the complement of borrow, so we have to
2095 // un-complement it using andc.
2096 uint32x4_p bw = vec_subc(vec1, vec2);
2097 uint32x4_p res = vec_sub(vec1, vec2);
2098 bw = vec_andc(mask, bw);
2099 bw = vec_sld (bw, zero, 4);
2100 return vec_sub(res, bw);
2101 #endif
2102 }
2103
2104 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2105 /// \brief Subtract two vectors as if uint64x2_p
2106 /// \param vec1 the first vector
2107 /// \param vec2 the second vector
2108 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2109 /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2110 /// manages the borrows from the elements.
2111 /// \par Wraps
2112 /// vec_sub for POWER8
2113 /// \since Crypto++ 8.3
VecSub64(const uint64x2_p & vec1,const uint64x2_p & vec2)2114 inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2115 {
2116 // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2117 const uint64x2_p res = vec_sub(vec1, vec2);
2118
2119 #if defined(CRYPTOPP_DEBUG)
2120 // Test 32-bit sub in debug builds while we are here.
2121 const uint32x4_p x = (uint32x4_p)vec1;
2122 const uint32x4_p y = (uint32x4_p)vec2;
2123 const uint32x4_p r = VecSub64(x, y);
2124
2125 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2126 #endif
2127
2128 return res;
2129 }
2130 #endif
2131
2132 /// \brief Rotate a vector left as if uint64x2_p
2133 /// \tparam C rotate bit count
2134 /// \param vec the vector
2135 /// \return vector
2136 /// \details VecRotateLeft() rotates each element in a vector by bit count.
2137 /// vec is rotated as if uint64x2_p.
2138 /// \par Wraps
2139 /// vec_rl
2140 /// \since Crypto++ 8.3
2141 template<unsigned int C>
VecRotateLeft64(const uint32x4_p vec)2142 inline uint32x4_p VecRotateLeft64(const uint32x4_p vec)
2143 {
2144 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2145 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2146 return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);
2147 #else
2148 // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2149 enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2150
2151 // Get the low bits, shift them to high bits
2152 uint32x4_p t1 = VecShiftLeft<S32>(vec);
2153 // Get the high bits, shift them to low bits
2154 uint32x4_p t2 = VecShiftRight<32-S32>(vec);
2155
2156 if (S64 == 0)
2157 {
2158 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2159 return VecPermute(vec, m);
2160 }
2161 else if (S64 == 32)
2162 {
2163 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2164 return VecPermute(vec, m);
2165 }
2166 else if (BR) // Big rotate amount?
2167 {
2168 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2169 t1 = VecPermute(t1, m);
2170 }
2171 else
2172 {
2173 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2174 t2 = VecPermute(t2, m);
2175 }
2176
2177 return vec_or(t1, t2);
2178 #endif
2179 }
2180
2181 /// \brief Rotate a vector left as if uint64x2_p
2182 /// \param vec the vector
2183 /// \return vector
2184 /// \details VecRotateLeft<8>() rotates each element in a vector
2185 /// by 8-bits. vec is rotated as if uint64x2_p. This specialization
2186 /// is used by algorithms like Speck128.
2187 /// \par Wraps
2188 /// vec_rl
2189 /// \since Crypto++ 8.3
2190 template<>
2191 inline uint32x4_p VecRotateLeft64<8>(const uint32x4_p vec)
2192 {
2193 #if (CRYPTOPP_BIG_ENDIAN)
2194 const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2195 return VecPermute(vec, m);
2196 #else
2197 const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2198 return VecPermute(vec, m);
2199 #endif
2200 }
2201
2202 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2203 /// \brief Rotate a vector left as if uint64x2_p
2204 /// \tparam C rotate bit count
2205 /// \param vec the vector
2206 /// \return vector
2207 /// \details VecRotateLeft64() rotates each element in a vector by
2208 /// bit count. vec is rotated as if uint64x2_p.
2209 /// \par Wraps
2210 /// vec_rl
2211 /// \since Crypto++ 8.3
2212 template<unsigned int C>
VecRotateLeft64(const uint64x2_p vec)2213 inline uint64x2_p VecRotateLeft64(const uint64x2_p vec)
2214 {
2215 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2216 const uint64x2_p res = VecRotateLeft<C>(vec);
2217
2218 #if defined(CRYPTOPP_DEBUG)
2219 // Test 32-bit rotate in debug builds while we are here.
2220 const uint32x4_p x = (uint32x4_p)vec;
2221 const uint32x4_p r = VecRotateLeft64<C>(x);
2222
2223 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2224 #endif
2225
2226 return res;
2227 }
2228 #endif
2229
2230 /// \brief Rotate a vector right as if uint64x2_p
2231 /// \tparam C rotate bit count
2232 /// \param vec the vector
2233 /// \return vector
2234 /// \details VecRotateRight64() rotates each element in a vector by
2235 /// bit count. vec is rotated as if uint64x2_p.
2236 /// \par Wraps
2237 /// vec_rl
2238 /// \since Crypto++ 8.3
2239 template<unsigned int C>
VecRotateRight64(const uint32x4_p vec)2240 inline uint32x4_p VecRotateRight64(const uint32x4_p vec)
2241 {
2242 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2243 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2244 return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);
2245 #else
2246 // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2247 enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2248
2249 // Get the low bits, shift them to high bits
2250 uint32x4_p t1 = VecShiftRight<S32>(vec);
2251 // Get the high bits, shift them to low bits
2252 uint32x4_p t2 = VecShiftLeft<32-S32>(vec);
2253
2254 if (S64 == 0)
2255 {
2256 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2257 return VecPermute(vec, m);
2258 }
2259 else if (S64 == 32)
2260 {
2261 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2262 return VecPermute(vec, m);
2263 }
2264 else if (BR) // Big rotate amount?
2265 {
2266 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2267 t1 = VecPermute(t1, m);
2268 }
2269 else
2270 {
2271 const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2272 t2 = VecPermute(t2, m);
2273 }
2274
2275 return vec_or(t1, t2);
2276 #endif
2277 }
2278
2279 /// \brief Rotate a vector right as if uint64x2_p
2280 /// \param vec the vector
2281 /// \return vector
2282 /// \details VecRotateRight64<8>() rotates each element in a vector
2283 /// by 8-bits. vec is rotated as if uint64x2_p. This specialization
2284 /// is used by algorithms like Speck128.
2285 /// \details vec is rotated as if uint64x2_p.
2286 /// \par Wraps
2287 /// vec_rl
2288 /// \since Crypto++ 8.3
2289 template<>
2290 inline uint32x4_p VecRotateRight64<8>(const uint32x4_p vec)
2291 {
2292 #if (CRYPTOPP_BIG_ENDIAN)
2293 const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2294 return VecPermute(vec, m);
2295 #else
2296 const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2297 return VecPermute(vec, m);
2298 #endif
2299 }
2300
2301 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2302 /// \brief Rotate a vector right as if uint64x2_p
2303 /// \tparam C rotate bit count
2304 /// \param vec the vector
2305 /// \return vector
2306 /// \details VecRotateRight64() rotates each element in a vector by
2307 /// bit count. vec is rotated as if uint64x2_p.
2308 /// \par Wraps
2309 /// vec_rl
2310 /// \since Crypto++ 8.3
2311 template<unsigned int C>
VecRotateRight64(const uint64x2_p vec)2312 inline uint64x2_p VecRotateRight64(const uint64x2_p vec)
2313 {
2314 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2315 const uint64x2_p res = VecRotateRight<C>(vec);
2316
2317 #if defined(CRYPTOPP_DEBUG)
2318 // Test 32-bit rotate in debug builds while we are here.
2319 const uint32x4_p x = (uint32x4_p)vec;
2320 const uint32x4_p r = VecRotateRight64<C>(x);
2321
2322 CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2323 #endif
2324
2325 return res;
2326 }
2327 #endif
2328
2329 /// \brief AND two vectors as if uint64x2_p
2330 /// \tparam T1 vector type
2331 /// \tparam T2 vector type
2332 /// \param vec1 the first vector
2333 /// \param vec2 the second vector
2334 /// \return vector
2335 /// \details VecAnd64() performs <tt>vec1 & vec2</tt>.
2336 /// vec2 is cast to the same type as vec1. The return vector
2337 /// is the same type as vec1.
2338 /// \details VecAnd64() is a convenience function that simply performs a VecAnd().
2339 /// \par Wraps
2340 /// vec_and
2341 /// \since Crypto++ 8.3
2342 template <class T1, class T2>
VecAnd64(const T1 vec1,const T2 vec2)2343 inline T1 VecAnd64(const T1 vec1, const T2 vec2)
2344 {
2345 return (T1)vec_and(vec1, (T1)vec2);
2346 }
2347
2348 /// \brief OR two vectors as if uint64x2_p
2349 /// \tparam T1 vector type
2350 /// \tparam T2 vector type
2351 /// \param vec1 the first vector
2352 /// \param vec2 the second vector
2353 /// \return vector
2354 /// \details VecOr64() performs <tt>vec1 | vec2</tt>.
2355 /// vec2 is cast to the same type as vec1. The return vector
2356 /// is the same type as vec1.
2357 /// \details VecOr64() is a convenience function that simply performs a VecOr().
2358 /// \par Wraps
2359 /// vec_or
2360 /// \since Crypto++ 8.3
2361 template <class T1, class T2>
VecOr64(const T1 vec1,const T2 vec2)2362 inline T1 VecOr64(const T1 vec1, const T2 vec2)
2363 {
2364 return (T1)vec_or(vec1, (T1)vec2);
2365 }
2366
2367 /// \brief XOR two vectors as if uint64x2_p
2368 /// \tparam T1 vector type
2369 /// \tparam T2 vector type
2370 /// \param vec1 the first vector
2371 /// \param vec2 the second vector
2372 /// \return vector
2373 /// \details VecXor64() performs <tt>vec1 ^ vec2</tt>.
2374 /// vec2 is cast to the same type as vec1. The return vector
2375 /// is the same type as vec1.
2376 /// \details VecXor64() is a convenience function that simply performs a VecXor().
2377 /// \par Wraps
2378 /// vec_xor
2379 /// \since Crypto++ 8.3
2380 template <class T1, class T2>
VecXor64(const T1 vec1,const T2 vec2)2381 inline T1 VecXor64(const T1 vec1, const T2 vec2)
2382 {
2383 return (T1)vec_xor(vec1, (T1)vec2);
2384 }
2385
2386 /// \brief Broadcast 64-bit double word to a vector
2387 /// \param val the 64-bit value
2388 /// \return vector
2389 /// \par Wraps
2390 /// vec_splats
2391 /// \since Crypto++ 8.3
VecSplatWord64(word64 val)2392 inline uint32x4_p VecSplatWord64(word64 val)
2393 {
2394 #if defined(_ARCH_PWR8)
2395 // The PPC64 ABI says so.
2396 return (uint32x4_p)vec_splats((unsigned long long)val);
2397 #else
2398 const word64 x[2] = {val,val};
2399 return (uint32x4_p)VecLoad((const word32*)x);
2400 #endif
2401 }
2402
2403 /// \brief Broadcast 64-bit element to a vector as if uint64x2_p
2404 /// \tparam the element number
2405 /// \param val the 64-bit value
2406 /// \return vector
2407 /// \par Wraps
2408 /// vec_splat
2409 /// \since Crypto++ 8.3
2410 template <unsigned int N>
VecSplatElement64(const uint32x4_p val)2411 inline uint32x4_p VecSplatElement64(const uint32x4_p val)
2412 {
2413 #if defined(__VSX__) || defined(_ARCH_PWR8)
2414 return (uint32x4_p)vec_splat((uint64x2_p)val, N);
2415 #else
2416 enum {E=N&1};
2417 if (E == 0)
2418 {
2419 const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
2420 return (uint32x4_p)vec_perm(val, val, m);
2421 }
2422 else // (E == 1)
2423 {
2424 const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
2425 return (uint32x4_p)vec_perm(val, val, m);
2426 }
2427 #endif
2428 }
2429
2430 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2431 /// \brief Broadcast 64-bit element to a vector
2432 /// \tparam the element number
2433 /// \param val the 64-bit value
2434 /// \return vector
2435 /// \since Crypto++ 8.3
2436 template <unsigned int N>
VecSplatElement64(const uint64x2_p val)2437 inline uint64x2_p VecSplatElement64(const uint64x2_p val)
2438 {
2439 return vec_splat(val, N);
2440 }
2441 #endif
2442
2443 //@}
2444
2445 //////////////////////// Power8 Crypto ////////////////////////
2446
2447 // __CRYPTO__ alone is not enough. Clang will define __CRYPTO__
2448 // when it is not available, like with Power7. Sigh...
2449 #if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2450
2451 /// \name POLYNOMIAL MULTIPLICATION
2452 //@{
2453
2454 /// \brief Polynomial multiplication
2455 /// \param a the first term
2456 /// \param b the second term
2457 /// \return vector product
2458 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2459 /// polynomial multiplication multiplies the high and low terms, and then
2460 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2461 /// al*bl</tt>. It is different behavior than Intel polynomial
2462 /// multiplication. To obtain a single product without the XOR, then set
2463 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2464 /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
2465 /// \par Wraps
2466 /// __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.
2467 /// \since Crypto++ 8.1
VecPolyMultiply(const uint32x4_p & a,const uint32x4_p & b)2468 inline uint32x4_p VecPolyMultiply(const uint32x4_p& a, const uint32x4_p& b)
2469 {
2470 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2471 return __vpmsumw (a, b);
2472 #elif defined(__clang__)
2473 return __builtin_altivec_crypto_vpmsumw (a, b);
2474 #else
2475 return __builtin_crypto_vpmsumw (a, b);
2476 #endif
2477 }
2478
2479 /// \brief Polynomial multiplication
2480 /// \param a the first term
2481 /// \param b the second term
2482 /// \return vector product
2483 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2484 /// polynomial multiplication multiplies the high and low terms, and then
2485 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2486 /// al*bl</tt>. It is different behavior than Intel polynomial
2487 /// multiplication. To obtain a single product without the XOR, then set
2488 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2489 /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
2490 /// \par Wraps
2491 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2492 /// \since Crypto++ 8.1
VecPolyMultiply(const uint64x2_p & a,const uint64x2_p & b)2493 inline uint64x2_p VecPolyMultiply(const uint64x2_p& a, const uint64x2_p& b)
2494 {
2495 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2496 return __vpmsumd (a, b);
2497 #elif defined(__clang__)
2498 return __builtin_altivec_crypto_vpmsumd (a, b);
2499 #else
2500 return __builtin_crypto_vpmsumd (a, b);
2501 #endif
2502 }
2503
2504 /// \brief Polynomial multiplication
2505 /// \param a the first term
2506 /// \param b the second term
2507 /// \return vector product
2508 /// \details VecIntelMultiply00() performs polynomial multiplication and presents
2509 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
2510 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
2511 /// are multiplied.
2512 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2513 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2514 /// \par Wraps
2515 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2516 /// \since Crypto++ 8.0
VecIntelMultiply00(const uint64x2_p & a,const uint64x2_p & b)2517 inline uint64x2_p VecIntelMultiply00(const uint64x2_p& a, const uint64x2_p& b)
2518 {
2519 #if defined(CRYPTOPP_BIG_ENDIAN)
2520 return VecSwapWords(VecPolyMultiply(VecGetHigh(a), VecGetHigh(b)));
2521 #else
2522 return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
2523 #endif
2524 }
2525
2526 /// \brief Polynomial multiplication
2527 /// \param a the first term
2528 /// \param b the second term
2529 /// \return vector product
2530 /// \details VecIntelMultiply01 performs() polynomial multiplication and presents
2531 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
2532 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
2533 /// 64-bits of <tt>b</tt> are multiplied.
2534 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2535 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2536 /// \par Wraps
2537 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2538 /// \since Crypto++ 8.0
VecIntelMultiply01(const uint64x2_p & a,const uint64x2_p & b)2539 inline uint64x2_p VecIntelMultiply01(const uint64x2_p& a, const uint64x2_p& b)
2540 {
2541 #if defined(CRYPTOPP_BIG_ENDIAN)
2542 return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));
2543 #else
2544 return VecPolyMultiply(a, VecGetHigh(b));
2545 #endif
2546 }
2547
2548 /// \brief Polynomial multiplication
2549 /// \param a the first term
2550 /// \param b the second term
2551 /// \return vector product
2552 /// \details VecIntelMultiply10() performs polynomial multiplication and presents
2553 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
2554 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
2555 /// 64-bits of <tt>b</tt> are multiplied.
2556 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2557 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2558 /// \par Wraps
2559 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2560 /// \since Crypto++ 8.0
VecIntelMultiply10(const uint64x2_p & a,const uint64x2_p & b)2561 inline uint64x2_p VecIntelMultiply10(const uint64x2_p& a, const uint64x2_p& b)
2562 {
2563 #if defined(CRYPTOPP_BIG_ENDIAN)
2564 return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));
2565 #else
2566 return VecPolyMultiply(VecGetHigh(a), b);
2567 #endif
2568 }
2569
2570 /// \brief Polynomial multiplication
2571 /// \param a the first term
2572 /// \param b the second term
2573 /// \return vector product
2574 /// \details VecIntelMultiply11() performs polynomial multiplication and presents
2575 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
2576 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
2577 /// are multiplied.
2578 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2579 /// is MSB and numbered 127, while the rightmost bit is LSB and numbered 0.
2580 /// \par Wraps
2581 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2582 /// \since Crypto++ 8.0
VecIntelMultiply11(const uint64x2_p & a,const uint64x2_p & b)2583 inline uint64x2_p VecIntelMultiply11(const uint64x2_p& a, const uint64x2_p& b)
2584 {
2585 #if defined(CRYPTOPP_BIG_ENDIAN)
2586 return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
2587 #else
2588 return VecPolyMultiply(VecGetLow(a), b);
2589 #endif
2590 }
2591
2592 //@}
2593
2594 /// \name AES ENCRYPTION
2595 //@{
2596
2597 /// \brief One round of AES encryption
2598 /// \tparam T1 vector type
2599 /// \tparam T2 vector type
2600 /// \param state the state vector
2601 /// \param key the subkey vector
2602 /// \details VecEncrypt() performs one round of AES encryption of state
2603 /// using subkey key. The return vector is the same type as state.
2604 /// \details VecEncrypt() is available on POWER8 and above.
2605 /// \par Wraps
2606 /// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
2607 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2608 template <class T1, class T2>
VecEncrypt(const T1 state,const T2 key)2609 inline T1 VecEncrypt(const T1 state, const T2 key)
2610 {
2611 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2612 return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
2613 #elif defined(__clang__)
2614 return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2615 #elif defined(__GNUC__)
2616 return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2617 #else
2618 CRYPTOPP_ASSERT(0);
2619 #endif
2620 }
2621
2622 /// \brief Final round of AES encryption
2623 /// \tparam T1 vector type
2624 /// \tparam T2 vector type
2625 /// \param state the state vector
2626 /// \param key the subkey vector
2627 /// \details VecEncryptLast() performs the final round of AES encryption
2628 /// of state using subkey key. The return vector is the same type as state.
2629 /// \details VecEncryptLast() is available on POWER8 and above.
2630 /// \par Wraps
2631 /// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
2632 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2633 template <class T1, class T2>
VecEncryptLast(const T1 state,const T2 key)2634 inline T1 VecEncryptLast(const T1 state, const T2 key)
2635 {
2636 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2637 return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
2638 #elif defined(__clang__)
2639 return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2640 #elif defined(__GNUC__)
2641 return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2642 #else
2643 CRYPTOPP_ASSERT(0);
2644 #endif
2645 }
2646
2647 /// \brief One round of AES decryption
2648 /// \tparam T1 vector type
2649 /// \tparam T2 vector type
2650 /// \param state the state vector
2651 /// \param key the subkey vector
2652 /// \details VecDecrypt() performs one round of AES decryption of state
2653 /// using subkey key. The return vector is the same type as state.
2654 /// \details VecDecrypt() is available on POWER8 and above.
2655 /// \par Wraps
2656 /// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
2657 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2658 template <class T1, class T2>
VecDecrypt(const T1 state,const T2 key)2659 inline T1 VecDecrypt(const T1 state, const T2 key)
2660 {
2661 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2662 return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
2663 #elif defined(__clang__)
2664 return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2665 #elif defined(__GNUC__)
2666 return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2667 #else
2668 CRYPTOPP_ASSERT(0);
2669 #endif
2670 }
2671
2672 /// \brief Final round of AES decryption
2673 /// \tparam T1 vector type
2674 /// \tparam T2 vector type
2675 /// \param state the state vector
2676 /// \param key the subkey vector
2677 /// \details VecDecryptLast() performs the final round of AES decryption
2678 /// of state using subkey key. The return vector is the same type as state.
2679 /// \details VecDecryptLast() is available on POWER8 and above.
2680 /// \par Wraps
2681 /// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
2682 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2683 template <class T1, class T2>
VecDecryptLast(const T1 state,const T2 key)2684 inline T1 VecDecryptLast(const T1 state, const T2 key)
2685 {
2686 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2687 return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
2688 #elif defined(__clang__)
2689 return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2690 #elif defined(__GNUC__)
2691 return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2692 #else
2693 CRYPTOPP_ASSERT(0);
2694 #endif
2695 }
2696
2697 //@}
2698
2699 /// \name SHA DIGESTS
2700 //@{
2701
2702 /// \brief SHA256 Sigma functions
2703 /// \tparam func function
2704 /// \tparam fmask function mask
2705 /// \tparam T vector type
2706 /// \param data the block to transform
2707 /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
2708 /// func and fmask. The return vector is the same type as data.
2709 /// \details VecSHA256() is available on POWER8 and above.
2710 /// \par Wraps
2711 /// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
2712 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2713 template <int func, int fmask, class T>
VecSHA256(const T data)2714 inline T VecSHA256(const T data)
2715 {
2716 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2717 return (T)__vshasigmaw((uint32x4_p)data, func, fmask);
2718 #elif defined(__clang__)
2719 return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2720 #elif defined(__GNUC__)
2721 return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2722 #else
2723 CRYPTOPP_ASSERT(0);
2724 #endif
2725 }
2726
2727 /// \brief SHA512 Sigma functions
2728 /// \tparam func function
2729 /// \tparam fmask function mask
2730 /// \tparam T vector type
2731 /// \param data the block to transform
2732 /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
2733 /// func and fmask. The return vector is the same type as data.
2734 /// \details VecSHA512() is available on POWER8 and above.
2735 /// \par Wraps
2736 /// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
2737 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2738 template <int func, int fmask, class T>
VecSHA512(const T data)2739 inline T VecSHA512(const T data)
2740 {
2741 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2742 return (T)__vshasigmad((uint64x2_p)data, func, fmask);
2743 #elif defined(__clang__)
2744 return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2745 #elif defined(__GNUC__)
2746 return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2747 #else
2748 CRYPTOPP_ASSERT(0);
2749 #endif
2750 }
2751
2752 //@}
2753
2754 #endif // __CRYPTO__
2755
2756 #endif // _ALTIVEC_
2757
2758 NAMESPACE_END
2759
2760 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
2761 # pragma GCC diagnostic pop
2762 #endif
2763
2764 #endif // CRYPTOPP_PPC_CRYPTO_H
2765