1 // lea_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 //    This source file uses intrinsics and built-ins to gain access to
4 //    SSSE3, ARM NEON and ARMv8a, and Power8 Altivec instructions. A separate
5 //    source file is needed because additional CXXFLAGS are required to enable
6 //    the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "lea.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both simon.cpp and simon_simd.cpp.
16 // #undef CRYPTOPP_SSSE3_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 # if defined(__GNUC__)
28 #  include <x86intrin.h>
29 # endif
30 #endif
31 
32 #if (CRYPTOPP_ARM_NEON_HEADER)
33 # include "adv_simd.h"
34 # include <arm_neon.h>
35 #endif
36 
37 #if (CRYPTOPP_ARM_ACLE_HEADER)
38 # include <stdint.h>
39 # include <arm_acle.h>
40 #endif
41 
42 #if defined(_M_ARM64)
43 # include "adv_simd.h"
44 #endif
45 
46 // Do not port this to POWER architecture. Naively we hoped
47 // for a 2x to 3x speedup. The result was a 5x slow down.
48 // The table below shows MiB/s and cpb.
49 //
50 // C++:
51 // <TD>LEA-128(128)/CTR (128-bit key)<TD>C++<TD>207<TD>15.64
52 // <TD>LEA-128(192)/CTR (192-bit key)<TD>C++<TD>186<TD>17.48
53 // <TD>LEA-128(256)/CTR (256-bit key)<TD>C++<TD>124<TD>26.2
54 //
55 // Power8:
56 // <TD>LEA-128(128)/CTR (128-bit key)<TD>Power8<TD>37<TD>88.7
57 // <TD>LEA-128(192)/CTR (192-bit key)<TD>Power8<TD>40<TD>82.1
58 // <TD>LEA-128(256)/CTR (256-bit key)<TD>Power8<TD>28<TD>116.0
59 
60 #undef CRYPTOPP_POWER8_AVAILABLE
61 #if defined(CRYPTOPP_POWER8_AVAILABLE)
62 # include "adv_simd.h"
63 # include "ppc_simd.h"
64 #endif
65 
66 // Squash MS LNK4221 and libtool warnings
67 extern const char LEA_SIMD_FNAME[] = __FILE__;
68 
69 ANONYMOUS_NAMESPACE_BEGIN
70 
71 using CryptoPP::word32;
72 
73 // *************************** ARM NEON ***************************//
74 
75 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
76 
Xor(const uint32x4_t & a,const uint32x4_t & b)77 inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b)
78 {
79     return veorq_u32(a, b);
80 }
81 
Add(const uint32x4_t & a,const uint32x4_t & b)82 inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b)
83 {
84     return vaddq_u32(a, b);
85 }
86 
Sub(const uint32x4_t & a,const uint32x4_t & b)87 inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b)
88 {
89     return vsubq_u32(a, b);
90 }
91 
92 template <unsigned int R>
RotateLeft(const uint32x4_t & val)93 inline uint32x4_t RotateLeft(const uint32x4_t& val)
94 {
95     const uint32x4_t a(vshlq_n_u32(val, R));
96     const uint32x4_t b(vshrq_n_u32(val, 32 - R));
97     return vorrq_u32(a, b);
98 }
99 
100 template <unsigned int R>
RotateRight(const uint32x4_t & val)101 inline uint32x4_t RotateRight(const uint32x4_t& val)
102 {
103     const uint32x4_t a(vshlq_n_u32(val, 32 - R));
104     const uint32x4_t b(vshrq_n_u32(val, R));
105     return vorrq_u32(a, b);
106 }
107 
108 #if defined(__aarch32__) || defined(__aarch64__)
109 template <>
RotateLeft(const uint32x4_t & val)110 inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
111 {
112 #if (CRYPTOPP_BIG_ENDIAN)
113     const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
114     const uint8x16_t mask = vld1q_u8(maskb);
115 #else
116     const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
117     const uint8x16_t mask = vld1q_u8(maskb);
118 #endif
119 
120     return vreinterpretq_u32_u8(
121         vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
122 }
123 
124 template <>
RotateRight(const uint32x4_t & val)125 inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
126 {
127 #if (CRYPTOPP_BIG_ENDIAN)
128     const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
129     const uint8x16_t mask = vld1q_u8(maskb);
130 #else
131     const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
132     const uint8x16_t mask = vld1q_u8(maskb);
133 #endif
134 
135     return vreinterpretq_u32_u8(
136         vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
137 }
138 #endif
139 
UnpackLow32(uint32x4_t a,uint32x4_t b)140 uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b)
141 {
142     uint32x2_t a1 = vget_low_u32(a);
143     uint32x2_t b1 = vget_low_u32(b);
144     uint32x2x2_t result = vzip_u32(a1, b1);
145     return vcombine_u32(result.val[0], result.val[1]);
146 }
147 
UnpackHigh32(uint32x4_t a,uint32x4_t b)148 uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b)
149 {
150     uint32x2_t a1 = vget_high_u32(a);
151     uint32x2_t b1 = vget_high_u32(b);
152     uint32x2x2_t result = vzip_u32(a1, b1);
153     return vcombine_u32(result.val[0], result.val[1]);
154 }
155 
UnpackLow64(uint32x4_t a,uint32x4_t b)156 uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b)
157 {
158     uint64x1_t a1 = vget_low_u64((uint64x2_t)a);
159     uint64x1_t b1 = vget_low_u64((uint64x2_t)b);
160     return (uint32x4_t)vcombine_u64(a1, b1);
161 }
162 
UnpackHigh64(uint32x4_t a,uint32x4_t b)163 uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
164 {
165     uint64x1_t a1 = vget_high_u64((uint64x2_t)a);
166     uint64x1_t b1 = vget_high_u64((uint64x2_t)b);
167     return (uint32x4_t)vcombine_u64(a1, b1);
168 }
169 
170 template <unsigned int IDX>
LoadKey(const word32 rkey[])171 inline uint32x4_t LoadKey(const word32 rkey[])
172 {
173     return vdupq_n_u32(rkey[IDX]);
174 }
175 
176 template <unsigned int IDX>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)177 inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
178 {
179     // Should not be instantiated
180     CRYPTOPP_ASSERT(0);
181 
182     CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
183     CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
184     return vmovq_n_u32(0);
185 }
186 
187 template <>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)188 inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
189 {
190     const uint32x4_t r1 = UnpackLow32(a, b);
191     const uint32x4_t r2 = UnpackLow32(c, d);
192     return UnpackLow64(r1, r2);
193 }
194 
195 template <>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)196 inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
197 {
198     const uint32x4_t r1 = UnpackLow32(a, b);
199     const uint32x4_t r2 = UnpackLow32(c, d);
200     return UnpackHigh64(r1, r2);
201 }
202 
203 template <>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)204 inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
205 {
206     const uint32x4_t r1 = UnpackHigh32(a, b);
207     const uint32x4_t r2 = UnpackHigh32(c, d);
208     return UnpackLow64(r1, r2);
209 }
210 
211 template <>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)212 inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
213 {
214     const uint32x4_t r1 = UnpackHigh32(a, b);
215     const uint32x4_t r2 = UnpackHigh32(c, d);
216     return UnpackHigh64(r1, r2);
217 }
218 
219 template <unsigned int IDX>
UnpackNEON(const uint32x4_t & v)220 inline uint32x4_t UnpackNEON(const uint32x4_t& v)
221 {
222     // Should not be instantiated
223     CRYPTOPP_ASSERT(0);
224 
225     CRYPTOPP_UNUSED(v);
226     return vmovq_n_u32(0);
227 }
228 
229 template <>
UnpackNEON(const uint32x4_t & v)230 inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v)
231 {
232     // Splat to all lanes
233     return vdupq_n_u32(vgetq_lane_u32(v, 0));
234 }
235 
236 template <>
UnpackNEON(const uint32x4_t & v)237 inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v)
238 {
239     // Splat to all lanes
240     return vdupq_n_u32(vgetq_lane_u32(v, 1));
241 }
242 
243 template <>
UnpackNEON(const uint32x4_t & v)244 inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v)
245 {
246     // Splat to all lanes
247     return vdupq_n_u32(vgetq_lane_u32(v, 2));
248 }
249 
250 template <>
UnpackNEON(const uint32x4_t & v)251 inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v)
252 {
253     // Splat to all lanes
254     return vdupq_n_u32(vgetq_lane_u32(v, 3));
255 }
256 
257 template <unsigned int IDX>
RepackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)258 inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
259 {
260     return UnpackNEON<IDX>(a, b, c, d);
261 }
262 
263 template <unsigned int IDX>
RepackNEON(const uint32x4_t & v)264 inline uint32x4_t RepackNEON(const uint32x4_t& v)
265 {
266     return UnpackNEON<IDX>(v);
267 }
268 
269 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
270 
271 // *************************** IA-32 ***************************//
272 
273 #if (CRYPTOPP_SSSE3_AVAILABLE)
274 
Xor(const __m128i & a,const __m128i & b)275 inline __m128i Xor(const __m128i& a, const __m128i& b)
276 {
277     return _mm_xor_si128(a, b);
278 }
279 
Add(const __m128i & a,const __m128i & b)280 inline __m128i Add(const __m128i& a, const __m128i& b)
281 {
282     return _mm_add_epi32(a, b);
283 }
284 
Sub(const __m128i & a,const __m128i & b)285 inline __m128i Sub(const __m128i& a, const __m128i& b)
286 {
287     return _mm_sub_epi32(a, b);
288 }
289 
290 template <unsigned int R>
RotateLeft(const __m128i & val)291 inline __m128i RotateLeft(const __m128i& val)
292 {
293 #if defined(__XOP__)
294     return _mm_roti_epi32(val, R);
295 #else
296     return _mm_or_si128(
297         _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
298 #endif
299 }
300 
301 template <unsigned int R>
RotateRight(const __m128i & val)302 inline __m128i RotateRight(const __m128i& val)
303 {
304 #if defined(__XOP__)
305     return _mm_roti_epi32(val, 32-R);
306 #else
307     return _mm_or_si128(
308         _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
309 #endif
310 }
311 
312 // Faster than two Shifts and an Or.
313 template <>
RotateLeft(const __m128i & val)314 inline __m128i RotateLeft<8>(const __m128i& val)
315 {
316 #if defined(__XOP__)
317     return _mm_roti_epi32(val, 8);
318 #else
319     const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
320     return _mm_shuffle_epi8(val, mask);
321 #endif
322 }
323 
324 // Faster than two Shifts and an Or.
325 template <>
RotateRight(const __m128i & val)326 inline __m128i RotateRight<8>(const __m128i& val)
327 {
328 #if defined(__XOP__)
329     return _mm_roti_epi32(val, 32-8);
330 #else
331     const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
332     return _mm_shuffle_epi8(val, mask);
333 #endif
334 }
335 
336 template <unsigned int IDX>
LoadKey(const word32 rkey[])337 inline __m128i LoadKey(const word32 rkey[])
338 {
339     float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk));
340     return _mm_castps_si128(_mm_load_ps1(&rk));
341 }
342 
343 template <unsigned int IDX>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)344 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
345 {
346     // Should not be instantiated
347     CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
348     CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
349     CRYPTOPP_ASSERT(0);
350     return _mm_setzero_si128();
351 }
352 
353 template <>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)354 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
355 {
356     // LEA is little-endian oriented, so there is no need for a separate shuffle.
357     const __m128i r1 = _mm_unpacklo_epi32(a, b);
358     const __m128i r2 = _mm_unpacklo_epi32(c, d);
359     return _mm_unpacklo_epi64(r1, r2);
360 }
361 
362 template <>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)363 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
364 {
365     // LEA is little-endian oriented, so there is no need for a separate shuffle.
366     const __m128i r1 = _mm_unpacklo_epi32(a, b);
367     const __m128i r2 = _mm_unpacklo_epi32(c, d);
368     return _mm_unpackhi_epi64(r1, r2);
369 }
370 
371 template <>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)372 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
373 {
374     // LEA is little-endian oriented, so there is no need for a separate shuffle.
375     const __m128i r1 = _mm_unpackhi_epi32(a, b);
376     const __m128i r2 = _mm_unpackhi_epi32(c, d);
377     return _mm_unpacklo_epi64(r1, r2);
378 }
379 
380 template <>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)381 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
382 {
383     // LEA is little-endian oriented, so there is no need for a separate shuffle.
384     const __m128i r1 = _mm_unpackhi_epi32(a, b);
385     const __m128i r2 = _mm_unpackhi_epi32(c, d);
386     return _mm_unpackhi_epi64(r1, r2);
387 }
388 
389 template <unsigned int IDX>
UnpackXMM(const __m128i & v)390 inline __m128i UnpackXMM(const __m128i& v)
391 {
392     // Should not be instantiated
393     CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
394     return _mm_setzero_si128();
395 }
396 
397 template <>
UnpackXMM(const __m128i & v)398 inline __m128i UnpackXMM<0>(const __m128i& v)
399 {
400     // Splat to all lanes
401     return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
402 }
403 
404 template <>
UnpackXMM(const __m128i & v)405 inline __m128i UnpackXMM<1>(const __m128i& v)
406 {
407     // Splat to all lanes
408     return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
409 }
410 
411 template <>
UnpackXMM(const __m128i & v)412 inline __m128i UnpackXMM<2>(const __m128i& v)
413 {
414     // Splat to all lanes
415     return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
416 }
417 
418 template <>
UnpackXMM(const __m128i & v)419 inline __m128i UnpackXMM<3>(const __m128i& v)
420 {
421     // Splat to all lanes
422     return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
423 }
424 
425 template <unsigned int IDX>
RepackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)426 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
427 {
428     return UnpackXMM<IDX>(a, b, c, d);
429 }
430 
431 template <unsigned int IDX>
RepackXMM(const __m128i & v)432 inline __m128i RepackXMM(const __m128i& v)
433 {
434     return UnpackXMM<IDX>(v);
435 }
436 
437 #endif  // CRYPTOPP_SSSE3_AVAILABLE
438 
439 // *************************** Power8 ***************************//
440 
441 #if (CRYPTOPP_POWER8_AVAILABLE)
442 
443 using CryptoPP::uint8x16_p;
444 using CryptoPP::uint32x4_p;
445 using CryptoPP::uint64x2_p;
446 
Xor(const uint32x4_p & a,const uint32x4_p & b)447 inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
448 {
449     return VecXor(a, b);
450 }
451 
Add(const uint32x4_p & a,const uint32x4_p & b)452 inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
453 {
454     return VecAdd(a, b);
455 }
456 
Sub(const uint32x4_p & a,const uint32x4_p & b)457 inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
458 {
459     return VecSub(a, b);
460 }
461 
462 template <unsigned int R>
RotateLeft(const uint32x4_p & val)463 inline uint32x4_p RotateLeft(const uint32x4_p& val)
464 {
465     const uint32x4_p m = {R, R, R, R};
466     return vec_rl(val, m);
467 }
468 
469 template <unsigned int R>
RotateRight(const uint32x4_p & val)470 inline uint32x4_p RotateRight(const uint32x4_p& val)
471 {
472     const uint32x4_p m = {32-R, 32-R, 32-R, 32-R};
473     return vec_rl(val, m);
474 }
475 
476 template <unsigned int IDX>
LoadKey(const word32 rkey[])477 inline uint32x4_p LoadKey(const word32 rkey[])
478 {
479     return vec_splats(rkey[IDX]);
480 }
481 
482 template <unsigned int IDX>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)483 inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
484 {
485     // Should not be instantiated
486     CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
487     CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
488     CRYPTOPP_ASSERT(0);
489     return VecXor(a, a);
490 }
491 
492 template <>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)493 inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
494 {
495     const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
496     const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
497     return (uint32x4_p)vec_mergel(r1, r2);
498 }
499 
500 template <>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)501 inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
502 {
503     const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
504     const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
505     return (uint32x4_p)vec_mergeh(r1, r2);
506 }
507 
508 template <>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)509 inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
510 {
511     const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
512     const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
513     return (uint32x4_p)vec_mergel(r1, r2);
514 }
515 
516 template <>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)517 inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
518 {
519     const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
520     const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
521     return (uint32x4_p)vec_mergeh(r1, r2);
522 }
523 
524 template <unsigned int IDX>
UnpackSIMD(const uint32x4_p & v)525 inline uint32x4_p UnpackSIMD(const uint32x4_p& v)
526 {
527     // Should not be instantiated
528     CRYPTOPP_ASSERT(0);
529     return VecXor(v, v);
530 }
531 
532 template <>
UnpackSIMD(const uint32x4_p & v)533 inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v)
534 {
535     // Splat to all lanes
536     const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
537     return (uint32x4_p)VecPermute(v, v, m);
538 }
539 
540 template <>
UnpackSIMD(const uint32x4_p & v)541 inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v)
542 {
543     // Splat to all lanes
544     const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
545     return (uint32x4_p)VecPermute(v, v, m);
546 }
547 
548 template <>
UnpackSIMD(const uint32x4_p & v)549 inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v)
550 {
551     // Splat to all lanes
552     const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
553     return (uint32x4_p)VecPermute(v, v, m);
554 }
555 
556 template <>
UnpackSIMD(const uint32x4_p & v)557 inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v)
558 {
559     // Splat to all lanes
560     const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
561     return (uint32x4_p)VecPermute(v, v, m);
562 }
563 
564 template <unsigned int IDX>
RepackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)565 inline uint32x4_p RepackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
566 {
567     return UnpackSIMD<IDX>(a, b, c, d);
568 }
569 
570 template <unsigned int IDX>
RepackSIMD(const uint32x4_p & v)571 inline uint32x4_p RepackSIMD(const uint32x4_p& v)
572 {
573     return UnpackSIMD<IDX>(v);
574 }
575 
576 #endif  // CRYPTOPP_POWER8_AVAILABLE
577 
578 // *************************** LEA Encryption ***************************//
579 
580 #if (CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_SSSE3_AVAILABLE)
581 
582 template <class W>
LEA_Encryption(W temp[4],const word32 * subkeys,unsigned int rounds)583 inline void LEA_Encryption(W temp[4], const word32 *subkeys, unsigned int rounds)
584 {
585     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
586     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
587     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
588     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
589     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
590     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
591     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
592     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
593     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
594     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
595     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
596     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
597 
598     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
599     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
600     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
601     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
602     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
603     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
604     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
605     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
606     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
607     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
608     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
609     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
610 
611     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
612     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
613     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
614     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
615     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
616     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
617     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
618     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
619     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
620     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
621     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
622     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
623 
624     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
625     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
626     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
627     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
628     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
629     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
630     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
631     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
632     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
633     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
634     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
635     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
636 
637     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
638     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
639     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
640     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
641     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
642     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
643     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
644     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
645     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
646     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
647     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
648     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
649 
650     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
651     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
652     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
653     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
654     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
655     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
656     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
657     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
658     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
659     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
660     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
661     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
662 
663     if(rounds > 24)
664     {
665         temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
666         temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
667         temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
668         temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
669         temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
670         temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
671         temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
672         temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
673         temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
674         temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
675         temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
676         temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
677     }
678 
679     if(rounds > 28)
680     {
681         temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
682         temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
683         temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
684         temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
685         temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
686         temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
687         temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
688         temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
689         temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
690         temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
691         temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
692         temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
693     }
694 }
695 
696 // *************************** LEA Decryption ***************************//
697 
698 template <class W>
LEA_Decryption(W temp[4],const word32 * subkeys,unsigned int rounds)699 inline void LEA_Decryption(W temp[4], const word32 *subkeys, unsigned int rounds)
700 {
701     if(rounds > 28)
702     {
703         temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
704         temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
705         temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
706         temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
707         temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
708         temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
709         temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
710         temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
711         temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
712         temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
713         temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
714         temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
715     }
716 
717     if(rounds > 24)
718     {
719         temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
720         temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
721         temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
722         temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
723         temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
724         temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
725         temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
726         temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
727         temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
728         temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
729         temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
730         temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
731     }
732 
733     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
734     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
735     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
736     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
737     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
738     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
739     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
740     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
741     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
742     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
743     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
744     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
745 
746     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
747     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
748     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
749     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
750     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
751     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
752     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
753     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
754     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
755     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
756     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
757     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
758 
759     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
760     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
761     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
762     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
763     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
764     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
765     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
766     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
767     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
768     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
769     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
770     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
771 
772     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
773     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
774     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
775     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
776     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
777     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
778     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
779     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
780     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
781     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
782     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
783     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
784 
785     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
786     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
787     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
788     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
789     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
790     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
791     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
792     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
793     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
794     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
795     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
796     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
797 
798     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
799     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
800     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
801     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
802     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
803     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
804     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
805     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
806     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
807     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
808     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
809     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
810 }
811 
812 #endif  // LEA Encryption and Decryption
813 
814 // *************************** ARM NEON ***************************//
815 
816 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
817 
LEA_Enc_Block(uint32x4_t & block0,const word32 * subkeys,unsigned int rounds)818 inline void LEA_Enc_Block(uint32x4_t &block0,
819     const word32 *subkeys, unsigned int rounds)
820 {
821     uint32x4_t temp[4];
822     temp[0] = UnpackNEON<0>(block0);
823     temp[1] = UnpackNEON<1>(block0);
824     temp[2] = UnpackNEON<2>(block0);
825     temp[3] = UnpackNEON<3>(block0);
826 
827     LEA_Encryption(temp, subkeys, rounds);
828 
829     block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
830 }
831 
LEA_Dec_Block(uint32x4_t & block0,const word32 * subkeys,unsigned int rounds)832 inline void LEA_Dec_Block(uint32x4_t &block0,
833     const word32 *subkeys, unsigned int rounds)
834 {
835     uint32x4_t temp[4];
836     temp[0] = UnpackNEON<0>(block0);
837     temp[1] = UnpackNEON<1>(block0);
838     temp[2] = UnpackNEON<2>(block0);
839     temp[3] = UnpackNEON<3>(block0);
840 
841     LEA_Decryption(temp, subkeys, rounds);
842 
843     block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
844 }
845 
LEA_Enc_4_Blocks(uint32x4_t & block0,uint32x4_t & block1,uint32x4_t & block2,uint32x4_t & block3,const word32 * subkeys,unsigned int rounds)846 inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
847     uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
848 {
849     uint32x4_t temp[4];
850     temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
851     temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
852     temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
853     temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
854 
855     LEA_Encryption(temp, subkeys, rounds);
856 
857     block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
858     block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
859     block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
860     block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
861 }
862 
LEA_Dec_4_Blocks(uint32x4_t & block0,uint32x4_t & block1,uint32x4_t & block2,uint32x4_t & block3,const word32 * subkeys,unsigned int rounds)863 inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
864     uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
865 {
866     uint32x4_t temp[4];
867     temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
868     temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
869     temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
870     temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
871 
872     LEA_Decryption(temp, subkeys, rounds);
873 
874     block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
875     block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
876     block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
877     block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
878 }
879 
880 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
881 
882 // *************************** IA-32 ***************************//
883 
884 #if (CRYPTOPP_SSSE3_AVAILABLE)
885 
LEA_Enc_Block(__m128i & block0,const word32 * subkeys,unsigned int rounds)886 inline void LEA_Enc_Block(__m128i &block0,
887     const word32 *subkeys, unsigned int rounds)
888 {
889     __m128i temp[4];
890     temp[0] = UnpackXMM<0>(block0);
891     temp[1] = UnpackXMM<1>(block0);
892     temp[2] = UnpackXMM<2>(block0);
893     temp[3] = UnpackXMM<3>(block0);
894 
895     LEA_Encryption(temp, subkeys, rounds);
896 
897     block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
898 }
899 
LEA_Dec_Block(__m128i & block0,const word32 * subkeys,unsigned int rounds)900 inline void LEA_Dec_Block(__m128i &block0,
901     const word32 *subkeys, unsigned int rounds)
902 {
903     __m128i temp[4];
904     temp[0] = UnpackXMM<0>(block0);
905     temp[1] = UnpackXMM<1>(block0);
906     temp[2] = UnpackXMM<2>(block0);
907     temp[3] = UnpackXMM<3>(block0);
908 
909     LEA_Decryption(temp, subkeys, rounds);
910 
911     block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
912 }
913 
LEA_Enc_4_Blocks(__m128i & block0,__m128i & block1,__m128i & block2,__m128i & block3,const word32 * subkeys,unsigned int rounds)914 inline void LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1,
915     __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
916 {
917     __m128i temp[4];
918     temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
919     temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
920     temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
921     temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
922 
923     LEA_Encryption(temp, subkeys, rounds);
924 
925     block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
926     block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
927     block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
928     block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
929 }
930 
LEA_Dec_4_Blocks(__m128i & block0,__m128i & block1,__m128i & block2,__m128i & block3,const word32 * subkeys,unsigned int rounds)931 inline void LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1,
932     __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
933 {
934     __m128i temp[4];
935     temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
936     temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
937     temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
938     temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
939 
940     LEA_Decryption(temp, subkeys, rounds);
941 
942     block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
943     block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
944     block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
945     block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
946 }
947 
948 #endif  // CRYPTOPP_SSSE3_AVAILABLE
949 
950 // *************************** Power8 ***************************//
951 
952 #if (CRYPTOPP_POWER8_AVAILABLE)
953 
LEA_Enc_Block(uint32x4_p & block0,const word32 * subkeys,unsigned int rounds)954 inline void LEA_Enc_Block(uint32x4_p &block0,
955     const word32 *subkeys, unsigned int rounds)
956 {
957     uint32x4_p temp[4];
958     temp[0] = UnpackSIMD<0>(block0);
959     temp[1] = UnpackSIMD<1>(block0);
960     temp[2] = UnpackSIMD<2>(block0);
961     temp[3] = UnpackSIMD<3>(block0);
962 
963     LEA_Encryption(temp, subkeys, rounds);
964 
965     block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
966 }
967 
LEA_Dec_Block(uint32x4_p & block0,const word32 * subkeys,unsigned int rounds)968 inline void LEA_Dec_Block(uint32x4_p &block0,
969     const word32 *subkeys, unsigned int rounds)
970 {
971     uint32x4_p temp[4];
972     temp[0] = UnpackSIMD<0>(block0);
973     temp[1] = UnpackSIMD<1>(block0);
974     temp[2] = UnpackSIMD<2>(block0);
975     temp[3] = UnpackSIMD<3>(block0);
976 
977     LEA_Decryption(temp, subkeys, rounds);
978 
979     block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
980 }
981 
LEA_Enc_4_Blocks(uint32x4_p & block0,uint32x4_p & block1,uint32x4_p & block2,uint32x4_p & block3,const word32 * subkeys,unsigned int rounds)982 inline void LEA_Enc_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
983     uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
984 {
985     uint32x4_p temp[4];
986     temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
987     temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
988     temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
989     temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
990 
991     LEA_Encryption(temp, subkeys, rounds);
992 
993     block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
994     block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
995     block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
996     block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
997 }
998 
LEA_Dec_4_Blocks(uint32x4_p & block0,uint32x4_p & block1,uint32x4_p & block2,uint32x4_p & block3,const word32 * subkeys,unsigned int rounds)999 inline void LEA_Dec_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
1000     uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
1001 {
1002     uint32x4_p temp[4];
1003     temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
1004     temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
1005     temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
1006     temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
1007 
1008     LEA_Decryption(temp, subkeys, rounds);
1009 
1010     block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
1011     block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
1012     block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
1013     block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
1014 }
1015 
1016 #endif  // CRYPTOPP_POWER8_AVAILABLE
1017 
1018 ANONYMOUS_NAMESPACE_END
1019 
1020 // *************************** SIMD Templates ***************************//
1021 
NAMESPACE_BEGIN(CryptoPP)1022 NAMESPACE_BEGIN(CryptoPP)
1023 
1024 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
1025 size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1026     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1027 {
1028     return AdvancedProcessBlocks128_4x1_SSE(LEA_Enc_Block, LEA_Enc_4_Blocks,
1029         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1030 }
1031 
LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1032 size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1033     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1034 {
1035     return AdvancedProcessBlocks128_4x1_SSE(LEA_Dec_Block, LEA_Dec_4_Blocks,
1036         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1037 }
1038 #endif // CRYPTOPP_SSSE3_AVAILABLE
1039 
1040 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
LEA_Enc_AdvancedProcessBlocks_NEON(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1041 size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1042     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1043 {
1044     return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks,
1045         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1046 }
1047 
LEA_Dec_AdvancedProcessBlocks_NEON(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1048 size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1049     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1050 {
1051     return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks,
1052         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1053 }
1054 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
1055 
1056 #if defined(CRYPTOPP_POWER8_AVAILABLE)
LEA_Enc_AdvancedProcessBlocks_POWER8(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1057 size_t LEA_Enc_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1058     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1059 {
1060     return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Enc_Block, LEA_Enc_4_Blocks,
1061         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1062 }
1063 
LEA_Dec_AdvancedProcessBlocks_POWER8(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1064 size_t LEA_Dec_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1065     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1066 {
1067     return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Dec_Block, LEA_Dec_4_Blocks,
1068         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1069 }
1070 #endif // CRYPTOPP_POWER8_AVAILABLE
1071 
1072 NAMESPACE_END
1073