1 // lea_simd.cpp - written and placed in the public domain by Jeffrey Walton
2 //
3 //    This source file uses intrinsics and built-ins to gain access to
4 //    SSSE3, ARM NEON and ARMv8a, and Power8 Altivec instructions. A separate
5 //    source file is needed because additional CXXFLAGS are required to enable
6 //    the appropriate instructions sets in some build configurations.
7 
8 #include "pch.h"
9 #include "config.h"
10 
11 #include "lea.h"
12 #include "misc.h"
13 
14 // Uncomment for benchmarking C++ against SSE or NEON.
15 // Do so in both simon.cpp and simon_simd.cpp.
16 // #undef CRYPTOPP_SSSE3_AVAILABLE
17 // #undef CRYPTOPP_ARM_NEON_AVAILABLE
18 
19 #if (CRYPTOPP_SSSE3_AVAILABLE)
20 # include "adv_simd.h"
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
23 #endif
24 
25 #if defined(__XOP__)
26 # include <ammintrin.h>
27 # if defined(__GNUC__)
28 #  include <x86intrin.h>
29 # endif
30 #endif
31 
32 #if defined(__AVX512F__)
33 # define CRYPTOPP_AVX512_ROTATE 1
34 # include <immintrin.h>
35 #endif
36 
37 #if (CRYPTOPP_ARM_NEON_HEADER)
38 # include "adv_simd.h"
39 # include <arm_neon.h>
40 #endif
41 
42 #if (CRYPTOPP_ARM_ACLE_HEADER)
43 # include <stdint.h>
44 # include <arm_acle.h>
45 #endif
46 
47 #if defined(_M_ARM64)
48 # include "adv_simd.h"
49 #endif
50 
51 // Do not port this to POWER architecture. Naively we hoped
52 // for a 2x to 3x speedup. The result was a 5x slow down.
53 // The table below shows MiB/s and cpb.
54 //
55 // C++:
56 // <TD>LEA-128(128)/CTR (128-bit key)<TD>C++<TD>207<TD>15.64
57 // <TD>LEA-128(192)/CTR (192-bit key)<TD>C++<TD>186<TD>17.48
58 // <TD>LEA-128(256)/CTR (256-bit key)<TD>C++<TD>124<TD>26.2
59 //
60 // Power8:
61 // <TD>LEA-128(128)/CTR (128-bit key)<TD>Power8<TD>37<TD>88.7
62 // <TD>LEA-128(192)/CTR (192-bit key)<TD>Power8<TD>40<TD>82.1
63 // <TD>LEA-128(256)/CTR (256-bit key)<TD>Power8<TD>28<TD>116.0
64 
65 #undef CRYPTOPP_POWER8_AVAILABLE
66 #if defined(CRYPTOPP_POWER8_AVAILABLE)
67 # include "adv_simd.h"
68 # include "ppc_simd.h"
69 #endif
70 
71 // Squash MS LNK4221 and libtool warnings
72 extern const char LEA_SIMD_FNAME[] = __FILE__;
73 
74 ANONYMOUS_NAMESPACE_BEGIN
75 
76 using CryptoPP::word32;
77 
78 // *************************** ARM NEON ***************************//
79 
80 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
81 
Xor(const uint32x4_t & a,const uint32x4_t & b)82 inline uint32x4_t Xor(const uint32x4_t& a, const uint32x4_t& b)
83 {
84     return veorq_u32(a, b);
85 }
86 
Add(const uint32x4_t & a,const uint32x4_t & b)87 inline uint32x4_t Add(const uint32x4_t& a, const uint32x4_t& b)
88 {
89     return vaddq_u32(a, b);
90 }
91 
Sub(const uint32x4_t & a,const uint32x4_t & b)92 inline uint32x4_t Sub(const uint32x4_t& a, const uint32x4_t& b)
93 {
94     return vsubq_u32(a, b);
95 }
96 
97 template <unsigned int R>
RotateLeft(const uint32x4_t & val)98 inline uint32x4_t RotateLeft(const uint32x4_t& val)
99 {
100     const uint32x4_t a(vshlq_n_u32(val, R));
101     const uint32x4_t b(vshrq_n_u32(val, 32 - R));
102     return vorrq_u32(a, b);
103 }
104 
105 template <unsigned int R>
RotateRight(const uint32x4_t & val)106 inline uint32x4_t RotateRight(const uint32x4_t& val)
107 {
108     const uint32x4_t a(vshlq_n_u32(val, 32 - R));
109     const uint32x4_t b(vshrq_n_u32(val, R));
110     return vorrq_u32(a, b);
111 }
112 
113 #if defined(__aarch32__) || defined(__aarch64__)
114 template <>
RotateLeft(const uint32x4_t & val)115 inline uint32x4_t RotateLeft<8>(const uint32x4_t& val)
116 {
117 #if (CRYPTOPP_BIG_ENDIAN)
118     const uint8_t maskb[16] = { 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3 };
119     const uint8x16_t mask = vld1q_u8(maskb);
120 #else
121     const uint8_t maskb[16] = { 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 };
122     const uint8x16_t mask = vld1q_u8(maskb);
123 #endif
124 
125     return vreinterpretq_u32_u8(
126         vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
127 }
128 
129 template <>
RotateRight(const uint32x4_t & val)130 inline uint32x4_t RotateRight<8>(const uint32x4_t& val)
131 {
132 #if (CRYPTOPP_BIG_ENDIAN)
133     const uint8_t maskb[16] = { 12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1 };
134     const uint8x16_t mask = vld1q_u8(maskb);
135 #else
136     const uint8_t maskb[16] = { 1,2,3,0, 5,6,7,4, 9,10,11,8, 13,14,14,12 };
137     const uint8x16_t mask = vld1q_u8(maskb);
138 #endif
139 
140     return vreinterpretq_u32_u8(
141         vqtbl1q_u8(vreinterpretq_u8_u32(val), mask));
142 }
143 #endif
144 
UnpackLow32(uint32x4_t a,uint32x4_t b)145 uint32x4_t UnpackLow32(uint32x4_t a, uint32x4_t b)
146 {
147     uint32x2_t a1 = vget_low_u32(a);
148     uint32x2_t b1 = vget_low_u32(b);
149     uint32x2x2_t result = vzip_u32(a1, b1);
150     return vcombine_u32(result.val[0], result.val[1]);
151 }
152 
UnpackHigh32(uint32x4_t a,uint32x4_t b)153 uint32x4_t UnpackHigh32(uint32x4_t a, uint32x4_t b)
154 {
155     uint32x2_t a1 = vget_high_u32(a);
156     uint32x2_t b1 = vget_high_u32(b);
157     uint32x2x2_t result = vzip_u32(a1, b1);
158     return vcombine_u32(result.val[0], result.val[1]);
159 }
160 
UnpackLow64(uint32x4_t a,uint32x4_t b)161 uint32x4_t UnpackLow64(uint32x4_t a, uint32x4_t b)
162 {
163     uint64x1_t a1 = vget_low_u64((uint64x2_t)a);
164     uint64x1_t b1 = vget_low_u64((uint64x2_t)b);
165     return (uint32x4_t)vcombine_u64(a1, b1);
166 }
167 
UnpackHigh64(uint32x4_t a,uint32x4_t b)168 uint32x4_t UnpackHigh64(uint32x4_t a, uint32x4_t b)
169 {
170     uint64x1_t a1 = vget_high_u64((uint64x2_t)a);
171     uint64x1_t b1 = vget_high_u64((uint64x2_t)b);
172     return (uint32x4_t)vcombine_u64(a1, b1);
173 }
174 
175 template <unsigned int IDX>
LoadKey(const word32 rkey[])176 inline uint32x4_t LoadKey(const word32 rkey[])
177 {
178     return vdupq_n_u32(rkey[IDX]);
179 }
180 
181 template <unsigned int IDX>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)182 inline uint32x4_t UnpackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
183 {
184     // Should not be instantiated
185     CRYPTOPP_ASSERT(0);
186 
187     CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
188     CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
189     return vmovq_n_u32(0);
190 }
191 
192 template <>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)193 inline uint32x4_t UnpackNEON<0>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
194 {
195     const uint32x4_t r1 = UnpackLow32(a, b);
196     const uint32x4_t r2 = UnpackLow32(c, d);
197     return UnpackLow64(r1, r2);
198 }
199 
200 template <>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)201 inline uint32x4_t UnpackNEON<1>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
202 {
203     const uint32x4_t r1 = UnpackLow32(a, b);
204     const uint32x4_t r2 = UnpackLow32(c, d);
205     return UnpackHigh64(r1, r2);
206 }
207 
208 template <>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)209 inline uint32x4_t UnpackNEON<2>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
210 {
211     const uint32x4_t r1 = UnpackHigh32(a, b);
212     const uint32x4_t r2 = UnpackHigh32(c, d);
213     return UnpackLow64(r1, r2);
214 }
215 
216 template <>
UnpackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)217 inline uint32x4_t UnpackNEON<3>(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
218 {
219     const uint32x4_t r1 = UnpackHigh32(a, b);
220     const uint32x4_t r2 = UnpackHigh32(c, d);
221     return UnpackHigh64(r1, r2);
222 }
223 
224 template <unsigned int IDX>
UnpackNEON(const uint32x4_t & v)225 inline uint32x4_t UnpackNEON(const uint32x4_t& v)
226 {
227     // Should not be instantiated
228     CRYPTOPP_ASSERT(0);
229 
230     CRYPTOPP_UNUSED(v);
231     return vmovq_n_u32(0);
232 }
233 
234 template <>
UnpackNEON(const uint32x4_t & v)235 inline uint32x4_t UnpackNEON<0>(const uint32x4_t& v)
236 {
237     // Splat to all lanes
238     return vdupq_n_u32(vgetq_lane_u32(v, 0));
239 }
240 
241 template <>
UnpackNEON(const uint32x4_t & v)242 inline uint32x4_t UnpackNEON<1>(const uint32x4_t& v)
243 {
244     // Splat to all lanes
245     return vdupq_n_u32(vgetq_lane_u32(v, 1));
246 }
247 
248 template <>
UnpackNEON(const uint32x4_t & v)249 inline uint32x4_t UnpackNEON<2>(const uint32x4_t& v)
250 {
251     // Splat to all lanes
252     return vdupq_n_u32(vgetq_lane_u32(v, 2));
253 }
254 
255 template <>
UnpackNEON(const uint32x4_t & v)256 inline uint32x4_t UnpackNEON<3>(const uint32x4_t& v)
257 {
258     // Splat to all lanes
259     return vdupq_n_u32(vgetq_lane_u32(v, 3));
260 }
261 
262 template <unsigned int IDX>
RepackNEON(const uint32x4_t & a,const uint32x4_t & b,const uint32x4_t & c,const uint32x4_t & d)263 inline uint32x4_t RepackNEON(const uint32x4_t& a, const uint32x4_t& b, const uint32x4_t& c, const uint32x4_t& d)
264 {
265     return UnpackNEON<IDX>(a, b, c, d);
266 }
267 
268 template <unsigned int IDX>
RepackNEON(const uint32x4_t & v)269 inline uint32x4_t RepackNEON(const uint32x4_t& v)
270 {
271     return UnpackNEON<IDX>(v);
272 }
273 
274 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
275 
276 // *************************** IA-32 ***************************//
277 
278 #if (CRYPTOPP_SSSE3_AVAILABLE)
279 
Xor(const __m128i & a,const __m128i & b)280 inline __m128i Xor(const __m128i& a, const __m128i& b)
281 {
282     return _mm_xor_si128(a, b);
283 }
284 
Add(const __m128i & a,const __m128i & b)285 inline __m128i Add(const __m128i& a, const __m128i& b)
286 {
287     return _mm_add_epi32(a, b);
288 }
289 
Sub(const __m128i & a,const __m128i & b)290 inline __m128i Sub(const __m128i& a, const __m128i& b)
291 {
292     return _mm_sub_epi32(a, b);
293 }
294 
295 template <unsigned int R>
RotateLeft(const __m128i & val)296 inline __m128i RotateLeft(const __m128i& val)
297 {
298 #if defined(__XOP__)
299     return _mm_roti_epi32(val, R);
300 #else
301     return _mm_or_si128(
302         _mm_slli_epi32(val, R), _mm_srli_epi32(val, 32-R));
303 #endif
304 }
305 
306 template <unsigned int R>
RotateRight(const __m128i & val)307 inline __m128i RotateRight(const __m128i& val)
308 {
309 #if defined(__XOP__)
310     return _mm_roti_epi32(val, 32-R);
311 #else
312     return _mm_or_si128(
313         _mm_slli_epi32(val, 32-R), _mm_srli_epi32(val, R));
314 #endif
315 }
316 
317 // Faster than two Shifts and an Or.
318 template <>
RotateLeft(const __m128i & val)319 inline __m128i RotateLeft<8>(const __m128i& val)
320 {
321 #if defined(__XOP__)
322     return _mm_roti_epi32(val, 8);
323 #else
324     const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
325     return _mm_shuffle_epi8(val, mask);
326 #endif
327 }
328 
329 // Faster than two Shifts and an Or.
330 template <>
RotateRight(const __m128i & val)331 inline __m128i RotateRight<8>(const __m128i& val)
332 {
333 #if defined(__XOP__)
334     return _mm_roti_epi32(val, 32-8);
335 #else
336     const __m128i mask = _mm_set_epi8(12,15,14,13, 8,11,10,9, 4,7,6,5, 0,3,2,1);
337     return _mm_shuffle_epi8(val, mask);
338 #endif
339 }
340 
341 template <unsigned int IDX>
LoadKey(const word32 rkey[])342 inline __m128i LoadKey(const word32 rkey[])
343 {
344     float rk; std::memcpy(&rk, rkey+IDX, sizeof(rk));
345     return _mm_castps_si128(_mm_load_ps1(&rk));
346 }
347 
348 template <unsigned int IDX>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)349 inline __m128i UnpackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
350 {
351     // Should not be instantiated
352     CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
353     CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
354     CRYPTOPP_ASSERT(0);
355     return _mm_setzero_si128();
356 }
357 
358 template <>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)359 inline __m128i UnpackXMM<0>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
360 {
361     // LEA is little-endian oriented, so there is no need for a separate shuffle.
362     const __m128i r1 = _mm_unpacklo_epi32(a, b);
363     const __m128i r2 = _mm_unpacklo_epi32(c, d);
364     return _mm_unpacklo_epi64(r1, r2);
365 }
366 
367 template <>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)368 inline __m128i UnpackXMM<1>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
369 {
370     // LEA is little-endian oriented, so there is no need for a separate shuffle.
371     const __m128i r1 = _mm_unpacklo_epi32(a, b);
372     const __m128i r2 = _mm_unpacklo_epi32(c, d);
373     return _mm_unpackhi_epi64(r1, r2);
374 }
375 
376 template <>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)377 inline __m128i UnpackXMM<2>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
378 {
379     // LEA is little-endian oriented, so there is no need for a separate shuffle.
380     const __m128i r1 = _mm_unpackhi_epi32(a, b);
381     const __m128i r2 = _mm_unpackhi_epi32(c, d);
382     return _mm_unpacklo_epi64(r1, r2);
383 }
384 
385 template <>
UnpackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)386 inline __m128i UnpackXMM<3>(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
387 {
388     // LEA is little-endian oriented, so there is no need for a separate shuffle.
389     const __m128i r1 = _mm_unpackhi_epi32(a, b);
390     const __m128i r2 = _mm_unpackhi_epi32(c, d);
391     return _mm_unpackhi_epi64(r1, r2);
392 }
393 
394 template <unsigned int IDX>
UnpackXMM(const __m128i & v)395 inline __m128i UnpackXMM(const __m128i& v)
396 {
397     // Should not be instantiated
398     CRYPTOPP_UNUSED(v); CRYPTOPP_ASSERT(0);
399     return _mm_setzero_si128();
400 }
401 
402 template <>
UnpackXMM(const __m128i & v)403 inline __m128i UnpackXMM<0>(const __m128i& v)
404 {
405     // Splat to all lanes
406     return _mm_shuffle_epi8(v, _mm_set_epi8(3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0));
407 }
408 
409 template <>
UnpackXMM(const __m128i & v)410 inline __m128i UnpackXMM<1>(const __m128i& v)
411 {
412     // Splat to all lanes
413     return _mm_shuffle_epi8(v, _mm_set_epi8(7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4));
414 }
415 
416 template <>
UnpackXMM(const __m128i & v)417 inline __m128i UnpackXMM<2>(const __m128i& v)
418 {
419     // Splat to all lanes
420     return _mm_shuffle_epi8(v, _mm_set_epi8(11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8));
421 }
422 
423 template <>
UnpackXMM(const __m128i & v)424 inline __m128i UnpackXMM<3>(const __m128i& v)
425 {
426     // Splat to all lanes
427     return _mm_shuffle_epi8(v, _mm_set_epi8(15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12));
428 }
429 
430 template <unsigned int IDX>
RepackXMM(const __m128i & a,const __m128i & b,const __m128i & c,const __m128i & d)431 inline __m128i RepackXMM(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
432 {
433     return UnpackXMM<IDX>(a, b, c, d);
434 }
435 
436 template <unsigned int IDX>
RepackXMM(const __m128i & v)437 inline __m128i RepackXMM(const __m128i& v)
438 {
439     return UnpackXMM<IDX>(v);
440 }
441 
442 #endif  // CRYPTOPP_SSSE3_AVAILABLE
443 
444 // *************************** Power8 ***************************//
445 
446 #if (CRYPTOPP_POWER8_AVAILABLE)
447 
448 using CryptoPP::uint8x16_p;
449 using CryptoPP::uint32x4_p;
450 using CryptoPP::uint64x2_p;
451 
Xor(const uint32x4_p & a,const uint32x4_p & b)452 inline uint32x4_p Xor(const uint32x4_p& a, const uint32x4_p& b)
453 {
454     return VecXor(a, b);
455 }
456 
Add(const uint32x4_p & a,const uint32x4_p & b)457 inline uint32x4_p Add(const uint32x4_p& a, const uint32x4_p& b)
458 {
459     return VecAdd(a, b);
460 }
461 
Sub(const uint32x4_p & a,const uint32x4_p & b)462 inline uint32x4_p Sub(const uint32x4_p& a, const uint32x4_p& b)
463 {
464     return VecSub(a, b);
465 }
466 
467 template <unsigned int R>
RotateLeft(const uint32x4_p & val)468 inline uint32x4_p RotateLeft(const uint32x4_p& val)
469 {
470     const uint32x4_p m = {R, R, R, R};
471     return vec_rl(val, m);
472 }
473 
474 template <unsigned int R>
RotateRight(const uint32x4_p & val)475 inline uint32x4_p RotateRight(const uint32x4_p& val)
476 {
477     const uint32x4_p m = {32-R, 32-R, 32-R, 32-R};
478     return vec_rl(val, m);
479 }
480 
481 template <unsigned int IDX>
LoadKey(const word32 rkey[])482 inline uint32x4_p LoadKey(const word32 rkey[])
483 {
484     return vec_splats(rkey[IDX]);
485 }
486 
487 template <unsigned int IDX>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)488 inline uint32x4_p UnpackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
489 {
490     // Should not be instantiated
491     CRYPTOPP_UNUSED(a); CRYPTOPP_UNUSED(b);
492     CRYPTOPP_UNUSED(c); CRYPTOPP_UNUSED(d);
493     CRYPTOPP_ASSERT(0);
494     return VecXor(a, a);
495 }
496 
497 template <>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)498 inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
499 {
500     const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
501     const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
502     return (uint32x4_p)vec_mergel(r1, r2);
503 }
504 
505 template <>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)506 inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
507 {
508     const uint64x2_p r1 = (uint64x2_p)vec_mergel(a, b);
509     const uint64x2_p r2 = (uint64x2_p)vec_mergel(c, d);
510     return (uint32x4_p)vec_mergeh(r1, r2);
511 }
512 
513 template <>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)514 inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
515 {
516     const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
517     const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
518     return (uint32x4_p)vec_mergel(r1, r2);
519 }
520 
521 template <>
UnpackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)522 inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
523 {
524     const uint64x2_p r1 = (uint64x2_p)vec_mergeh(a, b);
525     const uint64x2_p r2 = (uint64x2_p)vec_mergeh(c, d);
526     return (uint32x4_p)vec_mergeh(r1, r2);
527 }
528 
529 template <unsigned int IDX>
UnpackSIMD(const uint32x4_p & v)530 inline uint32x4_p UnpackSIMD(const uint32x4_p& v)
531 {
532     // Should not be instantiated
533     CRYPTOPP_ASSERT(0);
534     return VecXor(v, v);
535 }
536 
537 template <>
UnpackSIMD(const uint32x4_p & v)538 inline uint32x4_p UnpackSIMD<0>(const uint32x4_p& v)
539 {
540     // Splat to all lanes
541     const uint8x16_p m = {3,2,1,0, 3,2,1,0, 3,2,1,0, 3,2,1,0};
542     return (uint32x4_p)VecPermute(v, v, m);
543 }
544 
545 template <>
UnpackSIMD(const uint32x4_p & v)546 inline uint32x4_p UnpackSIMD<1>(const uint32x4_p& v)
547 {
548     // Splat to all lanes
549     const uint8x16_p m = {7,6,5,4, 7,6,5,4, 7,6,5,4, 7,6,5,4};
550     return (uint32x4_p)VecPermute(v, v, m);
551 }
552 
553 template <>
UnpackSIMD(const uint32x4_p & v)554 inline uint32x4_p UnpackSIMD<2>(const uint32x4_p& v)
555 {
556     // Splat to all lanes
557     const uint8x16_p m = {11,10,9,8, 11,10,9,8, 11,10,9,8, 11,10,9,8};
558     return (uint32x4_p)VecPermute(v, v, m);
559 }
560 
561 template <>
UnpackSIMD(const uint32x4_p & v)562 inline uint32x4_p UnpackSIMD<3>(const uint32x4_p& v)
563 {
564     // Splat to all lanes
565     const uint8x16_p m = {15,14,13,12, 15,14,13,12, 15,14,13,12, 15,14,13,12};
566     return (uint32x4_p)VecPermute(v, v, m);
567 }
568 
569 template <unsigned int IDX>
RepackSIMD(const uint32x4_p & a,const uint32x4_p & b,const uint32x4_p & c,const uint32x4_p & d)570 inline uint32x4_p RepackSIMD(const uint32x4_p& a, const uint32x4_p& b, const uint32x4_p& c, const uint32x4_p& d)
571 {
572     return UnpackSIMD<IDX>(a, b, c, d);
573 }
574 
575 template <unsigned int IDX>
RepackSIMD(const uint32x4_p & v)576 inline uint32x4_p RepackSIMD(const uint32x4_p& v)
577 {
578     return UnpackSIMD<IDX>(v);
579 }
580 
581 #endif  // CRYPTOPP_POWER8_AVAILABLE
582 
583 // *************************** LEA Encryption ***************************//
584 
585 #if (CRYPTOPP_ARM_NEON_AVAILABLE || CRYPTOPP_SSSE3_AVAILABLE)
586 
587 template <class W>
LEA_Encryption(W temp[4],const word32 * subkeys,unsigned int rounds)588 inline void LEA_Encryption(W temp[4], const word32 *subkeys, unsigned int rounds)
589 {
590     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<4>(subkeys)), Xor(temp[3], LoadKey<5>(subkeys))));
591     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<2>(subkeys)), Xor(temp[2], LoadKey<3>(subkeys))));
592     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<0>(subkeys)), Xor(temp[1], LoadKey<1>(subkeys))));
593     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<10>(subkeys)), Xor(temp[0], LoadKey<11>(subkeys))));
594     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<8>(subkeys)), Xor(temp[3], LoadKey<9>(subkeys))));
595     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<6>(subkeys)), Xor(temp[2], LoadKey<7>(subkeys))));
596     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<16>(subkeys)), Xor(temp[1], LoadKey<17>(subkeys))));
597     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<14>(subkeys)), Xor(temp[0], LoadKey<15>(subkeys))));
598     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<12>(subkeys)), Xor(temp[3], LoadKey<13>(subkeys))));
599     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<22>(subkeys)), Xor(temp[2], LoadKey<23>(subkeys))));
600     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<20>(subkeys)), Xor(temp[1], LoadKey<21>(subkeys))));
601     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<18>(subkeys)), Xor(temp[0], LoadKey<19>(subkeys))));
602 
603     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<28>(subkeys)), Xor(temp[3], LoadKey<29>(subkeys))));
604     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<26>(subkeys)), Xor(temp[2], LoadKey<27>(subkeys))));
605     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<24>(subkeys)), Xor(temp[1], LoadKey<25>(subkeys))));
606     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<34>(subkeys)), Xor(temp[0], LoadKey<35>(subkeys))));
607     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<32>(subkeys)), Xor(temp[3], LoadKey<33>(subkeys))));
608     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<30>(subkeys)), Xor(temp[2], LoadKey<31>(subkeys))));
609     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<40>(subkeys)), Xor(temp[1], LoadKey<41>(subkeys))));
610     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<38>(subkeys)), Xor(temp[0], LoadKey<39>(subkeys))));
611     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<36>(subkeys)), Xor(temp[3], LoadKey<37>(subkeys))));
612     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<46>(subkeys)), Xor(temp[2], LoadKey<47>(subkeys))));
613     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<44>(subkeys)), Xor(temp[1], LoadKey<45>(subkeys))));
614     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<42>(subkeys)), Xor(temp[0], LoadKey<43>(subkeys))));
615 
616     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<52>(subkeys)), Xor(temp[3], LoadKey<53>(subkeys))));
617     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<50>(subkeys)), Xor(temp[2], LoadKey<51>(subkeys))));
618     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<48>(subkeys)), Xor(temp[1], LoadKey<49>(subkeys))));
619     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<58>(subkeys)), Xor(temp[0], LoadKey<59>(subkeys))));
620     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<56>(subkeys)), Xor(temp[3], LoadKey<57>(subkeys))));
621     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<54>(subkeys)), Xor(temp[2], LoadKey<55>(subkeys))));
622     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<64>(subkeys)), Xor(temp[1], LoadKey<65>(subkeys))));
623     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<62>(subkeys)), Xor(temp[0], LoadKey<63>(subkeys))));
624     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<60>(subkeys)), Xor(temp[3], LoadKey<61>(subkeys))));
625     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<70>(subkeys)), Xor(temp[2], LoadKey<71>(subkeys))));
626     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<68>(subkeys)), Xor(temp[1], LoadKey<69>(subkeys))));
627     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<66>(subkeys)), Xor(temp[0], LoadKey<67>(subkeys))));
628 
629     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<76>(subkeys)), Xor(temp[3], LoadKey<77>(subkeys))));
630     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<74>(subkeys)), Xor(temp[2], LoadKey<75>(subkeys))));
631     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<72>(subkeys)), Xor(temp[1], LoadKey<73>(subkeys))));
632     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<82>(subkeys)), Xor(temp[0], LoadKey<83>(subkeys))));
633     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<80>(subkeys)), Xor(temp[3], LoadKey<81>(subkeys))));
634     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<78>(subkeys)), Xor(temp[2], LoadKey<79>(subkeys))));
635     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<88>(subkeys)), Xor(temp[1], LoadKey<89>(subkeys))));
636     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<86>(subkeys)), Xor(temp[0], LoadKey<87>(subkeys))));
637     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<84>(subkeys)), Xor(temp[3], LoadKey<85>(subkeys))));
638     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<94>(subkeys)), Xor(temp[2], LoadKey<95>(subkeys))));
639     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<92>(subkeys)), Xor(temp[1], LoadKey<93>(subkeys))));
640     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<90>(subkeys)), Xor(temp[0], LoadKey<91>(subkeys))));
641 
642     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<100>(subkeys)), Xor(temp[3], LoadKey<101>(subkeys))));
643     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<98>(subkeys)), Xor(temp[2], LoadKey<99>(subkeys))));
644     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<96>(subkeys)), Xor(temp[1], LoadKey<97>(subkeys))));
645     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<106>(subkeys)), Xor(temp[0], LoadKey<107>(subkeys))));
646     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<104>(subkeys)), Xor(temp[3], LoadKey<105>(subkeys))));
647     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<102>(subkeys)), Xor(temp[2], LoadKey<103>(subkeys))));
648     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<112>(subkeys)), Xor(temp[1], LoadKey<113>(subkeys))));
649     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<110>(subkeys)), Xor(temp[0], LoadKey<111>(subkeys))));
650     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<108>(subkeys)), Xor(temp[3], LoadKey<109>(subkeys))));
651     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<118>(subkeys)), Xor(temp[2], LoadKey<119>(subkeys))));
652     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<116>(subkeys)), Xor(temp[1], LoadKey<117>(subkeys))));
653     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<114>(subkeys)), Xor(temp[0], LoadKey<115>(subkeys))));
654 
655     temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<124>(subkeys)), Xor(temp[3], LoadKey<125>(subkeys))));
656     temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<122>(subkeys)), Xor(temp[2], LoadKey<123>(subkeys))));
657     temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<120>(subkeys)), Xor(temp[1], LoadKey<121>(subkeys))));
658     temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<130>(subkeys)), Xor(temp[0], LoadKey<131>(subkeys))));
659     temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<128>(subkeys)), Xor(temp[3], LoadKey<129>(subkeys))));
660     temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<126>(subkeys)), Xor(temp[2], LoadKey<127>(subkeys))));
661     temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<136>(subkeys)), Xor(temp[1], LoadKey<137>(subkeys))));
662     temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<134>(subkeys)), Xor(temp[0], LoadKey<135>(subkeys))));
663     temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<132>(subkeys)), Xor(temp[3], LoadKey<133>(subkeys))));
664     temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<142>(subkeys)), Xor(temp[2], LoadKey<143>(subkeys))));
665     temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<140>(subkeys)), Xor(temp[1], LoadKey<141>(subkeys))));
666     temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<138>(subkeys)), Xor(temp[0], LoadKey<139>(subkeys))));
667 
668     if(rounds > 24)
669     {
670         temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<148>(subkeys)), Xor(temp[3], LoadKey<149>(subkeys))));
671         temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<146>(subkeys)), Xor(temp[2], LoadKey<147>(subkeys))));
672         temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<144>(subkeys)), Xor(temp[1], LoadKey<145>(subkeys))));
673         temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<154>(subkeys)), Xor(temp[0], LoadKey<155>(subkeys))));
674         temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<152>(subkeys)), Xor(temp[3], LoadKey<153>(subkeys))));
675         temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<150>(subkeys)), Xor(temp[2], LoadKey<151>(subkeys))));
676         temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<160>(subkeys)), Xor(temp[1], LoadKey<161>(subkeys))));
677         temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<158>(subkeys)), Xor(temp[0], LoadKey<159>(subkeys))));
678         temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<156>(subkeys)), Xor(temp[3], LoadKey<157>(subkeys))));
679         temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<166>(subkeys)), Xor(temp[2], LoadKey<167>(subkeys))));
680         temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<164>(subkeys)), Xor(temp[1], LoadKey<165>(subkeys))));
681         temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<162>(subkeys)), Xor(temp[0], LoadKey<163>(subkeys))));
682     }
683 
684     if(rounds > 28)
685     {
686         temp[3] = RotateRight<3>(Add(Xor(temp[2], LoadKey<172>(subkeys)), Xor(temp[3], LoadKey<173>(subkeys))));
687         temp[2] = RotateRight<5>(Add(Xor(temp[1], LoadKey<170>(subkeys)), Xor(temp[2], LoadKey<171>(subkeys))));
688         temp[1] = RotateLeft<9>(Add(Xor(temp[0], LoadKey<168>(subkeys)), Xor(temp[1], LoadKey<169>(subkeys))));
689         temp[0] = RotateRight<3>(Add(Xor(temp[3], LoadKey<178>(subkeys)), Xor(temp[0], LoadKey<179>(subkeys))));
690         temp[3] = RotateRight<5>(Add(Xor(temp[2], LoadKey<176>(subkeys)), Xor(temp[3], LoadKey<177>(subkeys))));
691         temp[2] = RotateLeft<9>(Add(Xor(temp[1], LoadKey<174>(subkeys)), Xor(temp[2], LoadKey<175>(subkeys))));
692         temp[1] = RotateRight<3>(Add(Xor(temp[0], LoadKey<184>(subkeys)), Xor(temp[1], LoadKey<185>(subkeys))));
693         temp[0] = RotateRight<5>(Add(Xor(temp[3], LoadKey<182>(subkeys)), Xor(temp[0], LoadKey<183>(subkeys))));
694         temp[3] = RotateLeft<9>(Add(Xor(temp[2], LoadKey<180>(subkeys)), Xor(temp[3], LoadKey<181>(subkeys))));
695         temp[2] = RotateRight<3>(Add(Xor(temp[1], LoadKey<190>(subkeys)), Xor(temp[2], LoadKey<191>(subkeys))));
696         temp[1] = RotateRight<5>(Add(Xor(temp[0], LoadKey<188>(subkeys)), Xor(temp[1], LoadKey<189>(subkeys))));
697         temp[0] = RotateLeft<9>(Add(Xor(temp[3], LoadKey<186>(subkeys)), Xor(temp[0], LoadKey<187>(subkeys))));
698     }
699 }
700 
701 // *************************** LEA Decryption ***************************//
702 
703 template <class W>
LEA_Decryption(W temp[4],const word32 * subkeys,unsigned int rounds)704 inline void LEA_Decryption(W temp[4], const word32 *subkeys, unsigned int rounds)
705 {
706     if(rounds > 28)
707     {
708         temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<186>(subkeys))), LoadKey<187>(subkeys));
709         temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<188>(subkeys))), LoadKey<189>(subkeys));
710         temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<190>(subkeys))), LoadKey<191>(subkeys));
711         temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<180>(subkeys))), LoadKey<181>(subkeys));
712         temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<182>(subkeys))), LoadKey<183>(subkeys));
713         temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<184>(subkeys))), LoadKey<185>(subkeys));
714         temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<174>(subkeys))), LoadKey<175>(subkeys));
715         temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<176>(subkeys))), LoadKey<177>(subkeys));
716         temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<178>(subkeys))), LoadKey<179>(subkeys));
717         temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<168>(subkeys))), LoadKey<169>(subkeys));
718         temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<170>(subkeys))), LoadKey<171>(subkeys));
719         temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<172>(subkeys))), LoadKey<173>(subkeys));
720     }
721 
722     if(rounds > 24)
723     {
724         temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<162>(subkeys))), LoadKey<163>(subkeys));
725         temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<164>(subkeys))), LoadKey<165>(subkeys));
726         temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<166>(subkeys))), LoadKey<167>(subkeys));
727         temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<156>(subkeys))), LoadKey<157>(subkeys));
728         temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<158>(subkeys))), LoadKey<159>(subkeys));
729         temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<160>(subkeys))), LoadKey<161>(subkeys));
730         temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<150>(subkeys))), LoadKey<151>(subkeys));
731         temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<152>(subkeys))), LoadKey<153>(subkeys));
732         temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<154>(subkeys))), LoadKey<155>(subkeys));
733         temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<144>(subkeys))), LoadKey<145>(subkeys));
734         temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<146>(subkeys))), LoadKey<147>(subkeys));
735         temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<148>(subkeys))), LoadKey<149>(subkeys));
736     }
737 
738     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<138>(subkeys))), LoadKey<139>(subkeys));
739     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<140>(subkeys))), LoadKey<141>(subkeys));
740     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<142>(subkeys))), LoadKey<143>(subkeys));
741     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<132>(subkeys))), LoadKey<133>(subkeys));
742     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<134>(subkeys))), LoadKey<135>(subkeys));
743     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<136>(subkeys))), LoadKey<137>(subkeys));
744     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<126>(subkeys))), LoadKey<127>(subkeys));
745     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<128>(subkeys))), LoadKey<129>(subkeys));
746     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<130>(subkeys))), LoadKey<131>(subkeys));
747     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<120>(subkeys))), LoadKey<121>(subkeys));
748     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<122>(subkeys))), LoadKey<123>(subkeys));
749     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<124>(subkeys))), LoadKey<125>(subkeys));
750 
751     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<114>(subkeys))), LoadKey<115>(subkeys));
752     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<116>(subkeys))), LoadKey<117>(subkeys));
753     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<118>(subkeys))), LoadKey<119>(subkeys));
754     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<108>(subkeys))), LoadKey<109>(subkeys));
755     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<110>(subkeys))), LoadKey<111>(subkeys));
756     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<112>(subkeys))), LoadKey<113>(subkeys));
757     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<102>(subkeys))), LoadKey<103>(subkeys));
758     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<104>(subkeys))), LoadKey<105>(subkeys));
759     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<106>(subkeys))), LoadKey<107>(subkeys));
760     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<96>(subkeys))), LoadKey<97>(subkeys));
761     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<98>(subkeys))), LoadKey<99>(subkeys));
762     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<100>(subkeys))), LoadKey<101>(subkeys));
763 
764     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<90>(subkeys))), LoadKey<91>(subkeys));
765     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<92>(subkeys))), LoadKey<93>(subkeys));
766     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<94>(subkeys))), LoadKey<95>(subkeys));
767     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<84>(subkeys))), LoadKey<85>(subkeys));
768     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<86>(subkeys))), LoadKey<87>(subkeys));
769     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<88>(subkeys))), LoadKey<89>(subkeys));
770     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<78>(subkeys))), LoadKey<79>(subkeys));
771     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<80>(subkeys))), LoadKey<81>(subkeys));
772     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<82>(subkeys))), LoadKey<83>(subkeys));
773     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<72>(subkeys))), LoadKey<73>(subkeys));
774     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<74>(subkeys))), LoadKey<75>(subkeys));
775     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<76>(subkeys))), LoadKey<77>(subkeys));
776 
777     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<66>(subkeys))), LoadKey<67>(subkeys));
778     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<68>(subkeys))), LoadKey<69>(subkeys));
779     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<70>(subkeys))), LoadKey<71>(subkeys));
780     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<60>(subkeys))), LoadKey<61>(subkeys));
781     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<62>(subkeys))), LoadKey<63>(subkeys));
782     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<64>(subkeys))), LoadKey<65>(subkeys));
783     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<54>(subkeys))), LoadKey<55>(subkeys));
784     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<56>(subkeys))), LoadKey<57>(subkeys));
785     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<58>(subkeys))), LoadKey<59>(subkeys));
786     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<48>(subkeys))), LoadKey<49>(subkeys));
787     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<50>(subkeys))), LoadKey<51>(subkeys));
788     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<52>(subkeys))), LoadKey<53>(subkeys));
789 
790     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<42>(subkeys))), LoadKey<43>(subkeys));
791     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<44>(subkeys))), LoadKey<45>(subkeys));
792     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<46>(subkeys))), LoadKey<47>(subkeys));
793     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<36>(subkeys))), LoadKey<37>(subkeys));
794     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<38>(subkeys))), LoadKey<39>(subkeys));
795     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<40>(subkeys))), LoadKey<41>(subkeys));
796     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<30>(subkeys))), LoadKey<31>(subkeys));
797     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<32>(subkeys))), LoadKey<33>(subkeys));
798     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<34>(subkeys))), LoadKey<35>(subkeys));
799     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<24>(subkeys))), LoadKey<25>(subkeys));
800     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<26>(subkeys))), LoadKey<27>(subkeys));
801     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<28>(subkeys))), LoadKey<29>(subkeys));
802 
803     temp[0] = Xor(Sub(RotateRight<9>(temp[0]), Xor(temp[3], LoadKey<18>(subkeys))), LoadKey<19>(subkeys));
804     temp[1] = Xor(Sub(RotateLeft<5>(temp[1]), Xor(temp[0], LoadKey<20>(subkeys))), LoadKey<21>(subkeys));
805     temp[2] = Xor(Sub(RotateLeft<3>(temp[2]), Xor(temp[1], LoadKey<22>(subkeys))), LoadKey<23>(subkeys));
806     temp[3] = Xor(Sub(RotateRight<9>(temp[3]), Xor(temp[2], LoadKey<12>(subkeys))), LoadKey<13>(subkeys));
807     temp[0] = Xor(Sub(RotateLeft<5>(temp[0]), Xor(temp[3], LoadKey<14>(subkeys))), LoadKey<15>(subkeys));
808     temp[1] = Xor(Sub(RotateLeft<3>(temp[1]), Xor(temp[0], LoadKey<16>(subkeys))), LoadKey<17>(subkeys));
809     temp[2] = Xor(Sub(RotateRight<9>(temp[2]), Xor(temp[1], LoadKey<6>(subkeys))), LoadKey<7>(subkeys));
810     temp[3] = Xor(Sub(RotateLeft<5>(temp[3]), Xor(temp[2], LoadKey<8>(subkeys))), LoadKey<9>(subkeys));
811     temp[0] = Xor(Sub(RotateLeft<3>(temp[0]), Xor(temp[3], LoadKey<10>(subkeys))), LoadKey<11>(subkeys));
812     temp[1] = Xor(Sub(RotateRight<9>(temp[1]), Xor(temp[0], LoadKey<0>(subkeys))), LoadKey<1>(subkeys));
813     temp[2] = Xor(Sub(RotateLeft<5>(temp[2]), Xor(temp[1], LoadKey<2>(subkeys))), LoadKey<3>(subkeys));
814     temp[3] = Xor(Sub(RotateLeft<3>(temp[3]), Xor(temp[2], LoadKey<4>(subkeys))), LoadKey<5>(subkeys));
815 }
816 
817 #endif  // LEA Encryption and Decryption
818 
819 // *************************** ARM NEON ***************************//
820 
821 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
822 
LEA_Enc_Block(uint32x4_t & block0,const word32 * subkeys,unsigned int rounds)823 inline void LEA_Enc_Block(uint32x4_t &block0,
824     const word32 *subkeys, unsigned int rounds)
825 {
826     uint32x4_t temp[4];
827     temp[0] = UnpackNEON<0>(block0);
828     temp[1] = UnpackNEON<1>(block0);
829     temp[2] = UnpackNEON<2>(block0);
830     temp[3] = UnpackNEON<3>(block0);
831 
832     LEA_Encryption(temp, subkeys, rounds);
833 
834     block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
835 }
836 
LEA_Dec_Block(uint32x4_t & block0,const word32 * subkeys,unsigned int rounds)837 inline void LEA_Dec_Block(uint32x4_t &block0,
838     const word32 *subkeys, unsigned int rounds)
839 {
840     uint32x4_t temp[4];
841     temp[0] = UnpackNEON<0>(block0);
842     temp[1] = UnpackNEON<1>(block0);
843     temp[2] = UnpackNEON<2>(block0);
844     temp[3] = UnpackNEON<3>(block0);
845 
846     LEA_Decryption(temp, subkeys, rounds);
847 
848     block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
849 }
850 
LEA_Enc_4_Blocks(uint32x4_t & block0,uint32x4_t & block1,uint32x4_t & block2,uint32x4_t & block3,const word32 * subkeys,unsigned int rounds)851 inline void LEA_Enc_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
852     uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
853 {
854     uint32x4_t temp[4];
855     temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
856     temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
857     temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
858     temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
859 
860     LEA_Encryption(temp, subkeys, rounds);
861 
862     block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
863     block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
864     block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
865     block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
866 }
867 
LEA_Dec_4_Blocks(uint32x4_t & block0,uint32x4_t & block1,uint32x4_t & block2,uint32x4_t & block3,const word32 * subkeys,unsigned int rounds)868 inline void LEA_Dec_4_Blocks(uint32x4_t &block0, uint32x4_t &block1,
869     uint32x4_t &block2, uint32x4_t &block3, const word32 *subkeys, unsigned int rounds)
870 {
871     uint32x4_t temp[4];
872     temp[0] = UnpackNEON<0>(block0, block1, block2, block3);
873     temp[1] = UnpackNEON<1>(block0, block1, block2, block3);
874     temp[2] = UnpackNEON<2>(block0, block1, block2, block3);
875     temp[3] = UnpackNEON<3>(block0, block1, block2, block3);
876 
877     LEA_Decryption(temp, subkeys, rounds);
878 
879     block0 = RepackNEON<0>(temp[0], temp[1], temp[2], temp[3]);
880     block1 = RepackNEON<1>(temp[0], temp[1], temp[2], temp[3]);
881     block2 = RepackNEON<2>(temp[0], temp[1], temp[2], temp[3]);
882     block3 = RepackNEON<3>(temp[0], temp[1], temp[2], temp[3]);
883 }
884 
885 #endif  // CRYPTOPP_ARM_NEON_AVAILABLE
886 
887 // *************************** IA-32 ***************************//
888 
889 #if (CRYPTOPP_SSSE3_AVAILABLE)
890 
LEA_Enc_Block(__m128i & block0,const word32 * subkeys,unsigned int rounds)891 inline void LEA_Enc_Block(__m128i &block0,
892     const word32 *subkeys, unsigned int rounds)
893 {
894     __m128i temp[4];
895     temp[0] = UnpackXMM<0>(block0);
896     temp[1] = UnpackXMM<1>(block0);
897     temp[2] = UnpackXMM<2>(block0);
898     temp[3] = UnpackXMM<3>(block0);
899 
900     LEA_Encryption(temp, subkeys, rounds);
901 
902     block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
903 }
904 
LEA_Dec_Block(__m128i & block0,const word32 * subkeys,unsigned int rounds)905 inline void LEA_Dec_Block(__m128i &block0,
906     const word32 *subkeys, unsigned int rounds)
907 {
908     __m128i temp[4];
909     temp[0] = UnpackXMM<0>(block0);
910     temp[1] = UnpackXMM<1>(block0);
911     temp[2] = UnpackXMM<2>(block0);
912     temp[3] = UnpackXMM<3>(block0);
913 
914     LEA_Decryption(temp, subkeys, rounds);
915 
916     block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
917 }
918 
LEA_Enc_4_Blocks(__m128i & block0,__m128i & block1,__m128i & block2,__m128i & block3,const word32 * subkeys,unsigned int rounds)919 inline void LEA_Enc_4_Blocks(__m128i &block0, __m128i &block1,
920     __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
921 {
922     __m128i temp[4];
923     temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
924     temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
925     temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
926     temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
927 
928     LEA_Encryption(temp, subkeys, rounds);
929 
930     block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
931     block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
932     block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
933     block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
934 }
935 
LEA_Dec_4_Blocks(__m128i & block0,__m128i & block1,__m128i & block2,__m128i & block3,const word32 * subkeys,unsigned int rounds)936 inline void LEA_Dec_4_Blocks(__m128i &block0, __m128i &block1,
937     __m128i &block2, __m128i &block3, const word32 *subkeys, unsigned int rounds)
938 {
939     __m128i temp[4];
940     temp[0] = UnpackXMM<0>(block0, block1, block2, block3);
941     temp[1] = UnpackXMM<1>(block0, block1, block2, block3);
942     temp[2] = UnpackXMM<2>(block0, block1, block2, block3);
943     temp[3] = UnpackXMM<3>(block0, block1, block2, block3);
944 
945     LEA_Decryption(temp, subkeys, rounds);
946 
947     block0 = RepackXMM<0>(temp[0], temp[1], temp[2], temp[3]);
948     block1 = RepackXMM<1>(temp[0], temp[1], temp[2], temp[3]);
949     block2 = RepackXMM<2>(temp[0], temp[1], temp[2], temp[3]);
950     block3 = RepackXMM<3>(temp[0], temp[1], temp[2], temp[3]);
951 }
952 
953 #endif  // CRYPTOPP_SSSE3_AVAILABLE
954 
955 // *************************** Power8 ***************************//
956 
957 #if (CRYPTOPP_POWER8_AVAILABLE)
958 
LEA_Enc_Block(uint32x4_p & block0,const word32 * subkeys,unsigned int rounds)959 inline void LEA_Enc_Block(uint32x4_p &block0,
960     const word32 *subkeys, unsigned int rounds)
961 {
962     uint32x4_p temp[4];
963     temp[0] = UnpackSIMD<0>(block0);
964     temp[1] = UnpackSIMD<1>(block0);
965     temp[2] = UnpackSIMD<2>(block0);
966     temp[3] = UnpackSIMD<3>(block0);
967 
968     LEA_Encryption(temp, subkeys, rounds);
969 
970     block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
971 }
972 
LEA_Dec_Block(uint32x4_p & block0,const word32 * subkeys,unsigned int rounds)973 inline void LEA_Dec_Block(uint32x4_p &block0,
974     const word32 *subkeys, unsigned int rounds)
975 {
976     uint32x4_p temp[4];
977     temp[0] = UnpackSIMD<0>(block0);
978     temp[1] = UnpackSIMD<1>(block0);
979     temp[2] = UnpackSIMD<2>(block0);
980     temp[3] = UnpackSIMD<3>(block0);
981 
982     LEA_Decryption(temp, subkeys, rounds);
983 
984     block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
985 }
986 
LEA_Enc_4_Blocks(uint32x4_p & block0,uint32x4_p & block1,uint32x4_p & block2,uint32x4_p & block3,const word32 * subkeys,unsigned int rounds)987 inline void LEA_Enc_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
988     uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
989 {
990     uint32x4_p temp[4];
991     temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
992     temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
993     temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
994     temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
995 
996     LEA_Encryption(temp, subkeys, rounds);
997 
998     block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
999     block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
1000     block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
1001     block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
1002 }
1003 
LEA_Dec_4_Blocks(uint32x4_p & block0,uint32x4_p & block1,uint32x4_p & block2,uint32x4_p & block3,const word32 * subkeys,unsigned int rounds)1004 inline void LEA_Dec_4_Blocks(uint32x4_p &block0, uint32x4_p &block1,
1005     uint32x4_p &block2, uint32x4_p &block3, const word32 *subkeys, unsigned int rounds)
1006 {
1007     uint32x4_p temp[4];
1008     temp[0] = UnpackSIMD<0>(block0, block1, block2, block3);
1009     temp[1] = UnpackSIMD<1>(block0, block1, block2, block3);
1010     temp[2] = UnpackSIMD<2>(block0, block1, block2, block3);
1011     temp[3] = UnpackSIMD<3>(block0, block1, block2, block3);
1012 
1013     LEA_Decryption(temp, subkeys, rounds);
1014 
1015     block0 = RepackSIMD<0>(temp[0], temp[1], temp[2], temp[3]);
1016     block1 = RepackSIMD<1>(temp[0], temp[1], temp[2], temp[3]);
1017     block2 = RepackSIMD<2>(temp[0], temp[1], temp[2], temp[3]);
1018     block3 = RepackSIMD<3>(temp[0], temp[1], temp[2], temp[3]);
1019 }
1020 
1021 #endif  // CRYPTOPP_POWER8_AVAILABLE
1022 
1023 ANONYMOUS_NAMESPACE_END
1024 
1025 // *************************** SIMD Templates ***************************//
1026 
NAMESPACE_BEGIN(CryptoPP)1027 NAMESPACE_BEGIN(CryptoPP)
1028 
1029 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
1030 size_t LEA_Enc_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1031     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1032 {
1033     return AdvancedProcessBlocks128_4x1_SSE(LEA_Enc_Block, LEA_Enc_4_Blocks,
1034         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1035 }
1036 
LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1037 size_t LEA_Dec_AdvancedProcessBlocks_SSSE3(const word32* subKeys, size_t rounds,
1038     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1039 {
1040     return AdvancedProcessBlocks128_4x1_SSE(LEA_Dec_Block, LEA_Dec_4_Blocks,
1041         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1042 }
1043 #endif // CRYPTOPP_SSSE3_AVAILABLE
1044 
1045 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
LEA_Enc_AdvancedProcessBlocks_NEON(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1046 size_t LEA_Enc_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1047     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1048 {
1049     return AdvancedProcessBlocks128_4x1_NEON(LEA_Enc_Block, LEA_Enc_4_Blocks,
1050         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1051 }
1052 
LEA_Dec_AdvancedProcessBlocks_NEON(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1053 size_t LEA_Dec_AdvancedProcessBlocks_NEON(const word32* subKeys, size_t rounds,
1054     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1055 {
1056     return AdvancedProcessBlocks128_4x1_NEON(LEA_Dec_Block, LEA_Dec_4_Blocks,
1057         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1058 }
1059 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
1060 
1061 #if defined(CRYPTOPP_POWER8_AVAILABLE)
LEA_Enc_AdvancedProcessBlocks_POWER8(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1062 size_t LEA_Enc_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1063     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1064 {
1065     return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Enc_Block, LEA_Enc_4_Blocks,
1066         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1067 }
1068 
LEA_Dec_AdvancedProcessBlocks_POWER8(const word32 * subKeys,size_t rounds,const byte * inBlocks,const byte * xorBlocks,byte * outBlocks,size_t length,word32 flags)1069 size_t LEA_Dec_AdvancedProcessBlocks_POWER8(const word32* subKeys, size_t rounds,
1070     const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1071 {
1072     return AdvancedProcessBlocks128_4x1_ALTIVEC(LEA_Dec_Block, LEA_Dec_4_Blocks,
1073         subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1074 }
1075 #endif // CRYPTOPP_POWER8_AVAILABLE
1076 
1077 NAMESPACE_END
1078