1 // sha_simd.cpp - written and placed in the public domain by
2 //                Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 //    This source file uses intrinsics to gain access to SHA-NI and
5 //    ARMv8a SHA instructions. A separate source file is needed
6 //    because additional CXXFLAGS are required to enable the
7 //    appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "sha.h"
12 #include "misc.h"
13 
14 #if defined(CRYPTOPP_DISABLE_SHA_ASM)
15 # undef CRYPTOPP_X86_ASM_AVAILABLE
16 # undef CRYPTOPP_X32_ASM_AVAILABLE
17 # undef CRYPTOPP_X64_ASM_AVAILABLE
18 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
19 #endif
20 
21 #if (CRYPTOPP_SHANI_AVAILABLE)
22 # include <nmmintrin.h>
23 # include <immintrin.h>
24 #endif
25 
26 // Android makes <arm_acle.h> available with ARMv7-a
27 #if (CRYPTOPP_BOOL_ARMV8)
28 # if (CRYPTOPP_ARM_NEON_HEADER)
29 #  include <arm_neon.h>
30 # endif
31 # if (CRYPTOPP_ARM_ACLE_HEADER)
32 #  include <stdint.h>
33 #  include <arm_acle.h>
34 # endif
35 #endif
36 
37 #if CRYPTOPP_POWER8_SHA_AVAILABLE
38 # include "ppc_simd.h"
39 #endif
40 
41 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
42 # include <signal.h>
43 # include <setjmp.h>
44 #endif
45 
46 #ifndef EXCEPTION_EXECUTE_HANDLER
47 # define EXCEPTION_EXECUTE_HANDLER 1
48 #endif
49 
50 // Squash MS LNK4221 and libtool warnings
51 extern const char SHA_SIMD_FNAME[] = __FILE__;
52 
53 NAMESPACE_BEGIN(CryptoPP)
54 
55 // ***************** SHA key tables ********************
56 
57 extern const word32 SHA256_K[64];
58 extern const word64 SHA512_K[80];
59 
60 // ***************** SIGILL probes ********************
61 
62 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
63 extern "C" {
64     typedef void (*SigHandler)(int);
65 
66     static jmp_buf s_jmpSIGILL;
SigIllHandler(int)67     static void SigIllHandler(int)
68     {
69         longjmp(s_jmpSIGILL, 1);
70     }
71 }
72 #endif  // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
73 
74 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
CPU_ProbeSHA1()75 bool CPU_ProbeSHA1()
76 {
77 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
78     return false;
79 #elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
80 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
81     volatile bool result = true;
82     __try
83     {
84         unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
85         uint32x4_t data1 = vld1q_u32(w+0);
86         uint32x4_t data2 = vld1q_u32(w+4);
87         uint32x4_t data3 = vld1q_u32(w+8);
88 
89         uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
90         uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
91         uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
92         uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
93         uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
94 
95         result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
96     }
97     __except (EXCEPTION_EXECUTE_HANDLER)
98     {
99         return false;
100     }
101     return result;
102 # else
103 
104     // longjmp and clobber warnings. Volatile is required.
105     // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
106     volatile bool result = true;
107 
108     volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
109     if (oldHandler == SIG_ERR)
110         return false;
111 
112     volatile sigset_t oldMask;
113     if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
114     {
115         signal(SIGILL, oldHandler);
116         return false;
117     }
118 
119     if (setjmp(s_jmpSIGILL))
120         result = false;
121     else
122     {
123         unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
124         uint32x4_t data1 = vld1q_u32(w+0);
125         uint32x4_t data2 = vld1q_u32(w+4);
126         uint32x4_t data3 = vld1q_u32(w+8);
127 
128         uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
129         uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
130         uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
131         uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
132         uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
133 
134         result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
135     }
136 
137     sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
138     signal(SIGILL, oldHandler);
139     return result;
140 # endif
141 #else
142     return false;
143 #endif  // CRYPTOPP_ARM_SHA1_AVAILABLE
144 }
145 
CPU_ProbeSHA256()146 bool CPU_ProbeSHA256()
147 {
148 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
149     return false;
150 #elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
151 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
152     volatile bool result = true;
153     __try
154     {
155         unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
156         uint32x4_t data1 = vld1q_u32(w+0);
157         uint32x4_t data2 = vld1q_u32(w+4);
158         uint32x4_t data3 = vld1q_u32(w+8);
159 
160         uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
161         uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
162         uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
163         uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
164 
165         result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
166     }
167     __except (EXCEPTION_EXECUTE_HANDLER)
168     {
169         return false;
170     }
171     return result;
172 #else
173 
174     // longjmp and clobber warnings. Volatile is required.
175     // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
176     volatile bool result = true;
177 
178     volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
179     if (oldHandler == SIG_ERR)
180         return false;
181 
182     volatile sigset_t oldMask;
183     if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
184     {
185         signal(SIGILL, oldHandler);
186         return false;
187     }
188 
189     if (setjmp(s_jmpSIGILL))
190         result = false;
191     else
192     {
193         unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
194         uint32x4_t data1 = vld1q_u32(w+0);
195         uint32x4_t data2 = vld1q_u32(w+4);
196         uint32x4_t data3 = vld1q_u32(w+8);
197 
198         uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
199         uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
200         uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
201         uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
202 
203         result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
204     }
205 
206     sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
207     signal(SIGILL, oldHandler);
208     return result;
209 # endif
210 #else
211     return false;
212 #endif  // CRYPTOPP_ARM_SHA2_AVAILABLE
213 }
214 #endif  // ARM32 or ARM64
215 
216 // ***************** Intel x86 SHA ********************
217 
218 /////////////////////////////////////
219 // start of Walton and Gulley code //
220 /////////////////////////////////////
221 
222 #if CRYPTOPP_SHANI_AVAILABLE
223 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
SHA1_HashMultipleBlocks_SHANI(word32 * state,const word32 * data,size_t length,ByteOrder order)224 void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
225 {
226     CRYPTOPP_ASSERT(state);
227     CRYPTOPP_ASSERT(data);
228     CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
229 
230     __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
231     __m128i MASK, MSG0, MSG1, MSG2, MSG3;
232 
233     // Load initial values
234     ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
235     E0 = _mm_set_epi32(state[4], 0, 0, 0);
236     ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
237 
238     // IA-32 SHA is little endian, SHA::Transform is big endian,
239     // and SHA::HashMultipleBlocks can be either. ByteOrder
240     // allows us to avoid extra endian reversals. It saves 1.0 cpb.
241     MASK = order == BIG_ENDIAN_ORDER ?  // Data arrangement
242            _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
243            _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
244 
245     while (length >= SHA1::BLOCKSIZE)
246     {
247         // Save current hash
248         ABCD_SAVE = ABCD;
249         E0_SAVE = E0;
250 
251         // Rounds 0-3
252         MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
253         MSG0 = _mm_shuffle_epi8(MSG0, MASK);
254         E0 = _mm_add_epi32(E0, MSG0);
255         E1 = ABCD;
256         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
257 
258         // Rounds 4-7
259         MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
260         MSG1 = _mm_shuffle_epi8(MSG1, MASK);
261         E1 = _mm_sha1nexte_epu32(E1, MSG1);
262         E0 = ABCD;
263         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
264         MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
265 
266         // Rounds 8-11
267         MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
268         MSG2 = _mm_shuffle_epi8(MSG2, MASK);
269         E0 = _mm_sha1nexte_epu32(E0, MSG2);
270         E1 = ABCD;
271         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
272         MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
273         MSG0 = _mm_xor_si128(MSG0, MSG2);
274 
275         // Rounds 12-15
276         MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
277         MSG3 = _mm_shuffle_epi8(MSG3, MASK);
278         E1 = _mm_sha1nexte_epu32(E1, MSG3);
279         E0 = ABCD;
280         MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
281         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
282         MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
283         MSG1 = _mm_xor_si128(MSG1, MSG3);
284 
285         // Rounds 16-19
286         E0 = _mm_sha1nexte_epu32(E0, MSG0);
287         E1 = ABCD;
288         MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
289         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
290         MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
291         MSG2 = _mm_xor_si128(MSG2, MSG0);
292 
293         // Rounds 20-23
294         E1 = _mm_sha1nexte_epu32(E1, MSG1);
295         E0 = ABCD;
296         MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
297         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
298         MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
299         MSG3 = _mm_xor_si128(MSG3, MSG1);
300 
301         // Rounds 24-27
302         E0 = _mm_sha1nexte_epu32(E0, MSG2);
303         E1 = ABCD;
304         MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
305         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
306         MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
307         MSG0 = _mm_xor_si128(MSG0, MSG2);
308 
309         // Rounds 28-31
310         E1 = _mm_sha1nexte_epu32(E1, MSG3);
311         E0 = ABCD;
312         MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
313         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
314         MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
315         MSG1 = _mm_xor_si128(MSG1, MSG3);
316 
317         // Rounds 32-35
318         E0 = _mm_sha1nexte_epu32(E0, MSG0);
319         E1 = ABCD;
320         MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
321         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
322         MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
323         MSG2 = _mm_xor_si128(MSG2, MSG0);
324 
325         // Rounds 36-39
326         E1 = _mm_sha1nexte_epu32(E1, MSG1);
327         E0 = ABCD;
328         MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
329         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
330         MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
331         MSG3 = _mm_xor_si128(MSG3, MSG1);
332 
333         // Rounds 40-43
334         E0 = _mm_sha1nexte_epu32(E0, MSG2);
335         E1 = ABCD;
336         MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
337         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
338         MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
339         MSG0 = _mm_xor_si128(MSG0, MSG2);
340 
341         // Rounds 44-47
342         E1 = _mm_sha1nexte_epu32(E1, MSG3);
343         E0 = ABCD;
344         MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
345         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
346         MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
347         MSG1 = _mm_xor_si128(MSG1, MSG3);
348 
349         // Rounds 48-51
350         E0 = _mm_sha1nexte_epu32(E0, MSG0);
351         E1 = ABCD;
352         MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
353         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
354         MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
355         MSG2 = _mm_xor_si128(MSG2, MSG0);
356 
357         // Rounds 52-55
358         E1 = _mm_sha1nexte_epu32(E1, MSG1);
359         E0 = ABCD;
360         MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
361         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
362         MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
363         MSG3 = _mm_xor_si128(MSG3, MSG1);
364 
365         // Rounds 56-59
366         E0 = _mm_sha1nexte_epu32(E0, MSG2);
367         E1 = ABCD;
368         MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
369         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
370         MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
371         MSG0 = _mm_xor_si128(MSG0, MSG2);
372 
373         // Rounds 60-63
374         E1 = _mm_sha1nexte_epu32(E1, MSG3);
375         E0 = ABCD;
376         MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
377         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
378         MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
379         MSG1 = _mm_xor_si128(MSG1, MSG3);
380 
381         // Rounds 64-67
382         E0 = _mm_sha1nexte_epu32(E0, MSG0);
383         E1 = ABCD;
384         MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
385         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
386         MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
387         MSG2 = _mm_xor_si128(MSG2, MSG0);
388 
389         // Rounds 68-71
390         E1 = _mm_sha1nexte_epu32(E1, MSG1);
391         E0 = ABCD;
392         MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
393         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
394         MSG3 = _mm_xor_si128(MSG3, MSG1);
395 
396         // Rounds 72-75
397         E0 = _mm_sha1nexte_epu32(E0, MSG2);
398         E1 = ABCD;
399         MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
400         ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
401 
402         // Rounds 76-79
403         E1 = _mm_sha1nexte_epu32(E1, MSG3);
404         E0 = ABCD;
405         ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
406 
407         // Add values back to state
408         E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
409         ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
410 
411         data += SHA1::BLOCKSIZE/sizeof(word32);
412         length -= SHA1::BLOCKSIZE;
413     }
414 
415     // Save state
416     ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
417     _mm_storeu_si128(M128_CAST(state), ABCD);
418     state[4] = _mm_extract_epi32(E0, 3);
419 }
420 
421 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
SHA256_HashMultipleBlocks_SHANI(word32 * state,const word32 * data,size_t length,ByteOrder order)422 void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
423 {
424     CRYPTOPP_ASSERT(state);
425     CRYPTOPP_ASSERT(data);
426     CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
427 
428     __m128i STATE0, STATE1;
429     __m128i MSG, TMP, MASK;
430     __m128i TMSG0, TMSG1, TMSG2, TMSG3;
431     __m128i ABEF_SAVE, CDGH_SAVE;
432 
433     // Load initial values
434     TMP    = _mm_loadu_si128(M128_CAST(&state[0]));
435     STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
436 
437     // IA-32 SHA is little endian, SHA::Transform is big endian,
438     // and SHA::HashMultipleBlocks can be either. ByteOrder
439     // allows us to avoid extra endian reversals. It saves 1.0 cpb.
440     MASK = order == BIG_ENDIAN_ORDER ?  // Data arrangement
441            _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
442            _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
443 
444     TMP = _mm_shuffle_epi32(TMP, 0xB1);          // CDAB
445     STATE1 = _mm_shuffle_epi32(STATE1, 0x1B);    // EFGH
446     STATE0 = _mm_alignr_epi8(TMP, STATE1, 8);    // ABEF
447     STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
448 
449     while (length >= SHA256::BLOCKSIZE)
450     {
451         // Save current hash
452         ABEF_SAVE = STATE0;
453         CDGH_SAVE = STATE1;
454 
455         // Rounds 0-3
456         MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
457         TMSG0 = _mm_shuffle_epi8(MSG, MASK);
458         MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
459         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
460         MSG = _mm_shuffle_epi32(MSG, 0x0E);
461         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
462 
463         // Rounds 4-7
464         TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
465         TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
466         MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
467         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
468         MSG = _mm_shuffle_epi32(MSG, 0x0E);
469         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
470         TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
471 
472         // Rounds 8-11
473         TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
474         TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
475         MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
476         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
477         MSG = _mm_shuffle_epi32(MSG, 0x0E);
478         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
479         TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
480 
481         // Rounds 12-15
482         TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
483         TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
484         MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
485         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
486         TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
487         TMSG0 = _mm_add_epi32(TMSG0, TMP);
488         TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
489         MSG = _mm_shuffle_epi32(MSG, 0x0E);
490         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
491         TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
492 
493         // Rounds 16-19
494         MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
495         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
496         TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
497         TMSG1 = _mm_add_epi32(TMSG1, TMP);
498         TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
499         MSG = _mm_shuffle_epi32(MSG, 0x0E);
500         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
501         TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
502 
503         // Rounds 20-23
504         MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
505         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
506         TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
507         TMSG2 = _mm_add_epi32(TMSG2, TMP);
508         TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
509         MSG = _mm_shuffle_epi32(MSG, 0x0E);
510         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
511         TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
512 
513         // Rounds 24-27
514         MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
515         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
516         TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
517         TMSG3 = _mm_add_epi32(TMSG3, TMP);
518         TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
519         MSG = _mm_shuffle_epi32(MSG, 0x0E);
520         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
521         TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
522 
523         // Rounds 28-31
524         MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
525         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
526         TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
527         TMSG0 = _mm_add_epi32(TMSG0, TMP);
528         TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
529         MSG = _mm_shuffle_epi32(MSG, 0x0E);
530         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
531         TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
532 
533         // Rounds 32-35
534         MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
535         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
536         TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
537         TMSG1 = _mm_add_epi32(TMSG1, TMP);
538         TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
539         MSG = _mm_shuffle_epi32(MSG, 0x0E);
540         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
541         TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
542 
543         // Rounds 36-39
544         MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
545         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
546         TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
547         TMSG2 = _mm_add_epi32(TMSG2, TMP);
548         TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
549         MSG = _mm_shuffle_epi32(MSG, 0x0E);
550         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
551         TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
552 
553         // Rounds 40-43
554         MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
555         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
556         TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
557         TMSG3 = _mm_add_epi32(TMSG3, TMP);
558         TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
559         MSG = _mm_shuffle_epi32(MSG, 0x0E);
560         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
561         TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
562 
563         // Rounds 44-47
564         MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
565         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
566         TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
567         TMSG0 = _mm_add_epi32(TMSG0, TMP);
568         TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
569         MSG = _mm_shuffle_epi32(MSG, 0x0E);
570         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
571         TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
572 
573         // Rounds 48-51
574         MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
575         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
576         TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
577         TMSG1 = _mm_add_epi32(TMSG1, TMP);
578         TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
579         MSG = _mm_shuffle_epi32(MSG, 0x0E);
580         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
581         TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
582 
583         // Rounds 52-55
584         MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
585         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
586         TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
587         TMSG2 = _mm_add_epi32(TMSG2, TMP);
588         TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
589         MSG = _mm_shuffle_epi32(MSG, 0x0E);
590         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
591 
592         // Rounds 56-59
593         MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
594         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
595         TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
596         TMSG3 = _mm_add_epi32(TMSG3, TMP);
597         TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
598         MSG = _mm_shuffle_epi32(MSG, 0x0E);
599         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
600 
601         // Rounds 60-63
602         MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
603         STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
604         MSG = _mm_shuffle_epi32(MSG, 0x0E);
605         STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
606 
607         // Add values back to state
608         STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
609         STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
610 
611         data += SHA256::BLOCKSIZE/sizeof(word32);
612         length -= SHA256::BLOCKSIZE;
613     }
614 
615     TMP = _mm_shuffle_epi32(STATE0, 0x1B);       // FEBA
616     STATE1 = _mm_shuffle_epi32(STATE1, 0xB1);    // DCHG
617     STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
618     STATE1 = _mm_alignr_epi8(STATE1, TMP, 8);    // ABEF
619 
620     // Save state
621     _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
622     _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
623 }
624 #endif  // CRYPTOPP_SHANI_AVAILABLE
625 
626 ///////////////////////////////////
627 // end of Walton and Gulley code //
628 ///////////////////////////////////
629 
630 // ***************** ARMV8 SHA ********************
631 
632 /////////////////////////////////////////////////////////////
633 // start of Walton, Schneiders, O'Rourke and Hovsmith code //
634 /////////////////////////////////////////////////////////////
635 
636 #if CRYPTOPP_ARM_SHA1_AVAILABLE
SHA1_HashMultipleBlocks_ARMV8(word32 * state,const word32 * data,size_t length,ByteOrder order)637 void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
638 {
639     CRYPTOPP_ASSERT(state);
640     CRYPTOPP_ASSERT(data);
641     CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
642 
643     uint32x4_t C0, C1, C2, C3;
644     uint32x4_t ABCD, ABCD_SAVED;
645     uint32x4_t MSG0, MSG1, MSG2, MSG3;
646     uint32x4_t TMP0, TMP1;
647     uint32_t   E0, E0_SAVED, E1;
648 
649     // Load initial values
650     C0 = vdupq_n_u32(0x5A827999);
651     C1 = vdupq_n_u32(0x6ED9EBA1);
652     C2 = vdupq_n_u32(0x8F1BBCDC);
653     C3 = vdupq_n_u32(0xCA62C1D6);
654 
655     ABCD = vld1q_u32(&state[0]);
656     E0 = state[4];
657 
658     while (length >= SHA1::BLOCKSIZE)
659     {
660         // Save current hash
661         ABCD_SAVED = ABCD;
662         E0_SAVED = E0;
663 
664         MSG0 = vld1q_u32(data +  0);
665         MSG1 = vld1q_u32(data +  4);
666         MSG2 = vld1q_u32(data +  8);
667         MSG3 = vld1q_u32(data + 12);
668 
669         if (order == BIG_ENDIAN_ORDER)  // Data arrangement
670         {
671             MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
672             MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
673             MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
674             MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
675         }
676 
677         TMP0 = vaddq_u32(MSG0, C0);
678         TMP1 = vaddq_u32(MSG1, C0);
679 
680         // Rounds 0-3
681         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
682         ABCD = vsha1cq_u32(ABCD, E0, TMP0);
683         TMP0 = vaddq_u32(MSG2, C0);
684         MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
685 
686         // Rounds 4-7
687         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
688         ABCD = vsha1cq_u32(ABCD, E1, TMP1);
689         TMP1 = vaddq_u32(MSG3, C0);
690         MSG0 = vsha1su1q_u32(MSG0, MSG3);
691         MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
692 
693         // Rounds 8-11
694         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
695         ABCD = vsha1cq_u32(ABCD, E0, TMP0);
696         TMP0 = vaddq_u32(MSG0, C0);
697         MSG1 = vsha1su1q_u32(MSG1, MSG0);
698         MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
699 
700         // Rounds 12-15
701         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
702         ABCD = vsha1cq_u32(ABCD, E1, TMP1);
703         TMP1 = vaddq_u32(MSG1, C1);
704         MSG2 = vsha1su1q_u32(MSG2, MSG1);
705         MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
706 
707         // Rounds 16-19
708         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
709         ABCD = vsha1cq_u32(ABCD, E0, TMP0);
710         TMP0 = vaddq_u32(MSG2, C1);
711         MSG3 = vsha1su1q_u32(MSG3, MSG2);
712         MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
713 
714         // Rounds 20-23
715         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
716         ABCD = vsha1pq_u32(ABCD, E1, TMP1);
717         TMP1 = vaddq_u32(MSG3, C1);
718         MSG0 = vsha1su1q_u32(MSG0, MSG3);
719         MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
720 
721         // Rounds 24-27
722         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
723         ABCD = vsha1pq_u32(ABCD, E0, TMP0);
724         TMP0 = vaddq_u32(MSG0, C1);
725         MSG1 = vsha1su1q_u32(MSG1, MSG0);
726         MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
727 
728         // Rounds 28-31
729         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
730         ABCD = vsha1pq_u32(ABCD, E1, TMP1);
731         TMP1 = vaddq_u32(MSG1, C1);
732         MSG2 = vsha1su1q_u32(MSG2, MSG1);
733         MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
734 
735         // Rounds 32-35
736         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
737         ABCD = vsha1pq_u32(ABCD, E0, TMP0);
738         TMP0 = vaddq_u32(MSG2, C2);
739         MSG3 = vsha1su1q_u32(MSG3, MSG2);
740         MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
741 
742         // Rounds 36-39
743         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
744         ABCD = vsha1pq_u32(ABCD, E1, TMP1);
745         TMP1 = vaddq_u32(MSG3, C2);
746         MSG0 = vsha1su1q_u32(MSG0, MSG3);
747         MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
748 
749         // Rounds 40-43
750         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
751         ABCD = vsha1mq_u32(ABCD, E0, TMP0);
752         TMP0 = vaddq_u32(MSG0, C2);
753         MSG1 = vsha1su1q_u32(MSG1, MSG0);
754         MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
755 
756         // Rounds 44-47
757         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
758         ABCD = vsha1mq_u32(ABCD, E1, TMP1);
759         TMP1 = vaddq_u32(MSG1, C2);
760         MSG2 = vsha1su1q_u32(MSG2, MSG1);
761         MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
762 
763         // Rounds 48-51
764         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
765         ABCD = vsha1mq_u32(ABCD, E0, TMP0);
766         TMP0 = vaddq_u32(MSG2, C2);
767         MSG3 = vsha1su1q_u32(MSG3, MSG2);
768         MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
769 
770         // Rounds 52-55
771         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
772         ABCD = vsha1mq_u32(ABCD, E1, TMP1);
773         TMP1 = vaddq_u32(MSG3, C3);
774         MSG0 = vsha1su1q_u32(MSG0, MSG3);
775         MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
776 
777         // Rounds 56-59
778         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
779         ABCD = vsha1mq_u32(ABCD, E0, TMP0);
780         TMP0 = vaddq_u32(MSG0, C3);
781         MSG1 = vsha1su1q_u32(MSG1, MSG0);
782         MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
783 
784         // Rounds 60-63
785         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
786         ABCD = vsha1pq_u32(ABCD, E1, TMP1);
787         TMP1 = vaddq_u32(MSG1, C3);
788         MSG2 = vsha1su1q_u32(MSG2, MSG1);
789         MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
790 
791         // Rounds 64-67
792         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
793         ABCD = vsha1pq_u32(ABCD, E0, TMP0);
794         TMP0 = vaddq_u32(MSG2, C3);
795         MSG3 = vsha1su1q_u32(MSG3, MSG2);
796         MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
797 
798         // Rounds 68-71
799         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
800         ABCD = vsha1pq_u32(ABCD, E1, TMP1);
801         TMP1 = vaddq_u32(MSG3, C3);
802         MSG0 = vsha1su1q_u32(MSG0, MSG3);
803 
804         // Rounds 72-75
805         E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
806         ABCD = vsha1pq_u32(ABCD, E0, TMP0);
807 
808         // Rounds 76-79
809         E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
810         ABCD = vsha1pq_u32(ABCD, E1, TMP1);
811 
812         E0 += E0_SAVED;
813         ABCD = vaddq_u32(ABCD_SAVED, ABCD);
814 
815         data += SHA1::BLOCKSIZE/sizeof(word32);
816         length -= SHA1::BLOCKSIZE;
817     }
818 
819     // Save state
820     vst1q_u32(&state[0], ABCD);
821     state[4] = E0;
822 }
823 #endif  // CRYPTOPP_ARM_SHA1_AVAILABLE
824 
825 #if CRYPTOPP_ARM_SHA2_AVAILABLE
SHA256_HashMultipleBlocks_ARMV8(word32 * state,const word32 * data,size_t length,ByteOrder order)826 void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
827 {
828     CRYPTOPP_ASSERT(state);
829     CRYPTOPP_ASSERT(data);
830     CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
831 
832     uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
833     uint32x4_t MSG0, MSG1, MSG2, MSG3;
834     uint32x4_t TMP0, TMP1, TMP2;
835 
836     // Load initial values
837     STATE0 = vld1q_u32(&state[0]);
838     STATE1 = vld1q_u32(&state[4]);
839 
840     while (length >= SHA256::BLOCKSIZE)
841     {
842         // Save current hash
843         ABEF_SAVE = STATE0;
844         CDGH_SAVE = STATE1;
845 
846         // Load message
847         MSG0 = vld1q_u32(data +  0);
848         MSG1 = vld1q_u32(data +  4);
849         MSG2 = vld1q_u32(data +  8);
850         MSG3 = vld1q_u32(data + 12);
851 
852         if (order == BIG_ENDIAN_ORDER)  // Data arrangement
853         {
854             MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
855             MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
856             MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
857             MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
858         }
859 
860         TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
861 
862         // Rounds 0-3
863         MSG0 = vsha256su0q_u32(MSG0, MSG1);
864         TMP2 = STATE0;
865         TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
866         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
867         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
868         MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
869 
870         // Rounds 4-7
871         MSG1 = vsha256su0q_u32(MSG1, MSG2);
872         TMP2 = STATE0;
873         TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
874         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
875         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
876         MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
877 
878         // Rounds 8-11
879         MSG2 = vsha256su0q_u32(MSG2, MSG3);
880         TMP2 = STATE0;
881         TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
882         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
883         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
884         MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
885 
886         // Rounds 12-15
887         MSG3 = vsha256su0q_u32(MSG3, MSG0);
888         TMP2 = STATE0;
889         TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
890         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
891         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
892         MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
893 
894         // Rounds 16-19
895         MSG0 = vsha256su0q_u32(MSG0, MSG1);
896         TMP2 = STATE0;
897         TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
898         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
899         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
900         MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
901 
902         // Rounds 20-23
903         MSG1 = vsha256su0q_u32(MSG1, MSG2);
904         TMP2 = STATE0;
905         TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
906         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
907         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
908         MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
909 
910         // Rounds 24-27
911         MSG2 = vsha256su0q_u32(MSG2, MSG3);
912         TMP2 = STATE0;
913         TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
914         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
915         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
916         MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
917 
918         // Rounds 28-31
919         MSG3 = vsha256su0q_u32(MSG3, MSG0);
920         TMP2 = STATE0;
921         TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
922         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
923         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
924         MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
925 
926         // Rounds 32-35
927         MSG0 = vsha256su0q_u32(MSG0, MSG1);
928         TMP2 = STATE0;
929         TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
930         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
931         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
932         MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
933 
934         // Rounds 36-39
935         MSG1 = vsha256su0q_u32(MSG1, MSG2);
936         TMP2 = STATE0;
937         TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
938         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
939         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
940         MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
941 
942         // Rounds 40-43
943         MSG2 = vsha256su0q_u32(MSG2, MSG3);
944         TMP2 = STATE0;
945         TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
946         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
947         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
948         MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
949 
950         // Rounds 44-47
951         MSG3 = vsha256su0q_u32(MSG3, MSG0);
952         TMP2 = STATE0;
953         TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
954         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
955         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
956         MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
957 
958         // Rounds 48-51
959         TMP2 = STATE0;
960         TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
961         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
962         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
963 
964         // Rounds 52-55
965         TMP2 = STATE0;
966         TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
967         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
968         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
969 
970         // Rounds 56-59
971         TMP2 = STATE0;
972         TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
973         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
974         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
975 
976         // Rounds 60-63
977         TMP2 = STATE0;
978         STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
979         STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
980 
981         // Add back to state
982         STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
983         STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
984 
985         data += SHA256::BLOCKSIZE/sizeof(word32);
986         length -= SHA256::BLOCKSIZE;
987     }
988 
989     // Save state
990     vst1q_u32(&state[0], STATE0);
991     vst1q_u32(&state[4], STATE1);
992 }
993 #endif  // CRYPTOPP_ARM_SHA2_AVAILABLE
994 
995 ///////////////////////////////////////////////////////////
996 // end of Walton, Schneiders, O'Rourke and Hovsmith code //
997 ///////////////////////////////////////////////////////////
998 
999 // ***************** Power8 SHA ********************
1000 
1001 //////////////////////////////////////////////////
1002 // start Gustavo, Serra, Scalet and Walton code //
1003 //////////////////////////////////////////////////
1004 
1005 #if CRYPTOPP_POWER8_SHA_AVAILABLE
1006 
1007 // Indexes into the S[] array
1008 enum {A=0, B=1, C, D, E, F, G, H};
1009 
1010 inline
VecLoad32(const word32 * data,int offset)1011 uint32x4_p VecLoad32(const word32* data, int offset)
1012 {
1013 #if (CRYPTOPP_LITTLE_ENDIAN)
1014     const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1015     const uint32x4_p val = VecLoad(offset, data);
1016     return (uint32x4_p)VecPermute(val, val, mask);
1017 #else
1018     return VecLoad(offset, data);
1019 #endif
1020 }
1021 
1022 template<class T> inline
VecStore32(const T data,word32 dest[4])1023 void VecStore32(const T data, word32 dest[4])
1024 {
1025     VecStore(data, dest);
1026 }
1027 
1028 inline
VectorCh(const uint32x4_p x,const uint32x4_p y,const uint32x4_p z)1029 uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1030 {
1031     // The trick below is due to Andy Polyakov and Jack Lloyd
1032     return vec_sel(z,y,x);
1033 }
1034 
1035 inline
VectorMaj(const uint32x4_p x,const uint32x4_p y,const uint32x4_p z)1036 uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1037 {
1038     // The trick below is due to Andy Polyakov and Jack Lloyd
1039     return vec_sel(y, z, VecXor(x, y));
1040 }
1041 
1042 inline
Vector_sigma0(const uint32x4_p val)1043 uint32x4_p Vector_sigma0(const uint32x4_p val)
1044 {
1045     return VecSHA256<0,0>(val);
1046 }
1047 
1048 inline
Vector_sigma1(const uint32x4_p val)1049 uint32x4_p Vector_sigma1(const uint32x4_p val)
1050 {
1051     return VecSHA256<0,0xf>(val);
1052 }
1053 
1054 inline
VectorSigma0(const uint32x4_p val)1055 uint32x4_p VectorSigma0(const uint32x4_p val)
1056 {
1057     return VecSHA256<1,0>(val);
1058 }
1059 
1060 inline
VectorSigma1(const uint32x4_p val)1061 uint32x4_p VectorSigma1(const uint32x4_p val)
1062 {
1063     return VecSHA256<1,0xf>(val);
1064 }
1065 
1066 inline
VectorPack(const uint32x4_p a,const uint32x4_p b,const uint32x4_p c,const uint32x4_p d)1067 uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1068                        const uint32x4_p c, const uint32x4_p d)
1069 {
1070     const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1071     const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1072     return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1073 }
1074 
1075 template <unsigned int R> inline
SHA256_ROUND1(uint32x4_p W[16],uint32x4_p S[8],const uint32x4_p K,const uint32x4_p M)1076 void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1077 {
1078     uint32x4_p T1, T2;
1079 
1080     W[R] = M;
1081     T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1082     T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1083 
1084     S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1085     S[E] = S[D] + T1;
1086     S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1087     S[A] = T1 + T2;
1088 }
1089 
1090 template <unsigned int R> inline
SHA256_ROUND2(uint32x4_p W[16],uint32x4_p S[8],const uint32x4_p K)1091 void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1092 {
1093     // Indexes into the W[] array
1094     enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1095 
1096     const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1097     const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1098 
1099     uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1100     T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1101     uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1102 
1103     S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1104     S[E] = S[D] + T1;
1105     S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1106     S[A] = T1 + T2;
1107 }
1108 
SHA256_HashMultipleBlocks_POWER8(word32 * state,const word32 * data,size_t length,ByteOrder order)1109 void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1110 {
1111     CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1112     CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1113     CRYPTOPP_UNUSED(order);
1114 
1115     const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1116     const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1117 
1118     uint32x4_p abcd = VecLoad(state+0);
1119     uint32x4_p efgh = VecLoad(state+4);
1120     uint32x4_p W[16], S[8], vm, vk;
1121 
1122     size_t blocks = length / SHA256::BLOCKSIZE;
1123     while (blocks--)
1124     {
1125         unsigned int offset=0;
1126 
1127         S[A] = abcd; S[E] = efgh;
1128         S[B] = VecShiftLeftOctet<4>(S[A]);
1129         S[F] = VecShiftLeftOctet<4>(S[E]);
1130         S[C] = VecShiftLeftOctet<4>(S[B]);
1131         S[G] = VecShiftLeftOctet<4>(S[F]);
1132         S[D] = VecShiftLeftOctet<4>(S[C]);
1133         S[H] = VecShiftLeftOctet<4>(S[G]);
1134 
1135         // Rounds 0-16
1136         vk = VecLoad(offset, k);
1137         vm = VecLoad32(m, offset);
1138         SHA256_ROUND1<0>(W,S, vk,vm);
1139         offset+=16;
1140 
1141         vk = VecShiftLeftOctet<4>(vk);
1142         vm = VecShiftLeftOctet<4>(vm);
1143         SHA256_ROUND1<1>(W,S, vk,vm);
1144 
1145         vk = VecShiftLeftOctet<4>(vk);
1146         vm = VecShiftLeftOctet<4>(vm);
1147         SHA256_ROUND1<2>(W,S, vk,vm);
1148 
1149         vk = VecShiftLeftOctet<4>(vk);
1150         vm = VecShiftLeftOctet<4>(vm);
1151         SHA256_ROUND1<3>(W,S, vk,vm);
1152 
1153         vk = VecLoad(offset, k);
1154         vm = VecLoad32(m, offset);
1155         SHA256_ROUND1<4>(W,S, vk,vm);
1156         offset+=16;
1157 
1158         vk = VecShiftLeftOctet<4>(vk);
1159         vm = VecShiftLeftOctet<4>(vm);
1160         SHA256_ROUND1<5>(W,S, vk,vm);
1161 
1162         vk = VecShiftLeftOctet<4>(vk);
1163         vm = VecShiftLeftOctet<4>(vm);
1164         SHA256_ROUND1<6>(W,S, vk,vm);
1165 
1166         vk = VecShiftLeftOctet<4>(vk);
1167         vm = VecShiftLeftOctet<4>(vm);
1168         SHA256_ROUND1<7>(W,S, vk,vm);
1169 
1170         vk = VecLoad(offset, k);
1171         vm = VecLoad32(m, offset);
1172         SHA256_ROUND1<8>(W,S, vk,vm);
1173         offset+=16;
1174 
1175         vk = VecShiftLeftOctet<4>(vk);
1176         vm = VecShiftLeftOctet<4>(vm);
1177         SHA256_ROUND1<9>(W,S, vk,vm);
1178 
1179         vk = VecShiftLeftOctet<4>(vk);
1180         vm = VecShiftLeftOctet<4>(vm);
1181         SHA256_ROUND1<10>(W,S, vk,vm);
1182 
1183         vk = VecShiftLeftOctet<4>(vk);
1184         vm = VecShiftLeftOctet<4>(vm);
1185         SHA256_ROUND1<11>(W,S, vk,vm);
1186 
1187         vk = VecLoad(offset, k);
1188         vm = VecLoad32(m, offset);
1189         SHA256_ROUND1<12>(W,S, vk,vm);
1190         offset+=16;
1191 
1192         vk = VecShiftLeftOctet<4>(vk);
1193         vm = VecShiftLeftOctet<4>(vm);
1194         SHA256_ROUND1<13>(W,S, vk,vm);
1195 
1196         vk = VecShiftLeftOctet<4>(vk);
1197         vm = VecShiftLeftOctet<4>(vm);
1198         SHA256_ROUND1<14>(W,S, vk,vm);
1199 
1200         vk = VecShiftLeftOctet<4>(vk);
1201         vm = VecShiftLeftOctet<4>(vm);
1202         SHA256_ROUND1<15>(W,S, vk,vm);
1203 
1204         m += 16; // 32-bit words, not bytes
1205 
1206         // Rounds 16-64
1207         for (unsigned int i=16; i<64; i+=16)
1208         {
1209             vk = VecLoad(offset, k);
1210             SHA256_ROUND2<0>(W,S, vk);
1211             SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1212             SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1213             SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1214             offset+=16;
1215 
1216             vk = VecLoad(offset, k);
1217             SHA256_ROUND2<4>(W,S, vk);
1218             SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1219             SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1220             SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1221             offset+=16;
1222 
1223             vk = VecLoad(offset, k);
1224             SHA256_ROUND2<8>(W,S, vk);
1225             SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1226             SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1227             SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1228             offset+=16;
1229 
1230             vk = VecLoad(offset, k);
1231             SHA256_ROUND2<12>(W,S, vk);
1232             SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1233             SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1234             SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1235             offset+=16;
1236         }
1237 
1238         abcd += VectorPack(S[A],S[B],S[C],S[D]);
1239         efgh += VectorPack(S[E],S[F],S[G],S[H]);
1240     }
1241 
1242     VecStore32(abcd, state+0);
1243     VecStore32(efgh, state+4);
1244 }
1245 
1246 inline
VecStore64(const uint64x2_p val,word64 * data)1247 void VecStore64(const uint64x2_p val, word64* data)
1248 {
1249     VecStore(val, data);
1250 }
1251 
1252 inline
VecLoad64(const word64 * data,int offset)1253 uint64x2_p VecLoad64(const word64* data, int offset)
1254 {
1255 #if (CRYPTOPP_LITTLE_ENDIAN)
1256     const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1257     return VecPermute(VecLoad(offset, data), mask);
1258 #else
1259     return VecLoad(offset, data);
1260 #endif
1261 }
1262 
1263 inline
VectorCh(const uint64x2_p x,const uint64x2_p y,const uint64x2_p z)1264 uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1265 {
1266     // The trick below is due to Andy Polyakov and Jack Lloyd
1267     return vec_sel(z,y,x);
1268 }
1269 
1270 inline
VectorMaj(const uint64x2_p x,const uint64x2_p y,const uint64x2_p z)1271 uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1272 {
1273     // The trick below is due to Andy Polyakov and Jack Lloyd
1274     return vec_sel(y, z, VecXor(x, y));
1275 }
1276 
1277 inline
Vector_sigma0(const uint64x2_p val)1278 uint64x2_p Vector_sigma0(const uint64x2_p val)
1279 {
1280     return VecSHA512<0,0>(val);
1281 }
1282 
1283 inline
Vector_sigma1(const uint64x2_p val)1284 uint64x2_p Vector_sigma1(const uint64x2_p val)
1285 {
1286     return VecSHA512<0,0xf>(val);
1287 }
1288 
1289 inline
VectorSigma0(const uint64x2_p val)1290 uint64x2_p VectorSigma0(const uint64x2_p val)
1291 {
1292     return VecSHA512<1,0>(val);
1293 }
1294 
1295 inline
VectorSigma1(const uint64x2_p val)1296 uint64x2_p VectorSigma1(const uint64x2_p val)
1297 {
1298     return VecSHA512<1,0xf>(val);
1299 }
1300 
1301 inline
VectorPack(const uint64x2_p x,const uint64x2_p y)1302 uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1303 {
1304     const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1305     return VecPermute(x,y,m);
1306 }
1307 
1308 template <unsigned int R> inline
SHA512_ROUND1(uint64x2_p W[16],uint64x2_p S[8],const uint64x2_p K,const uint64x2_p M)1309 void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1310 {
1311     uint64x2_p T1, T2;
1312 
1313     W[R] = M;
1314     T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1315     T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1316 
1317     S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1318     S[E] = S[D] + T1;
1319     S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1320     S[A] = T1 + T2;
1321 }
1322 
1323 template <unsigned int R> inline
SHA512_ROUND2(uint64x2_p W[16],uint64x2_p S[8],const uint64x2_p K)1324 void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1325 {
1326     // Indexes into the W[] array
1327     enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1328 
1329     const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1330     const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1331 
1332     uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1333     T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1334     uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1335 
1336     S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1337     S[E] = S[D] + T1;
1338     S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1339     S[A] = T1 + T2;
1340 }
1341 
SHA512_HashMultipleBlocks_POWER8(word64 * state,const word64 * data,size_t length,ByteOrder order)1342 void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1343 {
1344     CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1345     CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1346     CRYPTOPP_UNUSED(order);
1347 
1348     const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1349     const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1350 
1351     uint64x2_p ab = VecLoad(state+0);
1352     uint64x2_p cd = VecLoad(state+2);
1353     uint64x2_p ef = VecLoad(state+4);
1354     uint64x2_p gh = VecLoad(state+6);
1355     uint64x2_p W[16], S[8], vm, vk;
1356 
1357     size_t blocks = length / SHA512::BLOCKSIZE;
1358     while (blocks--)
1359     {
1360         unsigned int offset=0;
1361 
1362         S[A] = ab; S[C] = cd;
1363         S[E] = ef; S[G] = gh;
1364         S[B] = VecShiftLeftOctet<8>(S[A]);
1365         S[D] = VecShiftLeftOctet<8>(S[C]);
1366         S[F] = VecShiftLeftOctet<8>(S[E]);
1367         S[H] = VecShiftLeftOctet<8>(S[G]);
1368 
1369         // Rounds 0-16
1370         vk = VecLoad(offset, k);
1371         vm = VecLoad64(m, offset);
1372         SHA512_ROUND1<0>(W,S, vk,vm);
1373         offset+=16;
1374 
1375         vk = VecShiftLeftOctet<8>(vk);
1376         vm = VecShiftLeftOctet<8>(vm);
1377         SHA512_ROUND1<1>(W,S, vk,vm);
1378 
1379         vk = VecLoad(offset, k);
1380         vm = VecLoad64(m, offset);
1381         SHA512_ROUND1<2>(W,S, vk,vm);
1382         offset+=16;
1383 
1384         vk = VecShiftLeftOctet<8>(vk);
1385         vm = VecShiftLeftOctet<8>(vm);
1386         SHA512_ROUND1<3>(W,S, vk,vm);
1387 
1388         vk = VecLoad(offset, k);
1389         vm = VecLoad64(m, offset);
1390         SHA512_ROUND1<4>(W,S, vk,vm);
1391         offset+=16;
1392 
1393         vk = VecShiftLeftOctet<8>(vk);
1394         vm = VecShiftLeftOctet<8>(vm);
1395         SHA512_ROUND1<5>(W,S, vk,vm);
1396 
1397         vk = VecLoad(offset, k);
1398         vm = VecLoad64(m, offset);
1399         SHA512_ROUND1<6>(W,S, vk,vm);
1400         offset+=16;
1401 
1402         vk = VecShiftLeftOctet<8>(vk);
1403         vm = VecShiftLeftOctet<8>(vm);
1404         SHA512_ROUND1<7>(W,S, vk,vm);
1405 
1406         vk = VecLoad(offset, k);
1407         vm = VecLoad64(m, offset);
1408         SHA512_ROUND1<8>(W,S, vk,vm);
1409         offset+=16;
1410 
1411         vk = VecShiftLeftOctet<8>(vk);
1412         vm = VecShiftLeftOctet<8>(vm);
1413         SHA512_ROUND1<9>(W,S, vk,vm);
1414 
1415         vk = VecLoad(offset, k);
1416         vm = VecLoad64(m, offset);
1417         SHA512_ROUND1<10>(W,S, vk,vm);
1418         offset+=16;
1419 
1420         vk = VecShiftLeftOctet<8>(vk);
1421         vm = VecShiftLeftOctet<8>(vm);
1422         SHA512_ROUND1<11>(W,S, vk,vm);
1423 
1424         vk = VecLoad(offset, k);
1425         vm = VecLoad64(m, offset);
1426         SHA512_ROUND1<12>(W,S, vk,vm);
1427         offset+=16;
1428 
1429         vk = VecShiftLeftOctet<8>(vk);
1430         vm = VecShiftLeftOctet<8>(vm);
1431         SHA512_ROUND1<13>(W,S, vk,vm);
1432 
1433         vk = VecLoad(offset, k);
1434         vm = VecLoad64(m, offset);
1435         SHA512_ROUND1<14>(W,S, vk,vm);
1436         offset+=16;
1437 
1438         vk = VecShiftLeftOctet<8>(vk);
1439         vm = VecShiftLeftOctet<8>(vm);
1440         SHA512_ROUND1<15>(W,S, vk,vm);
1441 
1442         m += 16; // 64-bit words, not bytes
1443 
1444         // Rounds 16-80
1445         for (unsigned int i=16; i<80; i+=16)
1446         {
1447             vk = VecLoad(offset, k);
1448             SHA512_ROUND2<0>(W,S, vk);
1449             SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1450             offset+=16;
1451 
1452             vk = VecLoad(offset, k);
1453             SHA512_ROUND2<2>(W,S, vk);
1454             SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1455             offset+=16;
1456 
1457             vk = VecLoad(offset, k);
1458             SHA512_ROUND2<4>(W,S, vk);
1459             SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1460             offset+=16;
1461 
1462             vk = VecLoad(offset, k);
1463             SHA512_ROUND2<6>(W,S, vk);
1464             SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1465             offset+=16;
1466 
1467             vk = VecLoad(offset, k);
1468             SHA512_ROUND2<8>(W,S, vk);
1469             SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1470             offset+=16;
1471 
1472             vk = VecLoad(offset, k);
1473             SHA512_ROUND2<10>(W,S, vk);
1474             SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1475             offset+=16;
1476 
1477             vk = VecLoad(offset, k);
1478             SHA512_ROUND2<12>(W,S, vk);
1479             SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1480             offset+=16;
1481 
1482             vk = VecLoad(offset, k);
1483             SHA512_ROUND2<14>(W,S, vk);
1484             SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1485             offset+=16;
1486         }
1487 
1488         ab += VectorPack(S[A],S[B]);
1489         cd += VectorPack(S[C],S[D]);
1490         ef += VectorPack(S[E],S[F]);
1491         gh += VectorPack(S[G],S[H]);
1492     }
1493 
1494     VecStore64(ab, state+0);
1495     VecStore64(cd, state+2);
1496     VecStore64(ef, state+4);
1497     VecStore64(gh, state+6);
1498 }
1499 
1500 #endif  // CRYPTOPP_POWER8_SHA_AVAILABLE
1501 
1502 ////////////////////////////////////////////////
1503 // end Gustavo, Serra, Scalet and Walton code //
1504 ////////////////////////////////////////////////
1505 
1506 NAMESPACE_END
1507