1 // shacla2_simd.cpp - written and placed in the public domain by
2 //                    Jeffrey Walton and Jack Lloyd
3 //
4 //    Jack Lloyd and the Botan team allowed Crypto++ to use parts of
5 //    Botan's implementation under the same license as Crypto++
6 //    is released. The code for SHACAL2_Enc_ProcessAndXorBlock_SHANI
7 //    below is Botan's x86_encrypt_blocks with minor tweaks. Many thanks
8 //    to the Botan team. Also see http://github.com/randombit/botan/.
9 //
10 //    This source file uses intrinsics to gain access to SHA-NI and
11 //    ARMv8a SHA instructions. A separate source file is needed because
12 //    additional CXXFLAGS are required to enable the appropriate instruction
13 //    sets in some build configurations.
14 
15 #include "pch.h"
16 #include "config.h"
17 #include "sha.h"
18 #include "misc.h"
19 
20 #if (CRYPTOPP_SHANI_AVAILABLE)
21 # include <nmmintrin.h>
22 # include <immintrin.h>
23 #endif
24 
25 // Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670
26 #define M128_CAST(x) ((__m128i *)(void *)(x))
27 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
28 
29 // Squash MS LNK4221 and libtool warnings
30 extern const char SHACAL2_SIMD_FNAME[] = __FILE__;
31 
32 NAMESPACE_BEGIN(CryptoPP)
33 
34 #if CRYPTOPP_SHANI_AVAILABLE
35 void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys, const byte *inBlock, const byte *xorBlock, byte *outBlock)
36 {
37 	CRYPTOPP_ASSERT(subKeys);
38 	CRYPTOPP_ASSERT(inBlock);
39 	CRYPTOPP_ASSERT(outBlock);
40 
41 	const __m128i MASK1 = _mm_set_epi8(8,9,10,11,  12,13,14,15,  0,1,2,3,  4,5,6,7);
42 	const __m128i MASK2 = _mm_set_epi8(0,1,2,3,  4,5,6,7,  8,9,10,11,  12,13,14,15);
43 
44 	__m128i B0 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 0)), MASK1);
45 	__m128i B1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 16)), MASK2);
46 
47 	__m128i TMP = _mm_alignr_epi8(B0, B1, 8);
48 	B1 = _mm_blend_epi16(B1, B0, 0xF0);
49 	B0 = TMP;
50 
51 #if 0
52 	// SSE2 + SSSE3, but 0.2 cpb slower on a Celeraon J3455
53 	const __m128i MASK1 = _mm_set_epi8(8,9,10,11,  12,13,14,15,  0,1,2,3,  4,5,6,7);
54 	const __m128i MASK2 = _mm_set_epi8(0,1,2,3,  4,5,6,7,  8,9,10,11,  12,13,14,15);
55 
56 	__m128i B0 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 0));
57 	__m128i B1 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 16));
58 
59 	__m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2);
60 	B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2);
61 	B0 = TMP;
62 #endif
63 
64 	const byte* keys = reinterpret_cast<const byte*>(subKeys);
65 	for (size_t i = 0; i != 8; ++i)
66 	{
67 		const __m128i RK0 = _mm_load_si128(CONST_M128_CAST(keys + 32*i));
68 		const __m128i RK2 = _mm_load_si128(CONST_M128_CAST(keys + 32*i+16));
69 		const __m128i RK1 = _mm_srli_si128(RK0, 8);
70 		const __m128i RK3 = _mm_srli_si128(RK2, 8);
71 
72 		B1 = _mm_sha256rnds2_epu32(B1, B0, RK0);
73 		B0 = _mm_sha256rnds2_epu32(B0, B1, RK1);
74 		B1 = _mm_sha256rnds2_epu32(B1, B0, RK2);
75 		B0 = _mm_sha256rnds2_epu32(B0, B1, RK3);
76 	}
77 
78 	TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1);
79 	B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1);
80 	B0 = TMP;
81 
82 	if (xorBlock)
83 	{
84 		_mm_storeu_si128(M128_CAST(outBlock + 0),
85 			_mm_xor_si128(B0, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 0))));
86 
87 		_mm_storeu_si128(M128_CAST(outBlock + 16),
88 			_mm_xor_si128(B1, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 16))));
89 	}
90 	else
91 	{
92 		_mm_storeu_si128(M128_CAST(outBlock + 0), B0);
93 		_mm_storeu_si128(M128_CAST(outBlock + 16), B1);
94 	}
95 }
96 #endif
97 
98 NAMESPACE_END
99