1 // shacla2_simd.cpp - written and placed in the public domain by 2 // Jeffrey Walton and Jack Lloyd 3 // 4 // Jack Lloyd and the Botan team allowed Crypto++ to use parts of 5 // Botan's implementation under the same license as Crypto++ 6 // is released. The code for SHACAL2_Enc_ProcessAndXorBlock_SHANI 7 // below is Botan's x86_encrypt_blocks with minor tweaks. Many thanks 8 // to the Botan team. Also see http://github.com/randombit/botan/. 9 // 10 // This source file uses intrinsics to gain access to SHA-NI and 11 // ARMv8a SHA instructions. A separate source file is needed because 12 // additional CXXFLAGS are required to enable the appropriate instruction 13 // sets in some build configurations. 14 15 #include "pch.h" 16 #include "config.h" 17 #include "sha.h" 18 #include "misc.h" 19 20 #if (CRYPTOPP_SHANI_AVAILABLE) 21 # include <nmmintrin.h> 22 # include <immintrin.h> 23 #endif 24 25 // Clang intrinsic casts, http://bugs.llvm.org/show_bug.cgi?id=20670 26 #define M128_CAST(x) ((__m128i *)(void *)(x)) 27 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x)) 28 29 // Squash MS LNK4221 and libtool warnings 30 extern const char SHACAL2_SIMD_FNAME[] = __FILE__; 31 32 NAMESPACE_BEGIN(CryptoPP) 33 34 #if CRYPTOPP_SHANI_AVAILABLE 35 void SHACAL2_Enc_ProcessAndXorBlock_SHANI(const word32* subKeys, const byte *inBlock, const byte *xorBlock, byte *outBlock) 36 { 37 CRYPTOPP_ASSERT(subKeys); 38 CRYPTOPP_ASSERT(inBlock); 39 CRYPTOPP_ASSERT(outBlock); 40 41 const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); 42 const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15); 43 44 __m128i B0 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 0)), MASK1); 45 __m128i B1 = _mm_shuffle_epi8(_mm_loadu_si128(CONST_M128_CAST(inBlock + 16)), MASK2); 46 47 __m128i TMP = _mm_alignr_epi8(B0, B1, 8); 48 B1 = _mm_blend_epi16(B1, B0, 0xF0); 49 B0 = TMP; 50 51 #if 0 52 // SSE2 + SSSE3, but 0.2 cpb slower on a Celeraon J3455 53 const __m128i MASK1 = _mm_set_epi8(8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7); 54 const __m128i MASK2 = _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15); 55 56 __m128i B0 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 0)); 57 __m128i B1 = _mm_loadu_si128(CONST_M128_CAST(inBlock + 16)); 58 59 __m128i TMP = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK2); 60 B1 = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK2); 61 B0 = TMP; 62 #endif 63 64 const byte* keys = reinterpret_cast<const byte*>(subKeys); 65 for (size_t i = 0; i != 8; ++i) 66 { 67 const __m128i RK0 = _mm_load_si128(CONST_M128_CAST(keys + 32*i)); 68 const __m128i RK2 = _mm_load_si128(CONST_M128_CAST(keys + 32*i+16)); 69 const __m128i RK1 = _mm_srli_si128(RK0, 8); 70 const __m128i RK3 = _mm_srli_si128(RK2, 8); 71 72 B1 = _mm_sha256rnds2_epu32(B1, B0, RK0); 73 B0 = _mm_sha256rnds2_epu32(B0, B1, RK1); 74 B1 = _mm_sha256rnds2_epu32(B1, B0, RK2); 75 B0 = _mm_sha256rnds2_epu32(B0, B1, RK3); 76 } 77 78 TMP = _mm_shuffle_epi8(_mm_unpackhi_epi64(B0, B1), MASK1); 79 B1 = _mm_shuffle_epi8(_mm_unpacklo_epi64(B0, B1), MASK1); 80 B0 = TMP; 81 82 if (xorBlock) 83 { 84 _mm_storeu_si128(M128_CAST(outBlock + 0), 85 _mm_xor_si128(B0, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 0)))); 86 87 _mm_storeu_si128(M128_CAST(outBlock + 16), 88 _mm_xor_si128(B1, _mm_loadu_si128(CONST_M128_CAST(xorBlock + 16)))); 89 } 90 else 91 { 92 _mm_storeu_si128(M128_CAST(outBlock + 0), B0); 93 _mm_storeu_si128(M128_CAST(outBlock + 16), B1); 94 } 95 } 96 #endif 97 98 NAMESPACE_END 99