1 2 #define BLAKE2_USE_SSSE3 3 #define BLAKE2_USE_SSE41 4 5 #include <stdint.h> 6 #include <string.h> 7 8 #include "blake2.h" 9 #include "private/common.h" 10 #include "private/sse2_64_32.h" 11 12 #if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \ 13 defined(HAVE_SMMINTRIN_H) 14 15 # ifdef __GNUC__ 16 # pragma GCC target("sse2") 17 # pragma GCC target("ssse3") 18 # pragma GCC target("sse4.1") 19 # endif 20 21 # include <emmintrin.h> 22 # include <smmintrin.h> 23 # include <tmmintrin.h> 24 25 # include "blake2b-compress-sse41.h" 26 27 CRYPTO_ALIGN(64) 28 static const uint64_t blake2b_IV[8] = { 29 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 30 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 31 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL 32 }; 33 34 int 35 blake2b_compress_sse41(blake2b_state *S, 36 const uint8_t block[BLAKE2B_BLOCKBYTES]) 37 { 38 __m128i row1l, row1h; 39 __m128i row2l, row2h; 40 __m128i row3l, row3h; 41 __m128i row4l, row4h; 42 __m128i b0, b1; 43 __m128i t0, t1; 44 const __m128i r16 = 45 _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); 46 const __m128i r24 = 47 _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); 48 const __m128i m0 = LOADU(block + 00); 49 const __m128i m1 = LOADU(block + 16); 50 const __m128i m2 = LOADU(block + 32); 51 const __m128i m3 = LOADU(block + 48); 52 const __m128i m4 = LOADU(block + 64); 53 const __m128i m5 = LOADU(block + 80); 54 const __m128i m6 = LOADU(block + 96); 55 const __m128i m7 = LOADU(block + 112); 56 row1l = LOADU(&S->h[0]); 57 row1h = LOADU(&S->h[2]); 58 row2l = LOADU(&S->h[4]); 59 row2h = LOADU(&S->h[6]); 60 row3l = LOADU(&blake2b_IV[0]); 61 row3h = LOADU(&blake2b_IV[2]); 62 row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); 63 row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); 64 ROUND(0); 65 ROUND(1); 66 ROUND(2); 67 ROUND(3); 68 ROUND(4); 69 ROUND(5); 70 ROUND(6); 71 ROUND(7); 72 ROUND(8); 73 ROUND(9); 74 ROUND(10); 75 ROUND(11); 76 row1l = _mm_xor_si128(row3l, row1l); 77 row1h = _mm_xor_si128(row3h, row1h); 78 STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); 79 STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); 80 row2l = _mm_xor_si128(row4l, row2l); 81 row2h = _mm_xor_si128(row4h, row2h); 82 STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); 83 STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); 84 return 0; 85 } 86 87 #endif 88