10ac341f1SConrad Meyer 
20ac341f1SConrad Meyer #define BLAKE2_USE_SSSE3
30ac341f1SConrad Meyer #define BLAKE2_USE_SSE41
40ac341f1SConrad Meyer #define BLAKE2_USE_AVX2
50ac341f1SConrad Meyer 
60ac341f1SConrad Meyer #include <stdint.h>
70ac341f1SConrad Meyer #include <string.h>
80ac341f1SConrad Meyer 
90ac341f1SConrad Meyer #include "blake2.h"
100ac341f1SConrad Meyer #include "private/common.h"
110ac341f1SConrad Meyer #include "private/sse2_64_32.h"
120ac341f1SConrad Meyer 
130ac341f1SConrad Meyer #if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
140ac341f1SConrad Meyer     defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
150ac341f1SConrad Meyer 
160ac341f1SConrad Meyer # ifdef __GNUC__
170ac341f1SConrad Meyer #  pragma GCC target("sse2")
180ac341f1SConrad Meyer #  pragma GCC target("ssse3")
190ac341f1SConrad Meyer #  pragma GCC target("sse4.1")
200ac341f1SConrad Meyer #  pragma GCC target("avx2")
210ac341f1SConrad Meyer # endif
220ac341f1SConrad Meyer 
230ac341f1SConrad Meyer # include <emmintrin.h>
240ac341f1SConrad Meyer # include <immintrin.h>
250ac341f1SConrad Meyer # include <smmintrin.h>
260ac341f1SConrad Meyer # include <tmmintrin.h>
270ac341f1SConrad Meyer 
280ac341f1SConrad Meyer # include "blake2b-compress-avx2.h"
290ac341f1SConrad Meyer 
300ac341f1SConrad Meyer CRYPTO_ALIGN(64)
310ac341f1SConrad Meyer static const uint64_t blake2b_IV[8] = {
320ac341f1SConrad Meyer     0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL,
330ac341f1SConrad Meyer     0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
340ac341f1SConrad Meyer     0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
350ac341f1SConrad Meyer };
360ac341f1SConrad Meyer 
370ac341f1SConrad Meyer int
blake2b_compress_avx2(blake2b_state * S,const uint8_t block[BLAKE2B_BLOCKBYTES])380ac341f1SConrad Meyer blake2b_compress_avx2(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES])
390ac341f1SConrad Meyer {
400ac341f1SConrad Meyer     __m256i a = LOADU(&S->h[0]);
410ac341f1SConrad Meyer     __m256i b = LOADU(&S->h[4]);
420ac341f1SConrad Meyer     BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]);
430ac341f1SConrad Meyer     STOREU(&S->h[0], a);
440ac341f1SConrad Meyer     STOREU(&S->h[4], b);
450ac341f1SConrad Meyer 
460ac341f1SConrad Meyer     return 0;
470ac341f1SConrad Meyer }
480ac341f1SConrad Meyer 
490ac341f1SConrad Meyer #endif
50