1 #ifndef blamka_round_avx512f_H 2 #define blamka_round_avx512f_H 3 4 #include "private/common.h" 5 #include "private/sse2_64_32.h" 6 7 #define ror64(x, n) _mm512_ror_epi64((x), (n)) 8 9 static inline __m512i 10 muladd(__m512i x, __m512i y) 11 { 12 __m512i z = _mm512_mul_epu32(x, y); 13 14 return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z)); 15 } 16 17 #define G1_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1) \ 18 do { \ 19 A0 = muladd(A0, B0); \ 20 A1 = muladd(A1, B1); \ 21 \ 22 D0 = _mm512_xor_si512(D0, A0); \ 23 D1 = _mm512_xor_si512(D1, A1); \ 24 \ 25 D0 = ror64(D0, 32); \ 26 D1 = ror64(D1, 32); \ 27 \ 28 C0 = muladd(C0, D0); \ 29 C1 = muladd(C1, D1); \ 30 \ 31 B0 = _mm512_xor_si512(B0, C0); \ 32 B1 = _mm512_xor_si512(B1, C1); \ 33 \ 34 B0 = ror64(B0, 24); \ 35 B1 = ror64(B1, 24); \ 36 } while ((void)0, 0) 37 38 #define G2_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1) \ 39 do { \ 40 A0 = muladd(A0, B0); \ 41 A1 = muladd(A1, B1); \ 42 \ 43 D0 = _mm512_xor_si512(D0, A0); \ 44 D1 = _mm512_xor_si512(D1, A1); \ 45 \ 46 D0 = ror64(D0, 16); \ 47 D1 = ror64(D1, 16); \ 48 \ 49 C0 = muladd(C0, D0); \ 50 C1 = muladd(C1, D1); \ 51 \ 52 B0 = _mm512_xor_si512(B0, C0); \ 53 B1 = _mm512_xor_si512(B1, C1); \ 54 \ 55 B0 = ror64(B0, 63); \ 56 B1 = ror64(B1, 63); \ 57 } while ((void)0, 0) 58 59 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 60 do { \ 61 B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \ 62 B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \ 63 \ 64 C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ 65 C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ 66 \ 67 D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \ 68 D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \ 69 } while ((void)0, 0) 70 71 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \ 72 do { \ 73 B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \ 74 B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \ 75 \ 76 C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \ 77 C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \ 78 \ 79 D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \ 80 D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \ 81 } while ((void)0, 0) 82 83 #define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \ 84 do { \ 85 G1_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1); \ 86 G2_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1); \ 87 \ 88 DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ 89 \ 90 G1_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1); \ 91 G2_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1); \ 92 \ 93 UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \ 94 } while ((void)0, 0) 95 96 #define SWAP_HALVES(A0, A1) \ 97 do { \ 98 __m512i t0, t1; \ 99 t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \ 100 t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \ 101 A0 = t0; \ 102 A1 = t1; \ 103 } while((void)0, 0) 104 105 #define SWAP_QUARTERS(A0, A1) \ 106 do { \ 107 SWAP_HALVES(A0, A1); \ 108 A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ 109 A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ 110 } while((void)0, 0) 111 112 #define UNSWAP_QUARTERS(A0, A1) \ 113 do { \ 114 A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \ 115 A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \ 116 SWAP_HALVES(A0, A1); \ 117 } while((void)0, 0) 118 119 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \ 120 do { \ 121 SWAP_HALVES(A0, B0); \ 122 SWAP_HALVES(C0, D0); \ 123 SWAP_HALVES(A1, B1); \ 124 SWAP_HALVES(C1, D1); \ 125 BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ 126 SWAP_HALVES(A0, B0); \ 127 SWAP_HALVES(C0, D0); \ 128 SWAP_HALVES(A1, B1); \ 129 SWAP_HALVES(C1, D1); \ 130 } while ((void)0, 0) 131 132 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \ 133 do { \ 134 SWAP_QUARTERS(A0, A1); \ 135 SWAP_QUARTERS(B0, B1); \ 136 SWAP_QUARTERS(C0, C1); \ 137 SWAP_QUARTERS(D0, D1); \ 138 BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \ 139 UNSWAP_QUARTERS(A0, A1); \ 140 UNSWAP_QUARTERS(B0, B1); \ 141 UNSWAP_QUARTERS(C0, C1); \ 142 UNSWAP_QUARTERS(D0, D1); \ 143 } while ((void)0, 0) 144 145 #endif 146