1 #ifndef blamka_round_avx512f_H
2 #define blamka_round_avx512f_H
3
4 #include "private/common.h"
5 #include "private/sse2_64_32.h"
6
7 #define ror64(x, n) _mm512_ror_epi64((x), (n))
8
9 static inline __m512i
muladd(__m512i x,__m512i y)10 muladd(__m512i x, __m512i y)
11 {
12 __m512i z = _mm512_mul_epu32(x, y);
13
14 return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
15 }
16
17 #define G1_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1) \
18 do { \
19 A0 = muladd(A0, B0); \
20 A1 = muladd(A1, B1); \
21 \
22 D0 = _mm512_xor_si512(D0, A0); \
23 D1 = _mm512_xor_si512(D1, A1); \
24 \
25 D0 = ror64(D0, 32); \
26 D1 = ror64(D1, 32); \
27 \
28 C0 = muladd(C0, D0); \
29 C1 = muladd(C1, D1); \
30 \
31 B0 = _mm512_xor_si512(B0, C0); \
32 B1 = _mm512_xor_si512(B1, C1); \
33 \
34 B0 = ror64(B0, 24); \
35 B1 = ror64(B1, 24); \
36 } while ((void)0, 0)
37
38 #define G2_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1) \
39 do { \
40 A0 = muladd(A0, B0); \
41 A1 = muladd(A1, B1); \
42 \
43 D0 = _mm512_xor_si512(D0, A0); \
44 D1 = _mm512_xor_si512(D1, A1); \
45 \
46 D0 = ror64(D0, 16); \
47 D1 = ror64(D1, 16); \
48 \
49 C0 = muladd(C0, D0); \
50 C1 = muladd(C1, D1); \
51 \
52 B0 = _mm512_xor_si512(B0, C0); \
53 B1 = _mm512_xor_si512(B1, C1); \
54 \
55 B0 = ror64(B0, 63); \
56 B1 = ror64(B1, 63); \
57 } while ((void)0, 0)
58
59 #define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
60 do { \
61 B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
62 B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
63 \
64 C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
65 C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
66 \
67 D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
68 D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
69 } while ((void)0, 0)
70
71 #define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
72 do { \
73 B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
74 B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
75 \
76 C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
77 C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
78 \
79 D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
80 D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
81 } while ((void)0, 0)
82
83 #define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
84 do { \
85 G1_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1); \
86 G2_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1); \
87 \
88 DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
89 \
90 G1_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1); \
91 G2_AVX512F(A0, B0, C0, D0, A1, B1, C1, D1); \
92 \
93 UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
94 } while ((void)0, 0)
95
96 #define SWAP_HALVES(A0, A1) \
97 do { \
98 __m512i t0, t1; \
99 t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
100 t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
101 A0 = t0; \
102 A1 = t1; \
103 } while((void)0, 0)
104
105 #define SWAP_QUARTERS(A0, A1) \
106 do { \
107 SWAP_HALVES(A0, A1); \
108 A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
109 A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
110 } while((void)0, 0)
111
112 #define UNSWAP_QUARTERS(A0, A1) \
113 do { \
114 A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
115 A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
116 SWAP_HALVES(A0, A1); \
117 } while((void)0, 0)
118
119 #define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
120 do { \
121 SWAP_HALVES(A0, B0); \
122 SWAP_HALVES(C0, D0); \
123 SWAP_HALVES(A1, B1); \
124 SWAP_HALVES(C1, D1); \
125 BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
126 SWAP_HALVES(A0, B0); \
127 SWAP_HALVES(C0, D0); \
128 SWAP_HALVES(A1, B1); \
129 SWAP_HALVES(C1, D1); \
130 } while ((void)0, 0)
131
132 #define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
133 do { \
134 SWAP_QUARTERS(A0, A1); \
135 SWAP_QUARTERS(B0, B1); \
136 SWAP_QUARTERS(C0, C1); \
137 SWAP_QUARTERS(D0, D1); \
138 BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
139 UNSWAP_QUARTERS(A0, A1); \
140 UNSWAP_QUARTERS(B0, B1); \
141 UNSWAP_QUARTERS(C0, C1); \
142 UNSWAP_QUARTERS(D0, D1); \
143 } while ((void)0, 0)
144
145 #endif
146