1
2 #ifndef blake2b_compress_avx2_H
3 #define blake2b_compress_avx2_H
4
5 #define LOAD128(p) _mm_load_si128((__m128i *) (p))
6 #define STORE128(p, r) _mm_store_si128((__m128i *) (p), r)
7
8 #define LOADU128(p) _mm_loadu_si128((__m128i *) (p))
9 #define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r)
10
11 #define LOAD(p) _mm256_load_si256((__m256i *) (p))
12 #define STORE(p, r) _mm256_store_si256((__m256i *) (p), r)
13
14 #define LOADU(p) _mm256_loadu_si256((__m256i *) (p))
15 #define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r)
16
17 static inline uint64_t
LOADU64(const void * p)18 LOADU64(const void *p)
19 {
20 uint64_t v;
21 memcpy(&v, p, sizeof v);
22 return v;
23 }
24
25 #define ROTATE16 \
26 _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \
27 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
28
29 #define ROTATE24 \
30 _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \
31 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
32
33 #define ADD(a, b) _mm256_add_epi64(a, b)
34 #define SUB(a, b) _mm256_sub_epi64(a, b)
35
36 #define XOR(a, b) _mm256_xor_si256(a, b)
37 #define AND(a, b) _mm256_and_si256(a, b)
38 #define OR(a, b) _mm256_or_si256(a, b)
39
40 #define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))
41 #define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24)
42 #define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16)
43 #define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
44
45 #define BLAKE2B_G1_V1(a, b, c, d, m) \
46 do { \
47 a = ADD(a, m); \
48 a = ADD(a, b); \
49 d = XOR(d, a); \
50 d = ROT32(d); \
51 c = ADD(c, d); \
52 b = XOR(b, c); \
53 b = ROT24(b); \
54 } while (0)
55
56 #define BLAKE2B_G2_V1(a, b, c, d, m) \
57 do { \
58 a = ADD(a, m); \
59 a = ADD(a, b); \
60 d = XOR(d, a); \
61 d = ROT16(d); \
62 c = ADD(c, d); \
63 b = XOR(b, c); \
64 b = ROT63(b); \
65 } while (0)
66
67 #define BLAKE2B_DIAG_V1(a, b, c, d) \
68 do { \
69 d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \
70 c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
71 b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \
72 } while (0)
73
74 #define BLAKE2B_UNDIAG_V1(a, b, c, d) \
75 do { \
76 d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \
77 c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \
78 b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \
79 } while (0)
80
81 #include "blake2b-load-avx2.h"
82
83 #define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \
84 do { \
85 __m256i b0; \
86 BLAKE2B_LOAD_MSG_##r##_1(b0); \
87 BLAKE2B_G1_V1(a, b, c, d, b0); \
88 BLAKE2B_LOAD_MSG_##r##_2(b0); \
89 BLAKE2B_G2_V1(a, b, c, d, b0); \
90 BLAKE2B_DIAG_V1(a, b, c, d); \
91 BLAKE2B_LOAD_MSG_##r##_3(b0); \
92 BLAKE2B_G1_V1(a, b, c, d, b0); \
93 BLAKE2B_LOAD_MSG_##r##_4(b0); \
94 BLAKE2B_G2_V1(a, b, c, d, b0); \
95 BLAKE2B_UNDIAG_V1(a, b, c, d); \
96 } while (0)
97
98 #define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \
99 do { \
100 BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
101 BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
102 BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
103 BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
104 BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
105 BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
106 BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
107 BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
108 BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
109 BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
110 BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
111 BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
112 } while (0)
113
114 #define DECLARE_MESSAGE_WORDS(m) \
115 const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
116 const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
117 const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
118 const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
119 const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
120 const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
121 const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
122 const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
123 __m256i t0, t1;
124
125 #define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \
126 do { \
127 DECLARE_MESSAGE_WORDS(m) \
128 const __m256i iv0 = a; \
129 const __m256i iv1 = b; \
130 __m256i c = LOAD(&blake2b_IV[0]); \
131 __m256i d = \
132 XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
133 BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
134 a = XOR(a, c); \
135 b = XOR(b, d); \
136 a = XOR(a, iv0); \
137 b = XOR(b, iv1); \
138 } while (0)
139
140 #endif
141