1 // chacha_avx.cpp - written and placed in the public domain by
2 // Jack Lloyd and Jeffrey Walton
3 //
4 // This source file uses intrinsics and built-ins to gain access to
5 // AVX2 instructions. A separate source file is needed because
6 // additional CXXFLAGS are required to enable the appropriate
7 // instructions sets in some build configurations.
8 //
9 // AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks
10 // to Jack Lloyd and the Botan team for allowing us to use it.
11 //
12 // Here are some relative numbers for ChaCha8:
13 // * Intel Skylake, 3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.
14 // * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.
15 // * AMD Bulldozer, 3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
16
17 #include "pch.h"
18 #include "config.h"
19
20 #include "chacha.h"
21 #include "misc.h"
22
23 #if defined(CRYPTOPP_AVX2_AVAILABLE)
24 # include <xmmintrin.h>
25 # include <emmintrin.h>
26 # include <immintrin.h>
27 #endif
28
29 // Squash MS LNK4221 and libtool warnings
30 extern const char CHACHA_AVX_FNAME[] = __FILE__;
31
32 // Sun Studio 12.4 OK, 12.5 and 12.6 compile error.
33 #if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
34 # define MAYBE_CONST
35 #else
36 # define MAYBE_CONST const
37 #endif
38
39 // VS2017 and global optimization bug. TODO, figure out when
40 // we can re-enable full optimizations for VS2017. Also see
41 // https://github.com/weidai11/cryptopp/issues/649 and
42 // https://github.com/weidai11/cryptopp/issues/735. The
43 // 649 issue affects AES but it is the same here. The 735
44 // issue is ChaCha AVX2 cut-in where it surfaced again.
45 #if (_MSC_VER >= 1910)
46 # ifndef CRYPTOPP_DEBUG
47 # pragma optimize("", off)
48 # pragma optimize("ts", on)
49 # endif
50 #endif
51
52 // The data is aligned, but Clang issues warning based on type
53 // and not the actual alignment of the variable and data.
54 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
55 # pragma GCC diagnostic ignored "-Wcast-align"
56 #endif
57
58 ANONYMOUS_NAMESPACE_BEGIN
59
60 #if (CRYPTOPP_AVX2_AVAILABLE)
61
62 template <unsigned int R>
RotateLeft(const __m256i val)63 inline __m256i RotateLeft(const __m256i val)
64 {
65 return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
66 }
67
68 template <>
RotateLeft(const __m256i val)69 inline __m256i RotateLeft<8>(const __m256i val)
70 {
71 const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
72 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
73 return _mm256_shuffle_epi8(val, mask);
74 }
75
76 template <>
RotateLeft(const __m256i val)77 inline __m256i RotateLeft<16>(const __m256i val)
78 {
79 const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
80 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
81 return _mm256_shuffle_epi8(val, mask);
82 }
83
84 #endif // CRYPTOPP_AVX2_AVAILABLE
85
86 ANONYMOUS_NAMESPACE_END
87
NAMESPACE_BEGIN(CryptoPP)88 NAMESPACE_BEGIN(CryptoPP)
89
90 #if (CRYPTOPP_AVX2_AVAILABLE)
91
92 void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
93 {
94 const __m256i state0 = _mm256_broadcastsi128_si256(
95 _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4)));
96 const __m256i state1 = _mm256_broadcastsi128_si256(
97 _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4)));
98 const __m256i state2 = _mm256_broadcastsi128_si256(
99 _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4)));
100 const __m256i state3 = _mm256_broadcastsi128_si256(
101 _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));
102
103 const word32 C = 0xFFFFFFFFu - state[12];
104 const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4);
105 const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);
106 const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);
107 const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);
108
109 __m256i X0_0 = state0;
110 __m256i X0_1 = state1;
111 __m256i X0_2 = state2;
112 __m256i X0_3 = _mm256_add_epi32(state3, CTR0);
113
114 __m256i X1_0 = state0;
115 __m256i X1_1 = state1;
116 __m256i X1_2 = state2;
117 __m256i X1_3 = _mm256_add_epi32(state3, CTR1);
118
119 __m256i X2_0 = state0;
120 __m256i X2_1 = state1;
121 __m256i X2_2 = state2;
122 __m256i X2_3 = _mm256_add_epi32(state3, CTR2);
123
124 __m256i X3_0 = state0;
125 __m256i X3_1 = state1;
126 __m256i X3_2 = state2;
127 __m256i X3_3 = _mm256_add_epi32(state3, CTR3);
128
129 for (int i = static_cast<int>(rounds); i > 0; i -= 2)
130 {
131 X0_0 = _mm256_add_epi32(X0_0, X0_1);
132 X1_0 = _mm256_add_epi32(X1_0, X1_1);
133 X2_0 = _mm256_add_epi32(X2_0, X2_1);
134 X3_0 = _mm256_add_epi32(X3_0, X3_1);
135
136 X0_3 = _mm256_xor_si256(X0_3, X0_0);
137 X1_3 = _mm256_xor_si256(X1_3, X1_0);
138 X2_3 = _mm256_xor_si256(X2_3, X2_0);
139 X3_3 = _mm256_xor_si256(X3_3, X3_0);
140
141 X0_3 = RotateLeft<16>(X0_3);
142 X1_3 = RotateLeft<16>(X1_3);
143 X2_3 = RotateLeft<16>(X2_3);
144 X3_3 = RotateLeft<16>(X3_3);
145
146 X0_2 = _mm256_add_epi32(X0_2, X0_3);
147 X1_2 = _mm256_add_epi32(X1_2, X1_3);
148 X2_2 = _mm256_add_epi32(X2_2, X2_3);
149 X3_2 = _mm256_add_epi32(X3_2, X3_3);
150
151 X0_1 = _mm256_xor_si256(X0_1, X0_2);
152 X1_1 = _mm256_xor_si256(X1_1, X1_2);
153 X2_1 = _mm256_xor_si256(X2_1, X2_2);
154 X3_1 = _mm256_xor_si256(X3_1, X3_2);
155
156 X0_1 = RotateLeft<12>(X0_1);
157 X1_1 = RotateLeft<12>(X1_1);
158 X2_1 = RotateLeft<12>(X2_1);
159 X3_1 = RotateLeft<12>(X3_1);
160
161 X0_0 = _mm256_add_epi32(X0_0, X0_1);
162 X1_0 = _mm256_add_epi32(X1_0, X1_1);
163 X2_0 = _mm256_add_epi32(X2_0, X2_1);
164 X3_0 = _mm256_add_epi32(X3_0, X3_1);
165
166 X0_3 = _mm256_xor_si256(X0_3, X0_0);
167 X1_3 = _mm256_xor_si256(X1_3, X1_0);
168 X2_3 = _mm256_xor_si256(X2_3, X2_0);
169 X3_3 = _mm256_xor_si256(X3_3, X3_0);
170
171 X0_3 = RotateLeft<8>(X0_3);
172 X1_3 = RotateLeft<8>(X1_3);
173 X2_3 = RotateLeft<8>(X2_3);
174 X3_3 = RotateLeft<8>(X3_3);
175
176 X0_2 = _mm256_add_epi32(X0_2, X0_3);
177 X1_2 = _mm256_add_epi32(X1_2, X1_3);
178 X2_2 = _mm256_add_epi32(X2_2, X2_3);
179 X3_2 = _mm256_add_epi32(X3_2, X3_3);
180
181 X0_1 = _mm256_xor_si256(X0_1, X0_2);
182 X1_1 = _mm256_xor_si256(X1_1, X1_2);
183 X2_1 = _mm256_xor_si256(X2_1, X2_2);
184 X3_1 = _mm256_xor_si256(X3_1, X3_2);
185
186 X0_1 = RotateLeft<7>(X0_1);
187 X1_1 = RotateLeft<7>(X1_1);
188 X2_1 = RotateLeft<7>(X2_1);
189 X3_1 = RotateLeft<7>(X3_1);
190
191 X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
192 X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
193 X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
194
195 X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
196 X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
197 X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
198
199 X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
200 X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
201 X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
202
203 X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
204 X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
205 X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
206
207 X0_0 = _mm256_add_epi32(X0_0, X0_1);
208 X1_0 = _mm256_add_epi32(X1_0, X1_1);
209 X2_0 = _mm256_add_epi32(X2_0, X2_1);
210 X3_0 = _mm256_add_epi32(X3_0, X3_1);
211
212 X0_3 = _mm256_xor_si256(X0_3, X0_0);
213 X1_3 = _mm256_xor_si256(X1_3, X1_0);
214 X2_3 = _mm256_xor_si256(X2_3, X2_0);
215 X3_3 = _mm256_xor_si256(X3_3, X3_0);
216
217 X0_3 = RotateLeft<16>(X0_3);
218 X1_3 = RotateLeft<16>(X1_3);
219 X2_3 = RotateLeft<16>(X2_3);
220 X3_3 = RotateLeft<16>(X3_3);
221
222 X0_2 = _mm256_add_epi32(X0_2, X0_3);
223 X1_2 = _mm256_add_epi32(X1_2, X1_3);
224 X2_2 = _mm256_add_epi32(X2_2, X2_3);
225 X3_2 = _mm256_add_epi32(X3_2, X3_3);
226
227 X0_1 = _mm256_xor_si256(X0_1, X0_2);
228 X1_1 = _mm256_xor_si256(X1_1, X1_2);
229 X2_1 = _mm256_xor_si256(X2_1, X2_2);
230 X3_1 = _mm256_xor_si256(X3_1, X3_2);
231
232 X0_1 = RotateLeft<12>(X0_1);
233 X1_1 = RotateLeft<12>(X1_1);
234 X2_1 = RotateLeft<12>(X2_1);
235 X3_1 = RotateLeft<12>(X3_1);
236
237 X0_0 = _mm256_add_epi32(X0_0, X0_1);
238 X1_0 = _mm256_add_epi32(X1_0, X1_1);
239 X2_0 = _mm256_add_epi32(X2_0, X2_1);
240 X3_0 = _mm256_add_epi32(X3_0, X3_1);
241
242 X0_3 = _mm256_xor_si256(X0_3, X0_0);
243 X1_3 = _mm256_xor_si256(X1_3, X1_0);
244 X2_3 = _mm256_xor_si256(X2_3, X2_0);
245 X3_3 = _mm256_xor_si256(X3_3, X3_0);
246
247 X0_3 = RotateLeft<8>(X0_3);
248 X1_3 = RotateLeft<8>(X1_3);
249 X2_3 = RotateLeft<8>(X2_3);
250 X3_3 = RotateLeft<8>(X3_3);
251
252 X0_2 = _mm256_add_epi32(X0_2, X0_3);
253 X1_2 = _mm256_add_epi32(X1_2, X1_3);
254 X2_2 = _mm256_add_epi32(X2_2, X2_3);
255 X3_2 = _mm256_add_epi32(X3_2, X3_3);
256
257 X0_1 = _mm256_xor_si256(X0_1, X0_2);
258 X1_1 = _mm256_xor_si256(X1_1, X1_2);
259 X2_1 = _mm256_xor_si256(X2_1, X2_2);
260 X3_1 = _mm256_xor_si256(X3_1, X3_2);
261
262 X0_1 = RotateLeft<7>(X0_1);
263 X1_1 = RotateLeft<7>(X1_1);
264 X2_1 = RotateLeft<7>(X2_1);
265 X3_1 = RotateLeft<7>(X3_1);
266
267 X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
268 X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
269 X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
270
271 X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
272 X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
273 X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
274
275 X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
276 X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
277 X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
278
279 X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
280 X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
281 X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
282 }
283
284 X0_0 = _mm256_add_epi32(X0_0, state0);
285 X0_1 = _mm256_add_epi32(X0_1, state1);
286 X0_2 = _mm256_add_epi32(X0_2, state2);
287 X0_3 = _mm256_add_epi32(X0_3, state3);
288 X0_3 = _mm256_add_epi32(X0_3, CTR0);
289
290 X1_0 = _mm256_add_epi32(X1_0, state0);
291 X1_1 = _mm256_add_epi32(X1_1, state1);
292 X1_2 = _mm256_add_epi32(X1_2, state2);
293 X1_3 = _mm256_add_epi32(X1_3, state3);
294 X1_3 = _mm256_add_epi32(X1_3, CTR1);
295
296 X2_0 = _mm256_add_epi32(X2_0, state0);
297 X2_1 = _mm256_add_epi32(X2_1, state1);
298 X2_2 = _mm256_add_epi32(X2_2, state2);
299 X2_3 = _mm256_add_epi32(X2_3, state3);
300 X2_3 = _mm256_add_epi32(X2_3, CTR2);
301
302 X3_0 = _mm256_add_epi32(X3_0, state0);
303 X3_1 = _mm256_add_epi32(X3_1, state1);
304 X3_2 = _mm256_add_epi32(X3_2, state2);
305 X3_3 = _mm256_add_epi32(X3_3, state3);
306 X3_3 = _mm256_add_epi32(X3_3, CTR3);
307
308 if (input)
309 {
310 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
311 _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
312 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+0*32)))));
313 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
314 _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
315 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+1*32)))));
316 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
317 _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
318 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+2*32)))));
319 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
320 _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
321 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+3*32)))));
322 }
323 else
324 {
325 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
326 _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
327 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
328 _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
329 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
330 _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
331 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
332 _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
333 }
334
335 if (input)
336 {
337 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
338 _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
339 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+4*32)))));
340 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
341 _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
342 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+5*32)))));
343 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
344 _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
345 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+6*32)))));
346 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
347 _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
348 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+7*32)))));
349 }
350 else
351 {
352 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
353 _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
354 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
355 _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
356 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
357 _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
358 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
359 _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
360 }
361
362 if (input)
363 {
364 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
365 _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
366 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+8*32)))));
367 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
368 _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
369 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+9*32)))));
370 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
371 _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
372 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+10*32)))));
373 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
374 _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
375 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+11*32)))));
376 }
377 else
378 {
379 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
380 _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
381 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
382 _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
383 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
384 _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
385 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
386 _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
387 }
388
389 if (input)
390 {
391 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
392 _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
393 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+12*32)))));
394 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
395 _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
396 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+13*32)))));
397 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
398 _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
399 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+14*32)))));
400 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
401 _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
402 _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+15*32)))));
403 }
404 else
405 {
406 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
407 _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
408 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
409 _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
410 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
411 _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
412 _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
413 _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
414 }
415
416 // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
417 _mm256_zeroupper();
418 }
419
420 #endif // CRYPTOPP_AVX2_AVAILABLE
421
422 NAMESPACE_END
423