1 // chacha_avx.cpp - written and placed in the public domain by
2 //                  Jack Lloyd and Jeffrey Walton
3 //
4 //    This source file uses intrinsics and built-ins to gain access to
5 //    AVX2 instructions. A separate source file is needed because
6 //    additional CXXFLAGS are required to enable the appropriate
7 //    instructions sets in some build configurations.
8 //
9 //    AVX2 implementation based on Botan's chacha_avx.cpp. Many thanks
10 //    to Jack Lloyd and the Botan team for allowing us to use it.
11 //
12 //    Here are some relative numbers for ChaCha8:
13 //    * Intel Skylake,   3.0 GHz: AVX2 at 4411 MB/s; 0.57 cpb.
14 //    * Intel Broadwell, 2.3 GHz: AVX2 at 3828 MB/s; 0.58 cpb.
15 //    * AMD Bulldozer,   3.3 GHz: AVX2 at 1680 MB/s; 1.47 cpb.
16 
17 #include "pch.h"
18 #include "config.h"
19 
20 #include "chacha.h"
21 #include "misc.h"
22 
23 #if defined(CRYPTOPP_AVX2_AVAILABLE)
24 # include <xmmintrin.h>
25 # include <emmintrin.h>
26 # include <immintrin.h>
27 #endif
28 
29 // Squash MS LNK4221 and libtool warnings
30 extern const char CHACHA_AVX_FNAME[] = __FILE__;
31 
32 // Sun Studio 12.4 OK, 12.5 and 12.6 compile error.
33 #if (__SUNPRO_CC >= 0x5140) && (__SUNPRO_CC <= 0x5150)
34 # define MAYBE_CONST
35 #else
36 # define MAYBE_CONST const
37 #endif
38 
39 // VS2017 and global optimization bug. TODO, figure out when
40 // we can re-enable full optimizations for VS2017. Also see
41 // https://github.com/weidai11/cryptopp/issues/649 and
42 // https://github.com/weidai11/cryptopp/issues/735. The
43 // 649 issue affects AES but it is the same here. The 735
44 // issue is ChaCha AVX2 cut-in where it surfaced again.
45 #if (_MSC_VER >= 1910)
46 # ifndef CRYPTOPP_DEBUG
47 #  pragma optimize("", off)
48 #  pragma optimize("ts", on)
49 # endif
50 #endif
51 
52 // The data is aligned, but Clang issues warning based on type
53 // and not the actual alignment of the variable and data.
54 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
55 # pragma GCC diagnostic ignored "-Wcast-align"
56 #endif
57 
58 ANONYMOUS_NAMESPACE_BEGIN
59 
60 #if (CRYPTOPP_AVX2_AVAILABLE)
61 
62 template <unsigned int R>
RotateLeft(const __m256i val)63 inline __m256i RotateLeft(const __m256i val)
64 {
65     return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32-R));
66 }
67 
68 template <>
RotateLeft(const __m256i val)69 inline __m256i RotateLeft<8>(const __m256i val)
70 {
71     const __m256i mask = _mm256_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3,
72                                          14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3);
73     return _mm256_shuffle_epi8(val, mask);
74 }
75 
76 template <>
RotateLeft(const __m256i val)77 inline __m256i RotateLeft<16>(const __m256i val)
78 {
79     const __m256i mask = _mm256_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2,
80                                          13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2);
81     return _mm256_shuffle_epi8(val, mask);
82 }
83 
84 #endif  // CRYPTOPP_AVX2_AVAILABLE
85 
86 ANONYMOUS_NAMESPACE_END
87 
NAMESPACE_BEGIN(CryptoPP)88 NAMESPACE_BEGIN(CryptoPP)
89 
90 #if (CRYPTOPP_AVX2_AVAILABLE)
91 
92 void ChaCha_OperateKeystream_AVX2(const word32 *state, const byte* input, byte *output, unsigned int rounds)
93 {
94     const __m256i state0 = _mm256_broadcastsi128_si256(
95         _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+0*4)));
96     const __m256i state1 = _mm256_broadcastsi128_si256(
97         _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+1*4)));
98     const __m256i state2 = _mm256_broadcastsi128_si256(
99         _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+2*4)));
100     const __m256i state3 = _mm256_broadcastsi128_si256(
101         _mm_loadu_si128(reinterpret_cast<const __m128i*>(state+3*4)));
102 
103     const word32 C = 0xFFFFFFFFu - state[12];
104     const __m256i CTR0 = _mm256_set_epi32(0, 0,     0, 0, 0, 0, C < 4, 4);
105     const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5);
106     const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6);
107     const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7);
108 
109     __m256i X0_0 = state0;
110     __m256i X0_1 = state1;
111     __m256i X0_2 = state2;
112     __m256i X0_3 = _mm256_add_epi32(state3, CTR0);
113 
114     __m256i X1_0 = state0;
115     __m256i X1_1 = state1;
116     __m256i X1_2 = state2;
117     __m256i X1_3 = _mm256_add_epi32(state3, CTR1);
118 
119     __m256i X2_0 = state0;
120     __m256i X2_1 = state1;
121     __m256i X2_2 = state2;
122     __m256i X2_3 = _mm256_add_epi32(state3, CTR2);
123 
124     __m256i X3_0 = state0;
125     __m256i X3_1 = state1;
126     __m256i X3_2 = state2;
127     __m256i X3_3 = _mm256_add_epi32(state3, CTR3);
128 
129     for (int i = static_cast<int>(rounds); i > 0; i -= 2)
130     {
131         X0_0 = _mm256_add_epi32(X0_0, X0_1);
132         X1_0 = _mm256_add_epi32(X1_0, X1_1);
133         X2_0 = _mm256_add_epi32(X2_0, X2_1);
134         X3_0 = _mm256_add_epi32(X3_0, X3_1);
135 
136         X0_3 = _mm256_xor_si256(X0_3, X0_0);
137         X1_3 = _mm256_xor_si256(X1_3, X1_0);
138         X2_3 = _mm256_xor_si256(X2_3, X2_0);
139         X3_3 = _mm256_xor_si256(X3_3, X3_0);
140 
141         X0_3 = RotateLeft<16>(X0_3);
142         X1_3 = RotateLeft<16>(X1_3);
143         X2_3 = RotateLeft<16>(X2_3);
144         X3_3 = RotateLeft<16>(X3_3);
145 
146         X0_2 = _mm256_add_epi32(X0_2, X0_3);
147         X1_2 = _mm256_add_epi32(X1_2, X1_3);
148         X2_2 = _mm256_add_epi32(X2_2, X2_3);
149         X3_2 = _mm256_add_epi32(X3_2, X3_3);
150 
151         X0_1 = _mm256_xor_si256(X0_1, X0_2);
152         X1_1 = _mm256_xor_si256(X1_1, X1_2);
153         X2_1 = _mm256_xor_si256(X2_1, X2_2);
154         X3_1 = _mm256_xor_si256(X3_1, X3_2);
155 
156         X0_1 = RotateLeft<12>(X0_1);
157         X1_1 = RotateLeft<12>(X1_1);
158         X2_1 = RotateLeft<12>(X2_1);
159         X3_1 = RotateLeft<12>(X3_1);
160 
161         X0_0 = _mm256_add_epi32(X0_0, X0_1);
162         X1_0 = _mm256_add_epi32(X1_0, X1_1);
163         X2_0 = _mm256_add_epi32(X2_0, X2_1);
164         X3_0 = _mm256_add_epi32(X3_0, X3_1);
165 
166         X0_3 = _mm256_xor_si256(X0_3, X0_0);
167         X1_3 = _mm256_xor_si256(X1_3, X1_0);
168         X2_3 = _mm256_xor_si256(X2_3, X2_0);
169         X3_3 = _mm256_xor_si256(X3_3, X3_0);
170 
171         X0_3 = RotateLeft<8>(X0_3);
172         X1_3 = RotateLeft<8>(X1_3);
173         X2_3 = RotateLeft<8>(X2_3);
174         X3_3 = RotateLeft<8>(X3_3);
175 
176         X0_2 = _mm256_add_epi32(X0_2, X0_3);
177         X1_2 = _mm256_add_epi32(X1_2, X1_3);
178         X2_2 = _mm256_add_epi32(X2_2, X2_3);
179         X3_2 = _mm256_add_epi32(X3_2, X3_3);
180 
181         X0_1 = _mm256_xor_si256(X0_1, X0_2);
182         X1_1 = _mm256_xor_si256(X1_1, X1_2);
183         X2_1 = _mm256_xor_si256(X2_1, X2_2);
184         X3_1 = _mm256_xor_si256(X3_1, X3_2);
185 
186         X0_1 = RotateLeft<7>(X0_1);
187         X1_1 = RotateLeft<7>(X1_1);
188         X2_1 = RotateLeft<7>(X2_1);
189         X3_1 = RotateLeft<7>(X3_1);
190 
191         X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1));
192         X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
193         X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3));
194 
195         X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1));
196         X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
197         X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3));
198 
199         X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1));
200         X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
201         X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3));
202 
203         X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1));
204         X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
205         X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3));
206 
207         X0_0 = _mm256_add_epi32(X0_0, X0_1);
208         X1_0 = _mm256_add_epi32(X1_0, X1_1);
209         X2_0 = _mm256_add_epi32(X2_0, X2_1);
210         X3_0 = _mm256_add_epi32(X3_0, X3_1);
211 
212         X0_3 = _mm256_xor_si256(X0_3, X0_0);
213         X1_3 = _mm256_xor_si256(X1_3, X1_0);
214         X2_3 = _mm256_xor_si256(X2_3, X2_0);
215         X3_3 = _mm256_xor_si256(X3_3, X3_0);
216 
217         X0_3 = RotateLeft<16>(X0_3);
218         X1_3 = RotateLeft<16>(X1_3);
219         X2_3 = RotateLeft<16>(X2_3);
220         X3_3 = RotateLeft<16>(X3_3);
221 
222         X0_2 = _mm256_add_epi32(X0_2, X0_3);
223         X1_2 = _mm256_add_epi32(X1_2, X1_3);
224         X2_2 = _mm256_add_epi32(X2_2, X2_3);
225         X3_2 = _mm256_add_epi32(X3_2, X3_3);
226 
227         X0_1 = _mm256_xor_si256(X0_1, X0_2);
228         X1_1 = _mm256_xor_si256(X1_1, X1_2);
229         X2_1 = _mm256_xor_si256(X2_1, X2_2);
230         X3_1 = _mm256_xor_si256(X3_1, X3_2);
231 
232         X0_1 = RotateLeft<12>(X0_1);
233         X1_1 = RotateLeft<12>(X1_1);
234         X2_1 = RotateLeft<12>(X2_1);
235         X3_1 = RotateLeft<12>(X3_1);
236 
237         X0_0 = _mm256_add_epi32(X0_0, X0_1);
238         X1_0 = _mm256_add_epi32(X1_0, X1_1);
239         X2_0 = _mm256_add_epi32(X2_0, X2_1);
240         X3_0 = _mm256_add_epi32(X3_0, X3_1);
241 
242         X0_3 = _mm256_xor_si256(X0_3, X0_0);
243         X1_3 = _mm256_xor_si256(X1_3, X1_0);
244         X2_3 = _mm256_xor_si256(X2_3, X2_0);
245         X3_3 = _mm256_xor_si256(X3_3, X3_0);
246 
247         X0_3 = RotateLeft<8>(X0_3);
248         X1_3 = RotateLeft<8>(X1_3);
249         X2_3 = RotateLeft<8>(X2_3);
250         X3_3 = RotateLeft<8>(X3_3);
251 
252         X0_2 = _mm256_add_epi32(X0_2, X0_3);
253         X1_2 = _mm256_add_epi32(X1_2, X1_3);
254         X2_2 = _mm256_add_epi32(X2_2, X2_3);
255         X3_2 = _mm256_add_epi32(X3_2, X3_3);
256 
257         X0_1 = _mm256_xor_si256(X0_1, X0_2);
258         X1_1 = _mm256_xor_si256(X1_1, X1_2);
259         X2_1 = _mm256_xor_si256(X2_1, X2_2);
260         X3_1 = _mm256_xor_si256(X3_1, X3_2);
261 
262         X0_1 = RotateLeft<7>(X0_1);
263         X1_1 = RotateLeft<7>(X1_1);
264         X2_1 = RotateLeft<7>(X2_1);
265         X3_1 = RotateLeft<7>(X3_1);
266 
267         X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3));
268         X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2));
269         X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1));
270 
271         X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3));
272         X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2));
273         X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1));
274 
275         X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3));
276         X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2));
277         X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1));
278 
279         X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3));
280         X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2));
281         X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1));
282     }
283 
284     X0_0 = _mm256_add_epi32(X0_0, state0);
285     X0_1 = _mm256_add_epi32(X0_1, state1);
286     X0_2 = _mm256_add_epi32(X0_2, state2);
287     X0_3 = _mm256_add_epi32(X0_3, state3);
288     X0_3 = _mm256_add_epi32(X0_3, CTR0);
289 
290     X1_0 = _mm256_add_epi32(X1_0, state0);
291     X1_1 = _mm256_add_epi32(X1_1, state1);
292     X1_2 = _mm256_add_epi32(X1_2, state2);
293     X1_3 = _mm256_add_epi32(X1_3, state3);
294     X1_3 = _mm256_add_epi32(X1_3, CTR1);
295 
296     X2_0 = _mm256_add_epi32(X2_0, state0);
297     X2_1 = _mm256_add_epi32(X2_1, state1);
298     X2_2 = _mm256_add_epi32(X2_2, state2);
299     X2_3 = _mm256_add_epi32(X2_3, state3);
300     X2_3 = _mm256_add_epi32(X2_3, CTR2);
301 
302     X3_0 = _mm256_add_epi32(X3_0, state0);
303     X3_1 = _mm256_add_epi32(X3_1, state1);
304     X3_2 = _mm256_add_epi32(X3_2, state2);
305     X3_3 = _mm256_add_epi32(X3_3, state3);
306     X3_3 = _mm256_add_epi32(X3_3, CTR3);
307 
308     if (input)
309     {
310         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
311             _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)),
312             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+0*32)))));
313         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
314             _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)),
315             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+1*32)))));
316         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
317             _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)),
318             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+2*32)))));
319         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
320             _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)),
321             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+3*32)))));
322     }
323     else
324     {
325         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+0*32),
326             _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)));
327         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+1*32),
328             _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)));
329         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+2*32),
330             _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)));
331         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+3*32),
332             _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)));
333     }
334 
335     if (input)
336     {
337         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
338             _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)),
339             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+4*32)))));
340         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
341             _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)),
342             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+5*32)))));
343         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
344             _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)),
345             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+6*32)))));
346         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
347             _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)),
348             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+7*32)))));
349     }
350     else
351     {
352         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+4*32),
353             _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)));
354         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+5*32),
355             _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)));
356         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+6*32),
357             _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)));
358         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+7*32),
359             _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)));
360     }
361 
362     if (input)
363     {
364         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
365             _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)),
366             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+8*32)))));
367         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
368             _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)),
369             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+9*32)))));
370         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
371             _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)),
372             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+10*32)))));
373         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
374             _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)),
375             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+11*32)))));
376     }
377     else
378     {
379         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 8*32),
380             _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)));
381         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+ 9*32),
382             _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)));
383         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+10*32),
384             _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)));
385         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+11*32),
386             _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)));
387     }
388 
389     if (input)
390     {
391         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
392             _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)),
393             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+12*32)))));
394         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
395             _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)),
396             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+13*32)))));
397         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
398             _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)),
399             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+14*32)))));
400         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
401             _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)),
402             _mm256_loadu_si256(const_cast<MAYBE_CONST __m256i*>(reinterpret_cast<const __m256i*>(input+15*32)))));
403     }
404     else
405     {
406         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+12*32),
407             _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)));
408         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+13*32),
409             _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)));
410         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+14*32),
411             _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)));
412         _mm256_storeu_si256(reinterpret_cast<__m256i*>(output+15*32),
413             _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)));
414     }
415 
416     // https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
417     _mm256_zeroupper();
418 }
419 
420 #endif  // CRYPTOPP_AVX2_AVAILABLE
421 
422 NAMESPACE_END
423