1 /* $NetBSD: chacha_sse2.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */
2
3 /*-
4 * Copyright (c) 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/types.h>
30 #include <sys/endian.h>
31
32 #include "immintrin.h"
33
34 #include "chacha_sse2.h"
35
36 static inline __m128i
rol32(__m128i x,uint8_t n)37 rol32(__m128i x, uint8_t n)
38 {
39
40 return _mm_slli_epi32(x, n) | _mm_srli_epi32(x, 32 - n);
41 }
42
43 static inline void
chacha_permute(__m128i * p0,__m128i * p1,__m128i * p2,__m128i * p3,unsigned nr)44 chacha_permute(__m128i *p0, __m128i *p1, __m128i *p2, __m128i *p3,
45 unsigned nr)
46 {
47 __m128i r0, r1, r2, r3;
48 __m128i c0, c1, c2, c3;
49
50 r0 = *p0;
51 r1 = *p1;
52 r2 = *p2;
53 r3 = *p3;
54
55 for (; nr > 0; nr -= 2) {
56 r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 16);
57 r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 12);
58 r0 = _mm_add_epi32(r0, r1); r3 ^= r0; r3 = rol32(r3, 8);
59 r2 = _mm_add_epi32(r2, r3); r1 ^= r2; r1 = rol32(r1, 7);
60
61 c0 = r0;
62 c1 = _mm_shuffle_epi32(r1, 0x39);
63 c2 = _mm_shuffle_epi32(r2, 0x4e);
64 c3 = _mm_shuffle_epi32(r3, 0x93);
65
66 c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 16);
67 c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 12);
68 c0 = _mm_add_epi32(c0, c1); c3 ^= c0; c3 = rol32(c3, 8);
69 c2 = _mm_add_epi32(c2, c3); c1 ^= c2; c1 = rol32(c1, 7);
70
71 r0 = c0;
72 r1 = _mm_shuffle_epi32(c1, 0x93);
73 r2 = _mm_shuffle_epi32(c2, 0x4e);
74 r3 = _mm_shuffle_epi32(c3, 0x39);
75 }
76
77 *p0 = r0;
78 *p1 = r1;
79 *p2 = r2;
80 *p3 = r3;
81 }
82
83 void
chacha_core_sse2(uint8_t out[restrict static64],const uint8_t in[static16],const uint8_t k[static32],const uint8_t c[static16],unsigned nr)84 chacha_core_sse2(uint8_t out[restrict static 64],
85 const uint8_t in[static 16],
86 const uint8_t k[static 32],
87 const uint8_t c[static 16],
88 unsigned nr)
89 {
90 __m128i in0, in1, in2, in3;
91 __m128i r0, r1, r2, r3;
92
93 r0 = in0 = _mm_loadu_si128((const __m128i *)c);
94 r1 = in1 = _mm_loadu_si128((const __m128i *)k);
95 r2 = in2 = _mm_loadu_si128((const __m128i *)k + 1);
96 r3 = in3 = _mm_loadu_si128((const __m128i *)in);
97
98 chacha_permute(&r0, &r1, &r2, &r3, nr);
99
100 _mm_storeu_si128((__m128i *)out + 0, _mm_add_epi32(r0, in0));
101 _mm_storeu_si128((__m128i *)out + 1, _mm_add_epi32(r1, in1));
102 _mm_storeu_si128((__m128i *)out + 2, _mm_add_epi32(r2, in2));
103 _mm_storeu_si128((__m128i *)out + 3, _mm_add_epi32(r3, in3));
104 }
105
106 void
hchacha_sse2(uint8_t out[restrict static32],const uint8_t in[static16],const uint8_t k[static32],const uint8_t c[static16],unsigned nr)107 hchacha_sse2(uint8_t out[restrict static 32],
108 const uint8_t in[static 16],
109 const uint8_t k[static 32],
110 const uint8_t c[static 16],
111 unsigned nr)
112 {
113 __m128i r0, r1, r2, r3;
114
115 r0 = _mm_loadu_si128((const __m128i *)c);
116 r1 = _mm_loadu_si128((const __m128i *)k);
117 r2 = _mm_loadu_si128((const __m128i *)k + 1);
118 r3 = _mm_loadu_si128((const __m128i *)in);
119
120 chacha_permute(&r0, &r1, &r2, &r3, nr);
121
122 _mm_storeu_si128((__m128i *)out + 0, r0);
123 _mm_storeu_si128((__m128i *)out + 1, r3);
124 }
125
126 #define CHACHA_QUARTERROUND(a, b, c, d) do \
127 { \
128 (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 16); \
129 (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 12); \
130 (a) = _mm_add_epi32((a), (b)); (d) ^= a; (d) = rol32((d), 8); \
131 (c) = _mm_add_epi32((c), (d)); (b) ^= c; (b) = rol32((b), 7); \
132 } while (/*CONSTCOND*/0)
133
134 static inline __m128i
load1_epi32(const void * p)135 load1_epi32(const void *p)
136 {
137 return (__m128i)_mm_load1_ps(p);
138 }
139
140 static inline __m128i
loadu_epi32(const void * p)141 loadu_epi32(const void *p)
142 {
143 return _mm_loadu_si128(p);
144 }
145
146 static inline void
storeu_epi32(void * p,__m128i v)147 storeu_epi32(void *p, __m128i v)
148 {
149 return _mm_storeu_si128(p, v);
150 }
151
152 static inline __m128i
unpack0_epi32(__m128i a,__m128i b,__m128i c,__m128i d)153 unpack0_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
154 {
155 __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (a[0], b[0], ...) */
156 __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (c[0], d[0], ...) */
157
158 /* (lo[0]=a[0], lo[1]=b[0], hi[0]=c[0], hi[1]=d[0]) */
159 return (__m128i)_mm_movelh_ps(lo, hi);
160 }
161
162 static inline __m128i
unpack1_epi32(__m128i a,__m128i b,__m128i c,__m128i d)163 unpack1_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
164 {
165 __m128 lo = (__m128)_mm_unpacklo_epi32(a, b); /* (..., a[1], b[1]) */
166 __m128 hi = (__m128)_mm_unpacklo_epi32(c, d); /* (..., c[1], d[1]) */
167
168 /* (lo[2]=a[1], lo[3]=b[1], hi[2]=c[1], hi[3]=d[1]) */
169 return (__m128i)_mm_movehl_ps(hi, lo);
170 }
171
172 static inline __m128i
unpack2_epi32(__m128i a,__m128i b,__m128i c,__m128i d)173 unpack2_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
174 {
175 __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (a[2], b[2], ...) */
176 __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (c[2], d[2], ...) */
177
178 /* (lo[0]=a[2], lo[1]=b[2], hi[0]=c[2], hi[1]=d[2]) */
179 return (__m128i)_mm_movelh_ps(lo, hi);
180 }
181
182 static inline __m128i
unpack3_epi32(__m128i a,__m128i b,__m128i c,__m128i d)183 unpack3_epi32(__m128i a, __m128i b, __m128i c, __m128i d)
184 {
185 __m128 lo = (__m128)_mm_unpackhi_epi32(a, b); /* (..., a[3], b[3]) */
186 __m128 hi = (__m128)_mm_unpackhi_epi32(c, d); /* (..., c[3], d[3]) */
187
188 /* (lo[2]=a[3], lo[3]=b[3], hi[2]=c[3], hi[3]=d[3]) */
189 return (__m128i)_mm_movehl_ps(hi, lo);
190 }
191
192 void
chacha_stream_sse2(uint8_t * restrict s,size_t n,uint32_t blkno,const uint8_t nonce[static12],const uint8_t k[static32],unsigned nr)193 chacha_stream_sse2(uint8_t *restrict s, size_t n,
194 uint32_t blkno,
195 const uint8_t nonce[static 12],
196 const uint8_t k[static 32],
197 unsigned nr)
198 {
199 __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
200 __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
201 __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
202 unsigned r;
203
204 if (n < 256)
205 goto out;
206
207 x0 = load1_epi32(chacha_const32 + 0);
208 x1 = load1_epi32(chacha_const32 + 4);
209 x2 = load1_epi32(chacha_const32 + 8);
210 x3 = load1_epi32(chacha_const32 + 12);
211 x4 = load1_epi32(k + 0);
212 x5 = load1_epi32(k + 4);
213 x6 = load1_epi32(k + 8);
214 x7 = load1_epi32(k + 12);
215 x8 = load1_epi32(k + 16);
216 x9 = load1_epi32(k + 20);
217 x10 = load1_epi32(k + 24);
218 x11 = load1_epi32(k + 28);
219 /* x12 set in the loop */
220 x13 = load1_epi32(nonce + 0);
221 x14 = load1_epi32(nonce + 4);
222 x15 = load1_epi32(nonce + 8);
223
224 for (; n >= 256; s += 256, n -= 256, blkno += 4) {
225 x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
226 _mm_set_epi32(3,2,1,0));
227 y0 = x0;
228 y1 = x1;
229 y2 = x2;
230 y3 = x3;
231 y4 = x4;
232 y5 = x5;
233 y6 = x6;
234 y7 = x7;
235 y8 = x8;
236 y9 = x9;
237 y10 = x10;
238 y11 = x11;
239 y12 = x12;
240 y13 = x13;
241 y14 = x14;
242 y15 = x15;
243 for (r = nr; r > 0; r -= 2) {
244 CHACHA_QUARTERROUND( y0, y4, y8,y12);
245 CHACHA_QUARTERROUND( y1, y5, y9,y13);
246 CHACHA_QUARTERROUND( y2, y6,y10,y14);
247 CHACHA_QUARTERROUND( y3, y7,y11,y15);
248 CHACHA_QUARTERROUND( y0, y5,y10,y15);
249 CHACHA_QUARTERROUND( y1, y6,y11,y12);
250 CHACHA_QUARTERROUND( y2, y7, y8,y13);
251 CHACHA_QUARTERROUND( y3, y4, y9,y14);
252 }
253 y0 = _mm_add_epi32(y0, x0);
254 y1 = _mm_add_epi32(y1, x1);
255 y2 = _mm_add_epi32(y2, x2);
256 y3 = _mm_add_epi32(y3, x3);
257 y4 = _mm_add_epi32(y4, x4);
258 y5 = _mm_add_epi32(y5, x5);
259 y6 = _mm_add_epi32(y6, x6);
260 y7 = _mm_add_epi32(y7, x7);
261 y8 = _mm_add_epi32(y8, x8);
262 y9 = _mm_add_epi32(y9, x9);
263 y10 = _mm_add_epi32(y10, x10);
264 y11 = _mm_add_epi32(y11, x11);
265 y12 = _mm_add_epi32(y12, x12);
266 y13 = _mm_add_epi32(y13, x13);
267 y14 = _mm_add_epi32(y14, x14);
268 y15 = _mm_add_epi32(y15, x15);
269
270 z0 = unpack0_epi32(y0, y1, y2, y3);
271 z1 = unpack0_epi32(y4, y5, y6, y7);
272 z2 = unpack0_epi32(y8, y9, y10, y11);
273 z3 = unpack0_epi32(y12, y13, y14, y15);
274 z4 = unpack1_epi32(y0, y1, y2, y3);
275 z5 = unpack1_epi32(y4, y5, y6, y7);
276 z6 = unpack1_epi32(y8, y9, y10, y11);
277 z7 = unpack1_epi32(y12, y13, y14, y15);
278 z8 = unpack2_epi32(y0, y1, y2, y3);
279 z9 = unpack2_epi32(y4, y5, y6, y7);
280 z10 = unpack2_epi32(y8, y9, y10, y11);
281 z11 = unpack2_epi32(y12, y13, y14, y15);
282 z12 = unpack3_epi32(y0, y1, y2, y3);
283 z13 = unpack3_epi32(y4, y5, y6, y7);
284 z14 = unpack3_epi32(y8, y9, y10, y11);
285 z15 = unpack3_epi32(y12, y13, y14, y15);
286
287 storeu_epi32(s + 16*0, z0);
288 storeu_epi32(s + 16*1, z1);
289 storeu_epi32(s + 16*2, z2);
290 storeu_epi32(s + 16*3, z3);
291 storeu_epi32(s + 16*4, z4);
292 storeu_epi32(s + 16*5, z5);
293 storeu_epi32(s + 16*6, z6);
294 storeu_epi32(s + 16*7, z7);
295 storeu_epi32(s + 16*8, z8);
296 storeu_epi32(s + 16*9, z9);
297 storeu_epi32(s + 16*10, z10);
298 storeu_epi32(s + 16*11, z11);
299 storeu_epi32(s + 16*12, z12);
300 storeu_epi32(s + 16*13, z13);
301 storeu_epi32(s + 16*14, z14);
302 storeu_epi32(s + 16*15, z15);
303 }
304
305 out: if (n) {
306 const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
307 __m128i in0, in1, in2, in3;
308 __m128i r0, r1, r2, r3;
309
310 in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
311 in1 = _mm_loadu_si128((const __m128i *)k);
312 in2 = _mm_loadu_si128((const __m128i *)k + 1);
313 in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
314 le32dec(nonce), blkno);
315
316 for (; n; s += 64, n -= 64) {
317 r0 = in0;
318 r1 = in1;
319 r2 = in2;
320 r3 = in3;
321 chacha_permute(&r0, &r1, &r2, &r3, nr);
322 r0 = _mm_add_epi32(r0, in0);
323 r1 = _mm_add_epi32(r1, in1);
324 r2 = _mm_add_epi32(r2, in2);
325 r3 = _mm_add_epi32(r3, in3);
326
327 if (n < 64) {
328 uint8_t buf[64] __aligned(16);
329
330 _mm_storeu_si128((__m128i *)buf + 0, r0);
331 _mm_storeu_si128((__m128i *)buf + 1, r1);
332 _mm_storeu_si128((__m128i *)buf + 2, r2);
333 _mm_storeu_si128((__m128i *)buf + 3, r3);
334 memcpy(s, buf, n);
335
336 break;
337 }
338
339 _mm_storeu_si128((__m128i *)s + 0, r0);
340 _mm_storeu_si128((__m128i *)s + 1, r1);
341 _mm_storeu_si128((__m128i *)s + 2, r2);
342 _mm_storeu_si128((__m128i *)s + 3, r3);
343 in3 = _mm_add_epi32(in3, blkno_inc);
344 }
345 }
346 }
347
348 void
chacha_stream_xor_sse2(uint8_t * s,const uint8_t * p,size_t n,uint32_t blkno,const uint8_t nonce[static12],const uint8_t k[static32],unsigned nr)349 chacha_stream_xor_sse2(uint8_t *s, const uint8_t *p, size_t n,
350 uint32_t blkno,
351 const uint8_t nonce[static 12],
352 const uint8_t k[static 32],
353 unsigned nr)
354 {
355 __m128i x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
356 __m128i y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15;
357 __m128i z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15;
358 unsigned r;
359
360 if (n < 256)
361 goto out;
362
363 x0 = load1_epi32(chacha_const32 + 0);
364 x1 = load1_epi32(chacha_const32 + 4);
365 x2 = load1_epi32(chacha_const32 + 8);
366 x3 = load1_epi32(chacha_const32 + 12);
367 x4 = load1_epi32(k + 0);
368 x5 = load1_epi32(k + 4);
369 x6 = load1_epi32(k + 8);
370 x7 = load1_epi32(k + 12);
371 x8 = load1_epi32(k + 16);
372 x9 = load1_epi32(k + 20);
373 x10 = load1_epi32(k + 24);
374 x11 = load1_epi32(k + 28);
375 /* x12 set in the loop */
376 x13 = load1_epi32(nonce + 0);
377 x14 = load1_epi32(nonce + 4);
378 x15 = load1_epi32(nonce + 8);
379
380 for (; n >= 256; s += 256, p += 256, n -= 256, blkno += 4) {
381 x12 = _mm_add_epi32(_mm_set1_epi32(blkno),
382 _mm_set_epi32(3,2,1,0));
383 y0 = x0;
384 y1 = x1;
385 y2 = x2;
386 y3 = x3;
387 y4 = x4;
388 y5 = x5;
389 y6 = x6;
390 y7 = x7;
391 y8 = x8;
392 y9 = x9;
393 y10 = x10;
394 y11 = x11;
395 y12 = x12;
396 y13 = x13;
397 y14 = x14;
398 y15 = x15;
399 for (r = nr; r > 0; r -= 2) {
400 CHACHA_QUARTERROUND( y0, y4, y8,y12);
401 CHACHA_QUARTERROUND( y1, y5, y9,y13);
402 CHACHA_QUARTERROUND( y2, y6,y10,y14);
403 CHACHA_QUARTERROUND( y3, y7,y11,y15);
404 CHACHA_QUARTERROUND( y0, y5,y10,y15);
405 CHACHA_QUARTERROUND( y1, y6,y11,y12);
406 CHACHA_QUARTERROUND( y2, y7, y8,y13);
407 CHACHA_QUARTERROUND( y3, y4, y9,y14);
408 }
409 y0 = _mm_add_epi32(y0, x0);
410 y1 = _mm_add_epi32(y1, x1);
411 y2 = _mm_add_epi32(y2, x2);
412 y3 = _mm_add_epi32(y3, x3);
413 y4 = _mm_add_epi32(y4, x4);
414 y5 = _mm_add_epi32(y5, x5);
415 y6 = _mm_add_epi32(y6, x6);
416 y7 = _mm_add_epi32(y7, x7);
417 y8 = _mm_add_epi32(y8, x8);
418 y9 = _mm_add_epi32(y9, x9);
419 y10 = _mm_add_epi32(y10, x10);
420 y11 = _mm_add_epi32(y11, x11);
421 y12 = _mm_add_epi32(y12, x12);
422 y13 = _mm_add_epi32(y13, x13);
423 y14 = _mm_add_epi32(y14, x14);
424 y15 = _mm_add_epi32(y15, x15);
425
426 z0 = unpack0_epi32(y0, y1, y2, y3);
427 z1 = unpack0_epi32(y4, y5, y6, y7);
428 z2 = unpack0_epi32(y8, y9, y10, y11);
429 z3 = unpack0_epi32(y12, y13, y14, y15);
430 z4 = unpack1_epi32(y0, y1, y2, y3);
431 z5 = unpack1_epi32(y4, y5, y6, y7);
432 z6 = unpack1_epi32(y8, y9, y10, y11);
433 z7 = unpack1_epi32(y12, y13, y14, y15);
434 z8 = unpack2_epi32(y0, y1, y2, y3);
435 z9 = unpack2_epi32(y4, y5, y6, y7);
436 z10 = unpack2_epi32(y8, y9, y10, y11);
437 z11 = unpack2_epi32(y12, y13, y14, y15);
438 z12 = unpack3_epi32(y0, y1, y2, y3);
439 z13 = unpack3_epi32(y4, y5, y6, y7);
440 z14 = unpack3_epi32(y8, y9, y10, y11);
441 z15 = unpack3_epi32(y12, y13, y14, y15);
442
443 storeu_epi32(s + 16*0, loadu_epi32(p + 16*0) ^ z0);
444 storeu_epi32(s + 16*1, loadu_epi32(p + 16*1) ^ z1);
445 storeu_epi32(s + 16*2, loadu_epi32(p + 16*2) ^ z2);
446 storeu_epi32(s + 16*3, loadu_epi32(p + 16*3) ^ z3);
447 storeu_epi32(s + 16*4, loadu_epi32(p + 16*4) ^ z4);
448 storeu_epi32(s + 16*5, loadu_epi32(p + 16*5) ^ z5);
449 storeu_epi32(s + 16*6, loadu_epi32(p + 16*6) ^ z6);
450 storeu_epi32(s + 16*7, loadu_epi32(p + 16*7) ^ z7);
451 storeu_epi32(s + 16*8, loadu_epi32(p + 16*8) ^ z8);
452 storeu_epi32(s + 16*9, loadu_epi32(p + 16*9) ^ z9);
453 storeu_epi32(s + 16*10, loadu_epi32(p + 16*10) ^ z10);
454 storeu_epi32(s + 16*11, loadu_epi32(p + 16*11) ^ z11);
455 storeu_epi32(s + 16*12, loadu_epi32(p + 16*12) ^ z12);
456 storeu_epi32(s + 16*13, loadu_epi32(p + 16*13) ^ z13);
457 storeu_epi32(s + 16*14, loadu_epi32(p + 16*14) ^ z14);
458 storeu_epi32(s + 16*15, loadu_epi32(p + 16*15) ^ z15);
459 }
460
461 out: if (n) {
462 const __m128i blkno_inc = _mm_set_epi32(0,0,0,1);
463 __m128i in0, in1, in2, in3;
464 __m128i r0, r1, r2, r3;
465
466 in0 = _mm_loadu_si128((const __m128i *)chacha_const32);
467 in1 = _mm_loadu_si128((const __m128i *)k);
468 in2 = _mm_loadu_si128((const __m128i *)k + 1);
469 in3 = _mm_set_epi32(le32dec(nonce + 8), le32dec(nonce + 4),
470 le32dec(nonce), blkno);
471
472 for (; n; s += 64, p += 64, n -= 64) {
473 r0 = in0;
474 r1 = in1;
475 r2 = in2;
476 r3 = in3;
477 chacha_permute(&r0, &r1, &r2, &r3, nr);
478 r0 = _mm_add_epi32(r0, in0);
479 r1 = _mm_add_epi32(r1, in1);
480 r2 = _mm_add_epi32(r2, in2);
481 r3 = _mm_add_epi32(r3, in3);
482
483 if (n < 64) {
484 uint8_t buf[64] __aligned(16);
485 unsigned i;
486
487 _mm_storeu_si128((__m128i *)buf + 0, r0);
488 _mm_storeu_si128((__m128i *)buf + 1, r1);
489 _mm_storeu_si128((__m128i *)buf + 2, r2);
490 _mm_storeu_si128((__m128i *)buf + 3, r3);
491
492 for (i = 0; i < n - n%4; i += 4)
493 le32enc(s + i,
494 le32dec(p + i) ^ le32dec(buf + i));
495 for (; i < n; i++)
496 s[i] = p[i] ^ buf[i];
497
498 break;
499 }
500
501 r0 ^= _mm_loadu_si128((const __m128i *)p + 0);
502 r1 ^= _mm_loadu_si128((const __m128i *)p + 1);
503 r2 ^= _mm_loadu_si128((const __m128i *)p + 2);
504 r3 ^= _mm_loadu_si128((const __m128i *)p + 3);
505 _mm_storeu_si128((__m128i *)s + 0, r0);
506 _mm_storeu_si128((__m128i *)s + 1, r1);
507 _mm_storeu_si128((__m128i *)s + 2, r2);
508 _mm_storeu_si128((__m128i *)s + 3, r3);
509 in3 = _mm_add_epi32(in3, blkno_inc);
510 }
511 }
512 }
513
514 void
xchacha_stream_sse2(uint8_t * restrict s,size_t nbytes,uint32_t blkno,const uint8_t nonce[static24],const uint8_t k[static32],unsigned nr)515 xchacha_stream_sse2(uint8_t *restrict s, size_t nbytes,
516 uint32_t blkno,
517 const uint8_t nonce[static 24],
518 const uint8_t k[static 32],
519 unsigned nr)
520 {
521 uint8_t subkey[32];
522 uint8_t subnonce[12];
523
524 hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
525 memset(subnonce, 0, 4);
526 memcpy(subnonce + 4, nonce + 16, 8);
527 chacha_stream_sse2(s, nbytes, blkno, subnonce, subkey, nr);
528 }
529
530 void
xchacha_stream_xor_sse2(uint8_t * restrict c,const uint8_t * p,size_t nbytes,uint32_t blkno,const uint8_t nonce[static24],const uint8_t k[static32],unsigned nr)531 xchacha_stream_xor_sse2(uint8_t *restrict c, const uint8_t *p, size_t nbytes,
532 uint32_t blkno,
533 const uint8_t nonce[static 24],
534 const uint8_t k[static 32],
535 unsigned nr)
536 {
537 uint8_t subkey[32];
538 uint8_t subnonce[12];
539
540 hchacha_sse2(subkey, nonce/*[0:16)*/, k, chacha_const32, nr);
541 memset(subnonce, 0, 4);
542 memcpy(subnonce + 4, nonce + 16, 8);
543 chacha_stream_xor_sse2(c, p, nbytes, blkno, subnonce, subkey, nr);
544 }
545