1 /*-
2  * Copyright 2009 Colin Percival
3  * Copyright 2012-2018 Alexander Peslyak
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  *
27  * This file was originally written by Colin Percival as part of the Tarsnap
28  * online backup system.
29  */
30 
31 /* JtR hack: don't use OpenMP inside (ye)scrypt */
32 #undef _OPENMP
33 
34 /*
35  * AVX and especially XOP speed up Salsa20 a lot, but this mostly matters for
36  * classic scrypt and for YESCRYPT_WORM (which use 8 rounds of Salsa20 per
37  * sub-block), and much less so for YESCRYPT_RW (which uses 2 rounds of Salsa20
38  * per block except during pwxform S-box initialization).
39  */
40 #if 0 /* FIXME */
41 #ifdef __XOP__
42 #warning "Note: XOP is enabled.  That's great."
43 #elif defined(__AVX__)
44 #warning "Note: AVX is enabled, which is great for classic scrypt and YESCRYPT_WORM, but is sometimes slightly slower than plain SSE2 for YESCRYPT_RW"
45 #elif defined(__SSE2__)
46 #warning "Note: AVX and XOP are not enabled, which is great for YESCRYPT_RW, but they would substantially improve performance at classic scrypt and YESCRYPT_WORM"
47 #elif defined(__x86_64__) || defined(__i386__)
48 #warning "SSE2 not enabled.  Expect poor performance."
49 #else
50 #warning "Note: building generic code for non-x86.  That's OK."
51 #endif
52 #endif /* 0 */
53 
54 /*
55  * The SSE4 code version has fewer instructions than the generic SSE2 version,
56  * but all of the instructions are SIMD, thereby wasting the scalar execution
57  * units.  Thus, the generic SSE2 version below actually runs faster on some
58  * CPUs due to its balanced mix of SIMD and scalar instructions.
59  */
60 #undef USE_SSE4_FOR_32BIT
61 
62 #ifdef __SSE2__
63 /*
64  * GCC before 4.9 would by default unnecessarily use store/load (without
65  * SSE4.1) or (V)PEXTR (with SSE4.1 or AVX) instead of simply (V)MOV.
66  * This was tracked as GCC bug 54349.
67  * "-mtune=corei7" works around this, but is only supported for GCC 4.6+.
68  * We use inline asm for pre-4.6 GCC, further down this file.
69  */
70 #if __GNUC__ == 4 && __GNUC_MINOR__ >= 6 && __GNUC_MINOR__ < 9 && \
71     !defined(__clang__) && !defined(__ICC)
72 #pragma GCC target ("tune=corei7")
73 #endif
74 #include <emmintrin.h>
75 #ifdef __XOP__
76 #include <x86intrin.h>
77 #endif
78 #elif defined(__SSE__)
79 #include <xmmintrin.h>
80 #endif
81 
82 #include <errno.h>
83 #include <stdint.h>
84 #include <stdlib.h>
85 #include <string.h>
86 
87 #include "insecure_memzero.h"
88 #include "sha256.h"
89 #include "sysendian.h"
90 
91 #define YESCRYPT_INTERNAL
92 #include "yescrypt.h"
93 
94 #include "yescrypt-platform.c"
95 
96 #if __STDC_VERSION__ >= 199901L
97 /* Have restrict */
98 #elif defined(__GNUC__)
99 #define restrict __restrict
100 #else
101 #define restrict
102 #endif
103 
104 #ifdef __GNUC__
105 #define unlikely(exp) __builtin_expect(exp, 0)
106 #else
107 #define unlikely(exp) (exp)
108 #endif
109 
110 #ifdef __SSE__
111 #define PREFETCH(x, hint) _mm_prefetch((const char *)(x), (hint));
112 #else
113 #undef PREFETCH
114 #endif
115 
116 typedef union {
117 	uint32_t w[16];
118 	uint64_t d[8];
119 #ifdef __SSE2__
120 	__m128i q[4];
121 #endif
122 } salsa20_blk_t;
123 
salsa20_simd_shuffle(const salsa20_blk_t * Bin,salsa20_blk_t * Bout)124 static inline void salsa20_simd_shuffle(const salsa20_blk_t *Bin,
125     salsa20_blk_t *Bout)
126 {
127 #define COMBINE(out, in1, in2) \
128 	Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32);
129 	COMBINE(0, 0, 2)
130 	COMBINE(1, 5, 7)
131 	COMBINE(2, 2, 4)
132 	COMBINE(3, 7, 1)
133 	COMBINE(4, 4, 6)
134 	COMBINE(5, 1, 3)
135 	COMBINE(6, 6, 0)
136 	COMBINE(7, 3, 5)
137 #undef COMBINE
138 }
139 
salsa20_simd_unshuffle(const salsa20_blk_t * Bin,salsa20_blk_t * Bout)140 static inline void salsa20_simd_unshuffle(const salsa20_blk_t *Bin,
141     salsa20_blk_t *Bout)
142 {
143 #define UNCOMBINE(out, in1, in2) \
144 	Bout->w[out * 2] = Bin->d[in1]; \
145 	Bout->w[out * 2 + 1] = Bin->d[in2] >> 32;
146 	UNCOMBINE(0, 0, 6)
147 	UNCOMBINE(1, 5, 3)
148 	UNCOMBINE(2, 2, 0)
149 	UNCOMBINE(3, 7, 5)
150 	UNCOMBINE(4, 4, 2)
151 	UNCOMBINE(5, 1, 7)
152 	UNCOMBINE(6, 6, 4)
153 	UNCOMBINE(7, 3, 1)
154 #undef UNCOMBINE
155 }
156 
157 #ifdef __SSE2__
158 #define DECL_X \
159 	__m128i X0, X1, X2, X3;
160 #define DECL_Y \
161 	__m128i Y0, Y1, Y2, Y3;
162 #define READ_X(in) \
163 	X0 = (in).q[0]; X1 = (in).q[1]; X2 = (in).q[2]; X3 = (in).q[3];
164 #define WRITE_X(out) \
165 	(out).q[0] = X0; (out).q[1] = X1; (out).q[2] = X2; (out).q[3] = X3;
166 
167 #ifdef __XOP__
168 #define ARX(out, in1, in2, s) \
169 	out = _mm_xor_si128(out, _mm_roti_epi32(_mm_add_epi32(in1, in2), s));
170 #else
171 #define ARX(out, in1, in2, s) { \
172 	__m128i tmp = _mm_add_epi32(in1, in2); \
173 	out = _mm_xor_si128(out, _mm_slli_epi32(tmp, s)); \
174 	out = _mm_xor_si128(out, _mm_srli_epi32(tmp, 32 - s)); \
175 }
176 #endif
177 
178 #define SALSA20_2ROUNDS \
179 	/* Operate on "columns" */ \
180 	ARX(X1, X0, X3, 7) \
181 	ARX(X2, X1, X0, 9) \
182 	ARX(X3, X2, X1, 13) \
183 	ARX(X0, X3, X2, 18) \
184 	/* Rearrange data */ \
185 	X1 = _mm_shuffle_epi32(X1, 0x93); \
186 	X2 = _mm_shuffle_epi32(X2, 0x4E); \
187 	X3 = _mm_shuffle_epi32(X3, 0x39); \
188 	/* Operate on "rows" */ \
189 	ARX(X3, X0, X1, 7) \
190 	ARX(X2, X3, X0, 9) \
191 	ARX(X1, X2, X3, 13) \
192 	ARX(X0, X1, X2, 18) \
193 	/* Rearrange data */ \
194 	X1 = _mm_shuffle_epi32(X1, 0x39); \
195 	X2 = _mm_shuffle_epi32(X2, 0x4E); \
196 	X3 = _mm_shuffle_epi32(X3, 0x93);
197 
198 /**
199  * Apply the Salsa20 core to the block provided in (X0 ... X3).
200  */
201 #define SALSA20_wrapper(out, rounds) { \
202 	__m128i Z0 = X0, Z1 = X1, Z2 = X2, Z3 = X3; \
203 	rounds \
204 	(out).q[0] = X0 = _mm_add_epi32(X0, Z0); \
205 	(out).q[1] = X1 = _mm_add_epi32(X1, Z1); \
206 	(out).q[2] = X2 = _mm_add_epi32(X2, Z2); \
207 	(out).q[3] = X3 = _mm_add_epi32(X3, Z3); \
208 }
209 
210 /**
211  * Apply the Salsa20/2 core to the block provided in X.
212  */
213 #define SALSA20_2(out) \
214 	SALSA20_wrapper(out, SALSA20_2ROUNDS)
215 
216 #define SALSA20_8ROUNDS \
217 	SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS SALSA20_2ROUNDS
218 
219 #define XOR_X(in) \
220 	X0 = _mm_xor_si128(X0, (in).q[0]); \
221 	X1 = _mm_xor_si128(X1, (in).q[1]); \
222 	X2 = _mm_xor_si128(X2, (in).q[2]); \
223 	X3 = _mm_xor_si128(X3, (in).q[3]);
224 
225 #define XOR_X_2(in1, in2) \
226 	X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \
227 	X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \
228 	X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \
229 	X3 = _mm_xor_si128((in1).q[3], (in2).q[3]);
230 
231 #define XOR_X_WRITE_XOR_Y_2(out, in) \
232 	(out).q[0] = Y0 = _mm_xor_si128((out).q[0], (in).q[0]); \
233 	(out).q[1] = Y1 = _mm_xor_si128((out).q[1], (in).q[1]); \
234 	(out).q[2] = Y2 = _mm_xor_si128((out).q[2], (in).q[2]); \
235 	(out).q[3] = Y3 = _mm_xor_si128((out).q[3], (in).q[3]); \
236 	X0 = _mm_xor_si128(X0, Y0); \
237 	X1 = _mm_xor_si128(X1, Y1); \
238 	X2 = _mm_xor_si128(X2, Y2); \
239 	X3 = _mm_xor_si128(X3, Y3);
240 
241 /**
242  * Apply the Salsa20/8 core to the block provided in X ^ in.
243  */
244 #define SALSA20_8_XOR_MEM(in, out) \
245 	XOR_X(in) \
246 	SALSA20_wrapper(out, SALSA20_8ROUNDS)
247 
248 #define INTEGERIFY _mm_cvtsi128_si32(X0)
249 
250 #else /* !defined(__SSE2__) */
251 
252 #define DECL_X \
253 	salsa20_blk_t X;
254 #define DECL_Y \
255 	salsa20_blk_t Y;
256 
257 #define COPY(out, in) \
258 	(out).d[0] = (in).d[0]; \
259 	(out).d[1] = (in).d[1]; \
260 	(out).d[2] = (in).d[2]; \
261 	(out).d[3] = (in).d[3]; \
262 	(out).d[4] = (in).d[4]; \
263 	(out).d[5] = (in).d[5]; \
264 	(out).d[6] = (in).d[6]; \
265 	(out).d[7] = (in).d[7];
266 
267 #define READ_X(in) COPY(X, in)
268 #define WRITE_X(out) COPY(out, X)
269 
270 /**
271  * salsa20(B):
272  * Apply the Salsa20 core to the provided block.
273  */
salsa20(salsa20_blk_t * restrict B,salsa20_blk_t * restrict Bout,uint32_t doublerounds)274 static inline void salsa20(salsa20_blk_t *restrict B,
275     salsa20_blk_t *restrict Bout, uint32_t doublerounds)
276 {
277 	salsa20_blk_t X;
278 #define x X.w
279 
280 	salsa20_simd_unshuffle(B, &X);
281 
282 	do {
283 #define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
284 		/* Operate on columns */
285 		x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
286 		x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
287 
288 		x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
289 		x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
290 
291 		x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
292 		x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
293 
294 		x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
295 		x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
296 
297 		/* Operate on rows */
298 		x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
299 		x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
300 
301 		x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
302 		x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
303 
304 		x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
305 		x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
306 
307 		x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
308 		x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
309 #undef R
310 	} while (--doublerounds);
311 #undef x
312 
313 	{
314 		uint32_t i;
315 		salsa20_simd_shuffle(&X, Bout);
316 		for (i = 0; i < 16; i += 4) {
317 			B->w[i] = Bout->w[i] += B->w[i];
318 			B->w[i + 1] = Bout->w[i + 1] += B->w[i + 1];
319 			B->w[i + 2] = Bout->w[i + 2] += B->w[i + 2];
320 			B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3];
321 		}
322 	}
323 
324 #if 0
325 	/* Too expensive */
326 	insecure_memzero(&X, sizeof(X));
327 #endif
328 }
329 
330 /**
331  * Apply the Salsa20/2 core to the block provided in X.
332  */
333 #define SALSA20_2(out) \
334 	salsa20(&X, &out, 1);
335 
336 #define XOR(out, in1, in2) \
337 	(out).d[0] = (in1).d[0] ^ (in2).d[0]; \
338 	(out).d[1] = (in1).d[1] ^ (in2).d[1]; \
339 	(out).d[2] = (in1).d[2] ^ (in2).d[2]; \
340 	(out).d[3] = (in1).d[3] ^ (in2).d[3]; \
341 	(out).d[4] = (in1).d[4] ^ (in2).d[4]; \
342 	(out).d[5] = (in1).d[5] ^ (in2).d[5]; \
343 	(out).d[6] = (in1).d[6] ^ (in2).d[6]; \
344 	(out).d[7] = (in1).d[7] ^ (in2).d[7];
345 
346 #define XOR_X(in) XOR(X, X, in)
347 #define XOR_X_2(in1, in2) XOR(X, in1, in2)
348 #define XOR_X_WRITE_XOR_Y_2(out, in) \
349 	XOR(Y, out, in) \
350 	COPY(out, Y) \
351 	XOR(X, X, Y)
352 
353 /**
354  * Apply the Salsa20/8 core to the block provided in X ^ in.
355  */
356 #define SALSA20_8_XOR_MEM(in, out) \
357 	XOR_X(in); \
358 	salsa20(&X, &out, 4);
359 
360 #define INTEGERIFY (uint32_t)X.d[0]
361 #endif
362 
363 /**
364  * blockmix_salsa8(Bin, Bout, r):
365  * Compute Bout = BlockMix_{salsa20/8, r}(Bin).  The input Bin must be 128r
366  * bytes in length; the output Bout must also be the same size.
367  */
blockmix_salsa8(const salsa20_blk_t * restrict Bin,salsa20_blk_t * restrict Bout,size_t r)368 static void blockmix_salsa8(const salsa20_blk_t *restrict Bin,
369     salsa20_blk_t *restrict Bout, size_t r)
370 {
371 	size_t i;
372 	DECL_X
373 
374 	READ_X(Bin[r * 2 - 1])
375 	for (i = 0; i < r; i++) {
376 		SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i])
377 		SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + i])
378 	}
379 }
380 
blockmix_salsa8_xor(const salsa20_blk_t * restrict Bin1,const salsa20_blk_t * restrict Bin2,salsa20_blk_t * restrict Bout,size_t r)381 static uint32_t blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1,
382     const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout,
383     size_t r)
384 {
385 	size_t i;
386 	DECL_X
387 
388 #ifdef PREFETCH
389 	PREFETCH(&Bin2[r * 2 - 1], _MM_HINT_T0)
390 	for (i = 0; i < r - 1; i++) {
391 		PREFETCH(&Bin2[i * 2], _MM_HINT_T0)
392 		PREFETCH(&Bin2[i * 2 + 1], _MM_HINT_T0)
393 	}
394 	PREFETCH(&Bin2[i * 2], _MM_HINT_T0)
395 #endif
396 
397 	XOR_X_2(Bin1[r * 2 - 1], Bin2[r * 2 - 1])
398 	for (i = 0; i < r; i++) {
399 		XOR_X(Bin1[i * 2])
400 		SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i])
401 		XOR_X(Bin1[i * 2 + 1])
402 		SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + i])
403 	}
404 
405 	return INTEGERIFY;
406 }
407 
408 /* This is tunable */
409 #define Swidth 8
410 
411 /* Not tunable in this implementation, hard-coded in a few places */
412 #define PWXsimple 2
413 #define PWXgather 4
414 
415 /* Derived values.  Not tunable except via Swidth above. */
416 #define PWXbytes (PWXgather * PWXsimple * 8)
417 #define Sbytes (3 * (1 << Swidth) * PWXsimple * 8)
418 #define Smask (((1 << Swidth) - 1) * PWXsimple * 8)
419 #define Smask2 (((uint64_t)Smask << 32) | Smask)
420 
421 #define DECL_SMASK2REG /* empty */
422 #define FORCE_REGALLOC_3 /* empty */
423 #define MAYBE_MEMORY_BARRIER /* empty */
424 
425 #ifdef __SSE2__
426 /*
427  * (V)PSRLDQ and (V)PSHUFD have higher throughput than (V)PSRLQ on some CPUs
428  * starting with Sandy Bridge.  Additionally, PSHUFD uses separate source and
429  * destination registers, whereas the shifts would require an extra move
430  * instruction for our code when building without AVX.  Unfortunately, PSHUFD
431  * is much slower on Conroe (4 cycles latency vs. 1 cycle latency for PSRLQ)
432  * and somewhat slower on some non-Intel CPUs (luckily not including AMD
433  * Bulldozer and Piledriver).
434  */
435 #ifdef __AVX__
436 #define HI32(X) \
437 	_mm_srli_si128((X), 4)
438 #elif 1 /* As an option, check for __SSE4_1__ here not to hurt Conroe */
439 #define HI32(X) \
440 	_mm_shuffle_epi32((X), _MM_SHUFFLE(2,3,0,1))
441 #else
442 #define HI32(X) \
443 	_mm_srli_epi64((X), 32)
444 #endif
445 
446 #if defined(__x86_64__) && \
447     __GNUC__ == 4 && __GNUC_MINOR__ < 6 && !defined(__ICC)
448 #ifdef __AVX__
449 #define MOVQ "vmovq"
450 #else
451 /* "movq" would be more correct, but "movd" is supported by older binutils
452  * due to an error in AMD's spec for x86-64. */
453 #define MOVQ "movd"
454 #endif
455 #define EXTRACT64(X) ({ \
456 	uint64_t result; \
457 	__asm__(MOVQ " %1, %0" : "=r" (result) : "x" (X)); \
458 	result; \
459 })
460 #elif defined(__x86_64__) && !defined(_MSC_VER) && !defined(__OPEN64__)
461 /* MSVC and Open64 had bugs */
462 #define EXTRACT64(X) _mm_cvtsi128_si64(X)
463 #elif defined(__x86_64__) && defined(__SSE4_1__)
464 /* No known bugs for this intrinsic */
465 #include <smmintrin.h>
466 #define EXTRACT64(X) _mm_extract_epi64((X), 0)
467 #elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
468 /* 32-bit */
469 #include <smmintrin.h>
470 #if 0
471 /* This is currently unused by the code below, which instead uses these two
472  * intrinsics explicitly when (!defined(__x86_64__) && defined(__SSE4_1__)) */
473 #define EXTRACT64(X) \
474 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
475 	((uint64_t)(uint32_t)_mm_extract_epi32((X), 1) << 32))
476 #endif
477 #else
478 /* 32-bit or compilers with known past bugs in _mm_cvtsi128_si64() */
479 #define EXTRACT64(X) \
480 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(X) | \
481 	((uint64_t)(uint32_t)_mm_cvtsi128_si32(HI32(X)) << 32))
482 #endif
483 
484 #if defined(__x86_64__) && (defined(__AVX__) || !defined(__GNUC__))
485 /* 64-bit with AVX */
486 /* Force use of 64-bit AND instead of two 32-bit ANDs */
487 #undef DECL_SMASK2REG
488 #if defined(__GNUC__) && !defined(__ICC)
489 #define DECL_SMASK2REG uint64_t Smask2reg = Smask2;
490 /* Force use of lower-numbered registers to reduce number of prefixes, relying
491  * on out-of-order execution and register renaming. */
492 #define FORCE_REGALLOC_1 \
493 	__asm__("" : "=a" (x), "+d" (Smask2reg), "+S" (S0), "+D" (S1));
494 #define FORCE_REGALLOC_2 \
495 	__asm__("" : : "c" (lo));
496 #else
497 static volatile uint64_t Smask2var = Smask2;
498 #define DECL_SMASK2REG uint64_t Smask2reg = Smask2var;
499 #define FORCE_REGALLOC_1 /* empty */
500 #define FORCE_REGALLOC_2 /* empty */
501 #endif
502 #define PWXFORM_SIMD(X) { \
503 	uint64_t x; \
504 	FORCE_REGALLOC_1 \
505 	uint32_t lo = x = EXTRACT64(X) & Smask2reg; \
506 	FORCE_REGALLOC_2 \
507 	uint32_t hi = x >> 32; \
508 	X = _mm_mul_epu32(HI32(X), X); \
509 	X = _mm_add_epi64(X, *(__m128i *)(S0 + lo)); \
510 	X = _mm_xor_si128(X, *(__m128i *)(S1 + hi)); \
511 }
512 #elif defined(__x86_64__)
513 /* 64-bit without AVX.  This relies on out-of-order execution and register
514  * renaming.  It may actually be fastest on CPUs with AVX(2) as well - e.g.,
515  * it runs great on Haswell. */
516 #if 0 /* FIXME */
517 #warning "Note: using x86-64 inline assembly for YESCRYPT_RW.  That's great."
518 #endif
519 /* We need a compiler memory barrier between sub-blocks to ensure that none of
520  * the writes into what was S2 during processing of the previous sub-block are
521  * postponed until after a read from S0 or S1 in the inline asm code below. */
522 #undef MAYBE_MEMORY_BARRIER
523 #define MAYBE_MEMORY_BARRIER \
524 	__asm__("" : : : "memory");
525 #ifdef __ILP32__ /* x32 */
526 #define REGISTER_PREFIX "e"
527 #else
528 #define REGISTER_PREFIX "r"
529 #endif
530 #define PWXFORM_SIMD(X) { \
531 	__m128i H; \
532 	__asm__( \
533 	    "movd %0, %%rax\n\t" \
534 	    "pshufd $0xb1, %0, %1\n\t" \
535 	    "andq %2, %%rax\n\t" \
536 	    "pmuludq %1, %0\n\t" \
537 	    "movl %%eax, %%ecx\n\t" \
538 	    "shrq $0x20, %%rax\n\t" \
539 	    "paddq (%3,%%" REGISTER_PREFIX "cx), %0\n\t" \
540 	    "pxor (%4,%%" REGISTER_PREFIX "ax), %0\n\t" \
541 	    : "+x" (X), "=x" (H) \
542 	    : "d" (Smask2), "S" (S0), "D" (S1) \
543 	    : "cc", "ax", "cx"); \
544 }
545 #elif defined(USE_SSE4_FOR_32BIT) && defined(__SSE4_1__)
546 /* 32-bit with SSE4.1 */
547 #define PWXFORM_SIMD(X) { \
548 	__m128i x = _mm_and_si128(X, _mm_set1_epi64x(Smask2)); \
549 	__m128i s0 = *(__m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x)); \
550 	__m128i s1 = *(__m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1)); \
551 	X = _mm_mul_epu32(HI32(X), X); \
552 	X = _mm_add_epi64(X, s0); \
553 	X = _mm_xor_si128(X, s1); \
554 }
555 #else
556 /* 32-bit without SSE4.1 */
557 #define PWXFORM_SIMD(X) { \
558 	uint64_t x = EXTRACT64(X) & Smask2; \
559 	__m128i s0 = *(__m128i *)(S0 + (uint32_t)x); \
560 	__m128i s1 = *(__m128i *)(S1 + (x >> 32)); \
561 	X = _mm_mul_epu32(HI32(X), X); \
562 	X = _mm_add_epi64(X, s0); \
563 	X = _mm_xor_si128(X, s1); \
564 }
565 #endif
566 
567 #define PWXFORM_ROUND \
568 	PWXFORM_SIMD(X0) \
569 	PWXFORM_SIMD(X1) \
570 	PWXFORM_SIMD(X2) \
571 	PWXFORM_SIMD(X3)
572 
573 #if defined(__x86_64__) && defined(__GNUC__) && !defined(__ICC)
574 #undef FORCE_REGALLOC_3
575 #define FORCE_REGALLOC_3 __asm__("" : : "b" (Sw));
576 #endif
577 
578 #else /* !defined(__SSE2__) */
579 
580 #define PWXFORM_SIMD(x0, x1) { \
581 	uint64_t x = x0 & Smask2; \
582 	uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \
583 	uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \
584 	x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \
585 	x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \
586 }
587 
588 #define PWXFORM_ROUND \
589 	PWXFORM_SIMD(X.d[0], X.d[1]) \
590 	PWXFORM_SIMD(X.d[2], X.d[3]) \
591 	PWXFORM_SIMD(X.d[4], X.d[5]) \
592 	PWXFORM_SIMD(X.d[6], X.d[7])
593 #endif
594 
595 /*
596  * This offset helps address the 256-byte write block via the single-byte
597  * displacements encodable in x86(-64) instructions.  It is needed because the
598  * displacements are signed.  Without it, we'd get 4-byte displacements for
599  * half of the writes.  Setting it to 0x80 instead of 0x7c would avoid needing
600  * a displacement for one of the writes, but then the LEA instruction would
601  * need a 4-byte displacement.
602  */
603 #define PWXFORM_WRITE_OFFSET 0x7c
604 
605 #define PWXFORM_WRITE \
606 	WRITE_X(*(salsa20_blk_t *)(Sw - PWXFORM_WRITE_OFFSET)) \
607 	Sw += 64;
608 
609 #define PWXFORM { \
610 	uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \
611 	FORCE_REGALLOC_3 \
612 	MAYBE_MEMORY_BARRIER \
613 	PWXFORM_ROUND \
614 	PWXFORM_ROUND PWXFORM_WRITE \
615 	PWXFORM_ROUND PWXFORM_WRITE \
616 	PWXFORM_ROUND PWXFORM_WRITE \
617 	PWXFORM_ROUND PWXFORM_WRITE \
618 	PWXFORM_ROUND \
619 	w = (w + 64 * 4) & Smask2; \
620 	{ \
621 		uint8_t *Stmp = S2; \
622 		S2 = S1; \
623 		S1 = S0; \
624 		S0 = Stmp; \
625 	} \
626 }
627 
628 typedef struct {
629 	uint8_t *S0, *S1, *S2;
630 	size_t w;
631 } pwxform_ctx_t;
632 
633 #define Salloc (Sbytes + ((sizeof(pwxform_ctx_t) + 63) & ~63U))
634 
635 /**
636  * blockmix_pwxform(Bin, Bout, r, S):
637  * Compute Bout = BlockMix_pwxform{salsa20/2, r, S}(Bin).  The input Bin must
638  * be 128r bytes in length; the output Bout must also be the same size.
639  */
blockmix(const salsa20_blk_t * restrict Bin,salsa20_blk_t * restrict Bout,size_t r,pwxform_ctx_t * restrict ctx)640 static void blockmix(const salsa20_blk_t *restrict Bin,
641     salsa20_blk_t *restrict Bout, size_t r, pwxform_ctx_t *restrict ctx)
642 {
643 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
644 	size_t w = ctx->w;
645 	size_t i;
646 	DECL_X
647 
648 	/* Convert count of 128-byte blocks to max index of 64-byte block */
649 	r = r * 2 - 1;
650 
651 	READ_X(Bin[r])
652 
653 	DECL_SMASK2REG
654 
655 	i = 0;
656 	do {
657 		XOR_X(Bin[i])
658 		PWXFORM
659 		if (unlikely(i >= r))
660 			break;
661 		WRITE_X(Bout[i])
662 		i++;
663 	} while (1);
664 
665 	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
666 	ctx->w = w;
667 
668 	SALSA20_2(Bout[i])
669 }
670 
blockmix_xor(const salsa20_blk_t * Bin1,const salsa20_blk_t * restrict Bin2,salsa20_blk_t * Bout,size_t r,int Bin2_in_ROM,pwxform_ctx_t * restrict ctx)671 static uint32_t blockmix_xor(const salsa20_blk_t *Bin1,
672     const salsa20_blk_t *restrict Bin2, salsa20_blk_t *Bout,
673     size_t r, int Bin2_in_ROM, pwxform_ctx_t *restrict ctx)
674 {
675 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
676 	size_t w = ctx->w;
677 	size_t i;
678 	DECL_X
679 
680 	/* Convert count of 128-byte blocks to max index of 64-byte block */
681 	r = r * 2 - 1;
682 
683 #ifdef PREFETCH
684 	if (Bin2_in_ROM) {
685 		PREFETCH(&Bin2[r], _MM_HINT_NTA)
686 		for (i = 0; i < r; i++) {
687 			PREFETCH(&Bin2[i], _MM_HINT_NTA)
688 		}
689 	} else {
690 		PREFETCH(&Bin2[r], _MM_HINT_T0)
691 		for (i = 0; i < r; i++) {
692 			PREFETCH(&Bin2[i], _MM_HINT_T0)
693 		}
694 	}
695 #else
696 	(void)Bin2_in_ROM; /* unused */
697 #endif
698 
699 	XOR_X_2(Bin1[r], Bin2[r])
700 
701 	DECL_SMASK2REG
702 
703 	i = 0;
704 	r--;
705 	do {
706 		XOR_X(Bin1[i])
707 		XOR_X(Bin2[i])
708 		PWXFORM
709 		WRITE_X(Bout[i])
710 
711 		XOR_X(Bin1[i + 1])
712 		XOR_X(Bin2[i + 1])
713 		PWXFORM
714 
715 		if (unlikely(i >= r))
716 			break;
717 
718 		WRITE_X(Bout[i + 1])
719 
720 		i += 2;
721 	} while (1);
722 	i++;
723 
724 	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
725 	ctx->w = w;
726 
727 	SALSA20_2(Bout[i])
728 
729 	return INTEGERIFY;
730 }
731 
blockmix_xor_save(salsa20_blk_t * restrict Bin1out,salsa20_blk_t * restrict Bin2,size_t r,pwxform_ctx_t * restrict ctx)732 static uint32_t blockmix_xor_save(salsa20_blk_t *restrict Bin1out,
733     salsa20_blk_t *restrict Bin2,
734     size_t r, pwxform_ctx_t *restrict ctx)
735 {
736 	uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2;
737 	size_t w = ctx->w;
738 	size_t i;
739 	DECL_X
740 	DECL_Y
741 
742 	/* Convert count of 128-byte blocks to max index of 64-byte block */
743 	r = r * 2 - 1;
744 
745 #ifdef PREFETCH
746 	PREFETCH(&Bin2[r], _MM_HINT_T0)
747 	for (i = 0; i < r; i++) {
748 		PREFETCH(&Bin2[i], _MM_HINT_T0)
749 	}
750 #endif
751 
752 	XOR_X_2(Bin1out[r], Bin2[r])
753 
754 	DECL_SMASK2REG
755 
756 	i = 0;
757 	r--;
758 	do {
759 		XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i])
760 		PWXFORM
761 		WRITE_X(Bin1out[i])
762 
763 		XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1])
764 		PWXFORM
765 
766 		if (unlikely(i >= r))
767 			break;
768 
769 		WRITE_X(Bin1out[i + 1])
770 
771 		i += 2;
772 	} while (1);
773 	i++;
774 
775 	ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2;
776 	ctx->w = w;
777 
778 	SALSA20_2(Bin1out[i])
779 
780 	return INTEGERIFY;
781 }
782 
783 /**
784  * integerify(B, r):
785  * Return the result of parsing B_{2r-1} as a little-endian integer.
786  */
integerify(const salsa20_blk_t * B,size_t r)787 static inline uint32_t integerify(const salsa20_blk_t *B, size_t r)
788 {
789 /*
790  * Our 64-bit words are in host byte order, which is why we don't just read
791  * w[0] here (would be wrong on big-endian).  Also, our 32-bit words are
792  * SIMD-shuffled (so the next 32 bits would be part of d[6]), but currently
793  * this does not matter as we only care about the least significant 32 bits.
794  */
795 	return (uint32_t)B[2 * r - 1].d[0];
796 }
797 
798 /**
799  * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx):
800  * Compute first loop of B = SMix_r(B, N).  The input B must be 128r bytes in
801  * length; the temporary storage V must be 128rN bytes in length; the temporary
802  * storage XY must be 128r+64 bytes in length.  N must be even and at least 4.
803  * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY
804  * to a multiple of at least 16 bytes.
805  */
smix1(uint8_t * B,size_t r,uint32_t N,yescrypt_flags_t flags,salsa20_blk_t * V,uint32_t NROM,const salsa20_blk_t * VROM,salsa20_blk_t * XY,pwxform_ctx_t * ctx)806 static void smix1(uint8_t *B, size_t r, uint32_t N, yescrypt_flags_t flags,
807     salsa20_blk_t *V, uint32_t NROM, const salsa20_blk_t *VROM,
808     salsa20_blk_t *XY, pwxform_ctx_t *ctx)
809 {
810 	size_t s = 2 * r;
811 	salsa20_blk_t *X = V, *Y = &V[s];
812 	uint32_t i, j;
813 
814 	for (i = 0; i < 2 * r; i++) {
815 		const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64];
816 		salsa20_blk_t *tmp = Y;
817 		salsa20_blk_t *dst = &X[i];
818 		size_t k;
819 		for (k = 0; k < 16; k++)
820 			tmp->w[k] = le32dec(&src->w[k]);
821 		salsa20_simd_shuffle(tmp, dst);
822 	}
823 
824 	if (VROM) {
825 		uint32_t n;
826 		const salsa20_blk_t *V_j;
827 
828 		V_j = &VROM[(NROM - 1) * s];
829 		j = blockmix_xor(X, V_j, Y, r, 1, ctx) & (NROM - 1);
830 		V_j = &VROM[j * s];
831 		X = Y + s;
832 		j = blockmix_xor(Y, V_j, X, r, 1, ctx);
833 
834 		for (n = 2; n < N; n <<= 1) {
835 			uint32_t m = (n < N / 2) ? n : (N - 1 - n);
836 			for (i = 1; i < m; i += 2) {
837 				j &= n - 1;
838 				j += i - 1;
839 				V_j = &V[j * s];
840 				Y = X + s;
841 				j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1);
842 				V_j = &VROM[j * s];
843 				X = Y + s;
844 				j = blockmix_xor(Y, V_j, X, r, 1, ctx);
845 			}
846 		}
847 		n >>= 1;
848 
849 		j &= n - 1;
850 		j += N - 2 - n;
851 		V_j = &V[j * s];
852 		Y = X + s;
853 		j = blockmix_xor(X, V_j, Y, r, 0, ctx) & (NROM - 1);
854 		V_j = &VROM[j * s];
855 		blockmix_xor(Y, V_j, XY, r, 1, ctx);
856 	} else if (flags & YESCRYPT_RW) {
857 		uint32_t n;
858 		salsa20_blk_t *V_j;
859 
860 		blockmix(X, Y, r, ctx);
861 		X = Y + s;
862 		blockmix(Y, X, r, ctx);
863 		j = integerify(X, r);
864 
865 		for (n = 2; n < N; n <<= 1) {
866 			uint32_t m = (n < N / 2) ? n : (N - 1 - n);
867 			for (i = 1; i < m; i += 2) {
868 				Y = X + s;
869 				j &= n - 1;
870 				j += i - 1;
871 				V_j = &V[j * s];
872 				j = blockmix_xor(X, V_j, Y, r, 0, ctx);
873 				j &= n - 1;
874 				j += i;
875 				V_j = &V[j * s];
876 				X = Y + s;
877 				j = blockmix_xor(Y, V_j, X, r, 0, ctx);
878 			}
879 		}
880 		n >>= 1;
881 
882 		j &= n - 1;
883 		j += N - 2 - n;
884 		V_j = &V[j * s];
885 		Y = X + s;
886 		j = blockmix_xor(X, V_j, Y, r, 0, ctx);
887 		j &= n - 1;
888 		j += N - 1 - n;
889 		V_j = &V[j * s];
890 		blockmix_xor(Y, V_j, XY, r, 0, ctx);
891 	} else {
892 		N -= 2;
893 		do {
894 			blockmix_salsa8(X, Y, r);
895 			X = Y + s;
896 			blockmix_salsa8(Y, X, r);
897 			Y = X + s;
898 		} while ((N -= 2));
899 
900 		blockmix_salsa8(X, Y, r);
901 		blockmix_salsa8(Y, XY, r);
902 	}
903 
904 	for (i = 0; i < 2 * r; i++) {
905 		const salsa20_blk_t *src = &XY[i];
906 		salsa20_blk_t *tmp = &XY[s];
907 		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
908 		size_t k;
909 		for (k = 0; k < 16; k++)
910 			le32enc(&tmp->w[k], src->w[k]);
911 		salsa20_simd_unshuffle(tmp, dst);
912 	}
913 }
914 
915 /**
916  * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx):
917  * Compute second loop of B = SMix_r(B, N).  The input B must be 128r bytes in
918  * length; the temporary storage V must be 128rN bytes in length; the temporary
919  * storage XY must be 256r bytes in length.  N must be a power of 2 and at
920  * least 2.  Nloop must be even.  The array V must be aligned to a multiple of
921  * 64 bytes, and arrays B and XY to a multiple of at least 16 bytes.
922  */
smix2(uint8_t * B,size_t r,uint32_t N,uint64_t Nloop,yescrypt_flags_t flags,salsa20_blk_t * V,uint32_t NROM,const salsa20_blk_t * VROM,salsa20_blk_t * XY,pwxform_ctx_t * ctx)923 static void smix2(uint8_t *B, size_t r, uint32_t N, uint64_t Nloop,
924     yescrypt_flags_t flags, salsa20_blk_t *V, uint32_t NROM,
925     const salsa20_blk_t *VROM, salsa20_blk_t *XY, pwxform_ctx_t *ctx)
926 {
927 	size_t s = 2 * r;
928 	salsa20_blk_t *X = XY, *Y = &XY[s];
929 	uint32_t i, j;
930 
931 	if (Nloop == 0)
932 		return;
933 
934 	for (i = 0; i < 2 * r; i++) {
935 		const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64];
936 		salsa20_blk_t *tmp = Y;
937 		salsa20_blk_t *dst = &X[i];
938 		size_t k;
939 		for (k = 0; k < 16; k++)
940 			tmp->w[k] = le32dec(&src->w[k]);
941 		salsa20_simd_shuffle(tmp, dst);
942 	}
943 
944 	j = integerify(X, r) & (N - 1);
945 
946 /*
947  * Normally, VROM implies YESCRYPT_RW, but we check for these separately
948  * because our SMix resets YESCRYPT_RW for the smix2() calls operating on the
949  * entire V when p > 1.
950  */
951 	if (VROM && (flags & YESCRYPT_RW)) {
952 		do {
953 			salsa20_blk_t *V_j = &V[j * s];
954 			const salsa20_blk_t *VROM_j;
955 			j = blockmix_xor_save(X, V_j, r, ctx) & (NROM - 1);
956 			VROM_j = &VROM[j * s];
957 			j = blockmix_xor(X, VROM_j, X, r, 1, ctx) & (N - 1);
958 		} while (Nloop -= 2);
959 	} else if (VROM) {
960 		do {
961 			const salsa20_blk_t *V_j = &V[j * s];
962 			j = blockmix_xor(X, V_j, X, r, 0, ctx) & (NROM - 1);
963 			V_j = &VROM[j * s];
964 			j = blockmix_xor(X, V_j, X, r, 1, ctx) & (N - 1);
965 		} while (Nloop -= 2);
966 	} else if (flags & YESCRYPT_RW) {
967 		do {
968 			salsa20_blk_t *V_j = &V[j * s];
969 			j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1);
970 			V_j = &V[j * s];
971 			j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1);
972 		} while (Nloop -= 2);
973 	} else if (ctx) {
974 		do {
975 			const salsa20_blk_t *V_j = &V[j * s];
976 			j = blockmix_xor(X, V_j, X, r, 0, ctx) & (N - 1);
977 			V_j = &V[j * s];
978 			j = blockmix_xor(X, V_j, X, r, 0, ctx) & (N - 1);
979 		} while (Nloop -= 2);
980 	} else {
981 		do {
982 			const salsa20_blk_t *V_j = &V[j * s];
983 			j = blockmix_salsa8_xor(X, V_j, Y, r) & (N - 1);
984 			V_j = &V[j * s];
985 			j = blockmix_salsa8_xor(Y, V_j, X, r) & (N - 1);
986 		} while (Nloop -= 2);
987 	}
988 
989 	for (i = 0; i < 2 * r; i++) {
990 		const salsa20_blk_t *src = &X[i];
991 		salsa20_blk_t *tmp = Y;
992 		salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64];
993 		size_t k;
994 		for (k = 0; k < 16; k++)
995 			le32enc(&tmp->w[k], src->w[k]);
996 		salsa20_simd_unshuffle(tmp, dst);
997 	}
998 }
999 
1000 /**
1001  * p2floor(x):
1002  * Largest power of 2 not greater than argument.
1003  */
p2floor(uint64_t x)1004 static uint64_t p2floor(uint64_t x)
1005 {
1006 	uint64_t y;
1007 	while ((y = x & (x - 1)))
1008 		x = y;
1009 	return x;
1010 }
1011 
1012 /**
1013  * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, passwd):
1014  * Compute B = SMix_r(B, N).  The input B must be 128rp bytes in length; the
1015  * temporary storage V must be 128rN bytes in length; the temporary storage
1016  * XY must be 256r or 256rp bytes in length (the larger size is required with
1017  * OpenMP-enabled builds).  N must be a power of 2 and at least 4.  The array V
1018  * must be aligned to a multiple of 64 bytes, and arrays B and XY to a multiple
1019  * of at least 16 bytes (aligning them to 64 bytes as well saves cache lines
1020  * and helps avoid false sharing in OpenMP-enabled builds when p > 1, but it
1021  * might also result in cache bank conflicts).
1022  */
smix(uint8_t * B,size_t r,uint32_t N,uint32_t p,uint32_t t,yescrypt_flags_t flags,salsa20_blk_t * V,uint32_t NROM,const salsa20_blk_t * VROM,salsa20_blk_t * XY,uint8_t * S,uint8_t * passwd)1023 static void smix(uint8_t *B, size_t r, uint32_t N, uint32_t p, uint32_t t,
1024     yescrypt_flags_t flags,
1025     salsa20_blk_t *V, uint32_t NROM, const salsa20_blk_t *VROM,
1026     salsa20_blk_t *XY, uint8_t *S, uint8_t *passwd)
1027 {
1028 	size_t s = 2 * r;
1029 	uint32_t Nchunk;
1030 	uint64_t Nloop_all, Nloop_rw;
1031 	uint32_t i;
1032 
1033 	Nchunk = N / p;
1034 	Nloop_all = Nchunk;
1035 	if (flags & YESCRYPT_RW) {
1036 		if (t <= 1) {
1037 			if (t)
1038 				Nloop_all *= 2; /* 2/3 */
1039 			Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */
1040 		} else {
1041 			Nloop_all *= t - 1;
1042 		}
1043 	} else if (t) {
1044 		if (t == 1)
1045 			Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */
1046 		Nloop_all *= t;
1047 	}
1048 
1049 	Nloop_rw = 0;
1050 	if (flags & YESCRYPT_INIT_SHARED)
1051 		Nloop_rw = Nloop_all;
1052 	else if (flags & YESCRYPT_RW)
1053 		Nloop_rw = Nloop_all / p;
1054 
1055 	Nchunk &= ~(uint32_t)1; /* round down to even */
1056 	Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */
1057 	Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */
1058 
1059 #ifdef _OPENMP
1060 #pragma omp parallel if (p > 1) default(none) private(i) shared(B, r, N, p, flags, V, NROM, VROM, XY, S, passwd, s, Nchunk, Nloop_all, Nloop_rw)
1061 	{
1062 #pragma omp for
1063 #endif
1064 	for (i = 0; i < p; i++) {
1065 		uint32_t Vchunk = i * Nchunk;
1066 		uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk);
1067 		uint8_t *Bp = &B[128 * r * i];
1068 		salsa20_blk_t *Vp = &V[Vchunk * s];
1069 #ifdef _OPENMP
1070 		salsa20_blk_t *XYp = &XY[i * (2 * s)];
1071 #else
1072 		salsa20_blk_t *XYp = XY;
1073 #endif
1074 		pwxform_ctx_t *ctx_i = NULL;
1075 		if (flags & YESCRYPT_RW) {
1076 			uint8_t *Si = S + i * Salloc;
1077 			smix1(Bp, 1, Sbytes / 128, 0 /* no flags */,
1078 			    (salsa20_blk_t *)Si, 0, NULL, XYp, NULL);
1079 			ctx_i = (pwxform_ctx_t *)(Si + Sbytes);
1080 			ctx_i->S2 = Si;
1081 			ctx_i->S1 = Si + Sbytes / 3;
1082 			ctx_i->S0 = Si + Sbytes / 3 * 2;
1083 			ctx_i->w = 0;
1084 			if (i == 0)
1085 				HMAC_SHA256_Buf(Bp + (128 * r - 64), 64,
1086 				    passwd, 32, passwd);
1087 		}
1088 		smix1(Bp, r, Np, flags, Vp, NROM, VROM, XYp, ctx_i);
1089 		smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp,
1090 		    NROM, VROM, XYp, ctx_i);
1091 	}
1092 
1093 	if (Nloop_all > Nloop_rw) {
1094 #ifdef _OPENMP
1095 #pragma omp for
1096 #endif
1097 		for (i = 0; i < p; i++) {
1098 			uint8_t *Bp = &B[128 * r * i];
1099 #ifdef _OPENMP
1100 			salsa20_blk_t *XYp = &XY[i * (2 * s)];
1101 #else
1102 			salsa20_blk_t *XYp = XY;
1103 #endif
1104 			pwxform_ctx_t *ctx_i = NULL;
1105 			if (flags & YESCRYPT_RW) {
1106 				uint8_t *Si = S + i * Salloc;
1107 				ctx_i = (pwxform_ctx_t *)(Si + Sbytes);
1108 			}
1109 			smix2(Bp, r, N, Nloop_all - Nloop_rw,
1110 			    flags & ~YESCRYPT_RW, V, NROM, VROM, XYp, ctx_i);
1111 		}
1112 	}
1113 #ifdef _OPENMP
1114 	}
1115 #endif
1116 }
1117 
1118 /**
1119  * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen,
1120  *     flags, N, r, p, t, NROM, buf, buflen):
1121  * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
1122  * p, buflen), or a revision of scrypt as requested by flags and shared, and
1123  * write the result into buf.
1124  *
1125  * shared and flags may request special modes as described in yescrypt.h.
1126  *
1127  * local is the thread-local data structure, allowing to preserve and reuse a
1128  * memory allocation across calls, thereby reducing its overhead.
1129  *
1130  * t controls computation time while not affecting peak memory usage.
1131  *
1132  * Return 0 on success; or -1 on error.
1133  *
1134  * This optimized implementation currently limits N to the range from 4 to
1135  * 2^31, but other implementations might not.
1136  */
yescrypt_kdf_body(const yescrypt_shared_t * shared,yescrypt_local_t * local,const uint8_t * passwd,size_t passwdlen,const uint8_t * salt,size_t saltlen,yescrypt_flags_t flags,uint64_t N,uint32_t r,uint32_t p,uint32_t t,uint64_t NROM,uint8_t * buf,size_t buflen)1137 static int yescrypt_kdf_body(const yescrypt_shared_t *shared,
1138     yescrypt_local_t *local,
1139     const uint8_t *passwd, size_t passwdlen,
1140     const uint8_t *salt, size_t saltlen,
1141     yescrypt_flags_t flags, uint64_t N, uint32_t r, uint32_t p, uint32_t t,
1142     uint64_t NROM,
1143     uint8_t *buf, size_t buflen)
1144 {
1145 	yescrypt_region_t tmp;
1146 	const salsa20_blk_t *VROM;
1147 	size_t B_size, V_size, XY_size, need;
1148 	uint8_t *B, *S;
1149 	salsa20_blk_t *V, *XY;
1150 	uint8_t sha256[32];
1151 	uint8_t dk[sizeof(sha256)], *dkp = buf;
1152 
1153 	/* Sanity-check parameters */
1154 	switch (flags & YESCRYPT_MODE_MASK) {
1155 	case 0: /* classic scrypt - can't have anything non-standard */
1156 		if (flags || t || NROM)
1157 			goto out_EINVAL;
1158 		break;
1159 	case YESCRYPT_WORM:
1160 		if (flags != YESCRYPT_WORM || NROM)
1161 			goto out_EINVAL;
1162 		break;
1163 	case YESCRYPT_RW:
1164 		if (flags != (flags & YESCRYPT_KNOWN_FLAGS))
1165 			goto out_EINVAL;
1166 #if PWXsimple == 2 && PWXgather == 4 && Sbytes == 12288
1167 		if ((flags & YESCRYPT_RW_FLAVOR_MASK) ==
1168 		    (YESCRYPT_ROUNDS_6 | YESCRYPT_GATHER_4 |
1169 		    YESCRYPT_SIMPLE_2 | YESCRYPT_SBOX_12K))
1170 			break;
1171 #else
1172 #error "Unsupported pwxform settings"
1173 #endif
1174 		/* FALLTHRU */
1175 	default:
1176 		goto out_EINVAL;
1177 	}
1178 #if SIZE_MAX > UINT32_MAX
1179 	if (buflen > (((uint64_t)1 << 32) - 1) * 32)
1180 		goto out_EINVAL;
1181 #endif
1182 	if ((uint64_t)r * (uint64_t)p >= 1 << 30)
1183 		goto out_EINVAL;
1184 	if (N > UINT32_MAX)
1185 		goto out_EINVAL;
1186 	if ((N & (N - 1)) != 0 || N <= 3 || r < 1 || p < 1)
1187 		goto out_EINVAL;
1188 	if (r > SIZE_MAX / 256 / p ||
1189 	    N > SIZE_MAX / 128 / r)
1190 		goto out_EINVAL;
1191 	if (flags & YESCRYPT_RW) {
1192 		if (N / p <= 3 || p > SIZE_MAX / Salloc)
1193 			goto out_EINVAL;
1194 	}
1195 #ifdef _OPENMP
1196 	else if (N > SIZE_MAX / 128 / (r * p)) {
1197 		goto out_EINVAL;
1198 	}
1199 #endif
1200 
1201 	VROM = NULL;
1202 	if (shared) {
1203 		uint64_t expected_size = (size_t)128 * r * NROM;
1204 		if ((NROM & (NROM - 1)) != 0 ||
1205 		    NROM <= 1 || NROM > UINT32_MAX ||
1206 		    shared->aligned_size < expected_size)
1207 			goto out_EINVAL;
1208 		if (!(flags & YESCRYPT_INIT_SHARED)) {
1209 			uint64_t *tag = (uint64_t *)
1210 			    ((uint8_t *)shared->aligned + expected_size - 48);
1211 			if (tag[0] != YESCRYPT_ROM_TAG1 || tag[1] != YESCRYPT_ROM_TAG2)
1212 				goto out_EINVAL;
1213 		}
1214 		VROM = shared->aligned;
1215 	} else {
1216 		if (NROM)
1217 			goto out_EINVAL;
1218 	}
1219 
1220 	/* Allocate memory */
1221 	V = NULL;
1222 	V_size = (size_t)128 * r * N;
1223 #ifdef _OPENMP
1224 	if (!(flags & YESCRYPT_RW))
1225 		V_size *= p;
1226 #endif
1227 	need = V_size;
1228 	if (flags & YESCRYPT_INIT_SHARED) {
1229 		if (local->aligned_size < need) {
1230 			if (local->base || local->aligned ||
1231 			    local->base_size || local->aligned_size)
1232 				goto out_EINVAL;
1233 			if (!alloc_region(local, need))
1234 				return -1;
1235 		}
1236 		if (flags & YESCRYPT_ALLOC_ONLY)
1237 			return -2; /* expected "failure" */
1238 		V = (salsa20_blk_t *)local->aligned;
1239 		need = 0;
1240 	}
1241 	B_size = (size_t)128 * r * p;
1242 	need += B_size;
1243 	if (need < B_size)
1244 		goto out_EINVAL;
1245 	XY_size = (size_t)256 * r;
1246 #ifdef _OPENMP
1247 	XY_size *= p;
1248 #endif
1249 	need += XY_size;
1250 	if (need < XY_size)
1251 		goto out_EINVAL;
1252 	if (flags & YESCRYPT_RW) {
1253 		size_t S_size = (size_t)Salloc * p;
1254 		need += S_size;
1255 		if (need < S_size)
1256 			goto out_EINVAL;
1257 	}
1258 	if (flags & YESCRYPT_INIT_SHARED) {
1259 		if (!alloc_region(&tmp, need))
1260 			return -1;
1261 		B = (uint8_t *)tmp.aligned;
1262 		XY = (salsa20_blk_t *)((uint8_t *)B + B_size);
1263 	} else {
1264 		init_region(&tmp);
1265 		if (local->aligned_size < need) {
1266 			if (free_region(local))
1267 				return -1;
1268 			if (!alloc_region(local, need))
1269 				return -1;
1270 		}
1271 		if (flags & YESCRYPT_ALLOC_ONLY)
1272 			return -3; /* expected "failure" */
1273 		B = (uint8_t *)local->aligned;
1274 		V = (salsa20_blk_t *)((uint8_t *)B + B_size);
1275 		XY = (salsa20_blk_t *)((uint8_t *)V + V_size);
1276 	}
1277 	S = NULL;
1278 	if (flags & YESCRYPT_RW)
1279 		S = (uint8_t *)XY + XY_size;
1280 
1281 	if (flags) {
1282 		HMAC_SHA256_Buf("yescrypt-prehash",
1283 		    (flags & YESCRYPT_PREHASH) ? 16 : 8,
1284 		    passwd, passwdlen, sha256);
1285 		passwd = sha256;
1286 		passwdlen = sizeof(sha256);
1287 	}
1288 
1289 	PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, B, B_size);
1290 
1291 	if (flags)
1292 		memcpy(sha256, B, sizeof(sha256));
1293 
1294 	if (p == 1 || (flags & YESCRYPT_RW)) {
1295 		smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, sha256);
1296 	} else {
1297 		uint32_t i;
1298 #ifdef _OPENMP
1299 #pragma omp parallel for default(none) private(i) shared(B, r, N, p, t, flags, V, NROM, VROM, XY, S)
1300 #endif
1301 		for (i = 0; i < p; i++) {
1302 #ifdef _OPENMP
1303 			smix(&B[(size_t)128 * r * i], r, N, 1, t, flags,
1304 			    &V[(size_t)2 * r * i * N],
1305 			    NROM, VROM,
1306 			    &XY[(size_t)4 * r * i], NULL, NULL);
1307 #else
1308 			smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V,
1309 			    NROM, VROM, XY, NULL, NULL);
1310 #endif
1311 		}
1312 	}
1313 
1314 	dkp = buf;
1315 	if (flags && buflen < sizeof(dk)) {
1316 		PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, dk, sizeof(dk));
1317 		dkp = dk;
1318 	}
1319 
1320 	PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf, buflen);
1321 
1322 	/*
1323 	 * Except when computing classic scrypt, allow all computation so far
1324 	 * to be performed on the client.  The final steps below match those of
1325 	 * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so
1326 	 * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of
1327 	 * SCRAM's use of SHA-1) would be usable with yescrypt hashes.
1328 	 */
1329 	if (flags && !(flags & YESCRYPT_PREHASH)) {
1330 		/* Compute ClientKey */
1331 		HMAC_SHA256_Buf(dkp, sizeof(dk), "Client Key", 10, sha256);
1332 		/* Compute StoredKey */
1333 		{
1334 			size_t clen = buflen;
1335 			if (clen > sizeof(dk))
1336 				clen = sizeof(dk);
1337 			SHA256_Buf(sha256, sizeof(sha256), dk);
1338 			memcpy(buf, dk, clen);
1339 		}
1340 	}
1341 
1342 	if (flags) {
1343 		insecure_memzero(sha256, sizeof(sha256));
1344 		insecure_memzero(dk, sizeof(dk));
1345 	}
1346 
1347 	if (free_region(&tmp)) {
1348 		insecure_memzero(buf, buflen); /* must preserve errno */
1349 		return -1;
1350 	}
1351 
1352 	/* Success! */
1353 	return 0;
1354 
1355 out_EINVAL:
1356 	errno = EINVAL;
1357 	return -1;
1358 }
1359 
1360 /**
1361  * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, params,
1362  *     buf, buflen):
1363  * Compute scrypt or its revision as requested by the parameters.  The inputs
1364  * to this function are the same as those for yescrypt_kdf_body() above, with
1365  * the addition of g, which controls hash upgrades (0 for no upgrades so far).
1366  */
yescrypt_kdf(const yescrypt_shared_t * shared,yescrypt_local_t * local,const uint8_t * passwd,size_t passwdlen,const uint8_t * salt,size_t saltlen,const yescrypt_params_t * params,uint8_t * buf,size_t buflen)1367 int yescrypt_kdf(const yescrypt_shared_t *shared, yescrypt_local_t *local,
1368     const uint8_t *passwd, size_t passwdlen,
1369     const uint8_t *salt, size_t saltlen,
1370     const yescrypt_params_t *params,
1371     uint8_t *buf, size_t buflen)
1372 {
1373 	yescrypt_flags_t flags = params->flags;
1374 	uint64_t N = params->N;
1375 	uint32_t r = params->r;
1376 	uint32_t p = params->p;
1377 	uint32_t t = params->t;
1378 	uint32_t g = params->g;
1379 	uint64_t NROM = params->NROM;
1380 	uint8_t dk[32];
1381 	int retval;
1382 
1383 	/* Support for hash upgrades has been temporarily removed */
1384 	if (g) {
1385 		errno = EINVAL;
1386 		return -1;
1387 	}
1388 
1389 	if ((flags & (YESCRYPT_RW | YESCRYPT_INIT_SHARED)) == YESCRYPT_RW &&
1390 	    p >= 1 && N / p >= 0x100 && N / p * r >= 0x20000) {
1391 		if (yescrypt_kdf_body(shared, local,
1392 		    passwd, passwdlen, salt, saltlen,
1393 		    flags | YESCRYPT_ALLOC_ONLY, N, r, p, t, NROM,
1394 		    buf, buflen) != -3) {
1395 			errno = EINVAL;
1396 			return -1;
1397 		}
1398 		if ((retval = yescrypt_kdf_body(shared, local,
1399 		    passwd, passwdlen, salt, saltlen,
1400 		    flags | YESCRYPT_PREHASH, N >> 6, r, p, 0, NROM,
1401 		    dk, sizeof(dk))))
1402 			return retval;
1403 		passwd = dk;
1404 		passwdlen = sizeof(dk);
1405 	}
1406 
1407 	retval = yescrypt_kdf_body(shared, local,
1408 	    passwd, passwdlen, salt, saltlen,
1409 	    flags, N, r, p, t, NROM, buf, buflen);
1410 #ifndef SKIP_MEMZERO
1411 	if (passwd == dk)
1412 		insecure_memzero(dk, sizeof(dk));
1413 #endif
1414 	return retval;
1415 }
1416 
yescrypt_init_shared(yescrypt_shared_t * shared,const uint8_t * seed,size_t seedlen,const yescrypt_params_t * params)1417 int yescrypt_init_shared(yescrypt_shared_t *shared,
1418     const uint8_t *seed, size_t seedlen,
1419     const yescrypt_params_t *params)
1420 {
1421 	yescrypt_params_t subparams;
1422 	yescrypt_shared_t half1, half2;
1423 	uint8_t salt[32];
1424 	uint64_t *tag;
1425 
1426 	subparams = *params;
1427 	subparams.flags |= YESCRYPT_INIT_SHARED;
1428 	subparams.N = params->NROM;
1429 	subparams.NROM = 0;
1430 
1431 	if (!(params->flags & YESCRYPT_RW) || params->N || params->g)
1432 		return -1;
1433 
1434 	if (params->flags & YESCRYPT_SHARED_PREALLOCATED) {
1435 		if (!shared->aligned || !shared->aligned_size)
1436 			return -1;
1437 
1438 /* Overwrite a possible old ROM tag before we overwrite the rest */
1439 		tag = (uint64_t *)
1440 		    ((uint8_t *)shared->aligned + shared->aligned_size - 48);
1441 		memset(tag, 0, 48);
1442 	} else {
1443 		init_region(shared);
1444 
1445 		subparams.flags |= YESCRYPT_ALLOC_ONLY;
1446 		if (yescrypt_kdf(NULL, shared, NULL, 0, NULL, 0, &subparams,
1447 		    NULL, 0) != -2 || !shared->aligned)
1448 			return -1;
1449 		subparams.flags -= YESCRYPT_ALLOC_ONLY;
1450 	}
1451 
1452 	subparams.N /= 2;
1453 
1454 	half1 = *shared;
1455 	half1.aligned_size /= 2;
1456 	half2 = half1;
1457 	half2.aligned = (uint8_t *)half2.aligned + half1.aligned_size;
1458 
1459 	if (yescrypt_kdf(NULL, &half1,
1460 	    seed, seedlen, (uint8_t *)"yescrypt-ROMhash", 16, &subparams,
1461 	    salt, sizeof(salt)))
1462 		goto fail;
1463 
1464 	subparams.NROM = subparams.N;
1465 
1466 	if (yescrypt_kdf(&half1, &half2,
1467 	    seed, seedlen, salt, sizeof(salt), &subparams, salt, sizeof(salt)))
1468 		goto fail;
1469 
1470 	if (yescrypt_kdf(&half2, &half1,
1471 	    seed, seedlen, salt, sizeof(salt), &subparams, salt, sizeof(salt)))
1472 		goto fail;
1473 
1474 	tag = (uint64_t *)
1475 	    ((uint8_t *)shared->aligned + shared->aligned_size - 48);
1476 	tag[0] = YESCRYPT_ROM_TAG1;
1477 	tag[1] = YESCRYPT_ROM_TAG2;
1478 	tag[2] = le64dec(salt);
1479 	tag[3] = le64dec(salt + 8);
1480 	tag[4] = le64dec(salt + 16);
1481 	tag[5] = le64dec(salt + 24);
1482 
1483 	insecure_memzero(salt, sizeof(salt));
1484 	return 0;
1485 
1486 fail:
1487 	insecure_memzero(salt, sizeof(salt));
1488 	if (!(params->flags & YESCRYPT_SHARED_PREALLOCATED))
1489 		free_region(shared);
1490 	return -1;
1491 }
1492 
yescrypt_digest_shared(yescrypt_shared_t * shared)1493 yescrypt_binary_t *yescrypt_digest_shared(yescrypt_shared_t *shared)
1494 {
1495 	static yescrypt_binary_t digest;
1496 	uint64_t *tag;
1497 
1498 	if (shared->aligned_size < 48)
1499 		return NULL;
1500 
1501 	tag = (uint64_t *)
1502 	    ((uint8_t *)shared->aligned + shared->aligned_size - 48);
1503 
1504 	if (tag[0] != YESCRYPT_ROM_TAG1 || tag[1] != YESCRYPT_ROM_TAG2)
1505 		return NULL;
1506 
1507 	le64enc(digest.uc, tag[2]);
1508 	le64enc(digest.uc + 8, tag[3]);
1509 	le64enc(digest.uc + 16, tag[4]);
1510 	le64enc(digest.uc + 24, tag[5]);
1511 
1512 	return &digest;
1513 }
1514 
yescrypt_free_shared(yescrypt_shared_t * shared)1515 int yescrypt_free_shared(yescrypt_shared_t *shared)
1516 {
1517 	return free_region(shared);
1518 }
1519 
yescrypt_init_local(yescrypt_local_t * local)1520 int yescrypt_init_local(yescrypt_local_t *local)
1521 {
1522 	init_region(local);
1523 	return 0;
1524 }
1525 
yescrypt_free_local(yescrypt_local_t * local)1526 int yescrypt_free_local(yescrypt_local_t *local)
1527 {
1528 	return free_region(local);
1529 }
1530