xref: /dragonfly/crypto/libressl/crypto/bn/bn_exp.c (revision 72c33676)
1 /* $OpenBSD: bn_exp.c,v 1.31 2017/05/02 03:59:44 deraadt Exp $ */
2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3  * All rights reserved.
4  *
5  * This package is an SSL implementation written
6  * by Eric Young (eay@cryptsoft.com).
7  * The implementation was written so as to conform with Netscapes SSL.
8  *
9  * This library is free for commercial and non-commercial use as long as
10  * the following conditions are aheared to.  The following conditions
11  * apply to all code found in this distribution, be it the RC4, RSA,
12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
13  * included with this distribution is covered by the same copyright terms
14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15  *
16  * Copyright remains Eric Young's, and as such any Copyright notices in
17  * the code are not to be removed.
18  * If this package is used in a product, Eric Young should be given attribution
19  * as the author of the parts of the library used.
20  * This can be in the form of a textual message at program startup or
21  * in documentation (online or textual) provided with the package.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  * 1. Redistributions of source code must retain the copyright
27  *    notice, this list of conditions and the following disclaimer.
28  * 2. Redistributions in binary form must reproduce the above copyright
29  *    notice, this list of conditions and the following disclaimer in the
30  *    documentation and/or other materials provided with the distribution.
31  * 3. All advertising materials mentioning features or use of this software
32  *    must display the following acknowledgement:
33  *    "This product includes cryptographic software written by
34  *     Eric Young (eay@cryptsoft.com)"
35  *    The word 'cryptographic' can be left out if the rouines from the library
36  *    being used are not cryptographic related :-).
37  * 4. If you include any Windows specific code (or a derivative thereof) from
38  *    the apps directory (application code) you must include an acknowledgement:
39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40  *
41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  *
53  * The licence and distribution terms for any publically available version or
54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
55  * copied and put under another distribution licence
56  * [including the GNU Public Licence.]
57  */
58 /* ====================================================================
59  * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
60  *
61  * Redistribution and use in source and binary forms, with or without
62  * modification, are permitted provided that the following conditions
63  * are met:
64  *
65  * 1. Redistributions of source code must retain the above copyright
66  *    notice, this list of conditions and the following disclaimer.
67  *
68  * 2. Redistributions in binary form must reproduce the above copyright
69  *    notice, this list of conditions and the following disclaimer in
70  *    the documentation and/or other materials provided with the
71  *    distribution.
72  *
73  * 3. All advertising materials mentioning features or use of this
74  *    software must display the following acknowledgment:
75  *    "This product includes software developed by the OpenSSL Project
76  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77  *
78  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79  *    endorse or promote products derived from this software without
80  *    prior written permission. For written permission, please contact
81  *    openssl-core@openssl.org.
82  *
83  * 5. Products derived from this software may not be called "OpenSSL"
84  *    nor may "OpenSSL" appear in their names without prior written
85  *    permission of the OpenSSL Project.
86  *
87  * 6. Redistributions of any form whatsoever must retain the following
88  *    acknowledgment:
89  *    "This product includes software developed by the OpenSSL Project
90  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91  *
92  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
96  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103  * OF THE POSSIBILITY OF SUCH DAMAGE.
104  * ====================================================================
105  *
106  * This product includes cryptographic software written by Eric Young
107  * (eay@cryptsoft.com).  This product includes software written by Tim
108  * Hudson (tjh@cryptsoft.com).
109  *
110  */
111 
112 #include <stdlib.h>
113 #include <string.h>
114 
115 #include <openssl/err.h>
116 
117 #include "bn_lcl.h"
118 #include "constant_time_locl.h"
119 
120 /* maximum precomputation table size for *variable* sliding windows */
121 #define TABLE_SIZE	32
122 
123 /* this one works - simple but works */
124 int
125 BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
126 {
127 	int i, bits, ret = 0;
128 	BIGNUM *v, *rr;
129 
130 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
131 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
132 		BNerror(ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
133 		return -1;
134 	}
135 
136 	BN_CTX_start(ctx);
137 	if ((r == a) || (r == p))
138 		rr = BN_CTX_get(ctx);
139 	else
140 		rr = r;
141 	v = BN_CTX_get(ctx);
142 	if (rr == NULL || v == NULL)
143 		goto err;
144 
145 	if (BN_copy(v, a) == NULL)
146 		goto err;
147 	bits = BN_num_bits(p);
148 
149 	if (BN_is_odd(p)) {
150 		if (BN_copy(rr, a) == NULL)
151 			goto err;
152 	} else {
153 		if (!BN_one(rr))
154 			goto err;
155 	}
156 
157 	for (i = 1; i < bits; i++) {
158 		if (!BN_sqr(v, v, ctx))
159 			goto err;
160 		if (BN_is_bit_set(p, i)) {
161 			if (!BN_mul(rr, rr, v, ctx))
162 				goto err;
163 		}
164 	}
165 	ret = 1;
166 
167 err:
168 	if (r != rr && rr != NULL)
169 		BN_copy(r, rr);
170 	BN_CTX_end(ctx);
171 	bn_check_top(r);
172 	return (ret);
173 }
174 
175 static int
176 BN_mod_exp_internal(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
177     BN_CTX *ctx, int ct)
178 {
179 	int ret;
180 
181 	bn_check_top(a);
182 	bn_check_top(p);
183 	bn_check_top(m);
184 
185 	/* For even modulus  m = 2^k*m_odd,  it might make sense to compute
186 	 * a^p mod m_odd  and  a^p mod 2^k  separately (with Montgomery
187 	 * exponentiation for the odd part), using appropriate exponent
188 	 * reductions, and combine the results using the CRT.
189 	 *
190 	 * For now, we use Montgomery only if the modulus is odd; otherwise,
191 	 * exponentiation using the reciprocal-based quick remaindering
192 	 * algorithm is used.
193 	 *
194 	 * (Timing obtained with expspeed.c [computations  a^p mod m
195 	 * where  a, p, m  are of the same length: 256, 512, 1024, 2048,
196 	 * 4096, 8192 bits], compared to the running time of the
197 	 * standard algorithm:
198 	 *
199 	 *   BN_mod_exp_mont   33 .. 40 %  [AMD K6-2, Linux, debug configuration]
200          *                     55 .. 77 %  [UltraSparc processor, but
201 	 *                                  debug-solaris-sparcv8-gcc conf.]
202 	 *
203 	 *   BN_mod_exp_recp   50 .. 70 %  [AMD K6-2, Linux, debug configuration]
204 	 *                     62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
205 	 *
206 	 * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
207 	 * at 2048 and more bits, but at 512 and 1024 bits, it was
208 	 * slower even than the standard algorithm!
209 	 *
210 	 * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
211 	 * should be obtained when the new Montgomery reduction code
212 	 * has been integrated into OpenSSL.)
213 	 */
214 
215 	if (BN_is_odd(m)) {
216 		if (a->top == 1 && !a->neg && !ct) {
217 			BN_ULONG A = a->d[0];
218 			ret = BN_mod_exp_mont_word(r, A,p, m,ctx, NULL);
219 		} else
220 			ret = BN_mod_exp_mont_ct(r, a,p, m,ctx, NULL);
221 	} else	{
222 		ret = BN_mod_exp_recp(r, a,p, m, ctx);
223 	}
224 
225 	bn_check_top(r);
226 	return (ret);
227 }
228 
229 int
230 BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
231     BN_CTX *ctx)
232 {
233 	return BN_mod_exp_internal(r, a, p, m, ctx,
234 	    (BN_get_flags(p, BN_FLG_CONSTTIME) != 0));
235 }
236 
237 int
238 BN_mod_exp_ct(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
239     BN_CTX *ctx)
240 {
241 	return BN_mod_exp_internal(r, a, p, m, ctx, 1);
242 }
243 
244 
245 int
246 BN_mod_exp_nonct(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
247     BN_CTX *ctx)
248 {
249 	return BN_mod_exp_internal(r, a, p, m, ctx, 0);
250 }
251 
252 
253 int
254 BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
255     BN_CTX *ctx)
256 {
257 	int i, j, bits, ret = 0, wstart, wend, window, wvalue;
258 	int start = 1;
259 	BIGNUM *aa;
260 	/* Table of variables obtained from 'ctx' */
261 	BIGNUM *val[TABLE_SIZE];
262 	BN_RECP_CTX recp;
263 
264 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
265 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
266 		BNerror(ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
267 		return -1;
268 	}
269 
270 	bits = BN_num_bits(p);
271 	if (bits == 0) {
272 		/* x**0 mod 1 is still zero. */
273 		if (BN_is_one(m)) {
274 			ret = 1;
275 			BN_zero(r);
276 		} else
277 			ret = BN_one(r);
278 		return ret;
279 	}
280 
281 	BN_CTX_start(ctx);
282 	if ((aa = BN_CTX_get(ctx)) == NULL)
283 		goto err;
284 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
285 		goto err;
286 
287 	BN_RECP_CTX_init(&recp);
288 	if (m->neg) {
289 		/* ignore sign of 'm' */
290 		if (!BN_copy(aa, m))
291 			goto err;
292 		aa->neg = 0;
293 		if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0)
294 			goto err;
295 	} else {
296 		if (BN_RECP_CTX_set(&recp, m, ctx) <= 0)
297 			goto err;
298 	}
299 
300 	if (!BN_nnmod(val[0], a, m, ctx))
301 		goto err;		/* 1 */
302 	if (BN_is_zero(val[0])) {
303 		BN_zero(r);
304 		ret = 1;
305 		goto err;
306 	}
307 
308 	window = BN_window_bits_for_exponent_size(bits);
309 	if (window > 1) {
310 		if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx))
311 			goto err;				/* 2 */
312 		j = 1 << (window - 1);
313 		for (i = 1; i < j; i++) {
314 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
315 			    !BN_mod_mul_reciprocal(val[i], val[i - 1],
316 			    aa, &recp, ctx))
317 				goto err;
318 		}
319 	}
320 
321 	start = 1;		/* This is used to avoid multiplication etc
322 				 * when there is only the value '1' in the
323 				 * buffer. */
324 	wvalue = 0;		/* The 'value' of the window */
325 	wstart = bits - 1;	/* The top bit of the window */
326 	wend = 0;		/* The bottom bit of the window */
327 
328 	if (!BN_one(r))
329 		goto err;
330 
331 	for (;;) {
332 		if (BN_is_bit_set(p, wstart) == 0) {
333 			if (!start)
334 				if (!BN_mod_mul_reciprocal(r, r,r, &recp, ctx))
335 					goto err;
336 			if (wstart == 0)
337 				break;
338 			wstart--;
339 			continue;
340 		}
341 		/* We now have wstart on a 'set' bit, we now need to work out
342 		 * how bit a window to do.  To do this we need to scan
343 		 * forward until the last set bit before the end of the
344 		 * window */
345 		j = wstart;
346 		wvalue = 1;
347 		wend = 0;
348 		for (i = 1; i < window; i++) {
349 			if (wstart - i < 0)
350 				break;
351 			if (BN_is_bit_set(p, wstart - i)) {
352 				wvalue <<= (i - wend);
353 				wvalue |= 1;
354 				wend = i;
355 			}
356 		}
357 
358 		/* wend is the size of the current window */
359 		j = wend + 1;
360 		/* add the 'bytes above' */
361 		if (!start)
362 			for (i = 0; i < j; i++) {
363 				if (!BN_mod_mul_reciprocal(r, r,r, &recp, ctx))
364 					goto err;
365 			}
366 
367 		/* wvalue will be an odd number < 2^window */
368 		if (!BN_mod_mul_reciprocal(r, r,val[wvalue >> 1], &recp, ctx))
369 			goto err;
370 
371 		/* move the 'window' down further */
372 		wstart -= wend + 1;
373 		wvalue = 0;
374 		start = 0;
375 		if (wstart < 0)
376 			break;
377 	}
378 	ret = 1;
379 
380 err:
381 	BN_CTX_end(ctx);
382 	BN_RECP_CTX_free(&recp);
383 	bn_check_top(r);
384 	return (ret);
385 }
386 
387 static int
388 BN_mod_exp_mont_internal(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
389     BN_CTX *ctx, BN_MONT_CTX *in_mont, int ct)
390 {
391 	int i, j, bits, ret = 0, wstart, wend, window, wvalue;
392 	int start = 1;
393 	BIGNUM *d, *r;
394 	const BIGNUM *aa;
395 	/* Table of variables obtained from 'ctx' */
396 	BIGNUM *val[TABLE_SIZE];
397 	BN_MONT_CTX *mont = NULL;
398 
399 	if (ct) {
400 		return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
401 	}
402 
403 	bn_check_top(a);
404 	bn_check_top(p);
405 	bn_check_top(m);
406 
407 	if (!BN_is_odd(m)) {
408 		BNerror(BN_R_CALLED_WITH_EVEN_MODULUS);
409 		return (0);
410 	}
411 
412 	bits = BN_num_bits(p);
413 	if (bits == 0) {
414 		/* x**0 mod 1 is still zero. */
415 		if (BN_is_one(m)) {
416 			ret = 1;
417 			BN_zero(rr);
418 		} else
419 			ret = BN_one(rr);
420 		return ret;
421 	}
422 
423 	BN_CTX_start(ctx);
424 	if ((d = BN_CTX_get(ctx)) == NULL)
425 		goto err;
426 	if ((r = BN_CTX_get(ctx)) == NULL)
427 		goto err;
428 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
429 		goto err;
430 
431 	/* If this is not done, things will break in the montgomery
432 	 * part */
433 
434 	if (in_mont != NULL)
435 		mont = in_mont;
436 	else {
437 		if ((mont = BN_MONT_CTX_new()) == NULL)
438 			goto err;
439 		if (!BN_MONT_CTX_set(mont, m, ctx))
440 			goto err;
441 	}
442 
443 	if (a->neg || BN_ucmp(a, m) >= 0) {
444 		if (!BN_nnmod(val[0], a,m, ctx))
445 			goto err;
446 		aa = val[0];
447 	} else
448 		aa = a;
449 	if (BN_is_zero(aa)) {
450 		BN_zero(rr);
451 		ret = 1;
452 		goto err;
453 	}
454 	if (!BN_to_montgomery(val[0], aa, mont, ctx))
455 		goto err; /* 1 */
456 
457 	window = BN_window_bits_for_exponent_size(bits);
458 	if (window > 1) {
459 		if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx))
460 			goto err; /* 2 */
461 		j = 1 << (window - 1);
462 		for (i = 1; i < j; i++) {
463 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
464 			    !BN_mod_mul_montgomery(val[i], val[i - 1],
465 			    d, mont, ctx))
466 				goto err;
467 		}
468 	}
469 
470 	start = 1;		/* This is used to avoid multiplication etc
471 				 * when there is only the value '1' in the
472 				 * buffer. */
473 	wvalue = 0;		/* The 'value' of the window */
474 	wstart = bits - 1;	/* The top bit of the window */
475 	wend = 0;		/* The bottom bit of the window */
476 
477 	if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
478 		goto err;
479 	for (;;) {
480 		if (BN_is_bit_set(p, wstart) == 0) {
481 			if (!start) {
482 				if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
483 					goto err;
484 			}
485 			if (wstart == 0)
486 				break;
487 			wstart--;
488 			continue;
489 		}
490 		/* We now have wstart on a 'set' bit, we now need to work out
491 		 * how bit a window to do.  To do this we need to scan
492 		 * forward until the last set bit before the end of the
493 		 * window */
494 		j = wstart;
495 		wvalue = 1;
496 		wend = 0;
497 		for (i = 1; i < window; i++) {
498 			if (wstart - i < 0)
499 				break;
500 			if (BN_is_bit_set(p, wstart - i)) {
501 				wvalue <<= (i - wend);
502 				wvalue |= 1;
503 				wend = i;
504 			}
505 		}
506 
507 		/* wend is the size of the current window */
508 		j = wend + 1;
509 		/* add the 'bytes above' */
510 		if (!start)
511 			for (i = 0; i < j; i++) {
512 				if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
513 					goto err;
514 			}
515 
516 		/* wvalue will be an odd number < 2^window */
517 		if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx))
518 			goto err;
519 
520 		/* move the 'window' down further */
521 		wstart -= wend + 1;
522 		wvalue = 0;
523 		start = 0;
524 		if (wstart < 0)
525 			break;
526 	}
527 	if (!BN_from_montgomery(rr, r,mont, ctx))
528 		goto err;
529 	ret = 1;
530 
531 err:
532 	if ((in_mont == NULL) && (mont != NULL))
533 		BN_MONT_CTX_free(mont);
534 	BN_CTX_end(ctx);
535 	bn_check_top(rr);
536 	return (ret);
537 }
538 
539 int
540 BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
541     BN_CTX *ctx, BN_MONT_CTX *in_mont)
542 {
543 	return BN_mod_exp_mont_internal(rr, a, p, m, ctx, in_mont,
544 	    (BN_get_flags(p, BN_FLG_CONSTTIME) != 0));
545 }
546 
547 int
548 BN_mod_exp_mont_ct(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
549     BN_CTX *ctx, BN_MONT_CTX *in_mont)
550 {
551 	return BN_mod_exp_mont_internal(rr, a, p, m, ctx, in_mont, 1);
552 }
553 
554 int
555 BN_mod_exp_mont_nonct(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
556     BN_CTX *ctx, BN_MONT_CTX *in_mont)
557 {
558 	return BN_mod_exp_mont_internal(rr, a, p, m, ctx, in_mont, 0);
559 }
560 
561 /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
562  * so that accessing any of these table values shows the same access pattern as far
563  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
564  * from/to that table. */
565 
566 static int
567 MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf,
568     int idx, int window)
569 {
570 	int i, j;
571 	int width = 1 << window;
572 	BN_ULONG *table = (BN_ULONG *)buf;
573 
574 	if (top > b->top)
575 		top = b->top; /* this works because 'buf' is explicitly zeroed */
576 
577 	for (i = 0, j = idx; i < top; i++, j += width) {
578 		table[j] = b->d[i];
579 	}
580 
581 	return 1;
582 }
583 
584 static int
585 MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx,
586     int window)
587 {
588 	int i, j;
589 	int width = 1 << window;
590 	volatile BN_ULONG *table = (volatile BN_ULONG *)buf;
591 
592 	if (bn_wexpand(b, top) == NULL)
593 		return 0;
594 
595 	if (window <= 3) {
596 		for (i = 0; i < top; i++, table += width) {
597 		    BN_ULONG acc = 0;
598 
599 		    for (j = 0; j < width; j++) {
600 			acc |= table[j] &
601 			       ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
602 		    }
603 
604 		    b->d[i] = acc;
605 		}
606 	} else {
607 		int xstride = 1 << (window - 2);
608 		BN_ULONG y0, y1, y2, y3;
609 
610 		i = idx >> (window - 2);        /* equivalent of idx / xstride */
611 		idx &= xstride - 1;             /* equivalent of idx % xstride */
612 
613 		y0 = (BN_ULONG)0 - (constant_time_eq_int(i,0)&1);
614 		y1 = (BN_ULONG)0 - (constant_time_eq_int(i,1)&1);
615 		y2 = (BN_ULONG)0 - (constant_time_eq_int(i,2)&1);
616 		y3 = (BN_ULONG)0 - (constant_time_eq_int(i,3)&1);
617 
618 		for (i = 0; i < top; i++, table += width) {
619 		    BN_ULONG acc = 0;
620 
621 		    for (j = 0; j < xstride; j++) {
622 			acc |= ( (table[j + 0 * xstride] & y0) |
623 				 (table[j + 1 * xstride] & y1) |
624 				 (table[j + 2 * xstride] & y2) |
625 				 (table[j + 3 * xstride] & y3) )
626 			       & ((BN_ULONG)0 - (constant_time_eq_int(j,idx)&1));
627 		    }
628 
629 		    b->d[i] = acc;
630 		}
631 	}
632 	b->top = top;
633 	bn_correct_top(b);
634 	return 1;
635 }
636 
637 /* Given a pointer value, compute the next address that is a cache line multiple. */
638 #define MOD_EXP_CTIME_ALIGN(x_) \
639 	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
640 
641 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
642  * precomputation memory layout to limit data-dependency to a minimum
643  * to protect secret exponents (cf. the hyper-threading timing attacks
644  * pointed out by Colin Percival,
645  * http://www.daemonology.net/hyperthreading-considered-harmful/)
646  */
647 int
648 BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
649     const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
650 {
651 	int i, bits, ret = 0, window, wvalue;
652 	int top;
653 	BN_MONT_CTX *mont = NULL;
654 	int numPowers;
655 	unsigned char *powerbufFree = NULL;
656 	int powerbufLen = 0;
657 	unsigned char *powerbuf = NULL;
658 	BIGNUM tmp, am;
659 
660 	bn_check_top(a);
661 	bn_check_top(p);
662 	bn_check_top(m);
663 
664 	if (!BN_is_odd(m)) {
665 		BNerror(BN_R_CALLED_WITH_EVEN_MODULUS);
666 		return (0);
667 	}
668 
669 	top = m->top;
670 
671 	bits = BN_num_bits(p);
672 	if (bits == 0) {
673 		/* x**0 mod 1 is still zero. */
674 		if (BN_is_one(m)) {
675 			ret = 1;
676 			BN_zero(rr);
677 		} else
678 			ret = BN_one(rr);
679 		return ret;
680 	}
681 
682 	BN_CTX_start(ctx);
683 
684 	/* Allocate a montgomery context if it was not supplied by the caller.
685 	 * If this is not done, things will break in the montgomery part.
686  	 */
687 	if (in_mont != NULL)
688 		mont = in_mont;
689 	else {
690 		if ((mont = BN_MONT_CTX_new()) == NULL)
691 			goto err;
692 		if (!BN_MONT_CTX_set(mont, m, ctx))
693 			goto err;
694 	}
695 
696 	/* Get the window size to use with size of p. */
697 	window = BN_window_bits_for_ctime_exponent_size(bits);
698 #if defined(OPENSSL_BN_ASM_MONT5)
699 	if (window == 6 && bits <= 1024)
700 		window = 5;	/* ~5% improvement of 2048-bit RSA sign */
701 #endif
702 
703 	/* Allocate a buffer large enough to hold all of the pre-computed
704 	 * powers of am, am itself and tmp.
705 	 */
706 	numPowers = 1 << window;
707 	powerbufLen = sizeof(m->d[0]) * (top * numPowers +
708 	    ((2*top) > numPowers ? (2*top) : numPowers));
709 	if ((powerbufFree = calloc(powerbufLen +
710 	    MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH, 1)) == NULL)
711 		goto err;
712 	powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
713 
714 	/* lay down tmp and am right after powers table */
715 	tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
716 	am.d = tmp.d + top;
717 	tmp.top = am.top = 0;
718 	tmp.dmax = am.dmax = top;
719 	tmp.neg = am.neg = 0;
720 	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
721 
722 	/* prepare a^0 in Montgomery domain */
723 #if 1
724 	if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
725 		goto err;
726 #else
727 	tmp.d[0] = (0 - m - >d[0]) & BN_MASK2;	/* 2^(top*BN_BITS2) - m */
728 	for (i = 1; i < top; i++)
729 		tmp.d[i] = (~m->d[i]) & BN_MASK2;
730 	tmp.top = top;
731 #endif
732 
733 	/* prepare a^1 in Montgomery domain */
734 	if (a->neg || BN_ucmp(a, m) >= 0) {
735 		if (!BN_mod_ct(&am, a,m, ctx))
736 			goto err;
737 		if (!BN_to_montgomery(&am, &am, mont, ctx))
738 			goto err;
739 	} else if (!BN_to_montgomery(&am, a,mont, ctx))
740 		goto err;
741 
742 #if defined(OPENSSL_BN_ASM_MONT5)
743 	/* This optimization uses ideas from http://eprint.iacr.org/2011/239,
744 	 * specifically optimization of cache-timing attack countermeasures
745 	 * and pre-computation optimization. */
746 
747 	/* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
748 	 * 512-bit RSA is hardly relevant, we omit it to spare size... */
749 	if (window == 5 && top > 1) {
750 		void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
751 		    const void *table, const BN_ULONG *np,
752 		    const BN_ULONG *n0, int num, int power);
753 		void bn_scatter5(const BN_ULONG *inp, size_t num,
754 		    void *table, size_t power);
755 		void bn_gather5(BN_ULONG *out, size_t num,
756 		    void *table, size_t power);
757 
758 		BN_ULONG *np = mont->N.d, *n0 = mont->n0;
759 
760 		/* BN_to_montgomery can contaminate words above .top
761 		 * [in BN_DEBUG[_DEBUG] build]... */
762 		for (i = am.top; i < top; i++)
763 			am.d[i] = 0;
764 		for (i = tmp.top; i < top; i++)
765 			tmp.d[i] = 0;
766 
767 		bn_scatter5(tmp.d, top, powerbuf, 0);
768 		bn_scatter5(am.d, am.top, powerbuf, 1);
769 		bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
770 		bn_scatter5(tmp.d, top, powerbuf, 2);
771 
772 #if 0
773 		for (i = 3; i < 32; i++) {
774 			/* Calculate a^i = a^(i-1) * a */
775 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
776 			    n0, top, i - 1);
777 			bn_scatter5(tmp.d, top, powerbuf, i);
778 		}
779 #else
780 		/* same as above, but uses squaring for 1/2 of operations */
781 		for (i = 4; i < 32; i*=2) {
782 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
783 			bn_scatter5(tmp.d, top, powerbuf, i);
784 		}
785 		for (i = 3; i < 8; i += 2) {
786 			int j;
787 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
788 			    n0, top, i - 1);
789 			bn_scatter5(tmp.d, top, powerbuf, i);
790 			for (j = 2 * i; j < 32; j *= 2) {
791 				bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
792 				bn_scatter5(tmp.d, top, powerbuf, j);
793 			}
794 		}
795 		for (; i < 16; i += 2) {
796 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
797 			    n0, top, i - 1);
798 			bn_scatter5(tmp.d, top, powerbuf, i);
799 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
800 			bn_scatter5(tmp.d, top, powerbuf, 2*i);
801 		}
802 		for (; i < 32; i += 2) {
803 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
804 			    n0, top, i - 1);
805 			bn_scatter5(tmp.d, top, powerbuf, i);
806 		}
807 #endif
808 		bits--;
809 		for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
810 			wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
811 		bn_gather5(tmp.d, top, powerbuf, wvalue);
812 
813 		/* Scan the exponent one window at a time starting from the most
814 		 * significant bits.
815 		 */
816 		while (bits >= 0) {
817 			for (wvalue = 0, i = 0; i < 5; i++, bits--)
818 				wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
819 
820 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
821 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
822 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
823 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
824 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
825 			bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
826 		}
827 
828 		tmp.top = top;
829 		bn_correct_top(&tmp);
830 	} else
831 #endif
832 	{
833 		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0,
834 		    window))
835 			goto err;
836 		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am,  top, powerbuf, 1,
837 		    window))
838 			goto err;
839 
840 		/* If the window size is greater than 1, then calculate
841 		 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
842 		 * (even powers could instead be computed as (a^(i/2))^2
843 		 * to use the slight performance advantage of sqr over mul).
844 		 */
845 		if (window > 1) {
846 			if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
847 				goto err;
848 			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf,
849 			    2, window))
850 				goto err;
851 			for (i = 3; i < numPowers; i++) {
852 				/* Calculate a^i = a^(i-1) * a */
853 				if (!BN_mod_mul_montgomery(&tmp, &am, &tmp,
854 				    mont, ctx))
855 					goto err;
856 				if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top,
857 				    powerbuf, i, window))
858 					goto err;
859 			}
860 		}
861 
862 		bits--;
863 		for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
864 			wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
865 		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf,
866 		    wvalue, window))
867 			goto err;
868 
869 		/* Scan the exponent one window at a time starting from the most
870 		 * significant bits.
871 		 */
872 		while (bits >= 0) {
873 			wvalue = 0; /* The 'value' of the window */
874 
875 			/* Scan the window, squaring the result as we go */
876 			for (i = 0; i < window; i++, bits--) {
877 				if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp,
878 				    mont, ctx))
879 					goto err;
880 				wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
881 			}
882 
883 			/* Fetch the appropriate pre-computed value from the pre-buf */
884 			if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf,
885 			    wvalue, window))
886 				goto err;
887 
888 			/* Multiply the result into the intermediate result */
889 			if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx))
890 				goto err;
891 		}
892 	}
893 
894 	/* Convert the final result from montgomery to standard format */
895 	if (!BN_from_montgomery(rr, &tmp, mont, ctx))
896 		goto err;
897 	ret = 1;
898 
899 err:
900 	if ((in_mont == NULL) && (mont != NULL))
901 		BN_MONT_CTX_free(mont);
902 	freezero(powerbufFree, powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
903 	BN_CTX_end(ctx);
904 	return (ret);
905 }
906 
907 int
908 BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, const BIGNUM *m,
909     BN_CTX *ctx, BN_MONT_CTX *in_mont)
910 {
911 	BN_MONT_CTX *mont = NULL;
912 	int b, bits, ret = 0;
913 	int r_is_one;
914 	BN_ULONG w, next_w;
915 	BIGNUM *d, *r, *t;
916 	BIGNUM *swap_tmp;
917 
918 #define BN_MOD_MUL_WORD(r, w, m) \
919 		(BN_mul_word(r, (w)) && \
920 		(/* BN_ucmp(r, (m)) < 0 ? 1 :*/  \
921 			(BN_mod_ct(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
922 		/* BN_MOD_MUL_WORD is only used with 'w' large,
923 		 * so the BN_ucmp test is probably more overhead
924 		 * than always using BN_mod (which uses BN_copy if
925 		 * a similar test returns true). */
926 		/* We can use BN_mod and do not need BN_nnmod because our
927 		 * accumulator is never negative (the result of BN_mod does
928 		 * not depend on the sign of the modulus).
929 		 */
930 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \
931 		(BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
932 
933 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
934 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
935 		BNerror(ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
936 		return -1;
937 	}
938 
939 	bn_check_top(p);
940 	bn_check_top(m);
941 
942 	if (!BN_is_odd(m)) {
943 		BNerror(BN_R_CALLED_WITH_EVEN_MODULUS);
944 		return (0);
945 	}
946 	if (m->top == 1)
947 		a %= m->d[0]; /* make sure that 'a' is reduced */
948 
949 	bits = BN_num_bits(p);
950 	if (bits == 0) {
951 		/* x**0 mod 1 is still zero. */
952 		if (BN_is_one(m)) {
953 			ret = 1;
954 			BN_zero(rr);
955 		} else
956 			ret = BN_one(rr);
957 		return ret;
958 	}
959 	if (a == 0) {
960 		BN_zero(rr);
961 		ret = 1;
962 		return ret;
963 	}
964 
965 	BN_CTX_start(ctx);
966 	if ((d = BN_CTX_get(ctx)) == NULL)
967 		goto err;
968 	if ((r = BN_CTX_get(ctx)) == NULL)
969 		goto err;
970 	if ((t = BN_CTX_get(ctx)) == NULL)
971 		goto err;
972 
973 	if (in_mont != NULL)
974 		mont = in_mont;
975 	else {
976 		if ((mont = BN_MONT_CTX_new()) == NULL)
977 			goto err;
978 		if (!BN_MONT_CTX_set(mont, m, ctx))
979 			goto err;
980 	}
981 
982 	r_is_one = 1; /* except for Montgomery factor */
983 
984 	/* bits-1 >= 0 */
985 
986 	/* The result is accumulated in the product r*w. */
987 	w = a; /* bit 'bits-1' of 'p' is always set */
988 	for (b = bits - 2; b >= 0; b--) {
989 		/* First, square r*w. */
990 		next_w = w * w;
991 		if ((next_w / w) != w) /* overflow */
992 		{
993 			if (r_is_one) {
994 				if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
995 					goto err;
996 				r_is_one = 0;
997 			} else {
998 				if (!BN_MOD_MUL_WORD(r, w, m))
999 					goto err;
1000 			}
1001 			next_w = 1;
1002 		}
1003 		w = next_w;
1004 		if (!r_is_one) {
1005 			if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
1006 				goto err;
1007 		}
1008 
1009 		/* Second, multiply r*w by 'a' if exponent bit is set. */
1010 		if (BN_is_bit_set(p, b)) {
1011 			next_w = w * a;
1012 			if ((next_w / a) != w) /* overflow */
1013 			{
1014 				if (r_is_one) {
1015 					if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
1016 						goto err;
1017 					r_is_one = 0;
1018 				} else {
1019 					if (!BN_MOD_MUL_WORD(r, w, m))
1020 						goto err;
1021 				}
1022 				next_w = a;
1023 			}
1024 			w = next_w;
1025 		}
1026 	}
1027 
1028 	/* Finally, set r:=r*w. */
1029 	if (w != 1) {
1030 		if (r_is_one) {
1031 			if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
1032 				goto err;
1033 			r_is_one = 0;
1034 		} else {
1035 			if (!BN_MOD_MUL_WORD(r, w, m))
1036 				goto err;
1037 		}
1038 	}
1039 
1040 	if (r_is_one) /* can happen only if a == 1*/
1041 	{
1042 		if (!BN_one(rr))
1043 			goto err;
1044 	} else {
1045 		if (!BN_from_montgomery(rr, r, mont, ctx))
1046 			goto err;
1047 	}
1048 	ret = 1;
1049 
1050 err:
1051 	if ((in_mont == NULL) && (mont != NULL))
1052 		BN_MONT_CTX_free(mont);
1053 	BN_CTX_end(ctx);
1054 	bn_check_top(rr);
1055 	return (ret);
1056 }
1057 
1058 
1059 /* The old fallback, simple version :-) */
1060 int
1061 BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
1062     BN_CTX *ctx)
1063 {
1064 	int i, j, bits, ret = 0, wstart, wend, window, wvalue;
1065 	int start = 1;
1066 	BIGNUM *d;
1067 	/* Table of variables obtained from 'ctx' */
1068 	BIGNUM *val[TABLE_SIZE];
1069 
1070 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
1071 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
1072 		BNerror(ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
1073 		return -1;
1074 	}
1075 
1076 	bits = BN_num_bits(p);
1077 	if (bits == 0) {
1078 		/* x**0 mod 1 is still zero. */
1079 		if (BN_is_one(m)) {
1080 			ret = 1;
1081 			BN_zero(r);
1082 		} else
1083 			ret = BN_one(r);
1084 		return ret;
1085 	}
1086 
1087 	BN_CTX_start(ctx);
1088 	if ((d = BN_CTX_get(ctx)) == NULL)
1089 		goto err;
1090 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
1091 		goto err;
1092 
1093 	if (!BN_nnmod(val[0],a,m,ctx))
1094 		goto err;		/* 1 */
1095 	if (BN_is_zero(val[0])) {
1096 		BN_zero(r);
1097 		ret = 1;
1098 		goto err;
1099 	}
1100 
1101 	window = BN_window_bits_for_exponent_size(bits);
1102 	if (window > 1) {
1103 		if (!BN_mod_mul(d, val[0], val[0], m, ctx))
1104 			goto err;				/* 2 */
1105 		j = 1 << (window - 1);
1106 		for (i = 1; i < j; i++) {
1107 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
1108 			    !BN_mod_mul(val[i], val[i - 1], d,m, ctx))
1109 				goto err;
1110 		}
1111 	}
1112 
1113 	start = 1;		/* This is used to avoid multiplication etc
1114 				 * when there is only the value '1' in the
1115 				 * buffer. */
1116 	wvalue = 0;		/* The 'value' of the window */
1117 	wstart = bits - 1;	/* The top bit of the window */
1118 	wend = 0;		/* The bottom bit of the window */
1119 
1120 	if (!BN_one(r))
1121 		goto err;
1122 
1123 	for (;;) {
1124 		if (BN_is_bit_set(p, wstart) == 0) {
1125 			if (!start)
1126 				if (!BN_mod_mul(r, r, r, m, ctx))
1127 					goto err;
1128 			if (wstart == 0)
1129 				break;
1130 			wstart--;
1131 			continue;
1132 		}
1133 		/* We now have wstart on a 'set' bit, we now need to work out
1134 		 * how bit a window to do.  To do this we need to scan
1135 		 * forward until the last set bit before the end of the
1136 		 * window */
1137 		j = wstart;
1138 		wvalue = 1;
1139 		wend = 0;
1140 		for (i = 1; i < window; i++) {
1141 			if (wstart - i < 0)
1142 				break;
1143 			if (BN_is_bit_set(p, wstart - i)) {
1144 				wvalue <<= (i - wend);
1145 				wvalue |= 1;
1146 				wend = i;
1147 			}
1148 		}
1149 
1150 		/* wend is the size of the current window */
1151 		j = wend + 1;
1152 		/* add the 'bytes above' */
1153 		if (!start)
1154 			for (i = 0; i < j; i++) {
1155 				if (!BN_mod_mul(r, r, r, m, ctx))
1156 					goto err;
1157 			}
1158 
1159 		/* wvalue will be an odd number < 2^window */
1160 		if (!BN_mod_mul(r, r, val[wvalue >> 1], m, ctx))
1161 			goto err;
1162 
1163 		/* move the 'window' down further */
1164 		wstart -= wend + 1;
1165 		wvalue = 0;
1166 		start = 0;
1167 		if (wstart < 0)
1168 			break;
1169 	}
1170 	ret = 1;
1171 
1172 err:
1173 	BN_CTX_end(ctx);
1174 	bn_check_top(r);
1175 	return (ret);
1176 }
1177