xref: /dragonfly/crypto/libressl/crypto/bn/bn_exp.c (revision f5b1c8a1)
1 /* $OpenBSD: bn_exp.c,v 1.22 2015/03/21 08:05:20 doug Exp $ */
2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3  * All rights reserved.
4  *
5  * This package is an SSL implementation written
6  * by Eric Young (eay@cryptsoft.com).
7  * The implementation was written so as to conform with Netscapes SSL.
8  *
9  * This library is free for commercial and non-commercial use as long as
10  * the following conditions are aheared to.  The following conditions
11  * apply to all code found in this distribution, be it the RC4, RSA,
12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
13  * included with this distribution is covered by the same copyright terms
14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15  *
16  * Copyright remains Eric Young's, and as such any Copyright notices in
17  * the code are not to be removed.
18  * If this package is used in a product, Eric Young should be given attribution
19  * as the author of the parts of the library used.
20  * This can be in the form of a textual message at program startup or
21  * in documentation (online or textual) provided with the package.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  * 1. Redistributions of source code must retain the copyright
27  *    notice, this list of conditions and the following disclaimer.
28  * 2. Redistributions in binary form must reproduce the above copyright
29  *    notice, this list of conditions and the following disclaimer in the
30  *    documentation and/or other materials provided with the distribution.
31  * 3. All advertising materials mentioning features or use of this software
32  *    must display the following acknowledgement:
33  *    "This product includes cryptographic software written by
34  *     Eric Young (eay@cryptsoft.com)"
35  *    The word 'cryptographic' can be left out if the rouines from the library
36  *    being used are not cryptographic related :-).
37  * 4. If you include any Windows specific code (or a derivative thereof) from
38  *    the apps directory (application code) you must include an acknowledgement:
39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40  *
41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  *
53  * The licence and distribution terms for any publically available version or
54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
55  * copied and put under another distribution licence
56  * [including the GNU Public Licence.]
57  */
58 /* ====================================================================
59  * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
60  *
61  * Redistribution and use in source and binary forms, with or without
62  * modification, are permitted provided that the following conditions
63  * are met:
64  *
65  * 1. Redistributions of source code must retain the above copyright
66  *    notice, this list of conditions and the following disclaimer.
67  *
68  * 2. Redistributions in binary form must reproduce the above copyright
69  *    notice, this list of conditions and the following disclaimer in
70  *    the documentation and/or other materials provided with the
71  *    distribution.
72  *
73  * 3. All advertising materials mentioning features or use of this
74  *    software must display the following acknowledgment:
75  *    "This product includes software developed by the OpenSSL Project
76  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
77  *
78  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
79  *    endorse or promote products derived from this software without
80  *    prior written permission. For written permission, please contact
81  *    openssl-core@openssl.org.
82  *
83  * 5. Products derived from this software may not be called "OpenSSL"
84  *    nor may "OpenSSL" appear in their names without prior written
85  *    permission of the OpenSSL Project.
86  *
87  * 6. Redistributions of any form whatsoever must retain the following
88  *    acknowledgment:
89  *    "This product includes software developed by the OpenSSL Project
90  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
91  *
92  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
93  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
94  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
95  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
96  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
97  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
98  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
99  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
100  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
101  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
102  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
103  * OF THE POSSIBILITY OF SUCH DAMAGE.
104  * ====================================================================
105  *
106  * This product includes cryptographic software written by Eric Young
107  * (eay@cryptsoft.com).  This product includes software written by Tim
108  * Hudson (tjh@cryptsoft.com).
109  *
110  */
111 
112 #include <stdlib.h>
113 #include <string.h>
114 
115 #include <openssl/err.h>
116 
117 #include "bn_lcl.h"
118 
119 /* maximum precomputation table size for *variable* sliding windows */
120 #define TABLE_SIZE	32
121 
122 /* this one works - simple but works */
123 int
124 BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
125 {
126 	int i, bits, ret = 0;
127 	BIGNUM *v, *rr;
128 
129 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
130 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
131 		BNerr(BN_F_BN_EXP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
132 		return -1;
133 	}
134 
135 	BN_CTX_start(ctx);
136 	if ((r == a) || (r == p))
137 		rr = BN_CTX_get(ctx);
138 	else
139 		rr = r;
140 	v = BN_CTX_get(ctx);
141 	if (rr == NULL || v == NULL)
142 		goto err;
143 
144 	if (BN_copy(v, a) == NULL)
145 		goto err;
146 	bits = BN_num_bits(p);
147 
148 	if (BN_is_odd(p)) {
149 		if (BN_copy(rr, a) == NULL)
150 			goto err;
151 	} else {
152 		if (!BN_one(rr))
153 			goto err;
154 	}
155 
156 	for (i = 1; i < bits; i++) {
157 		if (!BN_sqr(v, v, ctx))
158 			goto err;
159 		if (BN_is_bit_set(p, i)) {
160 			if (!BN_mul(rr, rr, v, ctx))
161 				goto err;
162 		}
163 	}
164 	ret = 1;
165 
166 err:
167 	if (r != rr && rr != NULL)
168 		BN_copy(r, rr);
169 	BN_CTX_end(ctx);
170 	bn_check_top(r);
171 	return (ret);
172 }
173 
174 int
175 BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
176     BN_CTX *ctx)
177 {
178 	int ret;
179 
180 	bn_check_top(a);
181 	bn_check_top(p);
182 	bn_check_top(m);
183 
184 	/* For even modulus  m = 2^k*m_odd,  it might make sense to compute
185 	 * a^p mod m_odd  and  a^p mod 2^k  separately (with Montgomery
186 	 * exponentiation for the odd part), using appropriate exponent
187 	 * reductions, and combine the results using the CRT.
188 	 *
189 	 * For now, we use Montgomery only if the modulus is odd; otherwise,
190 	 * exponentiation using the reciprocal-based quick remaindering
191 	 * algorithm is used.
192 	 *
193 	 * (Timing obtained with expspeed.c [computations  a^p mod m
194 	 * where  a, p, m  are of the same length: 256, 512, 1024, 2048,
195 	 * 4096, 8192 bits], compared to the running time of the
196 	 * standard algorithm:
197 	 *
198 	 *   BN_mod_exp_mont   33 .. 40 %  [AMD K6-2, Linux, debug configuration]
199          *                     55 .. 77 %  [UltraSparc processor, but
200 	 *                                  debug-solaris-sparcv8-gcc conf.]
201 	 *
202 	 *   BN_mod_exp_recp   50 .. 70 %  [AMD K6-2, Linux, debug configuration]
203 	 *                     62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
204 	 *
205 	 * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
206 	 * at 2048 and more bits, but at 512 and 1024 bits, it was
207 	 * slower even than the standard algorithm!
208 	 *
209 	 * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
210 	 * should be obtained when the new Montgomery reduction code
211 	 * has been integrated into OpenSSL.)
212 	 */
213 
214 #define MONT_MUL_MOD
215 #define MONT_EXP_WORD
216 #define RECP_MUL_MOD
217 
218 #ifdef MONT_MUL_MOD
219 	/* I have finally been able to take out this pre-condition of
220 	 * the top bit being set.  It was caused by an error in BN_div
221 	 * with negatives.  There was also another problem when for a^b%m
222 	 * a >= m.  eay 07-May-97 */
223 /*	if ((m->d[m->top-1]&BN_TBIT) && BN_is_odd(m)) */
224 
225 	if (BN_is_odd(m)) {
226 #  ifdef MONT_EXP_WORD
227 		if (a->top == 1 && !a->neg &&
228 		    (BN_get_flags(p, BN_FLG_CONSTTIME) == 0)) {
229 			BN_ULONG A = a->d[0];
230 			ret = BN_mod_exp_mont_word(r, A,p, m,ctx, NULL);
231 		} else
232 #  endif
233 			ret = BN_mod_exp_mont(r, a,p, m,ctx, NULL);
234 	} else
235 #endif
236 #ifdef RECP_MUL_MOD
237 	{
238 		ret = BN_mod_exp_recp(r, a,p, m, ctx);
239 	}
240 #else
241 	{
242 		ret = BN_mod_exp_simple(r, a,p, m, ctx);
243 	}
244 #endif
245 
246 	bn_check_top(r);
247 	return (ret);
248 }
249 
250 int
251 BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
252     BN_CTX *ctx)
253 {
254 	int i, j, bits, ret = 0, wstart, wend, window, wvalue;
255 	int start = 1;
256 	BIGNUM *aa;
257 	/* Table of variables obtained from 'ctx' */
258 	BIGNUM *val[TABLE_SIZE];
259 	BN_RECP_CTX recp;
260 
261 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
262 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
263 		BNerr(BN_F_BN_MOD_EXP_RECP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
264 		return -1;
265 	}
266 
267 	bits = BN_num_bits(p);
268 
269 	if (bits == 0) {
270 		ret = BN_one(r);
271 		return ret;
272 	}
273 
274 	BN_CTX_start(ctx);
275 	if ((aa = BN_CTX_get(ctx)) == NULL)
276 		goto err;
277 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
278 		goto err;
279 
280 	BN_RECP_CTX_init(&recp);
281 	if (m->neg) {
282 		/* ignore sign of 'm' */
283 		if (!BN_copy(aa, m))
284 			goto err;
285 		aa->neg = 0;
286 		if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0)
287 			goto err;
288 	} else {
289 		if (BN_RECP_CTX_set(&recp, m, ctx) <= 0)
290 			goto err;
291 	}
292 
293 	if (!BN_nnmod(val[0], a, m, ctx))
294 		goto err;		/* 1 */
295 	if (BN_is_zero(val[0])) {
296 		BN_zero(r);
297 		ret = 1;
298 		goto err;
299 	}
300 
301 	window = BN_window_bits_for_exponent_size(bits);
302 	if (window > 1) {
303 		if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx))
304 			goto err;				/* 2 */
305 		j = 1 << (window - 1);
306 		for (i = 1; i < j; i++) {
307 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
308 			    !BN_mod_mul_reciprocal(val[i], val[i - 1],
309 			    aa, &recp, ctx))
310 				goto err;
311 		}
312 	}
313 
314 	start = 1;		/* This is used to avoid multiplication etc
315 				 * when there is only the value '1' in the
316 				 * buffer. */
317 	wvalue = 0;		/* The 'value' of the window */
318 	wstart = bits - 1;	/* The top bit of the window */
319 	wend = 0;		/* The bottom bit of the window */
320 
321 	if (!BN_one(r))
322 		goto err;
323 
324 	for (;;) {
325 		if (BN_is_bit_set(p, wstart) == 0) {
326 			if (!start)
327 				if (!BN_mod_mul_reciprocal(r, r,r, &recp, ctx))
328 					goto err;
329 			if (wstart == 0)
330 				break;
331 			wstart--;
332 			continue;
333 		}
334 		/* We now have wstart on a 'set' bit, we now need to work out
335 		 * how bit a window to do.  To do this we need to scan
336 		 * forward until the last set bit before the end of the
337 		 * window */
338 		j = wstart;
339 		wvalue = 1;
340 		wend = 0;
341 		for (i = 1; i < window; i++) {
342 			if (wstart - i < 0)
343 				break;
344 			if (BN_is_bit_set(p, wstart - i)) {
345 				wvalue <<= (i - wend);
346 				wvalue |= 1;
347 				wend = i;
348 			}
349 		}
350 
351 		/* wend is the size of the current window */
352 		j = wend + 1;
353 		/* add the 'bytes above' */
354 		if (!start)
355 			for (i = 0; i < j; i++) {
356 				if (!BN_mod_mul_reciprocal(r, r,r, &recp, ctx))
357 					goto err;
358 			}
359 
360 		/* wvalue will be an odd number < 2^window */
361 		if (!BN_mod_mul_reciprocal(r, r,val[wvalue >> 1], &recp, ctx))
362 			goto err;
363 
364 		/* move the 'window' down further */
365 		wstart -= wend + 1;
366 		wvalue = 0;
367 		start = 0;
368 		if (wstart < 0)
369 			break;
370 	}
371 	ret = 1;
372 
373 err:
374 	BN_CTX_end(ctx);
375 	BN_RECP_CTX_free(&recp);
376 	bn_check_top(r);
377 	return (ret);
378 }
379 
380 int
381 BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
382     BN_CTX *ctx, BN_MONT_CTX *in_mont)
383 {
384 	int i, j, bits, ret = 0, wstart, wend, window, wvalue;
385 	int start = 1;
386 	BIGNUM *d, *r;
387 	const BIGNUM *aa;
388 	/* Table of variables obtained from 'ctx' */
389 	BIGNUM *val[TABLE_SIZE];
390 	BN_MONT_CTX *mont = NULL;
391 
392 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
393 		return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
394 	}
395 
396 	bn_check_top(a);
397 	bn_check_top(p);
398 	bn_check_top(m);
399 
400 	if (!BN_is_odd(m)) {
401 		BNerr(BN_F_BN_MOD_EXP_MONT, BN_R_CALLED_WITH_EVEN_MODULUS);
402 		return (0);
403 	}
404 	bits = BN_num_bits(p);
405 	if (bits == 0) {
406 		ret = BN_one(rr);
407 		return ret;
408 	}
409 
410 	BN_CTX_start(ctx);
411 	if ((d = BN_CTX_get(ctx)) == NULL)
412 		goto err;
413 	if ((r = BN_CTX_get(ctx)) == NULL)
414 		goto err;
415 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
416 		goto err;
417 
418 	/* If this is not done, things will break in the montgomery
419 	 * part */
420 
421 	if (in_mont != NULL)
422 		mont = in_mont;
423 	else {
424 		if ((mont = BN_MONT_CTX_new()) == NULL)
425 			goto err;
426 		if (!BN_MONT_CTX_set(mont, m, ctx))
427 			goto err;
428 	}
429 
430 	if (a->neg || BN_ucmp(a, m) >= 0) {
431 		if (!BN_nnmod(val[0], a,m, ctx))
432 			goto err;
433 		aa = val[0];
434 	} else
435 		aa = a;
436 	if (BN_is_zero(aa)) {
437 		BN_zero(rr);
438 		ret = 1;
439 		goto err;
440 	}
441 	if (!BN_to_montgomery(val[0], aa, mont, ctx))
442 		goto err; /* 1 */
443 
444 	window = BN_window_bits_for_exponent_size(bits);
445 	if (window > 1) {
446 		if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx))
447 			goto err; /* 2 */
448 		j = 1 << (window - 1);
449 		for (i = 1; i < j; i++) {
450 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
451 			    !BN_mod_mul_montgomery(val[i], val[i - 1],
452 			    d, mont, ctx))
453 				goto err;
454 		}
455 	}
456 
457 	start = 1;		/* This is used to avoid multiplication etc
458 				 * when there is only the value '1' in the
459 				 * buffer. */
460 	wvalue = 0;		/* The 'value' of the window */
461 	wstart = bits - 1;	/* The top bit of the window */
462 	wend = 0;		/* The bottom bit of the window */
463 
464 	if (!BN_to_montgomery(r, BN_value_one(), mont, ctx))
465 		goto err;
466 	for (;;) {
467 		if (BN_is_bit_set(p, wstart) == 0) {
468 			if (!start) {
469 				if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
470 					goto err;
471 			}
472 			if (wstart == 0)
473 				break;
474 			wstart--;
475 			continue;
476 		}
477 		/* We now have wstart on a 'set' bit, we now need to work out
478 		 * how bit a window to do.  To do this we need to scan
479 		 * forward until the last set bit before the end of the
480 		 * window */
481 		j = wstart;
482 		wvalue = 1;
483 		wend = 0;
484 		for (i = 1; i < window; i++) {
485 			if (wstart - i < 0)
486 				break;
487 			if (BN_is_bit_set(p, wstart - i)) {
488 				wvalue <<= (i - wend);
489 				wvalue |= 1;
490 				wend = i;
491 			}
492 		}
493 
494 		/* wend is the size of the current window */
495 		j = wend + 1;
496 		/* add the 'bytes above' */
497 		if (!start)
498 			for (i = 0; i < j; i++) {
499 				if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
500 					goto err;
501 			}
502 
503 		/* wvalue will be an odd number < 2^window */
504 		if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx))
505 			goto err;
506 
507 		/* move the 'window' down further */
508 		wstart -= wend + 1;
509 		wvalue = 0;
510 		start = 0;
511 		if (wstart < 0)
512 			break;
513 	}
514 	if (!BN_from_montgomery(rr, r,mont, ctx))
515 		goto err;
516 	ret = 1;
517 
518 err:
519 	if ((in_mont == NULL) && (mont != NULL))
520 		BN_MONT_CTX_free(mont);
521 	BN_CTX_end(ctx);
522 	bn_check_top(rr);
523 	return (ret);
524 }
525 
526 
527 /* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific layout
528  * so that accessing any of these table values shows the same access pattern as far
529  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
530  * from/to that table. */
531 
532 static int
533 MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf,
534     int idx, int width)
535 {
536 	size_t i, j;
537 
538 	if (top > b->top)
539 		top = b->top; /* this works because 'buf' is explicitly zeroed */
540 	for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
541 		buf[j] = ((unsigned char*)b->d)[i];
542 	}
543 
544 	return 1;
545 }
546 
547 static int
548 MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx,
549     int width)
550 {
551 	size_t i, j;
552 
553 	if (bn_wexpand(b, top) == NULL)
554 		return 0;
555 
556 	for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
557 		((unsigned char*)b->d)[i] = buf[j];
558 	}
559 
560 	b->top = top;
561 	bn_correct_top(b);
562 	return 1;
563 }
564 
565 /* Given a pointer value, compute the next address that is a cache line multiple. */
566 #define MOD_EXP_CTIME_ALIGN(x_) \
567 	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
568 
569 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
570  * precomputation memory layout to limit data-dependency to a minimum
571  * to protect secret exponents (cf. the hyper-threading timing attacks
572  * pointed out by Colin Percival,
573  * http://www.daemonology.net/hyperthreading-considered-harmful/)
574  */
575 int
576 BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
577     const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
578 {
579 	int i, bits, ret = 0, window, wvalue;
580 	int top;
581 	BN_MONT_CTX *mont = NULL;
582 	int numPowers;
583 	unsigned char *powerbufFree = NULL;
584 	int powerbufLen = 0;
585 	unsigned char *powerbuf = NULL;
586 	BIGNUM tmp, am;
587 
588 	bn_check_top(a);
589 	bn_check_top(p);
590 	bn_check_top(m);
591 
592 	top = m->top;
593 
594 	if (!(m->d[0] & 1)) {
595 		BNerr(BN_F_BN_MOD_EXP_MONT_CONSTTIME,
596 		    BN_R_CALLED_WITH_EVEN_MODULUS);
597 		return (0);
598 	}
599 	bits = BN_num_bits(p);
600 	if (bits == 0) {
601 		ret = BN_one(rr);
602 		return ret;
603 	}
604 
605 	BN_CTX_start(ctx);
606 
607 	/* Allocate a montgomery context if it was not supplied by the caller.
608 	 * If this is not done, things will break in the montgomery part.
609  	 */
610 	if (in_mont != NULL)
611 		mont = in_mont;
612 	else {
613 		if ((mont = BN_MONT_CTX_new()) == NULL)
614 			goto err;
615 		if (!BN_MONT_CTX_set(mont, m, ctx))
616 			goto err;
617 	}
618 
619 	/* Get the window size to use with size of p. */
620 	window = BN_window_bits_for_ctime_exponent_size(bits);
621 #if defined(OPENSSL_BN_ASM_MONT5)
622 	if (window == 6 && bits <= 1024)
623 		window = 5;	/* ~5% improvement of 2048-bit RSA sign */
624 #endif
625 
626 	/* Allocate a buffer large enough to hold all of the pre-computed
627 	 * powers of am, am itself and tmp.
628 	 */
629 	numPowers = 1 << window;
630 	powerbufLen = sizeof(m->d[0]) * (top * numPowers +
631 	    ((2*top) > numPowers ? (2*top) : numPowers));
632 	if ((powerbufFree = malloc(powerbufLen +
633 	    MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
634 		goto err;
635 
636 	powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
637 	memset(powerbuf, 0, powerbufLen);
638 
639 	/* lay down tmp and am right after powers table */
640 	tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
641 	am.d = tmp.d + top;
642 	tmp.top = am.top = 0;
643 	tmp.dmax = am.dmax = top;
644 	tmp.neg = am.neg = 0;
645 	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
646 
647 	/* prepare a^0 in Montgomery domain */
648 #if 1
649 	if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
650 		goto err;
651 #else
652 	tmp.d[0] = (0 - m - >d[0]) & BN_MASK2;	/* 2^(top*BN_BITS2) - m */
653 	for (i = 1; i < top; i++)
654 		tmp.d[i] = (~m->d[i]) & BN_MASK2;
655 	tmp.top = top;
656 #endif
657 
658 	/* prepare a^1 in Montgomery domain */
659 	if (a->neg || BN_ucmp(a, m) >= 0) {
660 		if (!BN_mod(&am, a,m, ctx))
661 			goto err;
662 		if (!BN_to_montgomery(&am, &am, mont, ctx))
663 			goto err;
664 	} else if (!BN_to_montgomery(&am, a,mont, ctx))
665 		goto err;
666 
667 #if defined(OPENSSL_BN_ASM_MONT5)
668 	/* This optimization uses ideas from http://eprint.iacr.org/2011/239,
669 	 * specifically optimization of cache-timing attack countermeasures
670 	 * and pre-computation optimization. */
671 
672 	/* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
673 	 * 512-bit RSA is hardly relevant, we omit it to spare size... */
674 	if (window == 5 && top > 1) {
675 		void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap,
676 		    const void *table, const BN_ULONG *np,
677 		    const BN_ULONG *n0, int num, int power);
678 		void bn_scatter5(const BN_ULONG *inp, size_t num,
679 		    void *table, size_t power);
680 		void bn_gather5(BN_ULONG *out, size_t num,
681 		    void *table, size_t power);
682 
683 		BN_ULONG *np = mont->N.d, *n0 = mont->n0;
684 
685 		/* BN_to_montgomery can contaminate words above .top
686 		 * [in BN_DEBUG[_DEBUG] build]... */
687 		for (i = am.top; i < top; i++)
688 			am.d[i] = 0;
689 		for (i = tmp.top; i < top; i++)
690 			tmp.d[i] = 0;
691 
692 		bn_scatter5(tmp.d, top, powerbuf, 0);
693 		bn_scatter5(am.d, am.top, powerbuf, 1);
694 		bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
695 		bn_scatter5(tmp.d, top, powerbuf, 2);
696 
697 #if 0
698 		for (i = 3; i < 32; i++) {
699 			/* Calculate a^i = a^(i-1) * a */
700 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
701 			    n0, top, i - 1);
702 			bn_scatter5(tmp.d, top, powerbuf, i);
703 		}
704 #else
705 		/* same as above, but uses squaring for 1/2 of operations */
706 		for (i = 4; i < 32; i*=2) {
707 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
708 			bn_scatter5(tmp.d, top, powerbuf, i);
709 		}
710 		for (i = 3; i < 8; i += 2) {
711 			int j;
712 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
713 			    n0, top, i - 1);
714 			bn_scatter5(tmp.d, top, powerbuf, i);
715 			for (j = 2 * i; j < 32; j *= 2) {
716 				bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
717 				bn_scatter5(tmp.d, top, powerbuf, j);
718 			}
719 		}
720 		for (; i < 16; i += 2) {
721 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
722 			    n0, top, i - 1);
723 			bn_scatter5(tmp.d, top, powerbuf, i);
724 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
725 			bn_scatter5(tmp.d, top, powerbuf, 2*i);
726 		}
727 		for (; i < 32; i += 2) {
728 			bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np,
729 			    n0, top, i - 1);
730 			bn_scatter5(tmp.d, top, powerbuf, i);
731 		}
732 #endif
733 		bits--;
734 		for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
735 			wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
736 		bn_gather5(tmp.d, top, powerbuf, wvalue);
737 
738 		/* Scan the exponent one window at a time starting from the most
739 		 * significant bits.
740 		 */
741 		while (bits >= 0) {
742 			for (wvalue = 0, i = 0; i < 5; i++, bits--)
743 				wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
744 
745 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
746 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
747 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
748 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
749 			bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
750 			bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
751 		}
752 
753 		tmp.top = top;
754 		bn_correct_top(&tmp);
755 	} else
756 #endif
757 	{
758 		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0,
759 		    numPowers))
760 			goto err;
761 		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am,  top, powerbuf, 1,
762 		    numPowers))
763 			goto err;
764 
765 		/* If the window size is greater than 1, then calculate
766 		 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
767 		 * (even powers could instead be computed as (a^(i/2))^2
768 		 * to use the slight performance advantage of sqr over mul).
769 		 */
770 		if (window > 1) {
771 			if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
772 				goto err;
773 			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf,
774 			    2, numPowers))
775 				goto err;
776 			for (i = 3; i < numPowers; i++) {
777 				/* Calculate a^i = a^(i-1) * a */
778 				if (!BN_mod_mul_montgomery(&tmp, &am, &tmp,
779 				    mont, ctx))
780 					goto err;
781 				if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top,
782 				    powerbuf, i, numPowers))
783 					goto err;
784 			}
785 		}
786 
787 		bits--;
788 		for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
789 			wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
790 		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp, top, powerbuf,
791 		    wvalue, numPowers))
792 			goto err;
793 
794 		/* Scan the exponent one window at a time starting from the most
795 		 * significant bits.
796 		 */
797 		while (bits >= 0) {
798 			wvalue = 0; /* The 'value' of the window */
799 
800 			/* Scan the window, squaring the result as we go */
801 			for (i = 0; i < window; i++, bits--) {
802 				if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp,
803 				    mont, ctx))
804 					goto err;
805 				wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
806 			}
807 
808 			/* Fetch the appropriate pre-computed value from the pre-buf */
809 			if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf,
810 			    wvalue, numPowers))
811 				goto err;
812 
813 			/* Multiply the result into the intermediate result */
814 			if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx))
815 				goto err;
816 		}
817 	}
818 
819 	/* Convert the final result from montgomery to standard format */
820 	if (!BN_from_montgomery(rr, &tmp, mont, ctx))
821 		goto err;
822 	ret = 1;
823 
824 err:
825 	if ((in_mont == NULL) && (mont != NULL))
826 		BN_MONT_CTX_free(mont);
827 	if (powerbuf != NULL) {
828 		explicit_bzero(powerbuf, powerbufLen);
829 		free(powerbufFree);
830 	}
831 	BN_CTX_end(ctx);
832 	return (ret);
833 }
834 
835 int
836 BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, const BIGNUM *m,
837     BN_CTX *ctx, BN_MONT_CTX *in_mont)
838 {
839 	BN_MONT_CTX *mont = NULL;
840 	int b, bits, ret = 0;
841 	int r_is_one;
842 	BN_ULONG w, next_w;
843 	BIGNUM *d, *r, *t;
844 	BIGNUM *swap_tmp;
845 
846 #define BN_MOD_MUL_WORD(r, w, m) \
847 		(BN_mul_word(r, (w)) && \
848 		(/* BN_ucmp(r, (m)) < 0 ? 1 :*/  \
849 			(BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
850 		/* BN_MOD_MUL_WORD is only used with 'w' large,
851 		 * so the BN_ucmp test is probably more overhead
852 		 * than always using BN_mod (which uses BN_copy if
853 		 * a similar test returns true). */
854 		/* We can use BN_mod and do not need BN_nnmod because our
855 		 * accumulator is never negative (the result of BN_mod does
856 		 * not depend on the sign of the modulus).
857 		 */
858 #define BN_TO_MONTGOMERY_WORD(r, w, mont) \
859 		(BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
860 
861 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
862 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
863 		BNerr(BN_F_BN_MOD_EXP_MONT_WORD,
864 		    ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
865 		return -1;
866 	}
867 
868 	bn_check_top(p);
869 	bn_check_top(m);
870 
871 	if (!BN_is_odd(m)) {
872 		BNerr(BN_F_BN_MOD_EXP_MONT_WORD, BN_R_CALLED_WITH_EVEN_MODULUS);
873 		return (0);
874 	}
875 	if (m->top == 1)
876 		a %= m->d[0]; /* make sure that 'a' is reduced */
877 
878 	bits = BN_num_bits(p);
879 	if (bits == 0) {
880 		ret = BN_one(rr);
881 		return ret;
882 	}
883 	if (a == 0) {
884 		BN_zero(rr);
885 		ret = 1;
886 		return ret;
887 	}
888 
889 	BN_CTX_start(ctx);
890 	if ((d = BN_CTX_get(ctx)) == NULL)
891 		goto err;
892 	if ((r = BN_CTX_get(ctx)) == NULL)
893 		goto err;
894 	if ((t = BN_CTX_get(ctx)) == NULL)
895 		goto err;
896 
897 	if (in_mont != NULL)
898 		mont = in_mont;
899 	else {
900 		if ((mont = BN_MONT_CTX_new()) == NULL)
901 			goto err;
902 		if (!BN_MONT_CTX_set(mont, m, ctx))
903 			goto err;
904 	}
905 
906 	r_is_one = 1; /* except for Montgomery factor */
907 
908 	/* bits-1 >= 0 */
909 
910 	/* The result is accumulated in the product r*w. */
911 	w = a; /* bit 'bits-1' of 'p' is always set */
912 	for (b = bits - 2; b >= 0; b--) {
913 		/* First, square r*w. */
914 		next_w = w * w;
915 		if ((next_w / w) != w) /* overflow */
916 		{
917 			if (r_is_one) {
918 				if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
919 					goto err;
920 				r_is_one = 0;
921 			} else {
922 				if (!BN_MOD_MUL_WORD(r, w, m))
923 					goto err;
924 			}
925 			next_w = 1;
926 		}
927 		w = next_w;
928 		if (!r_is_one) {
929 			if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
930 				goto err;
931 		}
932 
933 		/* Second, multiply r*w by 'a' if exponent bit is set. */
934 		if (BN_is_bit_set(p, b)) {
935 			next_w = w * a;
936 			if ((next_w / a) != w) /* overflow */
937 			{
938 				if (r_is_one) {
939 					if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
940 						goto err;
941 					r_is_one = 0;
942 				} else {
943 					if (!BN_MOD_MUL_WORD(r, w, m))
944 						goto err;
945 				}
946 				next_w = a;
947 			}
948 			w = next_w;
949 		}
950 	}
951 
952 	/* Finally, set r:=r*w. */
953 	if (w != 1) {
954 		if (r_is_one) {
955 			if (!BN_TO_MONTGOMERY_WORD(r, w, mont))
956 				goto err;
957 			r_is_one = 0;
958 		} else {
959 			if (!BN_MOD_MUL_WORD(r, w, m))
960 				goto err;
961 		}
962 	}
963 
964 	if (r_is_one) /* can happen only if a == 1*/
965 	{
966 		if (!BN_one(rr))
967 			goto err;
968 	} else {
969 		if (!BN_from_montgomery(rr, r, mont, ctx))
970 			goto err;
971 	}
972 	ret = 1;
973 
974 err:
975 	if ((in_mont == NULL) && (mont != NULL))
976 		BN_MONT_CTX_free(mont);
977 	BN_CTX_end(ctx);
978 	bn_check_top(rr);
979 	return (ret);
980 }
981 
982 
983 /* The old fallback, simple version :-) */
984 int
985 BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
986     BN_CTX *ctx)
987 {
988 	int i, j,bits, ret = 0, wstart, wend, window, wvalue;
989 	int start = 1;
990 	BIGNUM *d;
991 	/* Table of variables obtained from 'ctx' */
992 	BIGNUM *val[TABLE_SIZE];
993 
994 	if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
995 		/* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
996 		BNerr(BN_F_BN_MOD_EXP_SIMPLE,
997 		    ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
998 		return -1;
999 	}
1000 
1001 	bits = BN_num_bits(p);
1002 
1003 	if (bits == 0) {
1004 		ret = BN_one(r);
1005 		return ret;
1006 	}
1007 
1008 	BN_CTX_start(ctx);
1009 	if ((d = BN_CTX_get(ctx)) == NULL)
1010 		goto err;
1011 	if ((val[0] = BN_CTX_get(ctx)) == NULL)
1012 		goto err;
1013 
1014 	if (!BN_nnmod(val[0],a,m,ctx))
1015 		goto err;		/* 1 */
1016 	if (BN_is_zero(val[0])) {
1017 		BN_zero(r);
1018 		ret = 1;
1019 		goto err;
1020 	}
1021 
1022 	window = BN_window_bits_for_exponent_size(bits);
1023 	if (window > 1) {
1024 		if (!BN_mod_mul(d, val[0], val[0], m, ctx))
1025 			goto err;				/* 2 */
1026 		j = 1 << (window - 1);
1027 		for (i = 1; i < j; i++) {
1028 			if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
1029 			    !BN_mod_mul(val[i], val[i - 1], d,m, ctx))
1030 				goto err;
1031 		}
1032 	}
1033 
1034 	start = 1;		/* This is used to avoid multiplication etc
1035 				 * when there is only the value '1' in the
1036 				 * buffer. */
1037 	wvalue = 0;		/* The 'value' of the window */
1038 	wstart = bits - 1;	/* The top bit of the window */
1039 	wend = 0;		/* The bottom bit of the window */
1040 
1041 	if (!BN_one(r))
1042 		goto err;
1043 
1044 	for (;;) {
1045 		if (BN_is_bit_set(p, wstart) == 0) {
1046 			if (!start)
1047 				if (!BN_mod_mul(r, r, r, m, ctx))
1048 					goto err;
1049 			if (wstart == 0)
1050 				break;
1051 			wstart--;
1052 			continue;
1053 		}
1054 		/* We now have wstart on a 'set' bit, we now need to work out
1055 		 * how bit a window to do.  To do this we need to scan
1056 		 * forward until the last set bit before the end of the
1057 		 * window */
1058 		j = wstart;
1059 		wvalue = 1;
1060 		wend = 0;
1061 		for (i = 1; i < window; i++) {
1062 			if (wstart - i < 0)
1063 				break;
1064 			if (BN_is_bit_set(p, wstart - i)) {
1065 				wvalue <<= (i - wend);
1066 				wvalue |= 1;
1067 				wend = i;
1068 			}
1069 		}
1070 
1071 		/* wend is the size of the current window */
1072 		j = wend + 1;
1073 		/* add the 'bytes above' */
1074 		if (!start)
1075 			for (i = 0; i < j; i++) {
1076 				if (!BN_mod_mul(r, r, r, m, ctx))
1077 					goto err;
1078 			}
1079 
1080 		/* wvalue will be an odd number < 2^window */
1081 		if (!BN_mod_mul(r, r, val[wvalue >> 1], m, ctx))
1082 			goto err;
1083 
1084 		/* move the 'window' down further */
1085 		wstart -= wend + 1;
1086 		wvalue = 0;
1087 		start = 0;
1088 		if (wstart < 0)
1089 			break;
1090 	}
1091 	ret = 1;
1092 
1093 err:
1094 	BN_CTX_end(ctx);
1095 	bn_check_top(r);
1096 	return (ret);
1097 }
1098