xref: /dragonfly/crypto/libressl/crypto/bn/bn_asm.c (revision 72c33676)
1 /* $OpenBSD: bn_asm.c,v 1.15 2017/05/02 03:59:44 deraadt Exp $ */
2 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3  * All rights reserved.
4  *
5  * This package is an SSL implementation written
6  * by Eric Young (eay@cryptsoft.com).
7  * The implementation was written so as to conform with Netscapes SSL.
8  *
9  * This library is free for commercial and non-commercial use as long as
10  * the following conditions are aheared to.  The following conditions
11  * apply to all code found in this distribution, be it the RC4, RSA,
12  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
13  * included with this distribution is covered by the same copyright terms
14  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15  *
16  * Copyright remains Eric Young's, and as such any Copyright notices in
17  * the code are not to be removed.
18  * If this package is used in a product, Eric Young should be given attribution
19  * as the author of the parts of the library used.
20  * This can be in the form of a textual message at program startup or
21  * in documentation (online or textual) provided with the package.
22  *
23  * Redistribution and use in source and binary forms, with or without
24  * modification, are permitted provided that the following conditions
25  * are met:
26  * 1. Redistributions of source code must retain the copyright
27  *    notice, this list of conditions and the following disclaimer.
28  * 2. Redistributions in binary form must reproduce the above copyright
29  *    notice, this list of conditions and the following disclaimer in the
30  *    documentation and/or other materials provided with the distribution.
31  * 3. All advertising materials mentioning features or use of this software
32  *    must display the following acknowledgement:
33  *    "This product includes cryptographic software written by
34  *     Eric Young (eay@cryptsoft.com)"
35  *    The word 'cryptographic' can be left out if the rouines from the library
36  *    being used are not cryptographic related :-).
37  * 4. If you include any Windows specific code (or a derivative thereof) from
38  *    the apps directory (application code) you must include an acknowledgement:
39  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40  *
41  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51  * SUCH DAMAGE.
52  *
53  * The licence and distribution terms for any publically available version or
54  * derivative of this code cannot be changed.  i.e. this code cannot simply be
55  * copied and put under another distribution licence
56  * [including the GNU Public Licence.]
57  */
58 
59 #ifndef BN_DEBUG
60 # undef NDEBUG /* avoid conflicting definitions */
61 # define NDEBUG
62 #endif
63 
64 #include <assert.h>
65 #include <stdio.h>
66 
67 #include <openssl/opensslconf.h>
68 
69 #include "bn_lcl.h"
70 
71 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
72 
73 BN_ULONG
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)74 bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
75 {
76 	BN_ULONG c1 = 0;
77 
78 	assert(num >= 0);
79 	if (num <= 0)
80 		return (c1);
81 
82 #ifndef OPENSSL_SMALL_FOOTPRINT
83 	while (num & ~3) {
84 		mul_add(rp[0], ap[0], w, c1);
85 		mul_add(rp[1], ap[1], w, c1);
86 		mul_add(rp[2], ap[2], w, c1);
87 		mul_add(rp[3], ap[3], w, c1);
88 		ap += 4;
89 		rp += 4;
90 		num -= 4;
91 	}
92 #endif
93 	while (num) {
94 		mul_add(rp[0], ap[0], w, c1);
95 		ap++;
96 		rp++;
97 		num--;
98 	}
99 
100 	return (c1);
101 }
102 
103 BN_ULONG
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)104 bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
105 {
106 	BN_ULONG c1 = 0;
107 
108 	assert(num >= 0);
109 	if (num <= 0)
110 		return (c1);
111 
112 #ifndef OPENSSL_SMALL_FOOTPRINT
113 	while (num & ~3) {
114 		mul(rp[0], ap[0], w, c1);
115 		mul(rp[1], ap[1], w, c1);
116 		mul(rp[2], ap[2], w, c1);
117 		mul(rp[3], ap[3], w, c1);
118 		ap += 4;
119 		rp += 4;
120 		num -= 4;
121 	}
122 #endif
123 	while (num) {
124 		mul(rp[0], ap[0], w, c1);
125 		ap++;
126 		rp++;
127 		num--;
128 	}
129 	return (c1);
130 }
131 
132 void
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)133 bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
134 {
135 	assert(n >= 0);
136 	if (n <= 0)
137 		return;
138 
139 #ifndef OPENSSL_SMALL_FOOTPRINT
140 	while (n & ~3) {
141 		sqr(r[0], r[1], a[0]);
142 		sqr(r[2], r[3], a[1]);
143 		sqr(r[4], r[5], a[2]);
144 		sqr(r[6], r[7], a[3]);
145 		a += 4;
146 		r += 8;
147 		n -= 4;
148 	}
149 #endif
150 	while (n) {
151 		sqr(r[0], r[1], a[0]);
152 		a++;
153 		r += 2;
154 		n--;
155 	}
156 }
157 
158 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
159 
160 BN_ULONG
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)161 bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
162 {
163 	BN_ULONG c = 0;
164 	BN_ULONG bl, bh;
165 
166 	assert(num >= 0);
167 	if (num <= 0)
168 		return ((BN_ULONG)0);
169 
170 	bl = LBITS(w);
171 	bh = HBITS(w);
172 
173 #ifndef OPENSSL_SMALL_FOOTPRINT
174 	while (num & ~3) {
175 		mul_add(rp[0], ap[0], bl, bh, c);
176 		mul_add(rp[1], ap[1], bl, bh, c);
177 		mul_add(rp[2], ap[2], bl, bh, c);
178 		mul_add(rp[3], ap[3], bl, bh, c);
179 		ap += 4;
180 		rp += 4;
181 		num -= 4;
182 	}
183 #endif
184 	while (num) {
185 		mul_add(rp[0], ap[0], bl, bh, c);
186 		ap++;
187 		rp++;
188 		num--;
189 	}
190 	return (c);
191 }
192 
193 BN_ULONG
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)194 bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
195 {
196 	BN_ULONG carry = 0;
197 	BN_ULONG bl, bh;
198 
199 	assert(num >= 0);
200 	if (num <= 0)
201 		return ((BN_ULONG)0);
202 
203 	bl = LBITS(w);
204 	bh = HBITS(w);
205 
206 #ifndef OPENSSL_SMALL_FOOTPRINT
207 	while (num & ~3) {
208 		mul(rp[0], ap[0], bl, bh, carry);
209 		mul(rp[1], ap[1], bl, bh, carry);
210 		mul(rp[2], ap[2], bl, bh, carry);
211 		mul(rp[3], ap[3], bl, bh, carry);
212 		ap += 4;
213 		rp += 4;
214 		num -= 4;
215 	}
216 #endif
217 	while (num) {
218 		mul(rp[0], ap[0], bl, bh, carry);
219 		ap++;
220 		rp++;
221 		num--;
222 	}
223 	return (carry);
224 }
225 
226 void
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)227 bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
228 {
229 	assert(n >= 0);
230 	if (n <= 0)
231 		return;
232 
233 #ifndef OPENSSL_SMALL_FOOTPRINT
234 	while (n & ~3) {
235 		sqr64(r[0], r[1], a[0]);
236 		sqr64(r[2], r[3], a[1]);
237 		sqr64(r[4], r[5], a[2]);
238 		sqr64(r[6], r[7], a[3]);
239 		a += 4;
240 		r += 8;
241 		n -= 4;
242 	}
243 #endif
244 	while (n) {
245 		sqr64(r[0], r[1], a[0]);
246 		a++;
247 		r += 2;
248 		n--;
249 	}
250 }
251 
252 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
253 
254 #if defined(BN_LLONG) && defined(BN_DIV2W)
255 
256 BN_ULONG
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)257 bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
258 {
259 	return ((BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2)|l)/(BN_ULLONG)d));
260 }
261 
262 #else
263 
264 /* Divide h,l by d and return the result. */
265 /* I need to test this some more :-( */
266 BN_ULONG
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)267 bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
268 {
269 	BN_ULONG dh, dl, q,ret = 0, th, tl, t;
270 	int i, count = 2;
271 
272 	if (d == 0)
273 		return (BN_MASK2);
274 
275 	i = BN_num_bits_word(d);
276 	assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
277 
278 	i = BN_BITS2 - i;
279 	if (h >= d)
280 		h -= d;
281 
282 	if (i) {
283 		d <<= i;
284 		h = (h << i) | (l >> (BN_BITS2 - i));
285 		l <<= i;
286 	}
287 	dh = (d & BN_MASK2h) >> BN_BITS4;
288 	dl = (d & BN_MASK2l);
289 	for (;;) {
290 		if ((h >> BN_BITS4) == dh)
291 			q = BN_MASK2l;
292 		else
293 			q = h / dh;
294 
295 		th = q * dh;
296 		tl = dl * q;
297 		for (;;) {
298 			t = h - th;
299 			if ((t & BN_MASK2h) ||
300 			    ((tl) <= (
301 			    (t << BN_BITS4) |
302 			    ((l & BN_MASK2h) >> BN_BITS4))))
303 				break;
304 			q--;
305 			th -= dh;
306 			tl -= dl;
307 		}
308 		t = (tl >> BN_BITS4);
309 		tl = (tl << BN_BITS4) & BN_MASK2h;
310 		th += t;
311 
312 		if (l < tl)
313 			th++;
314 		l -= tl;
315 		if (h < th) {
316 			h += d;
317 			q--;
318 		}
319 		h -= th;
320 
321 		if (--count == 0)
322 			break;
323 
324 		ret = q << BN_BITS4;
325 		h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
326 		l = (l & BN_MASK2l) << BN_BITS4;
327 	}
328 	ret |= q;
329 	return (ret);
330 }
331 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
332 
333 #ifdef BN_LLONG
334 BN_ULONG
bn_add_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)335 bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
336 {
337 	BN_ULLONG ll = 0;
338 
339 	assert(n >= 0);
340 	if (n <= 0)
341 		return ((BN_ULONG)0);
342 
343 #ifndef OPENSSL_SMALL_FOOTPRINT
344 	while (n & ~3) {
345 		ll += (BN_ULLONG)a[0] + b[0];
346 		r[0] = (BN_ULONG)ll & BN_MASK2;
347 		ll >>= BN_BITS2;
348 		ll += (BN_ULLONG)a[1] + b[1];
349 		r[1] = (BN_ULONG)ll & BN_MASK2;
350 		ll >>= BN_BITS2;
351 		ll += (BN_ULLONG)a[2] + b[2];
352 		r[2] = (BN_ULONG)ll & BN_MASK2;
353 		ll >>= BN_BITS2;
354 		ll += (BN_ULLONG)a[3] + b[3];
355 		r[3] = (BN_ULONG)ll & BN_MASK2;
356 		ll >>= BN_BITS2;
357 		a += 4;
358 		b += 4;
359 		r += 4;
360 		n -= 4;
361 	}
362 #endif
363 	while (n) {
364 		ll += (BN_ULLONG)a[0] + b[0];
365 		r[0] = (BN_ULONG)ll & BN_MASK2;
366 		ll >>= BN_BITS2;
367 		a++;
368 		b++;
369 		r++;
370 		n--;
371 	}
372 	return ((BN_ULONG)ll);
373 }
374 #else /* !BN_LLONG */
375 BN_ULONG
bn_add_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)376 bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
377 {
378 	BN_ULONG c, l, t;
379 
380 	assert(n >= 0);
381 	if (n <= 0)
382 		return ((BN_ULONG)0);
383 
384 	c = 0;
385 #ifndef OPENSSL_SMALL_FOOTPRINT
386 	while (n & ~3) {
387 		t = a[0];
388 		t = (t + c) & BN_MASK2;
389 		c = (t < c);
390 		l = (t + b[0]) & BN_MASK2;
391 		c += (l < t);
392 		r[0] = l;
393 		t = a[1];
394 		t = (t + c) & BN_MASK2;
395 		c = (t < c);
396 		l = (t + b[1]) & BN_MASK2;
397 		c += (l < t);
398 		r[1] = l;
399 		t = a[2];
400 		t = (t + c) & BN_MASK2;
401 		c = (t < c);
402 		l = (t + b[2]) & BN_MASK2;
403 		c += (l < t);
404 		r[2] = l;
405 		t = a[3];
406 		t = (t + c) & BN_MASK2;
407 		c = (t < c);
408 		l = (t + b[3]) & BN_MASK2;
409 		c += (l < t);
410 		r[3] = l;
411 		a += 4;
412 		b += 4;
413 		r += 4;
414 		n -= 4;
415 	}
416 #endif
417 	while (n) {
418 		t = a[0];
419 		t = (t + c) & BN_MASK2;
420 		c = (t < c);
421 		l = (t + b[0]) & BN_MASK2;
422 		c += (l < t);
423 		r[0] = l;
424 		a++;
425 		b++;
426 		r++;
427 		n--;
428 	}
429 	return ((BN_ULONG)c);
430 }
431 #endif /* !BN_LLONG */
432 
433 BN_ULONG
bn_sub_words(BN_ULONG * r,const BN_ULONG * a,const BN_ULONG * b,int n)434 bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
435 {
436 	BN_ULONG t1, t2;
437 	int c = 0;
438 
439 	assert(n >= 0);
440 	if (n <= 0)
441 		return ((BN_ULONG)0);
442 
443 #ifndef OPENSSL_SMALL_FOOTPRINT
444 	while (n&~3) {
445 		t1 = a[0];
446 		t2 = b[0];
447 		r[0] = (t1 - t2 - c) & BN_MASK2;
448 		if (t1 != t2)
449 			c = (t1 < t2);
450 		t1 = a[1];
451 		t2 = b[1];
452 		r[1] = (t1 - t2 - c) & BN_MASK2;
453 		if (t1 != t2)
454 			c = (t1 < t2);
455 		t1 = a[2];
456 		t2 = b[2];
457 		r[2] = (t1 - t2 - c) & BN_MASK2;
458 		if (t1 != t2)
459 			c = (t1 < t2);
460 		t1 = a[3];
461 		t2 = b[3];
462 		r[3] = (t1 - t2 - c) & BN_MASK2;
463 		if (t1 != t2)
464 			c = (t1 < t2);
465 		a += 4;
466 		b += 4;
467 		r += 4;
468 		n -= 4;
469 	}
470 #endif
471 	while (n) {
472 		t1 = a[0];
473 		t2 = b[0];
474 		r[0] = (t1 - t2 - c) & BN_MASK2;
475 		if (t1 != t2)
476 			c = (t1 < t2);
477 		a++;
478 		b++;
479 		r++;
480 		n--;
481 	}
482 	return (c);
483 }
484 
485 #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
486 
487 #undef bn_mul_comba8
488 #undef bn_mul_comba4
489 #undef bn_sqr_comba8
490 #undef bn_sqr_comba4
491 
492 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
493 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
494 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
495 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
496 
497 #ifdef BN_LLONG
498 /*
499  * Keep in mind that additions to multiplication result can not
500  * overflow, because its high half cannot be all-ones.
501  */
502 #define mul_add_c(a,b,c0,c1,c2)		do {	\
503 	BN_ULONG hi;				\
504 	BN_ULLONG t = (BN_ULLONG)(a)*(b);	\
505 	t += c0;		/* no carry */	\
506 	c0 = (BN_ULONG)Lw(t);			\
507 	hi = (BN_ULONG)Hw(t);			\
508 	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++;	\
509 	} while(0)
510 
511 #define mul_add_c2(a,b,c0,c1,c2)	do {	\
512 	BN_ULONG hi;				\
513 	BN_ULLONG t = (BN_ULLONG)(a)*(b);	\
514 	BN_ULLONG tt = t+c0;	/* no carry */	\
515 	c0 = (BN_ULONG)Lw(tt);			\
516 	hi = (BN_ULONG)Hw(tt);			\
517 	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++;	\
518 	t += c0;		/* no carry */	\
519 	c0 = (BN_ULONG)Lw(t);			\
520 	hi = (BN_ULONG)Hw(t);			\
521 	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++;	\
522 	} while(0)
523 
524 #define sqr_add_c(a,i,c0,c1,c2)		do {	\
525 	BN_ULONG hi;				\
526 	BN_ULLONG t = (BN_ULLONG)a[i]*a[i];	\
527 	t += c0;		/* no carry */	\
528 	c0 = (BN_ULONG)Lw(t);			\
529 	hi = (BN_ULONG)Hw(t);			\
530 	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++;	\
531 	} while(0)
532 
533 #define sqr_add_c2(a,i,j,c0,c1,c2) \
534 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
535 
536 #elif defined(BN_UMULT_LOHI)
537 /*
538  * Keep in mind that additions to hi can not overflow, because
539  * the high word of a multiplication result cannot be all-ones.
540  */
541 #define mul_add_c(a,b,c0,c1,c2)		do {	\
542 	BN_ULONG ta = (a), tb = (b);		\
543 	BN_ULONG lo, hi;			\
544 	BN_UMULT_LOHI(lo,hi,ta,tb);		\
545 	c0 += lo; hi += (c0<lo)?1:0;		\
546 	c1 += hi; c2 += (c1<hi)?1:0;		\
547 	} while(0)
548 
549 #define mul_add_c2(a,b,c0,c1,c2)	do {	\
550 	BN_ULONG ta = (a), tb = (b);		\
551 	BN_ULONG lo, hi, tt;			\
552 	BN_UMULT_LOHI(lo,hi,ta,tb);		\
553 	c0 += lo; tt = hi+((c0<lo)?1:0);	\
554 	c1 += tt; c2 += (c1<tt)?1:0;		\
555 	c0 += lo; hi += (c0<lo)?1:0;		\
556 	c1 += hi; c2 += (c1<hi)?1:0;		\
557 	} while(0)
558 
559 #define sqr_add_c(a,i,c0,c1,c2)		do {	\
560 	BN_ULONG ta = (a)[i];			\
561 	BN_ULONG lo, hi;			\
562 	BN_UMULT_LOHI(lo,hi,ta,ta);		\
563 	c0 += lo; hi += (c0<lo)?1:0;		\
564 	c1 += hi; c2 += (c1<hi)?1:0;		\
565 	} while(0)
566 
567 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
568 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
569 
570 #elif defined(BN_UMULT_HIGH)
571 /*
572  * Keep in mind that additions to hi can not overflow, because
573  * the high word of a multiplication result cannot be all-ones.
574  */
575 #define mul_add_c(a,b,c0,c1,c2)		do {	\
576 	BN_ULONG ta = (a), tb = (b);		\
577 	BN_ULONG lo = ta * tb;			\
578 	BN_ULONG hi = BN_UMULT_HIGH(ta,tb);	\
579 	c0 += lo; hi += (c0<lo)?1:0;		\
580 	c1 += hi; c2 += (c1<hi)?1:0;		\
581 	} while(0)
582 
583 #define mul_add_c2(a,b,c0,c1,c2)	do {	\
584 	BN_ULONG ta = (a), tb = (b), tt;	\
585 	BN_ULONG lo = ta * tb;			\
586 	BN_ULONG hi = BN_UMULT_HIGH(ta,tb);	\
587 	c0 += lo; tt = hi + ((c0<lo)?1:0);	\
588 	c1 += tt; c2 += (c1<tt)?1:0;		\
589 	c0 += lo; hi += (c0<lo)?1:0;		\
590 	c1 += hi; c2 += (c1<hi)?1:0;		\
591 	} while(0)
592 
593 #define sqr_add_c(a,i,c0,c1,c2)		do {	\
594 	BN_ULONG ta = (a)[i];			\
595 	BN_ULONG lo = ta * ta;			\
596 	BN_ULONG hi = BN_UMULT_HIGH(ta,ta);	\
597 	c0 += lo; hi += (c0<lo)?1:0;		\
598 	c1 += hi; c2 += (c1<hi)?1:0;		\
599 	} while(0)
600 
601 #define sqr_add_c2(a,i,j,c0,c1,c2)	\
602 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
603 
604 #else /* !BN_LLONG */
605 /*
606  * Keep in mind that additions to hi can not overflow, because
607  * the high word of a multiplication result cannot be all-ones.
608  */
609 #define mul_add_c(a,b,c0,c1,c2)		do {	\
610 	BN_ULONG lo = LBITS(a), hi = HBITS(a);	\
611 	BN_ULONG bl = LBITS(b), bh = HBITS(b);	\
612 	mul64(lo,hi,bl,bh);			\
613 	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++;	\
614 	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++;	\
615 	} while(0)
616 
617 #define mul_add_c2(a,b,c0,c1,c2)	do {	\
618 	BN_ULONG tt;				\
619 	BN_ULONG lo = LBITS(a), hi = HBITS(a);	\
620 	BN_ULONG bl = LBITS(b), bh = HBITS(b);	\
621 	mul64(lo,hi,bl,bh);			\
622 	tt = hi;				\
623 	c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++;	\
624 	c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++;	\
625 	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++;	\
626 	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++;	\
627 	} while(0)
628 
629 #define sqr_add_c(a,i,c0,c1,c2)		do {	\
630 	BN_ULONG lo, hi;			\
631 	sqr64(lo,hi,(a)[i]);			\
632 	c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++;	\
633 	c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++;	\
634 	} while(0)
635 
636 #define sqr_add_c2(a,i,j,c0,c1,c2) \
637 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
638 #endif /* !BN_LLONG */
639 
640 void
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)641 bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
642 {
643 	BN_ULONG c1, c2, c3;
644 
645 	c1 = 0;
646 	c2 = 0;
647 	c3 = 0;
648 	mul_add_c(a[0], b[0], c1, c2, c3);
649 	r[0] = c1;
650 	c1 = 0;
651 	mul_add_c(a[0], b[1], c2, c3, c1);
652 	mul_add_c(a[1], b[0], c2, c3, c1);
653 	r[1] = c2;
654 	c2 = 0;
655 	mul_add_c(a[2], b[0], c3, c1, c2);
656 	mul_add_c(a[1], b[1], c3, c1, c2);
657 	mul_add_c(a[0], b[2], c3, c1, c2);
658 	r[2] = c3;
659 	c3 = 0;
660 	mul_add_c(a[0], b[3], c1, c2, c3);
661 	mul_add_c(a[1], b[2], c1, c2, c3);
662 	mul_add_c(a[2], b[1], c1, c2, c3);
663 	mul_add_c(a[3], b[0], c1, c2, c3);
664 	r[3] = c1;
665 	c1 = 0;
666 	mul_add_c(a[4], b[0], c2, c3, c1);
667 	mul_add_c(a[3], b[1], c2, c3, c1);
668 	mul_add_c(a[2], b[2], c2, c3, c1);
669 	mul_add_c(a[1], b[3], c2, c3, c1);
670 	mul_add_c(a[0], b[4], c2, c3, c1);
671 	r[4] = c2;
672 	c2 = 0;
673 	mul_add_c(a[0], b[5], c3, c1, c2);
674 	mul_add_c(a[1], b[4], c3, c1, c2);
675 	mul_add_c(a[2], b[3], c3, c1, c2);
676 	mul_add_c(a[3], b[2], c3, c1, c2);
677 	mul_add_c(a[4], b[1], c3, c1, c2);
678 	mul_add_c(a[5], b[0], c3, c1, c2);
679 	r[5] = c3;
680 	c3 = 0;
681 	mul_add_c(a[6], b[0], c1, c2, c3);
682 	mul_add_c(a[5], b[1], c1, c2, c3);
683 	mul_add_c(a[4], b[2], c1, c2, c3);
684 	mul_add_c(a[3], b[3], c1, c2, c3);
685 	mul_add_c(a[2], b[4], c1, c2, c3);
686 	mul_add_c(a[1], b[5], c1, c2, c3);
687 	mul_add_c(a[0], b[6], c1, c2, c3);
688 	r[6] = c1;
689 	c1 = 0;
690 	mul_add_c(a[0], b[7], c2, c3, c1);
691 	mul_add_c(a[1], b[6], c2, c3, c1);
692 	mul_add_c(a[2], b[5], c2, c3, c1);
693 	mul_add_c(a[3], b[4], c2, c3, c1);
694 	mul_add_c(a[4], b[3], c2, c3, c1);
695 	mul_add_c(a[5], b[2], c2, c3, c1);
696 	mul_add_c(a[6], b[1], c2, c3, c1);
697 	mul_add_c(a[7], b[0], c2, c3, c1);
698 	r[7] = c2;
699 	c2 = 0;
700 	mul_add_c(a[7], b[1], c3, c1, c2);
701 	mul_add_c(a[6], b[2], c3, c1, c2);
702 	mul_add_c(a[5], b[3], c3, c1, c2);
703 	mul_add_c(a[4], b[4], c3, c1, c2);
704 	mul_add_c(a[3], b[5], c3, c1, c2);
705 	mul_add_c(a[2], b[6], c3, c1, c2);
706 	mul_add_c(a[1], b[7], c3, c1, c2);
707 	r[8] = c3;
708 	c3 = 0;
709 	mul_add_c(a[2], b[7], c1, c2, c3);
710 	mul_add_c(a[3], b[6], c1, c2, c3);
711 	mul_add_c(a[4], b[5], c1, c2, c3);
712 	mul_add_c(a[5], b[4], c1, c2, c3);
713 	mul_add_c(a[6], b[3], c1, c2, c3);
714 	mul_add_c(a[7], b[2], c1, c2, c3);
715 	r[9] = c1;
716 	c1 = 0;
717 	mul_add_c(a[7], b[3], c2, c3, c1);
718 	mul_add_c(a[6], b[4], c2, c3, c1);
719 	mul_add_c(a[5], b[5], c2, c3, c1);
720 	mul_add_c(a[4], b[6], c2, c3, c1);
721 	mul_add_c(a[3], b[7], c2, c3, c1);
722 	r[10] = c2;
723 	c2 = 0;
724 	mul_add_c(a[4], b[7], c3, c1, c2);
725 	mul_add_c(a[5], b[6], c3, c1, c2);
726 	mul_add_c(a[6], b[5], c3, c1, c2);
727 	mul_add_c(a[7], b[4], c3, c1, c2);
728 	r[11] = c3;
729 	c3 = 0;
730 	mul_add_c(a[7], b[5], c1, c2, c3);
731 	mul_add_c(a[6], b[6], c1, c2, c3);
732 	mul_add_c(a[5], b[7], c1, c2, c3);
733 	r[12] = c1;
734 	c1 = 0;
735 	mul_add_c(a[6], b[7], c2, c3, c1);
736 	mul_add_c(a[7], b[6], c2, c3, c1);
737 	r[13] = c2;
738 	c2 = 0;
739 	mul_add_c(a[7], b[7], c3, c1, c2);
740 	r[14] = c3;
741 	r[15] = c1;
742 }
743 
744 void
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)745 bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
746 {
747 	BN_ULONG c1, c2, c3;
748 
749 	c1 = 0;
750 	c2 = 0;
751 	c3 = 0;
752 	mul_add_c(a[0], b[0], c1, c2, c3);
753 	r[0] = c1;
754 	c1 = 0;
755 	mul_add_c(a[0], b[1], c2, c3, c1);
756 	mul_add_c(a[1], b[0], c2, c3, c1);
757 	r[1] = c2;
758 	c2 = 0;
759 	mul_add_c(a[2], b[0], c3, c1, c2);
760 	mul_add_c(a[1], b[1], c3, c1, c2);
761 	mul_add_c(a[0], b[2], c3, c1, c2);
762 	r[2] = c3;
763 	c3 = 0;
764 	mul_add_c(a[0], b[3], c1, c2, c3);
765 	mul_add_c(a[1], b[2], c1, c2, c3);
766 	mul_add_c(a[2], b[1], c1, c2, c3);
767 	mul_add_c(a[3], b[0], c1, c2, c3);
768 	r[3] = c1;
769 	c1 = 0;
770 	mul_add_c(a[3], b[1], c2, c3, c1);
771 	mul_add_c(a[2], b[2], c2, c3, c1);
772 	mul_add_c(a[1], b[3], c2, c3, c1);
773 	r[4] = c2;
774 	c2 = 0;
775 	mul_add_c(a[2], b[3], c3, c1, c2);
776 	mul_add_c(a[3], b[2], c3, c1, c2);
777 	r[5] = c3;
778 	c3 = 0;
779 	mul_add_c(a[3], b[3], c1, c2, c3);
780 	r[6] = c1;
781 	r[7] = c2;
782 }
783 
784 void
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)785 bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
786 {
787 	BN_ULONG c1, c2, c3;
788 
789 	c1 = 0;
790 	c2 = 0;
791 	c3 = 0;
792 	sqr_add_c(a, 0, c1, c2, c3);
793 	r[0] = c1;
794 	c1 = 0;
795 	sqr_add_c2(a, 1, 0, c2, c3, c1);
796 	r[1] = c2;
797 	c2 = 0;
798 	sqr_add_c(a, 1, c3, c1, c2);
799 	sqr_add_c2(a, 2, 0, c3, c1, c2);
800 	r[2] = c3;
801 	c3 = 0;
802 	sqr_add_c2(a, 3, 0, c1, c2, c3);
803 	sqr_add_c2(a, 2, 1, c1, c2, c3);
804 	r[3] = c1;
805 	c1 = 0;
806 	sqr_add_c(a, 2, c2, c3, c1);
807 	sqr_add_c2(a, 3, 1, c2, c3, c1);
808 	sqr_add_c2(a, 4, 0, c2, c3, c1);
809 	r[4] = c2;
810 	c2 = 0;
811 	sqr_add_c2(a, 5, 0, c3, c1, c2);
812 	sqr_add_c2(a, 4, 1, c3, c1, c2);
813 	sqr_add_c2(a, 3, 2, c3, c1, c2);
814 	r[5] = c3;
815 	c3 = 0;
816 	sqr_add_c(a, 3, c1, c2, c3);
817 	sqr_add_c2(a, 4, 2, c1, c2, c3);
818 	sqr_add_c2(a, 5, 1, c1, c2, c3);
819 	sqr_add_c2(a, 6, 0, c1, c2, c3);
820 	r[6] = c1;
821 	c1 = 0;
822 	sqr_add_c2(a, 7, 0, c2, c3, c1);
823 	sqr_add_c2(a, 6, 1, c2, c3, c1);
824 	sqr_add_c2(a, 5, 2, c2, c3, c1);
825 	sqr_add_c2(a, 4, 3, c2, c3, c1);
826 	r[7] = c2;
827 	c2 = 0;
828 	sqr_add_c(a, 4, c3, c1, c2);
829 	sqr_add_c2(a, 5, 3, c3, c1, c2);
830 	sqr_add_c2(a, 6, 2, c3, c1, c2);
831 	sqr_add_c2(a, 7, 1, c3, c1, c2);
832 	r[8] = c3;
833 	c3 = 0;
834 	sqr_add_c2(a, 7, 2, c1, c2, c3);
835 	sqr_add_c2(a, 6, 3, c1, c2, c3);
836 	sqr_add_c2(a, 5, 4, c1, c2, c3);
837 	r[9] = c1;
838 	c1 = 0;
839 	sqr_add_c(a, 5, c2, c3, c1);
840 	sqr_add_c2(a, 6, 4, c2, c3, c1);
841 	sqr_add_c2(a, 7, 3, c2, c3, c1);
842 	r[10] = c2;
843 	c2 = 0;
844 	sqr_add_c2(a, 7, 4, c3, c1, c2);
845 	sqr_add_c2(a, 6, 5, c3, c1, c2);
846 	r[11] = c3;
847 	c3 = 0;
848 	sqr_add_c(a, 6, c1, c2, c3);
849 	sqr_add_c2(a, 7, 5, c1, c2, c3);
850 	r[12] = c1;
851 	c1 = 0;
852 	sqr_add_c2(a, 7, 6, c2, c3, c1);
853 	r[13] = c2;
854 	c2 = 0;
855 	sqr_add_c(a, 7, c3, c1, c2);
856 	r[14] = c3;
857 	r[15] = c1;
858 }
859 
860 void
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)861 bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
862 {
863 	BN_ULONG c1, c2, c3;
864 
865 	c1 = 0;
866 	c2 = 0;
867 	c3 = 0;
868 	sqr_add_c(a, 0, c1, c2, c3);
869 	r[0] = c1;
870 	c1 = 0;
871 	sqr_add_c2(a, 1, 0, c2, c3, c1);
872 	r[1] = c2;
873 	c2 = 0;
874 	sqr_add_c(a, 1, c3, c1, c2);
875 	sqr_add_c2(a, 2, 0, c3, c1, c2);
876 	r[2] = c3;
877 	c3 = 0;
878 	sqr_add_c2(a, 3, 0, c1, c2, c3);
879 	sqr_add_c2(a, 2, 1, c1, c2, c3);
880 	r[3] = c1;
881 	c1 = 0;
882 	sqr_add_c(a, 2, c2, c3, c1);
883 	sqr_add_c2(a, 3, 1, c2, c3, c1);
884 	r[4] = c2;
885 	c2 = 0;
886 	sqr_add_c2(a, 3, 2, c3, c1, c2);
887 	r[5] = c3;
888 	c3 = 0;
889 	sqr_add_c(a, 3, c1, c2, c3);
890 	r[6] = c1;
891 	r[7] = c2;
892 }
893 
894 #ifdef OPENSSL_NO_ASM
895 #ifdef OPENSSL_BN_ASM_MONT
896 /*
897  * This is essentially reference implementation, which may or may not
898  * result in performance improvement. E.g. on IA-32 this routine was
899  * observed to give 40% faster rsa1024 private key operations and 10%
900  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
901  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
902  * reference implementation, one to be used as starting point for
903  * platform-specific assembler. Mentioned numbers apply to compiler
904  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
905  * can vary not only from platform to platform, but even for compiler
906  * versions. Assembler vs. assembler improvement coefficients can
907  * [and are known to] differ and are to be documented elsewhere.
908  */
909 int
bn_mul_mont(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,const BN_ULONG * np,const BN_ULONG * n0p,int num)910 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0p, int num)
911 {
912 	BN_ULONG c0, c1, ml, *tp, n0;
913 #ifdef mul64
914 	BN_ULONG mh;
915 #endif
916 	int i = 0, j;
917 
918 #if 0	/* template for platform-specific implementation */
919 	if (ap == bp)
920 		return bn_sqr_mont(rp, ap, np, n0p, num);
921 #endif
922 	tp = reallocarray(NULL, num + 2, sizeof(BN_ULONG));
923 	if (tp == NULL)
924 		return 0;
925 
926 	n0 = *n0p;
927 
928 	c0 = 0;
929 	ml = bp[0];
930 #ifdef mul64
931 	mh = HBITS(ml);
932 	ml = LBITS(ml);
933 	for (j = 0; j < num; ++j)
934 		mul(tp[j], ap[j], ml, mh, c0);
935 #else
936 	for (j = 0; j < num; ++j)
937 		mul(tp[j], ap[j], ml, c0);
938 #endif
939 
940 	tp[num] = c0;
941 	tp[num + 1] = 0;
942 	goto enter;
943 
944 	for (i = 0; i < num; i++) {
945 		c0 = 0;
946 		ml = bp[i];
947 #ifdef mul64
948 		mh = HBITS(ml);
949 		ml = LBITS(ml);
950 		for (j = 0; j < num; ++j)
951 			mul_add(tp[j], ap[j], ml, mh, c0);
952 #else
953 		for (j = 0; j < num; ++j)
954 			mul_add(tp[j], ap[j], ml, c0);
955 #endif
956 		c1 = (tp[num] + c0) & BN_MASK2;
957 		tp[num] = c1;
958 		tp[num + 1] = (c1 < c0 ? 1 : 0);
959 enter:
960 		c1 = tp[0];
961 		ml = (c1 * n0) & BN_MASK2;
962 		c0 = 0;
963 #ifdef mul64
964 		mh = HBITS(ml);
965 		ml = LBITS(ml);
966 		mul_add(c1, np[0], ml, mh, c0);
967 #else
968 		mul_add(c1, ml, np[0], c0);
969 #endif
970 		for (j = 1; j < num; j++) {
971 			c1 = tp[j];
972 #ifdef mul64
973 			mul_add(c1, np[j], ml, mh, c0);
974 #else
975 			mul_add(c1, ml, np[j], c0);
976 #endif
977 			tp[j - 1] = c1 & BN_MASK2;
978 		}
979 		c1 = (tp[num] + c0) & BN_MASK2;
980 		tp[num - 1] = c1;
981 		tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
982 	}
983 
984 	if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
985 		c0 = bn_sub_words(rp, tp, np, num);
986 		if (tp[num] != 0 || c0 == 0) {
987 			goto out;
988 		}
989 	}
990 	memcpy(rp, tp, num * sizeof(BN_ULONG));
991 out:
992 	freezero(tp, (num + 2) * sizeof(BN_ULONG));
993 	return 1;
994 }
995 #else
996 /*
997  * Return value of 0 indicates that multiplication/convolution was not
998  * performed to signal the caller to fall down to alternative/original
999  * code-path.
1000  */
bn_mul_mont(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,const BN_ULONG * np,const BN_ULONG * n0,int num)1001 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
1002 	{	return 0;
1003 }
1004 #endif /* OPENSSL_BN_ASM_MONT */
1005 #endif
1006 
1007 #else /* !BN_MUL_COMBA */
1008 
1009 /* hmm... is it faster just to do a multiply? */
1010 #undef bn_sqr_comba4
1011 void
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)1012 bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
1013 {
1014 	BN_ULONG t[8];
1015 	bn_sqr_normal(r, a, 4, t);
1016 }
1017 
1018 #undef bn_sqr_comba8
1019 void
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)1020 bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
1021 {
1022 	BN_ULONG t[16];
1023 	bn_sqr_normal(r, a, 8, t);
1024 }
1025 
1026 void
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)1027 bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1028 {
1029 	r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
1030 	r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
1031 	r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
1032 	r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
1033 }
1034 
1035 void
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)1036 bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1037 {
1038 	r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
1039 	r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
1040 	r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
1041 	r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
1042 	r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
1043 	r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
1044 	r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
1045 	r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
1046 }
1047 
1048 #ifdef OPENSSL_NO_ASM
1049 #ifdef OPENSSL_BN_ASM_MONT
1050 int
bn_mul_mont(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,const BN_ULONG * np,const BN_ULONG * n0p,int num)1051 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1052     const BN_ULONG *np, const BN_ULONG *n0p, int num)
1053 {
1054 	BN_ULONG c0, c1, *tp, n0 = *n0p;
1055 	int i = 0, j;
1056 
1057 	tp = calloc(NULL, num + 2, sizeof(BN_ULONG));
1058 	if (tp == NULL)
1059 		return 0;
1060 
1061 	for (i = 0; i < num; i++) {
1062 		c0 = bn_mul_add_words(tp, ap, num, bp[i]);
1063 		c1 = (tp[num] + c0) & BN_MASK2;
1064 		tp[num] = c1;
1065 		tp[num + 1] = (c1 < c0 ? 1 : 0);
1066 
1067 		c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
1068 		c1 = (tp[num] + c0) & BN_MASK2;
1069 		tp[num] = c1;
1070 		tp[num + 1] += (c1 < c0 ? 1 : 0);
1071 		for (j = 0; j <= num; j++)
1072 			tp[j] = tp[j + 1];
1073 	}
1074 
1075 	if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1076 		c0 = bn_sub_words(rp, tp, np, num);
1077 		if (tp[num] != 0 || c0 == 0) {
1078 			goto out;
1079 		}
1080 	}
1081 	memcpy(rp, tp, num * sizeof(BN_ULONG));
1082 out:
1083 	freezero(tp, (num + 2) * sizeof(BN_ULONG));
1084 	return 1;
1085 }
1086 #else
1087 int
bn_mul_mont(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,const BN_ULONG * np,const BN_ULONG * n0,int num)1088 bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1089     const BN_ULONG *np, const BN_ULONG *n0, int num)
1090 {
1091 	return 0;
1092 }
1093 #endif /* OPENSSL_BN_ASM_MONT */
1094 #endif
1095 
1096 #endif /* !BN_MUL_COMBA */
1097