xref: /freebsd/crypto/openssl/crypto/bn/bn_asm.c (revision 17f01e99)
1e71b7053SJung-uk Kim /*
2e71b7053SJung-uk Kim  * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
374664626SKris Kennaway  *
4e71b7053SJung-uk Kim  * Licensed under the OpenSSL license (the "License").  You may not use
5e71b7053SJung-uk Kim  * this file except in compliance with the License.  You can obtain a copy
6e71b7053SJung-uk Kim  * in the file LICENSE in the source distribution or at
7e71b7053SJung-uk Kim  * https://www.openssl.org/source/license.html
874664626SKris Kennaway  */
974664626SKris Kennaway 
10f579bf8eSKris Kennaway #include <assert.h>
11e71b7053SJung-uk Kim #include <openssl/crypto.h>
12e71b7053SJung-uk Kim #include "internal/cryptlib.h"
1317f01e99SJung-uk Kim #include "bn_local.h"
1474664626SKris Kennaway 
15f579bf8eSKris Kennaway #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
1674664626SKris Kennaway 
176f9291ceSJung-uk Kim BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
186f9291ceSJung-uk Kim                           BN_ULONG w)
1974664626SKris Kennaway {
2074664626SKris Kennaway     BN_ULONG c1 = 0;
2174664626SKris Kennaway 
22f579bf8eSKris Kennaway     assert(num >= 0);
236f9291ceSJung-uk Kim     if (num <= 0)
24e71b7053SJung-uk Kim         return c1;
2574664626SKris Kennaway 
261f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT
276f9291ceSJung-uk Kim     while (num & ~3) {
2874664626SKris Kennaway         mul_add(rp[0], ap[0], w, c1);
2974664626SKris Kennaway         mul_add(rp[1], ap[1], w, c1);
3074664626SKris Kennaway         mul_add(rp[2], ap[2], w, c1);
3174664626SKris Kennaway         mul_add(rp[3], ap[3], w, c1);
326f9291ceSJung-uk Kim         ap += 4;
336f9291ceSJung-uk Kim         rp += 4;
346f9291ceSJung-uk Kim         num -= 4;
35f579bf8eSKris Kennaway     }
361f13597dSJung-uk Kim # endif
376f9291ceSJung-uk Kim     while (num) {
381f13597dSJung-uk Kim         mul_add(rp[0], ap[0], w, c1);
396f9291ceSJung-uk Kim         ap++;
406f9291ceSJung-uk Kim         rp++;
416f9291ceSJung-uk Kim         num--;
4274664626SKris Kennaway     }
4374664626SKris Kennaway 
44e71b7053SJung-uk Kim     return c1;
4574664626SKris Kennaway }
4674664626SKris Kennaway 
475c87c606SMark Murray BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
4874664626SKris Kennaway {
4974664626SKris Kennaway     BN_ULONG c1 = 0;
5074664626SKris Kennaway 
51f579bf8eSKris Kennaway     assert(num >= 0);
526f9291ceSJung-uk Kim     if (num <= 0)
53e71b7053SJung-uk Kim         return c1;
5474664626SKris Kennaway 
551f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT
566f9291ceSJung-uk Kim     while (num & ~3) {
5774664626SKris Kennaway         mul(rp[0], ap[0], w, c1);
5874664626SKris Kennaway         mul(rp[1], ap[1], w, c1);
5974664626SKris Kennaway         mul(rp[2], ap[2], w, c1);
6074664626SKris Kennaway         mul(rp[3], ap[3], w, c1);
616f9291ceSJung-uk Kim         ap += 4;
626f9291ceSJung-uk Kim         rp += 4;
636f9291ceSJung-uk Kim         num -= 4;
64f579bf8eSKris Kennaway     }
651f13597dSJung-uk Kim # endif
666f9291ceSJung-uk Kim     while (num) {
671f13597dSJung-uk Kim         mul(rp[0], ap[0], w, c1);
686f9291ceSJung-uk Kim         ap++;
696f9291ceSJung-uk Kim         rp++;
706f9291ceSJung-uk Kim         num--;
7174664626SKris Kennaway     }
72e71b7053SJung-uk Kim     return c1;
7374664626SKris Kennaway }
7474664626SKris Kennaway 
755c87c606SMark Murray void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
7674664626SKris Kennaway {
77f579bf8eSKris Kennaway     assert(n >= 0);
786f9291ceSJung-uk Kim     if (n <= 0)
796f9291ceSJung-uk Kim         return;
801f13597dSJung-uk Kim 
811f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT
826f9291ceSJung-uk Kim     while (n & ~3) {
83f579bf8eSKris Kennaway         sqr(r[0], r[1], a[0]);
84f579bf8eSKris Kennaway         sqr(r[2], r[3], a[1]);
85f579bf8eSKris Kennaway         sqr(r[4], r[5], a[2]);
86f579bf8eSKris Kennaway         sqr(r[6], r[7], a[3]);
876f9291ceSJung-uk Kim         a += 4;
886f9291ceSJung-uk Kim         r += 8;
896f9291ceSJung-uk Kim         n -= 4;
90f579bf8eSKris Kennaway     }
911f13597dSJung-uk Kim # endif
926f9291ceSJung-uk Kim     while (n) {
931f13597dSJung-uk Kim         sqr(r[0], r[1], a[0]);
946f9291ceSJung-uk Kim         a++;
956f9291ceSJung-uk Kim         r += 2;
966f9291ceSJung-uk Kim         n--;
9774664626SKris Kennaway     }
9874664626SKris Kennaway }
9974664626SKris Kennaway 
1006f9291ceSJung-uk Kim #else                           /* !(defined(BN_LLONG) ||
1016f9291ceSJung-uk Kim                                  * defined(BN_UMULT_HIGH)) */
10274664626SKris Kennaway 
1036f9291ceSJung-uk Kim BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
1046f9291ceSJung-uk Kim                           BN_ULONG w)
10574664626SKris Kennaway {
10674664626SKris Kennaway     BN_ULONG c = 0;
10774664626SKris Kennaway     BN_ULONG bl, bh;
10874664626SKris Kennaway 
109f579bf8eSKris Kennaway     assert(num >= 0);
1106f9291ceSJung-uk Kim     if (num <= 0)
111e71b7053SJung-uk Kim         return (BN_ULONG)0;
11274664626SKris Kennaway 
11374664626SKris Kennaway     bl = LBITS(w);
11474664626SKris Kennaway     bh = HBITS(w);
11574664626SKris Kennaway 
1161f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT
1176f9291ceSJung-uk Kim     while (num & ~3) {
11874664626SKris Kennaway         mul_add(rp[0], ap[0], bl, bh, c);
11974664626SKris Kennaway         mul_add(rp[1], ap[1], bl, bh, c);
12074664626SKris Kennaway         mul_add(rp[2], ap[2], bl, bh, c);
12174664626SKris Kennaway         mul_add(rp[3], ap[3], bl, bh, c);
1226f9291ceSJung-uk Kim         ap += 4;
1236f9291ceSJung-uk Kim         rp += 4;
1246f9291ceSJung-uk Kim         num -= 4;
1251f13597dSJung-uk Kim     }
1261f13597dSJung-uk Kim # endif
1276f9291ceSJung-uk Kim     while (num) {
1281f13597dSJung-uk Kim         mul_add(rp[0], ap[0], bl, bh, c);
1296f9291ceSJung-uk Kim         ap++;
1306f9291ceSJung-uk Kim         rp++;
1316f9291ceSJung-uk Kim         num--;
13274664626SKris Kennaway     }
133e71b7053SJung-uk Kim     return c;
13474664626SKris Kennaway }
13574664626SKris Kennaway 
1365c87c606SMark Murray BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
13774664626SKris Kennaway {
13874664626SKris Kennaway     BN_ULONG carry = 0;
13974664626SKris Kennaway     BN_ULONG bl, bh;
14074664626SKris Kennaway 
141f579bf8eSKris Kennaway     assert(num >= 0);
1426f9291ceSJung-uk Kim     if (num <= 0)
143e71b7053SJung-uk Kim         return (BN_ULONG)0;
14474664626SKris Kennaway 
14574664626SKris Kennaway     bl = LBITS(w);
14674664626SKris Kennaway     bh = HBITS(w);
14774664626SKris Kennaway 
1481f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT
1496f9291ceSJung-uk Kim     while (num & ~3) {
15074664626SKris Kennaway         mul(rp[0], ap[0], bl, bh, carry);
15174664626SKris Kennaway         mul(rp[1], ap[1], bl, bh, carry);
15274664626SKris Kennaway         mul(rp[2], ap[2], bl, bh, carry);
15374664626SKris Kennaway         mul(rp[3], ap[3], bl, bh, carry);
1546f9291ceSJung-uk Kim         ap += 4;
1556f9291ceSJung-uk Kim         rp += 4;
1566f9291ceSJung-uk Kim         num -= 4;
1571f13597dSJung-uk Kim     }
1581f13597dSJung-uk Kim # endif
1596f9291ceSJung-uk Kim     while (num) {
1601f13597dSJung-uk Kim         mul(rp[0], ap[0], bl, bh, carry);
1616f9291ceSJung-uk Kim         ap++;
1626f9291ceSJung-uk Kim         rp++;
1636f9291ceSJung-uk Kim         num--;
16474664626SKris Kennaway     }
165e71b7053SJung-uk Kim     return carry;
16674664626SKris Kennaway }
16774664626SKris Kennaway 
1685c87c606SMark Murray void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
16974664626SKris Kennaway {
170f579bf8eSKris Kennaway     assert(n >= 0);
1716f9291ceSJung-uk Kim     if (n <= 0)
1726f9291ceSJung-uk Kim         return;
1731f13597dSJung-uk Kim 
1741f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT
1756f9291ceSJung-uk Kim     while (n & ~3) {
17674664626SKris Kennaway         sqr64(r[0], r[1], a[0]);
17774664626SKris Kennaway         sqr64(r[2], r[3], a[1]);
17874664626SKris Kennaway         sqr64(r[4], r[5], a[2]);
17974664626SKris Kennaway         sqr64(r[6], r[7], a[3]);
1806f9291ceSJung-uk Kim         a += 4;
1816f9291ceSJung-uk Kim         r += 8;
1826f9291ceSJung-uk Kim         n -= 4;
1831f13597dSJung-uk Kim     }
1841f13597dSJung-uk Kim # endif
1856f9291ceSJung-uk Kim     while (n) {
1861f13597dSJung-uk Kim         sqr64(r[0], r[1], a[0]);
1876f9291ceSJung-uk Kim         a++;
1886f9291ceSJung-uk Kim         r += 2;
1896f9291ceSJung-uk Kim         n--;
19074664626SKris Kennaway     }
19174664626SKris Kennaway }
19274664626SKris Kennaway 
1936f9291ceSJung-uk Kim #endif                          /* !(defined(BN_LLONG) ||
1946f9291ceSJung-uk Kim                                  * defined(BN_UMULT_HIGH)) */
19574664626SKris Kennaway 
19674664626SKris Kennaway #if defined(BN_LLONG) && defined(BN_DIV2W)
19774664626SKris Kennaway 
19874664626SKris Kennaway BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
19974664626SKris Kennaway {
20074664626SKris Kennaway     return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
20174664626SKris Kennaway }
20274664626SKris Kennaway 
20374664626SKris Kennaway #else
20474664626SKris Kennaway 
205ddd58736SKris Kennaway /* Divide h,l by d and return the result. */
20674664626SKris Kennaway /* I need to test this some more :-( */
20774664626SKris Kennaway BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
20874664626SKris Kennaway {
20974664626SKris Kennaway     BN_ULONG dh, dl, q, ret = 0, th, tl, t;
21074664626SKris Kennaway     int i, count = 2;
21174664626SKris Kennaway 
2126f9291ceSJung-uk Kim     if (d == 0)
213e71b7053SJung-uk Kim         return BN_MASK2;
21474664626SKris Kennaway 
21574664626SKris Kennaway     i = BN_num_bits_word(d);
2163b4e3dcbSSimon L. B. Nielsen     assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
217ddd58736SKris Kennaway 
21874664626SKris Kennaway     i = BN_BITS2 - i;
2196f9291ceSJung-uk Kim     if (h >= d)
2206f9291ceSJung-uk Kim         h -= d;
22174664626SKris Kennaway 
2226f9291ceSJung-uk Kim     if (i) {
22374664626SKris Kennaway         d <<= i;
22474664626SKris Kennaway         h = (h << i) | (l >> (BN_BITS2 - i));
22574664626SKris Kennaway         l <<= i;
22674664626SKris Kennaway     }
22774664626SKris Kennaway     dh = (d & BN_MASK2h) >> BN_BITS4;
22874664626SKris Kennaway     dl = (d & BN_MASK2l);
2296f9291ceSJung-uk Kim     for (;;) {
23074664626SKris Kennaway         if ((h >> BN_BITS4) == dh)
23174664626SKris Kennaway             q = BN_MASK2l;
23274664626SKris Kennaway         else
23374664626SKris Kennaway             q = h / dh;
23474664626SKris Kennaway 
23574664626SKris Kennaway         th = q * dh;
23674664626SKris Kennaway         tl = dl * q;
2376f9291ceSJung-uk Kim         for (;;) {
23874664626SKris Kennaway             t = h - th;
23974664626SKris Kennaway             if ((t & BN_MASK2h) ||
2406f9291ceSJung-uk Kim                 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
24174664626SKris Kennaway                 break;
24274664626SKris Kennaway             q--;
24374664626SKris Kennaway             th -= dh;
24474664626SKris Kennaway             tl -= dl;
24574664626SKris Kennaway         }
24674664626SKris Kennaway         t = (tl >> BN_BITS4);
24774664626SKris Kennaway         tl = (tl << BN_BITS4) & BN_MASK2h;
24874664626SKris Kennaway         th += t;
24974664626SKris Kennaway 
2506f9291ceSJung-uk Kim         if (l < tl)
2516f9291ceSJung-uk Kim             th++;
25274664626SKris Kennaway         l -= tl;
2536f9291ceSJung-uk Kim         if (h < th) {
25474664626SKris Kennaway             h += d;
25574664626SKris Kennaway             q--;
25674664626SKris Kennaway         }
25774664626SKris Kennaway         h -= th;
25874664626SKris Kennaway 
2596f9291ceSJung-uk Kim         if (--count == 0)
2606f9291ceSJung-uk Kim             break;
26174664626SKris Kennaway 
26274664626SKris Kennaway         ret = q << BN_BITS4;
26374664626SKris Kennaway         h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
26474664626SKris Kennaway         l = (l & BN_MASK2l) << BN_BITS4;
26574664626SKris Kennaway     }
26674664626SKris Kennaway     ret |= q;
267e71b7053SJung-uk Kim     return ret;
26874664626SKris Kennaway }
269f579bf8eSKris Kennaway #endif                          /* !defined(BN_LLONG) && defined(BN_DIV2W) */
27074664626SKris Kennaway 
27174664626SKris Kennaway #ifdef BN_LLONG
2726f9291ceSJung-uk Kim BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
2736f9291ceSJung-uk Kim                       int n)
27474664626SKris Kennaway {
27574664626SKris Kennaway     BN_ULLONG ll = 0;
27674664626SKris Kennaway 
277f579bf8eSKris Kennaway     assert(n >= 0);
2786f9291ceSJung-uk Kim     if (n <= 0)
279e71b7053SJung-uk Kim         return (BN_ULONG)0;
28074664626SKris Kennaway 
2811f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT
2826f9291ceSJung-uk Kim     while (n & ~3) {
28374664626SKris Kennaway         ll += (BN_ULLONG) a[0] + b[0];
28474664626SKris Kennaway         r[0] = (BN_ULONG)ll & BN_MASK2;
28574664626SKris Kennaway         ll >>= BN_BITS2;
28674664626SKris Kennaway         ll += (BN_ULLONG) a[1] + b[1];
28774664626SKris Kennaway         r[1] = (BN_ULONG)ll & BN_MASK2;
28874664626SKris Kennaway         ll >>= BN_BITS2;
28974664626SKris Kennaway         ll += (BN_ULLONG) a[2] + b[2];
29074664626SKris Kennaway         r[2] = (BN_ULONG)ll & BN_MASK2;
29174664626SKris Kennaway         ll >>= BN_BITS2;
29274664626SKris Kennaway         ll += (BN_ULLONG) a[3] + b[3];
29374664626SKris Kennaway         r[3] = (BN_ULONG)ll & BN_MASK2;
29474664626SKris Kennaway         ll >>= BN_BITS2;
2956f9291ceSJung-uk Kim         a += 4;
2966f9291ceSJung-uk Kim         b += 4;
2976f9291ceSJung-uk Kim         r += 4;
2986f9291ceSJung-uk Kim         n -= 4;
2991f13597dSJung-uk Kim     }
3001f13597dSJung-uk Kim # endif
3016f9291ceSJung-uk Kim     while (n) {
3021f13597dSJung-uk Kim         ll += (BN_ULLONG) a[0] + b[0];
3031f13597dSJung-uk Kim         r[0] = (BN_ULONG)ll & BN_MASK2;
3041f13597dSJung-uk Kim         ll >>= BN_BITS2;
3056f9291ceSJung-uk Kim         a++;
3066f9291ceSJung-uk Kim         b++;
3076f9291ceSJung-uk Kim         r++;
3086f9291ceSJung-uk Kim         n--;
30974664626SKris Kennaway     }
310e71b7053SJung-uk Kim     return (BN_ULONG)ll;
31174664626SKris Kennaway }
312f579bf8eSKris Kennaway #else                           /* !BN_LLONG */
3136f9291ceSJung-uk Kim BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
3146f9291ceSJung-uk Kim                       int n)
31574664626SKris Kennaway {
31674664626SKris Kennaway     BN_ULONG c, l, t;
31774664626SKris Kennaway 
318f579bf8eSKris Kennaway     assert(n >= 0);
3196f9291ceSJung-uk Kim     if (n <= 0)
320e71b7053SJung-uk Kim         return (BN_ULONG)0;
32174664626SKris Kennaway 
32274664626SKris Kennaway     c = 0;
3231f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT
3246f9291ceSJung-uk Kim     while (n & ~3) {
32574664626SKris Kennaway         t = a[0];
32674664626SKris Kennaway         t = (t + c) & BN_MASK2;
32774664626SKris Kennaway         c = (t < c);
32874664626SKris Kennaway         l = (t + b[0]) & BN_MASK2;
32974664626SKris Kennaway         c += (l < t);
33074664626SKris Kennaway         r[0] = l;
33174664626SKris Kennaway         t = a[1];
33274664626SKris Kennaway         t = (t + c) & BN_MASK2;
33374664626SKris Kennaway         c = (t < c);
33474664626SKris Kennaway         l = (t + b[1]) & BN_MASK2;
33574664626SKris Kennaway         c += (l < t);
33674664626SKris Kennaway         r[1] = l;
33774664626SKris Kennaway         t = a[2];
33874664626SKris Kennaway         t = (t + c) & BN_MASK2;
33974664626SKris Kennaway         c = (t < c);
34074664626SKris Kennaway         l = (t + b[2]) & BN_MASK2;
34174664626SKris Kennaway         c += (l < t);
34274664626SKris Kennaway         r[2] = l;
34374664626SKris Kennaway         t = a[3];
34474664626SKris Kennaway         t = (t + c) & BN_MASK2;
34574664626SKris Kennaway         c = (t < c);
34674664626SKris Kennaway         l = (t + b[3]) & BN_MASK2;
34774664626SKris Kennaway         c += (l < t);
34874664626SKris Kennaway         r[3] = l;
3496f9291ceSJung-uk Kim         a += 4;
3506f9291ceSJung-uk Kim         b += 4;
3516f9291ceSJung-uk Kim         r += 4;
3526f9291ceSJung-uk Kim         n -= 4;
3531f13597dSJung-uk Kim     }
3541f13597dSJung-uk Kim # endif
3556f9291ceSJung-uk Kim     while (n) {
3561f13597dSJung-uk Kim         t = a[0];
3571f13597dSJung-uk Kim         t = (t + c) & BN_MASK2;
3581f13597dSJung-uk Kim         c = (t < c);
3591f13597dSJung-uk Kim         l = (t + b[0]) & BN_MASK2;
3601f13597dSJung-uk Kim         c += (l < t);
3611f13597dSJung-uk Kim         r[0] = l;
3626f9291ceSJung-uk Kim         a++;
3636f9291ceSJung-uk Kim         b++;
3646f9291ceSJung-uk Kim         r++;
3656f9291ceSJung-uk Kim         n--;
36674664626SKris Kennaway     }
367e71b7053SJung-uk Kim     return (BN_ULONG)c;
36874664626SKris Kennaway }
369f579bf8eSKris Kennaway #endif                          /* !BN_LLONG */
37074664626SKris Kennaway 
3716f9291ceSJung-uk Kim BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
3726f9291ceSJung-uk Kim                       int n)
37374664626SKris Kennaway {
37474664626SKris Kennaway     BN_ULONG t1, t2;
37574664626SKris Kennaway     int c = 0;
37674664626SKris Kennaway 
377f579bf8eSKris Kennaway     assert(n >= 0);
3786f9291ceSJung-uk Kim     if (n <= 0)
379e71b7053SJung-uk Kim         return (BN_ULONG)0;
38074664626SKris Kennaway 
3811f13597dSJung-uk Kim #ifndef OPENSSL_SMALL_FOOTPRINT
3826f9291ceSJung-uk Kim     while (n & ~3) {
3836f9291ceSJung-uk Kim         t1 = a[0];
3846f9291ceSJung-uk Kim         t2 = b[0];
38574664626SKris Kennaway         r[0] = (t1 - t2 - c) & BN_MASK2;
3866f9291ceSJung-uk Kim         if (t1 != t2)
3876f9291ceSJung-uk Kim             c = (t1 < t2);
3886f9291ceSJung-uk Kim         t1 = a[1];
3896f9291ceSJung-uk Kim         t2 = b[1];
39074664626SKris Kennaway         r[1] = (t1 - t2 - c) & BN_MASK2;
3916f9291ceSJung-uk Kim         if (t1 != t2)
3926f9291ceSJung-uk Kim             c = (t1 < t2);
3936f9291ceSJung-uk Kim         t1 = a[2];
3946f9291ceSJung-uk Kim         t2 = b[2];
39574664626SKris Kennaway         r[2] = (t1 - t2 - c) & BN_MASK2;
3966f9291ceSJung-uk Kim         if (t1 != t2)
3976f9291ceSJung-uk Kim             c = (t1 < t2);
3986f9291ceSJung-uk Kim         t1 = a[3];
3996f9291ceSJung-uk Kim         t2 = b[3];
40074664626SKris Kennaway         r[3] = (t1 - t2 - c) & BN_MASK2;
4016f9291ceSJung-uk Kim         if (t1 != t2)
4026f9291ceSJung-uk Kim             c = (t1 < t2);
4036f9291ceSJung-uk Kim         a += 4;
4046f9291ceSJung-uk Kim         b += 4;
4056f9291ceSJung-uk Kim         r += 4;
4066f9291ceSJung-uk Kim         n -= 4;
4071f13597dSJung-uk Kim     }
4081f13597dSJung-uk Kim #endif
4096f9291ceSJung-uk Kim     while (n) {
4106f9291ceSJung-uk Kim         t1 = a[0];
4116f9291ceSJung-uk Kim         t2 = b[0];
4121f13597dSJung-uk Kim         r[0] = (t1 - t2 - c) & BN_MASK2;
4136f9291ceSJung-uk Kim         if (t1 != t2)
4146f9291ceSJung-uk Kim             c = (t1 < t2);
4156f9291ceSJung-uk Kim         a++;
4166f9291ceSJung-uk Kim         b++;
4176f9291ceSJung-uk Kim         r++;
4186f9291ceSJung-uk Kim         n--;
41974664626SKris Kennaway     }
420e71b7053SJung-uk Kim     return c;
42174664626SKris Kennaway }
42274664626SKris Kennaway 
4231f13597dSJung-uk Kim #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
42474664626SKris Kennaway 
42574664626SKris Kennaway # undef bn_mul_comba8
42674664626SKris Kennaway # undef bn_mul_comba4
42774664626SKris Kennaway # undef bn_sqr_comba8
42874664626SKris Kennaway # undef bn_sqr_comba4
42974664626SKris Kennaway 
430f579bf8eSKris Kennaway /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
431f579bf8eSKris Kennaway /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
432f579bf8eSKris Kennaway /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
4336f9291ceSJung-uk Kim /*
4346f9291ceSJung-uk Kim  * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
4356f9291ceSJung-uk Kim  * c=(c2,c1,c0)
4366f9291ceSJung-uk Kim  */
437f579bf8eSKris Kennaway 
43874664626SKris Kennaway # ifdef BN_LLONG
4397bded2dbSJung-uk Kim /*
4407bded2dbSJung-uk Kim  * Keep in mind that additions to multiplication result can not
4417bded2dbSJung-uk Kim  * overflow, because its high half cannot be all-ones.
4427bded2dbSJung-uk Kim  */
4437bded2dbSJung-uk Kim #  define mul_add_c(a,b,c0,c1,c2)       do {    \
4447bded2dbSJung-uk Kim         BN_ULONG hi;                            \
4457bded2dbSJung-uk Kim         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
4467bded2dbSJung-uk Kim         t += c0;                /* no carry */  \
4477bded2dbSJung-uk Kim         c0 = (BN_ULONG)Lw(t);                   \
4487bded2dbSJung-uk Kim         hi = (BN_ULONG)Hw(t);                   \
4497bded2dbSJung-uk Kim         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
4507bded2dbSJung-uk Kim         } while(0)
45174664626SKris Kennaway 
4527bded2dbSJung-uk Kim #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
4537bded2dbSJung-uk Kim         BN_ULONG hi;                            \
4547bded2dbSJung-uk Kim         BN_ULLONG t = (BN_ULLONG)(a)*(b);       \
4557bded2dbSJung-uk Kim         BN_ULLONG tt = t+c0;    /* no carry */  \
4567bded2dbSJung-uk Kim         c0 = (BN_ULONG)Lw(tt);                  \
4577bded2dbSJung-uk Kim         hi = (BN_ULONG)Hw(tt);                  \
4587bded2dbSJung-uk Kim         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
4597bded2dbSJung-uk Kim         t += c0;                /* no carry */  \
4607bded2dbSJung-uk Kim         c0 = (BN_ULONG)Lw(t);                   \
4617bded2dbSJung-uk Kim         hi = (BN_ULONG)Hw(t);                   \
4627bded2dbSJung-uk Kim         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
4637bded2dbSJung-uk Kim         } while(0)
46474664626SKris Kennaway 
4657bded2dbSJung-uk Kim #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
4667bded2dbSJung-uk Kim         BN_ULONG hi;                            \
4677bded2dbSJung-uk Kim         BN_ULLONG t = (BN_ULLONG)a[i]*a[i];     \
4687bded2dbSJung-uk Kim         t += c0;                /* no carry */  \
4697bded2dbSJung-uk Kim         c0 = (BN_ULONG)Lw(t);                   \
4707bded2dbSJung-uk Kim         hi = (BN_ULONG)Hw(t);                   \
4717bded2dbSJung-uk Kim         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
4727bded2dbSJung-uk Kim         } while(0)
47374664626SKris Kennaway 
47474664626SKris Kennaway #  define sqr_add_c2(a,i,j,c0,c1,c2) \
47574664626SKris Kennaway         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
476f579bf8eSKris Kennaway 
4773b4e3dcbSSimon L. B. Nielsen # elif defined(BN_UMULT_LOHI)
4787bded2dbSJung-uk Kim /*
4797bded2dbSJung-uk Kim  * Keep in mind that additions to hi can not overflow, because
4807bded2dbSJung-uk Kim  * the high word of a multiplication result cannot be all-ones.
4817bded2dbSJung-uk Kim  */
4827bded2dbSJung-uk Kim #  define mul_add_c(a,b,c0,c1,c2)       do {    \
4833b4e3dcbSSimon L. B. Nielsen         BN_ULONG ta = (a), tb = (b);            \
4847bded2dbSJung-uk Kim         BN_ULONG lo, hi;                        \
4857bded2dbSJung-uk Kim         BN_UMULT_LOHI(lo,hi,ta,tb);             \
4867bded2dbSJung-uk Kim         c0 += lo; hi += (c0<lo)?1:0;            \
4877bded2dbSJung-uk Kim         c1 += hi; c2 += (c1<hi)?1:0;            \
4887bded2dbSJung-uk Kim         } while(0)
4893b4e3dcbSSimon L. B. Nielsen 
4907bded2dbSJung-uk Kim #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
4917bded2dbSJung-uk Kim         BN_ULONG ta = (a), tb = (b);            \
4927bded2dbSJung-uk Kim         BN_ULONG lo, hi, tt;                    \
4937bded2dbSJung-uk Kim         BN_UMULT_LOHI(lo,hi,ta,tb);             \
4947bded2dbSJung-uk Kim         c0 += lo; tt = hi+((c0<lo)?1:0);        \
4957bded2dbSJung-uk Kim         c1 += tt; c2 += (c1<tt)?1:0;            \
4967bded2dbSJung-uk Kim         c0 += lo; hi += (c0<lo)?1:0;            \
4977bded2dbSJung-uk Kim         c1 += hi; c2 += (c1<hi)?1:0;            \
4987bded2dbSJung-uk Kim         } while(0)
4993b4e3dcbSSimon L. B. Nielsen 
5007bded2dbSJung-uk Kim #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
5013b4e3dcbSSimon L. B. Nielsen         BN_ULONG ta = (a)[i];                   \
5027bded2dbSJung-uk Kim         BN_ULONG lo, hi;                        \
5037bded2dbSJung-uk Kim         BN_UMULT_LOHI(lo,hi,ta,ta);             \
5047bded2dbSJung-uk Kim         c0 += lo; hi += (c0<lo)?1:0;            \
5057bded2dbSJung-uk Kim         c1 += hi; c2 += (c1<hi)?1:0;            \
5067bded2dbSJung-uk Kim         } while(0)
5073b4e3dcbSSimon L. B. Nielsen 
5083b4e3dcbSSimon L. B. Nielsen #  define sqr_add_c2(a,i,j,c0,c1,c2)    \
5093b4e3dcbSSimon L. B. Nielsen         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
5103b4e3dcbSSimon L. B. Nielsen 
511f579bf8eSKris Kennaway # elif defined(BN_UMULT_HIGH)
5127bded2dbSJung-uk Kim /*
5137bded2dbSJung-uk Kim  * Keep in mind that additions to hi can not overflow, because
5147bded2dbSJung-uk Kim  * the high word of a multiplication result cannot be all-ones.
5157bded2dbSJung-uk Kim  */
5167bded2dbSJung-uk Kim #  define mul_add_c(a,b,c0,c1,c2)       do {    \
517f579bf8eSKris Kennaway         BN_ULONG ta = (a), tb = (b);            \
5187bded2dbSJung-uk Kim         BN_ULONG lo = ta * tb;                  \
5197bded2dbSJung-uk Kim         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
5207bded2dbSJung-uk Kim         c0 += lo; hi += (c0<lo)?1:0;            \
5217bded2dbSJung-uk Kim         c1 += hi; c2 += (c1<hi)?1:0;            \
5227bded2dbSJung-uk Kim         } while(0)
523f579bf8eSKris Kennaway 
5247bded2dbSJung-uk Kim #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
5257bded2dbSJung-uk Kim         BN_ULONG ta = (a), tb = (b), tt;        \
5267bded2dbSJung-uk Kim         BN_ULONG lo = ta * tb;                  \
5277bded2dbSJung-uk Kim         BN_ULONG hi = BN_UMULT_HIGH(ta,tb);     \
5287bded2dbSJung-uk Kim         c0 += lo; tt = hi + ((c0<lo)?1:0);      \
5297bded2dbSJung-uk Kim         c1 += tt; c2 += (c1<tt)?1:0;            \
5307bded2dbSJung-uk Kim         c0 += lo; hi += (c0<lo)?1:0;            \
5317bded2dbSJung-uk Kim         c1 += hi; c2 += (c1<hi)?1:0;            \
5327bded2dbSJung-uk Kim         } while(0)
533f579bf8eSKris Kennaway 
5347bded2dbSJung-uk Kim #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
535f579bf8eSKris Kennaway         BN_ULONG ta = (a)[i];                   \
5367bded2dbSJung-uk Kim         BN_ULONG lo = ta * ta;                  \
5377bded2dbSJung-uk Kim         BN_ULONG hi = BN_UMULT_HIGH(ta,ta);     \
5387bded2dbSJung-uk Kim         c0 += lo; hi += (c0<lo)?1:0;            \
5397bded2dbSJung-uk Kim         c1 += hi; c2 += (c1<hi)?1:0;            \
5407bded2dbSJung-uk Kim         } while(0)
541f579bf8eSKris Kennaway 
542f579bf8eSKris Kennaway #  define sqr_add_c2(a,i,j,c0,c1,c2)      \
543f579bf8eSKris Kennaway         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
544f579bf8eSKris Kennaway 
545f579bf8eSKris Kennaway # else                          /* !BN_LLONG */
5467bded2dbSJung-uk Kim /*
5477bded2dbSJung-uk Kim  * Keep in mind that additions to hi can not overflow, because
5487bded2dbSJung-uk Kim  * the high word of a multiplication result cannot be all-ones.
5497bded2dbSJung-uk Kim  */
5507bded2dbSJung-uk Kim #  define mul_add_c(a,b,c0,c1,c2)       do {    \
5517bded2dbSJung-uk Kim         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
5527bded2dbSJung-uk Kim         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
5537bded2dbSJung-uk Kim         mul64(lo,hi,bl,bh);                     \
5547bded2dbSJung-uk Kim         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
5557bded2dbSJung-uk Kim         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
5567bded2dbSJung-uk Kim         } while(0)
55774664626SKris Kennaway 
5587bded2dbSJung-uk Kim #  define mul_add_c2(a,b,c0,c1,c2)      do {    \
5597bded2dbSJung-uk Kim         BN_ULONG tt;                            \
5607bded2dbSJung-uk Kim         BN_ULONG lo = LBITS(a), hi = HBITS(a);  \
5617bded2dbSJung-uk Kim         BN_ULONG bl = LBITS(b), bh = HBITS(b);  \
5627bded2dbSJung-uk Kim         mul64(lo,hi,bl,bh);                     \
5637bded2dbSJung-uk Kim         tt = hi;                                \
5647bded2dbSJung-uk Kim         c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \
5657bded2dbSJung-uk Kim         c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \
5667bded2dbSJung-uk Kim         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
5677bded2dbSJung-uk Kim         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
5687bded2dbSJung-uk Kim         } while(0)
56974664626SKris Kennaway 
5707bded2dbSJung-uk Kim #  define sqr_add_c(a,i,c0,c1,c2)       do {    \
5717bded2dbSJung-uk Kim         BN_ULONG lo, hi;                        \
5727bded2dbSJung-uk Kim         sqr64(lo,hi,(a)[i]);                    \
5737bded2dbSJung-uk Kim         c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \
5747bded2dbSJung-uk Kim         c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \
5757bded2dbSJung-uk Kim         } while(0)
57674664626SKris Kennaway 
57774664626SKris Kennaway #  define sqr_add_c2(a,i,j,c0,c1,c2) \
57874664626SKris Kennaway         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
579f579bf8eSKris Kennaway # endif                         /* !BN_LLONG */
58074664626SKris Kennaway 
58174664626SKris Kennaway void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
58274664626SKris Kennaway {
58374664626SKris Kennaway     BN_ULONG c1, c2, c3;
58474664626SKris Kennaway 
58574664626SKris Kennaway     c1 = 0;
58674664626SKris Kennaway     c2 = 0;
58774664626SKris Kennaway     c3 = 0;
58874664626SKris Kennaway     mul_add_c(a[0], b[0], c1, c2, c3);
58974664626SKris Kennaway     r[0] = c1;
59074664626SKris Kennaway     c1 = 0;
59174664626SKris Kennaway     mul_add_c(a[0], b[1], c2, c3, c1);
59274664626SKris Kennaway     mul_add_c(a[1], b[0], c2, c3, c1);
59374664626SKris Kennaway     r[1] = c2;
59474664626SKris Kennaway     c2 = 0;
59574664626SKris Kennaway     mul_add_c(a[2], b[0], c3, c1, c2);
59674664626SKris Kennaway     mul_add_c(a[1], b[1], c3, c1, c2);
59774664626SKris Kennaway     mul_add_c(a[0], b[2], c3, c1, c2);
59874664626SKris Kennaway     r[2] = c3;
59974664626SKris Kennaway     c3 = 0;
60074664626SKris Kennaway     mul_add_c(a[0], b[3], c1, c2, c3);
60174664626SKris Kennaway     mul_add_c(a[1], b[2], c1, c2, c3);
60274664626SKris Kennaway     mul_add_c(a[2], b[1], c1, c2, c3);
60374664626SKris Kennaway     mul_add_c(a[3], b[0], c1, c2, c3);
60474664626SKris Kennaway     r[3] = c1;
60574664626SKris Kennaway     c1 = 0;
60674664626SKris Kennaway     mul_add_c(a[4], b[0], c2, c3, c1);
60774664626SKris Kennaway     mul_add_c(a[3], b[1], c2, c3, c1);
60874664626SKris Kennaway     mul_add_c(a[2], b[2], c2, c3, c1);
60974664626SKris Kennaway     mul_add_c(a[1], b[3], c2, c3, c1);
61074664626SKris Kennaway     mul_add_c(a[0], b[4], c2, c3, c1);
61174664626SKris Kennaway     r[4] = c2;
61274664626SKris Kennaway     c2 = 0;
61374664626SKris Kennaway     mul_add_c(a[0], b[5], c3, c1, c2);
61474664626SKris Kennaway     mul_add_c(a[1], b[4], c3, c1, c2);
61574664626SKris Kennaway     mul_add_c(a[2], b[3], c3, c1, c2);
61674664626SKris Kennaway     mul_add_c(a[3], b[2], c3, c1, c2);
61774664626SKris Kennaway     mul_add_c(a[4], b[1], c3, c1, c2);
61874664626SKris Kennaway     mul_add_c(a[5], b[0], c3, c1, c2);
61974664626SKris Kennaway     r[5] = c3;
62074664626SKris Kennaway     c3 = 0;
62174664626SKris Kennaway     mul_add_c(a[6], b[0], c1, c2, c3);
62274664626SKris Kennaway     mul_add_c(a[5], b[1], c1, c2, c3);
62374664626SKris Kennaway     mul_add_c(a[4], b[2], c1, c2, c3);
62474664626SKris Kennaway     mul_add_c(a[3], b[3], c1, c2, c3);
62574664626SKris Kennaway     mul_add_c(a[2], b[4], c1, c2, c3);
62674664626SKris Kennaway     mul_add_c(a[1], b[5], c1, c2, c3);
62774664626SKris Kennaway     mul_add_c(a[0], b[6], c1, c2, c3);
62874664626SKris Kennaway     r[6] = c1;
62974664626SKris Kennaway     c1 = 0;
63074664626SKris Kennaway     mul_add_c(a[0], b[7], c2, c3, c1);
63174664626SKris Kennaway     mul_add_c(a[1], b[6], c2, c3, c1);
63274664626SKris Kennaway     mul_add_c(a[2], b[5], c2, c3, c1);
63374664626SKris Kennaway     mul_add_c(a[3], b[4], c2, c3, c1);
63474664626SKris Kennaway     mul_add_c(a[4], b[3], c2, c3, c1);
63574664626SKris Kennaway     mul_add_c(a[5], b[2], c2, c3, c1);
63674664626SKris Kennaway     mul_add_c(a[6], b[1], c2, c3, c1);
63774664626SKris Kennaway     mul_add_c(a[7], b[0], c2, c3, c1);
63874664626SKris Kennaway     r[7] = c2;
63974664626SKris Kennaway     c2 = 0;
64074664626SKris Kennaway     mul_add_c(a[7], b[1], c3, c1, c2);
64174664626SKris Kennaway     mul_add_c(a[6], b[2], c3, c1, c2);
64274664626SKris Kennaway     mul_add_c(a[5], b[3], c3, c1, c2);
64374664626SKris Kennaway     mul_add_c(a[4], b[4], c3, c1, c2);
64474664626SKris Kennaway     mul_add_c(a[3], b[5], c3, c1, c2);
64574664626SKris Kennaway     mul_add_c(a[2], b[6], c3, c1, c2);
64674664626SKris Kennaway     mul_add_c(a[1], b[7], c3, c1, c2);
64774664626SKris Kennaway     r[8] = c3;
64874664626SKris Kennaway     c3 = 0;
64974664626SKris Kennaway     mul_add_c(a[2], b[7], c1, c2, c3);
65074664626SKris Kennaway     mul_add_c(a[3], b[6], c1, c2, c3);
65174664626SKris Kennaway     mul_add_c(a[4], b[5], c1, c2, c3);
65274664626SKris Kennaway     mul_add_c(a[5], b[4], c1, c2, c3);
65374664626SKris Kennaway     mul_add_c(a[6], b[3], c1, c2, c3);
65474664626SKris Kennaway     mul_add_c(a[7], b[2], c1, c2, c3);
65574664626SKris Kennaway     r[9] = c1;
65674664626SKris Kennaway     c1 = 0;
65774664626SKris Kennaway     mul_add_c(a[7], b[3], c2, c3, c1);
65874664626SKris Kennaway     mul_add_c(a[6], b[4], c2, c3, c1);
65974664626SKris Kennaway     mul_add_c(a[5], b[5], c2, c3, c1);
66074664626SKris Kennaway     mul_add_c(a[4], b[6], c2, c3, c1);
66174664626SKris Kennaway     mul_add_c(a[3], b[7], c2, c3, c1);
66274664626SKris Kennaway     r[10] = c2;
66374664626SKris Kennaway     c2 = 0;
66474664626SKris Kennaway     mul_add_c(a[4], b[7], c3, c1, c2);
66574664626SKris Kennaway     mul_add_c(a[5], b[6], c3, c1, c2);
66674664626SKris Kennaway     mul_add_c(a[6], b[5], c3, c1, c2);
66774664626SKris Kennaway     mul_add_c(a[7], b[4], c3, c1, c2);
66874664626SKris Kennaway     r[11] = c3;
66974664626SKris Kennaway     c3 = 0;
67074664626SKris Kennaway     mul_add_c(a[7], b[5], c1, c2, c3);
67174664626SKris Kennaway     mul_add_c(a[6], b[6], c1, c2, c3);
67274664626SKris Kennaway     mul_add_c(a[5], b[7], c1, c2, c3);
67374664626SKris Kennaway     r[12] = c1;
67474664626SKris Kennaway     c1 = 0;
67574664626SKris Kennaway     mul_add_c(a[6], b[7], c2, c3, c1);
67674664626SKris Kennaway     mul_add_c(a[7], b[6], c2, c3, c1);
67774664626SKris Kennaway     r[13] = c2;
67874664626SKris Kennaway     c2 = 0;
67974664626SKris Kennaway     mul_add_c(a[7], b[7], c3, c1, c2);
68074664626SKris Kennaway     r[14] = c3;
68174664626SKris Kennaway     r[15] = c1;
68274664626SKris Kennaway }
68374664626SKris Kennaway 
68474664626SKris Kennaway void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
68574664626SKris Kennaway {
68674664626SKris Kennaway     BN_ULONG c1, c2, c3;
68774664626SKris Kennaway 
68874664626SKris Kennaway     c1 = 0;
68974664626SKris Kennaway     c2 = 0;
69074664626SKris Kennaway     c3 = 0;
69174664626SKris Kennaway     mul_add_c(a[0], b[0], c1, c2, c3);
69274664626SKris Kennaway     r[0] = c1;
69374664626SKris Kennaway     c1 = 0;
69474664626SKris Kennaway     mul_add_c(a[0], b[1], c2, c3, c1);
69574664626SKris Kennaway     mul_add_c(a[1], b[0], c2, c3, c1);
69674664626SKris Kennaway     r[1] = c2;
69774664626SKris Kennaway     c2 = 0;
69874664626SKris Kennaway     mul_add_c(a[2], b[0], c3, c1, c2);
69974664626SKris Kennaway     mul_add_c(a[1], b[1], c3, c1, c2);
70074664626SKris Kennaway     mul_add_c(a[0], b[2], c3, c1, c2);
70174664626SKris Kennaway     r[2] = c3;
70274664626SKris Kennaway     c3 = 0;
70374664626SKris Kennaway     mul_add_c(a[0], b[3], c1, c2, c3);
70474664626SKris Kennaway     mul_add_c(a[1], b[2], c1, c2, c3);
70574664626SKris Kennaway     mul_add_c(a[2], b[1], c1, c2, c3);
70674664626SKris Kennaway     mul_add_c(a[3], b[0], c1, c2, c3);
70774664626SKris Kennaway     r[3] = c1;
70874664626SKris Kennaway     c1 = 0;
70974664626SKris Kennaway     mul_add_c(a[3], b[1], c2, c3, c1);
71074664626SKris Kennaway     mul_add_c(a[2], b[2], c2, c3, c1);
71174664626SKris Kennaway     mul_add_c(a[1], b[3], c2, c3, c1);
71274664626SKris Kennaway     r[4] = c2;
71374664626SKris Kennaway     c2 = 0;
71474664626SKris Kennaway     mul_add_c(a[2], b[3], c3, c1, c2);
71574664626SKris Kennaway     mul_add_c(a[3], b[2], c3, c1, c2);
71674664626SKris Kennaway     r[5] = c3;
71774664626SKris Kennaway     c3 = 0;
71874664626SKris Kennaway     mul_add_c(a[3], b[3], c1, c2, c3);
71974664626SKris Kennaway     r[6] = c1;
72074664626SKris Kennaway     r[7] = c2;
72174664626SKris Kennaway }
72274664626SKris Kennaway 
7235c87c606SMark Murray void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
72474664626SKris Kennaway {
72574664626SKris Kennaway     BN_ULONG c1, c2, c3;
72674664626SKris Kennaway 
72774664626SKris Kennaway     c1 = 0;
72874664626SKris Kennaway     c2 = 0;
72974664626SKris Kennaway     c3 = 0;
73074664626SKris Kennaway     sqr_add_c(a, 0, c1, c2, c3);
73174664626SKris Kennaway     r[0] = c1;
73274664626SKris Kennaway     c1 = 0;
73374664626SKris Kennaway     sqr_add_c2(a, 1, 0, c2, c3, c1);
73474664626SKris Kennaway     r[1] = c2;
73574664626SKris Kennaway     c2 = 0;
73674664626SKris Kennaway     sqr_add_c(a, 1, c3, c1, c2);
73774664626SKris Kennaway     sqr_add_c2(a, 2, 0, c3, c1, c2);
73874664626SKris Kennaway     r[2] = c3;
73974664626SKris Kennaway     c3 = 0;
74074664626SKris Kennaway     sqr_add_c2(a, 3, 0, c1, c2, c3);
74174664626SKris Kennaway     sqr_add_c2(a, 2, 1, c1, c2, c3);
74274664626SKris Kennaway     r[3] = c1;
74374664626SKris Kennaway     c1 = 0;
74474664626SKris Kennaway     sqr_add_c(a, 2, c2, c3, c1);
74574664626SKris Kennaway     sqr_add_c2(a, 3, 1, c2, c3, c1);
74674664626SKris Kennaway     sqr_add_c2(a, 4, 0, c2, c3, c1);
74774664626SKris Kennaway     r[4] = c2;
74874664626SKris Kennaway     c2 = 0;
74974664626SKris Kennaway     sqr_add_c2(a, 5, 0, c3, c1, c2);
75074664626SKris Kennaway     sqr_add_c2(a, 4, 1, c3, c1, c2);
75174664626SKris Kennaway     sqr_add_c2(a, 3, 2, c3, c1, c2);
75274664626SKris Kennaway     r[5] = c3;
75374664626SKris Kennaway     c3 = 0;
75474664626SKris Kennaway     sqr_add_c(a, 3, c1, c2, c3);
75574664626SKris Kennaway     sqr_add_c2(a, 4, 2, c1, c2, c3);
75674664626SKris Kennaway     sqr_add_c2(a, 5, 1, c1, c2, c3);
75774664626SKris Kennaway     sqr_add_c2(a, 6, 0, c1, c2, c3);
75874664626SKris Kennaway     r[6] = c1;
75974664626SKris Kennaway     c1 = 0;
76074664626SKris Kennaway     sqr_add_c2(a, 7, 0, c2, c3, c1);
76174664626SKris Kennaway     sqr_add_c2(a, 6, 1, c2, c3, c1);
76274664626SKris Kennaway     sqr_add_c2(a, 5, 2, c2, c3, c1);
76374664626SKris Kennaway     sqr_add_c2(a, 4, 3, c2, c3, c1);
76474664626SKris Kennaway     r[7] = c2;
76574664626SKris Kennaway     c2 = 0;
76674664626SKris Kennaway     sqr_add_c(a, 4, c3, c1, c2);
76774664626SKris Kennaway     sqr_add_c2(a, 5, 3, c3, c1, c2);
76874664626SKris Kennaway     sqr_add_c2(a, 6, 2, c3, c1, c2);
76974664626SKris Kennaway     sqr_add_c2(a, 7, 1, c3, c1, c2);
77074664626SKris Kennaway     r[8] = c3;
77174664626SKris Kennaway     c3 = 0;
77274664626SKris Kennaway     sqr_add_c2(a, 7, 2, c1, c2, c3);
77374664626SKris Kennaway     sqr_add_c2(a, 6, 3, c1, c2, c3);
77474664626SKris Kennaway     sqr_add_c2(a, 5, 4, c1, c2, c3);
77574664626SKris Kennaway     r[9] = c1;
77674664626SKris Kennaway     c1 = 0;
77774664626SKris Kennaway     sqr_add_c(a, 5, c2, c3, c1);
77874664626SKris Kennaway     sqr_add_c2(a, 6, 4, c2, c3, c1);
77974664626SKris Kennaway     sqr_add_c2(a, 7, 3, c2, c3, c1);
78074664626SKris Kennaway     r[10] = c2;
78174664626SKris Kennaway     c2 = 0;
78274664626SKris Kennaway     sqr_add_c2(a, 7, 4, c3, c1, c2);
78374664626SKris Kennaway     sqr_add_c2(a, 6, 5, c3, c1, c2);
78474664626SKris Kennaway     r[11] = c3;
78574664626SKris Kennaway     c3 = 0;
78674664626SKris Kennaway     sqr_add_c(a, 6, c1, c2, c3);
78774664626SKris Kennaway     sqr_add_c2(a, 7, 5, c1, c2, c3);
78874664626SKris Kennaway     r[12] = c1;
78974664626SKris Kennaway     c1 = 0;
79074664626SKris Kennaway     sqr_add_c2(a, 7, 6, c2, c3, c1);
79174664626SKris Kennaway     r[13] = c2;
79274664626SKris Kennaway     c2 = 0;
79374664626SKris Kennaway     sqr_add_c(a, 7, c3, c1, c2);
79474664626SKris Kennaway     r[14] = c3;
79574664626SKris Kennaway     r[15] = c1;
79674664626SKris Kennaway }
79774664626SKris Kennaway 
7985c87c606SMark Murray void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
79974664626SKris Kennaway {
80074664626SKris Kennaway     BN_ULONG c1, c2, c3;
80174664626SKris Kennaway 
80274664626SKris Kennaway     c1 = 0;
80374664626SKris Kennaway     c2 = 0;
80474664626SKris Kennaway     c3 = 0;
80574664626SKris Kennaway     sqr_add_c(a, 0, c1, c2, c3);
80674664626SKris Kennaway     r[0] = c1;
80774664626SKris Kennaway     c1 = 0;
80874664626SKris Kennaway     sqr_add_c2(a, 1, 0, c2, c3, c1);
80974664626SKris Kennaway     r[1] = c2;
81074664626SKris Kennaway     c2 = 0;
81174664626SKris Kennaway     sqr_add_c(a, 1, c3, c1, c2);
81274664626SKris Kennaway     sqr_add_c2(a, 2, 0, c3, c1, c2);
81374664626SKris Kennaway     r[2] = c3;
81474664626SKris Kennaway     c3 = 0;
81574664626SKris Kennaway     sqr_add_c2(a, 3, 0, c1, c2, c3);
81674664626SKris Kennaway     sqr_add_c2(a, 2, 1, c1, c2, c3);
81774664626SKris Kennaway     r[3] = c1;
81874664626SKris Kennaway     c1 = 0;
81974664626SKris Kennaway     sqr_add_c(a, 2, c2, c3, c1);
82074664626SKris Kennaway     sqr_add_c2(a, 3, 1, c2, c3, c1);
82174664626SKris Kennaway     r[4] = c2;
82274664626SKris Kennaway     c2 = 0;
82374664626SKris Kennaway     sqr_add_c2(a, 3, 2, c3, c1, c2);
82474664626SKris Kennaway     r[5] = c3;
82574664626SKris Kennaway     c3 = 0;
82674664626SKris Kennaway     sqr_add_c(a, 3, c1, c2, c3);
82774664626SKris Kennaway     r[6] = c1;
82874664626SKris Kennaway     r[7] = c2;
82974664626SKris Kennaway }
8301f13597dSJung-uk Kim 
8311f13597dSJung-uk Kim # ifdef OPENSSL_NO_ASM
8321f13597dSJung-uk Kim #  ifdef OPENSSL_BN_ASM_MONT
8331f13597dSJung-uk Kim #   include <alloca.h>
8341f13597dSJung-uk Kim /*
8351f13597dSJung-uk Kim  * This is essentially reference implementation, which may or may not
8361f13597dSJung-uk Kim  * result in performance improvement. E.g. on IA-32 this routine was
8371f13597dSJung-uk Kim  * observed to give 40% faster rsa1024 private key operations and 10%
8381f13597dSJung-uk Kim  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
8391f13597dSJung-uk Kim  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
8401f13597dSJung-uk Kim  * reference implementation, one to be used as starting point for
8411f13597dSJung-uk Kim  * platform-specific assembler. Mentioned numbers apply to compiler
8421f13597dSJung-uk Kim  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
8431f13597dSJung-uk Kim  * can vary not only from platform to platform, but even for compiler
8441f13597dSJung-uk Kim  * versions. Assembler vs. assembler improvement coefficients can
8451f13597dSJung-uk Kim  * [and are known to] differ and are to be documented elsewhere.
8461f13597dSJung-uk Kim  */
8476f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
8486f9291ceSJung-uk Kim                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
8491f13597dSJung-uk Kim {
8501f13597dSJung-uk Kim     BN_ULONG c0, c1, ml, *tp, n0;
8511f13597dSJung-uk Kim #   ifdef mul64
8521f13597dSJung-uk Kim     BN_ULONG mh;
8531f13597dSJung-uk Kim #   endif
8541f13597dSJung-uk Kim     volatile BN_ULONG *vp;
8551f13597dSJung-uk Kim     int i = 0, j;
8561f13597dSJung-uk Kim 
8576f9291ceSJung-uk Kim #   if 0                        /* template for platform-specific
8586f9291ceSJung-uk Kim                                  * implementation */
8596f9291ceSJung-uk Kim     if (ap == bp)
8606f9291ceSJung-uk Kim         return bn_sqr_mont(rp, ap, np, n0p, num);
8611f13597dSJung-uk Kim #   endif
8621f13597dSJung-uk Kim     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
8631f13597dSJung-uk Kim 
8641f13597dSJung-uk Kim     n0 = *n0p;
8651f13597dSJung-uk Kim 
8661f13597dSJung-uk Kim     c0 = 0;
8671f13597dSJung-uk Kim     ml = bp[0];
8681f13597dSJung-uk Kim #   ifdef mul64
8691f13597dSJung-uk Kim     mh = HBITS(ml);
8701f13597dSJung-uk Kim     ml = LBITS(ml);
8711f13597dSJung-uk Kim     for (j = 0; j < num; ++j)
8721f13597dSJung-uk Kim         mul(tp[j], ap[j], ml, mh, c0);
8731f13597dSJung-uk Kim #   else
8741f13597dSJung-uk Kim     for (j = 0; j < num; ++j)
8751f13597dSJung-uk Kim         mul(tp[j], ap[j], ml, c0);
8761f13597dSJung-uk Kim #   endif
8771f13597dSJung-uk Kim 
8781f13597dSJung-uk Kim     tp[num] = c0;
8791f13597dSJung-uk Kim     tp[num + 1] = 0;
8801f13597dSJung-uk Kim     goto enter;
8811f13597dSJung-uk Kim 
8826f9291ceSJung-uk Kim     for (i = 0; i < num; i++) {
8831f13597dSJung-uk Kim         c0 = 0;
8841f13597dSJung-uk Kim         ml = bp[i];
8851f13597dSJung-uk Kim #   ifdef mul64
8861f13597dSJung-uk Kim         mh = HBITS(ml);
8871f13597dSJung-uk Kim         ml = LBITS(ml);
8881f13597dSJung-uk Kim         for (j = 0; j < num; ++j)
8891f13597dSJung-uk Kim             mul_add(tp[j], ap[j], ml, mh, c0);
8901f13597dSJung-uk Kim #   else
8911f13597dSJung-uk Kim         for (j = 0; j < num; ++j)
8921f13597dSJung-uk Kim             mul_add(tp[j], ap[j], ml, c0);
8931f13597dSJung-uk Kim #   endif
8941f13597dSJung-uk Kim         c1 = (tp[num] + c0) & BN_MASK2;
8951f13597dSJung-uk Kim         tp[num] = c1;
8961f13597dSJung-uk Kim         tp[num + 1] = (c1 < c0 ? 1 : 0);
8971f13597dSJung-uk Kim  enter:
8981f13597dSJung-uk Kim         c1 = tp[0];
8991f13597dSJung-uk Kim         ml = (c1 * n0) & BN_MASK2;
9001f13597dSJung-uk Kim         c0 = 0;
9011f13597dSJung-uk Kim #   ifdef mul64
9021f13597dSJung-uk Kim         mh = HBITS(ml);
9031f13597dSJung-uk Kim         ml = LBITS(ml);
9041f13597dSJung-uk Kim         mul_add(c1, np[0], ml, mh, c0);
9051f13597dSJung-uk Kim #   else
9061f13597dSJung-uk Kim         mul_add(c1, ml, np[0], c0);
9071f13597dSJung-uk Kim #   endif
9086f9291ceSJung-uk Kim         for (j = 1; j < num; j++) {
9091f13597dSJung-uk Kim             c1 = tp[j];
9101f13597dSJung-uk Kim #   ifdef mul64
9111f13597dSJung-uk Kim             mul_add(c1, np[j], ml, mh, c0);
9121f13597dSJung-uk Kim #   else
9131f13597dSJung-uk Kim             mul_add(c1, ml, np[j], c0);
9141f13597dSJung-uk Kim #   endif
9151f13597dSJung-uk Kim             tp[j - 1] = c1 & BN_MASK2;
9161f13597dSJung-uk Kim         }
9171f13597dSJung-uk Kim         c1 = (tp[num] + c0) & BN_MASK2;
9181f13597dSJung-uk Kim         tp[num - 1] = c1;
9191f13597dSJung-uk Kim         tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
9201f13597dSJung-uk Kim     }
9211f13597dSJung-uk Kim 
9226f9291ceSJung-uk Kim     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
9231f13597dSJung-uk Kim         c0 = bn_sub_words(rp, tp, np, num);
9246f9291ceSJung-uk Kim         if (tp[num] != 0 || c0 == 0) {
9256f9291ceSJung-uk Kim             for (i = 0; i < num + 2; i++)
9266f9291ceSJung-uk Kim                 vp[i] = 0;
9271f13597dSJung-uk Kim             return 1;
9281f13597dSJung-uk Kim         }
9291f13597dSJung-uk Kim     }
9306f9291ceSJung-uk Kim     for (i = 0; i < num; i++)
9316f9291ceSJung-uk Kim         rp[i] = tp[i], vp[i] = 0;
9321f13597dSJung-uk Kim     vp[num] = 0;
9331f13597dSJung-uk Kim     vp[num + 1] = 0;
9341f13597dSJung-uk Kim     return 1;
9351f13597dSJung-uk Kim }
9361f13597dSJung-uk Kim #  else
9371f13597dSJung-uk Kim /*
9381f13597dSJung-uk Kim  * Return value of 0 indicates that multiplication/convolution was not
9391f13597dSJung-uk Kim  * performed to signal the caller to fall down to alternative/original
9401f13597dSJung-uk Kim  * code-path.
9411f13597dSJung-uk Kim  */
9426f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
9436f9291ceSJung-uk Kim                 const BN_ULONG *np, const BN_ULONG *n0, int num)
9446f9291ceSJung-uk Kim {
9456f9291ceSJung-uk Kim     return 0;
9466f9291ceSJung-uk Kim }
9471f13597dSJung-uk Kim #  endif                        /* OPENSSL_BN_ASM_MONT */
9481f13597dSJung-uk Kim # endif
9491f13597dSJung-uk Kim 
950f579bf8eSKris Kennaway #else                           /* !BN_MUL_COMBA */
95174664626SKris Kennaway 
95274664626SKris Kennaway /* hmm... is it faster just to do a multiply? */
95374664626SKris Kennaway # undef bn_sqr_comba4
954e71b7053SJung-uk Kim # undef bn_sqr_comba8
9551f13597dSJung-uk Kim void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
95674664626SKris Kennaway {
95774664626SKris Kennaway     BN_ULONG t[8];
95874664626SKris Kennaway     bn_sqr_normal(r, a, 4, t);
95974664626SKris Kennaway }
96074664626SKris Kennaway 
9611f13597dSJung-uk Kim void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
96274664626SKris Kennaway {
96374664626SKris Kennaway     BN_ULONG t[16];
96474664626SKris Kennaway     bn_sqr_normal(r, a, 8, t);
96574664626SKris Kennaway }
96674664626SKris Kennaway 
96774664626SKris Kennaway void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
96874664626SKris Kennaway {
96974664626SKris Kennaway     r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
97074664626SKris Kennaway     r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
97174664626SKris Kennaway     r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
97274664626SKris Kennaway     r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
97374664626SKris Kennaway }
97474664626SKris Kennaway 
97574664626SKris Kennaway void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
97674664626SKris Kennaway {
97774664626SKris Kennaway     r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
97874664626SKris Kennaway     r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
97974664626SKris Kennaway     r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
98074664626SKris Kennaway     r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
98174664626SKris Kennaway     r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
98274664626SKris Kennaway     r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
98374664626SKris Kennaway     r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
98474664626SKris Kennaway     r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
98574664626SKris Kennaway }
98674664626SKris Kennaway 
9871f13597dSJung-uk Kim # ifdef OPENSSL_NO_ASM
9881f13597dSJung-uk Kim #  ifdef OPENSSL_BN_ASM_MONT
9891f13597dSJung-uk Kim #   include <alloca.h>
9906f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
9916f9291ceSJung-uk Kim                 const BN_ULONG *np, const BN_ULONG *n0p, int num)
9921f13597dSJung-uk Kim {
9931f13597dSJung-uk Kim     BN_ULONG c0, c1, *tp, n0 = *n0p;
9941f13597dSJung-uk Kim     volatile BN_ULONG *vp;
9951f13597dSJung-uk Kim     int i = 0, j;
9961f13597dSJung-uk Kim 
9971f13597dSJung-uk Kim     vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
9981f13597dSJung-uk Kim 
9996f9291ceSJung-uk Kim     for (i = 0; i <= num; i++)
10006f9291ceSJung-uk Kim         tp[i] = 0;
10011f13597dSJung-uk Kim 
10026f9291ceSJung-uk Kim     for (i = 0; i < num; i++) {
10031f13597dSJung-uk Kim         c0 = bn_mul_add_words(tp, ap, num, bp[i]);
10041f13597dSJung-uk Kim         c1 = (tp[num] + c0) & BN_MASK2;
10051f13597dSJung-uk Kim         tp[num] = c1;
10061f13597dSJung-uk Kim         tp[num + 1] = (c1 < c0 ? 1 : 0);
10071f13597dSJung-uk Kim 
10081f13597dSJung-uk Kim         c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
10091f13597dSJung-uk Kim         c1 = (tp[num] + c0) & BN_MASK2;
10101f13597dSJung-uk Kim         tp[num] = c1;
10111f13597dSJung-uk Kim         tp[num + 1] += (c1 < c0 ? 1 : 0);
10126f9291ceSJung-uk Kim         for (j = 0; j <= num; j++)
10136f9291ceSJung-uk Kim             tp[j] = tp[j + 1];
10141f13597dSJung-uk Kim     }
10151f13597dSJung-uk Kim 
10166f9291ceSJung-uk Kim     if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
10171f13597dSJung-uk Kim         c0 = bn_sub_words(rp, tp, np, num);
10186f9291ceSJung-uk Kim         if (tp[num] != 0 || c0 == 0) {
10196f9291ceSJung-uk Kim             for (i = 0; i < num + 2; i++)
10206f9291ceSJung-uk Kim                 vp[i] = 0;
10211f13597dSJung-uk Kim             return 1;
10221f13597dSJung-uk Kim         }
10231f13597dSJung-uk Kim     }
10246f9291ceSJung-uk Kim     for (i = 0; i < num; i++)
10256f9291ceSJung-uk Kim         rp[i] = tp[i], vp[i] = 0;
10261f13597dSJung-uk Kim     vp[num] = 0;
10271f13597dSJung-uk Kim     vp[num + 1] = 0;
10281f13597dSJung-uk Kim     return 1;
10291f13597dSJung-uk Kim }
10301f13597dSJung-uk Kim #  else
10316f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
10326f9291ceSJung-uk Kim                 const BN_ULONG *np, const BN_ULONG *n0, int num)
10336f9291ceSJung-uk Kim {
10346f9291ceSJung-uk Kim     return 0;
10356f9291ceSJung-uk Kim }
10361f13597dSJung-uk Kim #  endif                        /* OPENSSL_BN_ASM_MONT */
10371f13597dSJung-uk Kim # endif
10381f13597dSJung-uk Kim 
1039f579bf8eSKris Kennaway #endif                          /* !BN_MUL_COMBA */
1040