1e71b7053SJung-uk Kim /* 28f1ef87aSJung-uk Kim * Copyright 1995-2023 The OpenSSL Project Authors. All Rights Reserved. 374664626SKris Kennaway * 4e71b7053SJung-uk Kim * Licensed under the OpenSSL license (the "License"). You may not use 5e71b7053SJung-uk Kim * this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim * in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim * https://www.openssl.org/source/license.html 874664626SKris Kennaway */ 974664626SKris Kennaway 10f579bf8eSKris Kennaway #include <assert.h> 11e71b7053SJung-uk Kim #include <openssl/crypto.h> 12e71b7053SJung-uk Kim #include "internal/cryptlib.h" 1317f01e99SJung-uk Kim #include "bn_local.h" 1474664626SKris Kennaway 15f579bf8eSKris Kennaway #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 1674664626SKris Kennaway 176f9291ceSJung-uk Kim BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 186f9291ceSJung-uk Kim BN_ULONG w) 1974664626SKris Kennaway { 2074664626SKris Kennaway BN_ULONG c1 = 0; 2174664626SKris Kennaway 22f579bf8eSKris Kennaway assert(num >= 0); 236f9291ceSJung-uk Kim if (num <= 0) 24e71b7053SJung-uk Kim return c1; 2574664626SKris Kennaway 261f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 276f9291ceSJung-uk Kim while (num & ~3) { 2874664626SKris Kennaway mul_add(rp[0], ap[0], w, c1); 2974664626SKris Kennaway mul_add(rp[1], ap[1], w, c1); 3074664626SKris Kennaway mul_add(rp[2], ap[2], w, c1); 3174664626SKris Kennaway mul_add(rp[3], ap[3], w, c1); 326f9291ceSJung-uk Kim ap += 4; 336f9291ceSJung-uk Kim rp += 4; 346f9291ceSJung-uk Kim num -= 4; 35f579bf8eSKris Kennaway } 361f13597dSJung-uk Kim # endif 376f9291ceSJung-uk Kim while (num) { 381f13597dSJung-uk Kim mul_add(rp[0], ap[0], w, c1); 396f9291ceSJung-uk Kim ap++; 406f9291ceSJung-uk Kim rp++; 416f9291ceSJung-uk Kim num--; 4274664626SKris Kennaway } 4374664626SKris Kennaway 44e71b7053SJung-uk Kim return c1; 4574664626SKris Kennaway } 4674664626SKris Kennaway 475c87c606SMark Murray BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 4874664626SKris Kennaway { 4974664626SKris Kennaway BN_ULONG c1 = 0; 5074664626SKris Kennaway 51f579bf8eSKris Kennaway assert(num >= 0); 526f9291ceSJung-uk Kim if (num <= 0) 53e71b7053SJung-uk Kim return c1; 5474664626SKris Kennaway 551f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 566f9291ceSJung-uk Kim while (num & ~3) { 5774664626SKris Kennaway mul(rp[0], ap[0], w, c1); 5874664626SKris Kennaway mul(rp[1], ap[1], w, c1); 5974664626SKris Kennaway mul(rp[2], ap[2], w, c1); 6074664626SKris Kennaway mul(rp[3], ap[3], w, c1); 616f9291ceSJung-uk Kim ap += 4; 626f9291ceSJung-uk Kim rp += 4; 636f9291ceSJung-uk Kim num -= 4; 64f579bf8eSKris Kennaway } 651f13597dSJung-uk Kim # endif 666f9291ceSJung-uk Kim while (num) { 671f13597dSJung-uk Kim mul(rp[0], ap[0], w, c1); 686f9291ceSJung-uk Kim ap++; 696f9291ceSJung-uk Kim rp++; 706f9291ceSJung-uk Kim num--; 7174664626SKris Kennaway } 72e71b7053SJung-uk Kim return c1; 7374664626SKris Kennaway } 7474664626SKris Kennaway 755c87c606SMark Murray void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 7674664626SKris Kennaway { 77f579bf8eSKris Kennaway assert(n >= 0); 786f9291ceSJung-uk Kim if (n <= 0) 796f9291ceSJung-uk Kim return; 801f13597dSJung-uk Kim 811f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 826f9291ceSJung-uk Kim while (n & ~3) { 83f579bf8eSKris Kennaway sqr(r[0], r[1], a[0]); 84f579bf8eSKris Kennaway sqr(r[2], r[3], a[1]); 85f579bf8eSKris Kennaway sqr(r[4], r[5], a[2]); 86f579bf8eSKris Kennaway sqr(r[6], r[7], a[3]); 876f9291ceSJung-uk Kim a += 4; 886f9291ceSJung-uk Kim r += 8; 896f9291ceSJung-uk Kim n -= 4; 90f579bf8eSKris Kennaway } 911f13597dSJung-uk Kim # endif 926f9291ceSJung-uk Kim while (n) { 931f13597dSJung-uk Kim sqr(r[0], r[1], a[0]); 946f9291ceSJung-uk Kim a++; 956f9291ceSJung-uk Kim r += 2; 966f9291ceSJung-uk Kim n--; 9774664626SKris Kennaway } 9874664626SKris Kennaway } 9974664626SKris Kennaway 1006f9291ceSJung-uk Kim #else /* !(defined(BN_LLONG) || 1016f9291ceSJung-uk Kim * defined(BN_UMULT_HIGH)) */ 10274664626SKris Kennaway 1036f9291ceSJung-uk Kim BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 1046f9291ceSJung-uk Kim BN_ULONG w) 10574664626SKris Kennaway { 10674664626SKris Kennaway BN_ULONG c = 0; 10774664626SKris Kennaway BN_ULONG bl, bh; 10874664626SKris Kennaway 109f579bf8eSKris Kennaway assert(num >= 0); 1106f9291ceSJung-uk Kim if (num <= 0) 111e71b7053SJung-uk Kim return (BN_ULONG)0; 11274664626SKris Kennaway 11374664626SKris Kennaway bl = LBITS(w); 11474664626SKris Kennaway bh = HBITS(w); 11574664626SKris Kennaway 1161f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 1176f9291ceSJung-uk Kim while (num & ~3) { 11874664626SKris Kennaway mul_add(rp[0], ap[0], bl, bh, c); 11974664626SKris Kennaway mul_add(rp[1], ap[1], bl, bh, c); 12074664626SKris Kennaway mul_add(rp[2], ap[2], bl, bh, c); 12174664626SKris Kennaway mul_add(rp[3], ap[3], bl, bh, c); 1226f9291ceSJung-uk Kim ap += 4; 1236f9291ceSJung-uk Kim rp += 4; 1246f9291ceSJung-uk Kim num -= 4; 1251f13597dSJung-uk Kim } 1261f13597dSJung-uk Kim # endif 1276f9291ceSJung-uk Kim while (num) { 1281f13597dSJung-uk Kim mul_add(rp[0], ap[0], bl, bh, c); 1296f9291ceSJung-uk Kim ap++; 1306f9291ceSJung-uk Kim rp++; 1316f9291ceSJung-uk Kim num--; 13274664626SKris Kennaway } 133e71b7053SJung-uk Kim return c; 13474664626SKris Kennaway } 13574664626SKris Kennaway 1365c87c606SMark Murray BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 13774664626SKris Kennaway { 13874664626SKris Kennaway BN_ULONG carry = 0; 13974664626SKris Kennaway BN_ULONG bl, bh; 14074664626SKris Kennaway 141f579bf8eSKris Kennaway assert(num >= 0); 1426f9291ceSJung-uk Kim if (num <= 0) 143e71b7053SJung-uk Kim return (BN_ULONG)0; 14474664626SKris Kennaway 14574664626SKris Kennaway bl = LBITS(w); 14674664626SKris Kennaway bh = HBITS(w); 14774664626SKris Kennaway 1481f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 1496f9291ceSJung-uk Kim while (num & ~3) { 15074664626SKris Kennaway mul(rp[0], ap[0], bl, bh, carry); 15174664626SKris Kennaway mul(rp[1], ap[1], bl, bh, carry); 15274664626SKris Kennaway mul(rp[2], ap[2], bl, bh, carry); 15374664626SKris Kennaway mul(rp[3], ap[3], bl, bh, carry); 1546f9291ceSJung-uk Kim ap += 4; 1556f9291ceSJung-uk Kim rp += 4; 1566f9291ceSJung-uk Kim num -= 4; 1571f13597dSJung-uk Kim } 1581f13597dSJung-uk Kim # endif 1596f9291ceSJung-uk Kim while (num) { 1601f13597dSJung-uk Kim mul(rp[0], ap[0], bl, bh, carry); 1616f9291ceSJung-uk Kim ap++; 1626f9291ceSJung-uk Kim rp++; 1636f9291ceSJung-uk Kim num--; 16474664626SKris Kennaway } 165e71b7053SJung-uk Kim return carry; 16674664626SKris Kennaway } 16774664626SKris Kennaway 1685c87c606SMark Murray void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 16974664626SKris Kennaway { 170f579bf8eSKris Kennaway assert(n >= 0); 1716f9291ceSJung-uk Kim if (n <= 0) 1726f9291ceSJung-uk Kim return; 1731f13597dSJung-uk Kim 1741f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 1756f9291ceSJung-uk Kim while (n & ~3) { 17674664626SKris Kennaway sqr64(r[0], r[1], a[0]); 17774664626SKris Kennaway sqr64(r[2], r[3], a[1]); 17874664626SKris Kennaway sqr64(r[4], r[5], a[2]); 17974664626SKris Kennaway sqr64(r[6], r[7], a[3]); 1806f9291ceSJung-uk Kim a += 4; 1816f9291ceSJung-uk Kim r += 8; 1826f9291ceSJung-uk Kim n -= 4; 1831f13597dSJung-uk Kim } 1841f13597dSJung-uk Kim # endif 1856f9291ceSJung-uk Kim while (n) { 1861f13597dSJung-uk Kim sqr64(r[0], r[1], a[0]); 1876f9291ceSJung-uk Kim a++; 1886f9291ceSJung-uk Kim r += 2; 1896f9291ceSJung-uk Kim n--; 19074664626SKris Kennaway } 19174664626SKris Kennaway } 19274664626SKris Kennaway 1936f9291ceSJung-uk Kim #endif /* !(defined(BN_LLONG) || 1946f9291ceSJung-uk Kim * defined(BN_UMULT_HIGH)) */ 19574664626SKris Kennaway 19674664626SKris Kennaway #if defined(BN_LLONG) && defined(BN_DIV2W) 19774664626SKris Kennaway 19874664626SKris Kennaway BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 19974664626SKris Kennaway { 20074664626SKris Kennaway return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 20174664626SKris Kennaway } 20274664626SKris Kennaway 20374664626SKris Kennaway #else 20474664626SKris Kennaway 205ddd58736SKris Kennaway /* Divide h,l by d and return the result. */ 20674664626SKris Kennaway /* I need to test this some more :-( */ 20774664626SKris Kennaway BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 20874664626SKris Kennaway { 20974664626SKris Kennaway BN_ULONG dh, dl, q, ret = 0, th, tl, t; 21074664626SKris Kennaway int i, count = 2; 21174664626SKris Kennaway 2126f9291ceSJung-uk Kim if (d == 0) 213e71b7053SJung-uk Kim return BN_MASK2; 21474664626SKris Kennaway 21574664626SKris Kennaway i = BN_num_bits_word(d); 2163b4e3dcbSSimon L. B. Nielsen assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 217ddd58736SKris Kennaway 21874664626SKris Kennaway i = BN_BITS2 - i; 2196f9291ceSJung-uk Kim if (h >= d) 2206f9291ceSJung-uk Kim h -= d; 22174664626SKris Kennaway 2226f9291ceSJung-uk Kim if (i) { 22374664626SKris Kennaway d <<= i; 22474664626SKris Kennaway h = (h << i) | (l >> (BN_BITS2 - i)); 22574664626SKris Kennaway l <<= i; 22674664626SKris Kennaway } 22774664626SKris Kennaway dh = (d & BN_MASK2h) >> BN_BITS4; 22874664626SKris Kennaway dl = (d & BN_MASK2l); 2296f9291ceSJung-uk Kim for (;;) { 23074664626SKris Kennaway if ((h >> BN_BITS4) == dh) 23174664626SKris Kennaway q = BN_MASK2l; 23274664626SKris Kennaway else 23374664626SKris Kennaway q = h / dh; 23474664626SKris Kennaway 23574664626SKris Kennaway th = q * dh; 23674664626SKris Kennaway tl = dl * q; 2376f9291ceSJung-uk Kim for (;;) { 23874664626SKris Kennaway t = h - th; 23974664626SKris Kennaway if ((t & BN_MASK2h) || 2406f9291ceSJung-uk Kim ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 24174664626SKris Kennaway break; 24274664626SKris Kennaway q--; 24374664626SKris Kennaway th -= dh; 24474664626SKris Kennaway tl -= dl; 24574664626SKris Kennaway } 24674664626SKris Kennaway t = (tl >> BN_BITS4); 24774664626SKris Kennaway tl = (tl << BN_BITS4) & BN_MASK2h; 24874664626SKris Kennaway th += t; 24974664626SKris Kennaway 2506f9291ceSJung-uk Kim if (l < tl) 2516f9291ceSJung-uk Kim th++; 25274664626SKris Kennaway l -= tl; 2536f9291ceSJung-uk Kim if (h < th) { 25474664626SKris Kennaway h += d; 25574664626SKris Kennaway q--; 25674664626SKris Kennaway } 25774664626SKris Kennaway h -= th; 25874664626SKris Kennaway 2596f9291ceSJung-uk Kim if (--count == 0) 2606f9291ceSJung-uk Kim break; 26174664626SKris Kennaway 26274664626SKris Kennaway ret = q << BN_BITS4; 26374664626SKris Kennaway h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 26474664626SKris Kennaway l = (l & BN_MASK2l) << BN_BITS4; 26574664626SKris Kennaway } 26674664626SKris Kennaway ret |= q; 267e71b7053SJung-uk Kim return ret; 26874664626SKris Kennaway } 269f579bf8eSKris Kennaway #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 27074664626SKris Kennaway 27174664626SKris Kennaway #ifdef BN_LLONG 2726f9291ceSJung-uk Kim BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 2736f9291ceSJung-uk Kim int n) 27474664626SKris Kennaway { 27574664626SKris Kennaway BN_ULLONG ll = 0; 27674664626SKris Kennaway 277f579bf8eSKris Kennaway assert(n >= 0); 2786f9291ceSJung-uk Kim if (n <= 0) 279e71b7053SJung-uk Kim return (BN_ULONG)0; 28074664626SKris Kennaway 2811f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 2826f9291ceSJung-uk Kim while (n & ~3) { 28374664626SKris Kennaway ll += (BN_ULLONG) a[0] + b[0]; 28474664626SKris Kennaway r[0] = (BN_ULONG)ll & BN_MASK2; 28574664626SKris Kennaway ll >>= BN_BITS2; 28674664626SKris Kennaway ll += (BN_ULLONG) a[1] + b[1]; 28774664626SKris Kennaway r[1] = (BN_ULONG)ll & BN_MASK2; 28874664626SKris Kennaway ll >>= BN_BITS2; 28974664626SKris Kennaway ll += (BN_ULLONG) a[2] + b[2]; 29074664626SKris Kennaway r[2] = (BN_ULONG)ll & BN_MASK2; 29174664626SKris Kennaway ll >>= BN_BITS2; 29274664626SKris Kennaway ll += (BN_ULLONG) a[3] + b[3]; 29374664626SKris Kennaway r[3] = (BN_ULONG)ll & BN_MASK2; 29474664626SKris Kennaway ll >>= BN_BITS2; 2956f9291ceSJung-uk Kim a += 4; 2966f9291ceSJung-uk Kim b += 4; 2976f9291ceSJung-uk Kim r += 4; 2986f9291ceSJung-uk Kim n -= 4; 2991f13597dSJung-uk Kim } 3001f13597dSJung-uk Kim # endif 3016f9291ceSJung-uk Kim while (n) { 3021f13597dSJung-uk Kim ll += (BN_ULLONG) a[0] + b[0]; 3031f13597dSJung-uk Kim r[0] = (BN_ULONG)ll & BN_MASK2; 3041f13597dSJung-uk Kim ll >>= BN_BITS2; 3056f9291ceSJung-uk Kim a++; 3066f9291ceSJung-uk Kim b++; 3076f9291ceSJung-uk Kim r++; 3086f9291ceSJung-uk Kim n--; 30974664626SKris Kennaway } 310e71b7053SJung-uk Kim return (BN_ULONG)ll; 31174664626SKris Kennaway } 312f579bf8eSKris Kennaway #else /* !BN_LLONG */ 3136f9291ceSJung-uk Kim BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 3146f9291ceSJung-uk Kim int n) 31574664626SKris Kennaway { 31674664626SKris Kennaway BN_ULONG c, l, t; 31774664626SKris Kennaway 318f579bf8eSKris Kennaway assert(n >= 0); 3196f9291ceSJung-uk Kim if (n <= 0) 320e71b7053SJung-uk Kim return (BN_ULONG)0; 32174664626SKris Kennaway 32274664626SKris Kennaway c = 0; 3231f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 3246f9291ceSJung-uk Kim while (n & ~3) { 32574664626SKris Kennaway t = a[0]; 32674664626SKris Kennaway t = (t + c) & BN_MASK2; 32774664626SKris Kennaway c = (t < c); 32874664626SKris Kennaway l = (t + b[0]) & BN_MASK2; 32974664626SKris Kennaway c += (l < t); 33074664626SKris Kennaway r[0] = l; 33174664626SKris Kennaway t = a[1]; 33274664626SKris Kennaway t = (t + c) & BN_MASK2; 33374664626SKris Kennaway c = (t < c); 33474664626SKris Kennaway l = (t + b[1]) & BN_MASK2; 33574664626SKris Kennaway c += (l < t); 33674664626SKris Kennaway r[1] = l; 33774664626SKris Kennaway t = a[2]; 33874664626SKris Kennaway t = (t + c) & BN_MASK2; 33974664626SKris Kennaway c = (t < c); 34074664626SKris Kennaway l = (t + b[2]) & BN_MASK2; 34174664626SKris Kennaway c += (l < t); 34274664626SKris Kennaway r[2] = l; 34374664626SKris Kennaway t = a[3]; 34474664626SKris Kennaway t = (t + c) & BN_MASK2; 34574664626SKris Kennaway c = (t < c); 34674664626SKris Kennaway l = (t + b[3]) & BN_MASK2; 34774664626SKris Kennaway c += (l < t); 34874664626SKris Kennaway r[3] = l; 3496f9291ceSJung-uk Kim a += 4; 3506f9291ceSJung-uk Kim b += 4; 3516f9291ceSJung-uk Kim r += 4; 3526f9291ceSJung-uk Kim n -= 4; 3531f13597dSJung-uk Kim } 3541f13597dSJung-uk Kim # endif 3556f9291ceSJung-uk Kim while (n) { 3561f13597dSJung-uk Kim t = a[0]; 3571f13597dSJung-uk Kim t = (t + c) & BN_MASK2; 3581f13597dSJung-uk Kim c = (t < c); 3591f13597dSJung-uk Kim l = (t + b[0]) & BN_MASK2; 3601f13597dSJung-uk Kim c += (l < t); 3611f13597dSJung-uk Kim r[0] = l; 3626f9291ceSJung-uk Kim a++; 3636f9291ceSJung-uk Kim b++; 3646f9291ceSJung-uk Kim r++; 3656f9291ceSJung-uk Kim n--; 36674664626SKris Kennaway } 367e71b7053SJung-uk Kim return (BN_ULONG)c; 36874664626SKris Kennaway } 369f579bf8eSKris Kennaway #endif /* !BN_LLONG */ 37074664626SKris Kennaway 3716f9291ceSJung-uk Kim BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 3726f9291ceSJung-uk Kim int n) 37374664626SKris Kennaway { 37474664626SKris Kennaway BN_ULONG t1, t2; 37574664626SKris Kennaway int c = 0; 37674664626SKris Kennaway 377f579bf8eSKris Kennaway assert(n >= 0); 3786f9291ceSJung-uk Kim if (n <= 0) 379e71b7053SJung-uk Kim return (BN_ULONG)0; 38074664626SKris Kennaway 3811f13597dSJung-uk Kim #ifndef OPENSSL_SMALL_FOOTPRINT 3826f9291ceSJung-uk Kim while (n & ~3) { 3836f9291ceSJung-uk Kim t1 = a[0]; 3848f1ef87aSJung-uk Kim t2 = (t1 - c) & BN_MASK2; 3858f1ef87aSJung-uk Kim c = (t2 > t1); 3868f1ef87aSJung-uk Kim t1 = b[0]; 3878f1ef87aSJung-uk Kim t1 = (t2 - t1) & BN_MASK2; 3888f1ef87aSJung-uk Kim r[0] = t1; 3898f1ef87aSJung-uk Kim c += (t1 > t2); 3906f9291ceSJung-uk Kim t1 = a[1]; 3918f1ef87aSJung-uk Kim t2 = (t1 - c) & BN_MASK2; 3928f1ef87aSJung-uk Kim c = (t2 > t1); 3938f1ef87aSJung-uk Kim t1 = b[1]; 3948f1ef87aSJung-uk Kim t1 = (t2 - t1) & BN_MASK2; 3958f1ef87aSJung-uk Kim r[1] = t1; 3968f1ef87aSJung-uk Kim c += (t1 > t2); 3976f9291ceSJung-uk Kim t1 = a[2]; 3988f1ef87aSJung-uk Kim t2 = (t1 - c) & BN_MASK2; 3998f1ef87aSJung-uk Kim c = (t2 > t1); 4008f1ef87aSJung-uk Kim t1 = b[2]; 4018f1ef87aSJung-uk Kim t1 = (t2 - t1) & BN_MASK2; 4028f1ef87aSJung-uk Kim r[2] = t1; 4038f1ef87aSJung-uk Kim c += (t1 > t2); 4046f9291ceSJung-uk Kim t1 = a[3]; 4058f1ef87aSJung-uk Kim t2 = (t1 - c) & BN_MASK2; 4068f1ef87aSJung-uk Kim c = (t2 > t1); 4078f1ef87aSJung-uk Kim t1 = b[3]; 4088f1ef87aSJung-uk Kim t1 = (t2 - t1) & BN_MASK2; 4098f1ef87aSJung-uk Kim r[3] = t1; 4108f1ef87aSJung-uk Kim c += (t1 > t2); 4116f9291ceSJung-uk Kim a += 4; 4126f9291ceSJung-uk Kim b += 4; 4136f9291ceSJung-uk Kim r += 4; 4146f9291ceSJung-uk Kim n -= 4; 4151f13597dSJung-uk Kim } 4161f13597dSJung-uk Kim #endif 4176f9291ceSJung-uk Kim while (n) { 4186f9291ceSJung-uk Kim t1 = a[0]; 4198f1ef87aSJung-uk Kim t2 = (t1 - c) & BN_MASK2; 4208f1ef87aSJung-uk Kim c = (t2 > t1); 4218f1ef87aSJung-uk Kim t1 = b[0]; 4228f1ef87aSJung-uk Kim t1 = (t2 - t1) & BN_MASK2; 4238f1ef87aSJung-uk Kim r[0] = t1; 4248f1ef87aSJung-uk Kim c += (t1 > t2); 4256f9291ceSJung-uk Kim a++; 4266f9291ceSJung-uk Kim b++; 4276f9291ceSJung-uk Kim r++; 4286f9291ceSJung-uk Kim n--; 42974664626SKris Kennaway } 430e71b7053SJung-uk Kim return c; 43174664626SKris Kennaway } 43274664626SKris Kennaway 4331f13597dSJung-uk Kim #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 43474664626SKris Kennaway 43574664626SKris Kennaway # undef bn_mul_comba8 43674664626SKris Kennaway # undef bn_mul_comba4 43774664626SKris Kennaway # undef bn_sqr_comba8 43874664626SKris Kennaway # undef bn_sqr_comba4 43974664626SKris Kennaway 440f579bf8eSKris Kennaway /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 441f579bf8eSKris Kennaway /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 442f579bf8eSKris Kennaway /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 4436f9291ceSJung-uk Kim /* 4446f9291ceSJung-uk Kim * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 4456f9291ceSJung-uk Kim * c=(c2,c1,c0) 4466f9291ceSJung-uk Kim */ 447f579bf8eSKris Kennaway 44874664626SKris Kennaway # ifdef BN_LLONG 4497bded2dbSJung-uk Kim /* 4507bded2dbSJung-uk Kim * Keep in mind that additions to multiplication result can not 4517bded2dbSJung-uk Kim * overflow, because its high half cannot be all-ones. 4527bded2dbSJung-uk Kim */ 4537bded2dbSJung-uk Kim # define mul_add_c(a,b,c0,c1,c2) do { \ 4547bded2dbSJung-uk Kim BN_ULONG hi; \ 4557bded2dbSJung-uk Kim BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 4567bded2dbSJung-uk Kim t += c0; /* no carry */ \ 4577bded2dbSJung-uk Kim c0 = (BN_ULONG)Lw(t); \ 4587bded2dbSJung-uk Kim hi = (BN_ULONG)Hw(t); \ 4598f1ef87aSJung-uk Kim c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 4607bded2dbSJung-uk Kim } while(0) 46174664626SKris Kennaway 4627bded2dbSJung-uk Kim # define mul_add_c2(a,b,c0,c1,c2) do { \ 4637bded2dbSJung-uk Kim BN_ULONG hi; \ 4647bded2dbSJung-uk Kim BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 4657bded2dbSJung-uk Kim BN_ULLONG tt = t+c0; /* no carry */ \ 4667bded2dbSJung-uk Kim c0 = (BN_ULONG)Lw(tt); \ 4677bded2dbSJung-uk Kim hi = (BN_ULONG)Hw(tt); \ 4688f1ef87aSJung-uk Kim c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 4697bded2dbSJung-uk Kim t += c0; /* no carry */ \ 4707bded2dbSJung-uk Kim c0 = (BN_ULONG)Lw(t); \ 4717bded2dbSJung-uk Kim hi = (BN_ULONG)Hw(t); \ 4728f1ef87aSJung-uk Kim c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 4737bded2dbSJung-uk Kim } while(0) 47474664626SKris Kennaway 4757bded2dbSJung-uk Kim # define sqr_add_c(a,i,c0,c1,c2) do { \ 4767bded2dbSJung-uk Kim BN_ULONG hi; \ 4777bded2dbSJung-uk Kim BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ 4787bded2dbSJung-uk Kim t += c0; /* no carry */ \ 4797bded2dbSJung-uk Kim c0 = (BN_ULONG)Lw(t); \ 4807bded2dbSJung-uk Kim hi = (BN_ULONG)Hw(t); \ 4818f1ef87aSJung-uk Kim c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 4827bded2dbSJung-uk Kim } while(0) 48374664626SKris Kennaway 48474664626SKris Kennaway # define sqr_add_c2(a,i,j,c0,c1,c2) \ 48574664626SKris Kennaway mul_add_c2((a)[i],(a)[j],c0,c1,c2) 486f579bf8eSKris Kennaway 4873b4e3dcbSSimon L. B. Nielsen # elif defined(BN_UMULT_LOHI) 4887bded2dbSJung-uk Kim /* 4897bded2dbSJung-uk Kim * Keep in mind that additions to hi can not overflow, because 4907bded2dbSJung-uk Kim * the high word of a multiplication result cannot be all-ones. 4917bded2dbSJung-uk Kim */ 4927bded2dbSJung-uk Kim # define mul_add_c(a,b,c0,c1,c2) do { \ 4933b4e3dcbSSimon L. B. Nielsen BN_ULONG ta = (a), tb = (b); \ 4947bded2dbSJung-uk Kim BN_ULONG lo, hi; \ 4957bded2dbSJung-uk Kim BN_UMULT_LOHI(lo,hi,ta,tb); \ 4968f1ef87aSJung-uk Kim c0 += lo; hi += (c0<lo); \ 4978f1ef87aSJung-uk Kim c1 += hi; c2 += (c1<hi); \ 4987bded2dbSJung-uk Kim } while(0) 4993b4e3dcbSSimon L. B. Nielsen 5007bded2dbSJung-uk Kim # define mul_add_c2(a,b,c0,c1,c2) do { \ 5017bded2dbSJung-uk Kim BN_ULONG ta = (a), tb = (b); \ 5027bded2dbSJung-uk Kim BN_ULONG lo, hi, tt; \ 5037bded2dbSJung-uk Kim BN_UMULT_LOHI(lo,hi,ta,tb); \ 5048f1ef87aSJung-uk Kim c0 += lo; tt = hi + (c0<lo); \ 5058f1ef87aSJung-uk Kim c1 += tt; c2 += (c1<tt); \ 5068f1ef87aSJung-uk Kim c0 += lo; hi += (c0<lo); \ 5078f1ef87aSJung-uk Kim c1 += hi; c2 += (c1<hi); \ 5087bded2dbSJung-uk Kim } while(0) 5093b4e3dcbSSimon L. B. Nielsen 5107bded2dbSJung-uk Kim # define sqr_add_c(a,i,c0,c1,c2) do { \ 5113b4e3dcbSSimon L. B. Nielsen BN_ULONG ta = (a)[i]; \ 5127bded2dbSJung-uk Kim BN_ULONG lo, hi; \ 5137bded2dbSJung-uk Kim BN_UMULT_LOHI(lo,hi,ta,ta); \ 5148f1ef87aSJung-uk Kim c0 += lo; hi += (c0<lo); \ 5158f1ef87aSJung-uk Kim c1 += hi; c2 += (c1<hi); \ 5167bded2dbSJung-uk Kim } while(0) 5173b4e3dcbSSimon L. B. Nielsen 5183b4e3dcbSSimon L. B. Nielsen # define sqr_add_c2(a,i,j,c0,c1,c2) \ 5193b4e3dcbSSimon L. B. Nielsen mul_add_c2((a)[i],(a)[j],c0,c1,c2) 5203b4e3dcbSSimon L. B. Nielsen 521f579bf8eSKris Kennaway # elif defined(BN_UMULT_HIGH) 5227bded2dbSJung-uk Kim /* 5237bded2dbSJung-uk Kim * Keep in mind that additions to hi can not overflow, because 5247bded2dbSJung-uk Kim * the high word of a multiplication result cannot be all-ones. 5257bded2dbSJung-uk Kim */ 5267bded2dbSJung-uk Kim # define mul_add_c(a,b,c0,c1,c2) do { \ 527f579bf8eSKris Kennaway BN_ULONG ta = (a), tb = (b); \ 5287bded2dbSJung-uk Kim BN_ULONG lo = ta * tb; \ 5297bded2dbSJung-uk Kim BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 5308f1ef87aSJung-uk Kim c0 += lo; hi += (c0<lo); \ 5318f1ef87aSJung-uk Kim c1 += hi; c2 += (c1<hi); \ 5327bded2dbSJung-uk Kim } while(0) 533f579bf8eSKris Kennaway 5347bded2dbSJung-uk Kim # define mul_add_c2(a,b,c0,c1,c2) do { \ 5357bded2dbSJung-uk Kim BN_ULONG ta = (a), tb = (b), tt; \ 5367bded2dbSJung-uk Kim BN_ULONG lo = ta * tb; \ 5377bded2dbSJung-uk Kim BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 5388f1ef87aSJung-uk Kim c0 += lo; tt = hi + (c0<lo); \ 5398f1ef87aSJung-uk Kim c1 += tt; c2 += (c1<tt); \ 5408f1ef87aSJung-uk Kim c0 += lo; hi += (c0<lo); \ 5418f1ef87aSJung-uk Kim c1 += hi; c2 += (c1<hi); \ 5427bded2dbSJung-uk Kim } while(0) 543f579bf8eSKris Kennaway 5447bded2dbSJung-uk Kim # define sqr_add_c(a,i,c0,c1,c2) do { \ 545f579bf8eSKris Kennaway BN_ULONG ta = (a)[i]; \ 5467bded2dbSJung-uk Kim BN_ULONG lo = ta * ta; \ 5477bded2dbSJung-uk Kim BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ 5488f1ef87aSJung-uk Kim c0 += lo; hi += (c0<lo); \ 5498f1ef87aSJung-uk Kim c1 += hi; c2 += (c1<hi); \ 5507bded2dbSJung-uk Kim } while(0) 551f579bf8eSKris Kennaway 552f579bf8eSKris Kennaway # define sqr_add_c2(a,i,j,c0,c1,c2) \ 553f579bf8eSKris Kennaway mul_add_c2((a)[i],(a)[j],c0,c1,c2) 554f579bf8eSKris Kennaway 555f579bf8eSKris Kennaway # else /* !BN_LLONG */ 5567bded2dbSJung-uk Kim /* 5577bded2dbSJung-uk Kim * Keep in mind that additions to hi can not overflow, because 5587bded2dbSJung-uk Kim * the high word of a multiplication result cannot be all-ones. 5597bded2dbSJung-uk Kim */ 5607bded2dbSJung-uk Kim # define mul_add_c(a,b,c0,c1,c2) do { \ 5617bded2dbSJung-uk Kim BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 5627bded2dbSJung-uk Kim BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 5637bded2dbSJung-uk Kim mul64(lo,hi,bl,bh); \ 5648f1ef87aSJung-uk Kim c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ 5658f1ef87aSJung-uk Kim c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 5667bded2dbSJung-uk Kim } while(0) 56774664626SKris Kennaway 5687bded2dbSJung-uk Kim # define mul_add_c2(a,b,c0,c1,c2) do { \ 5697bded2dbSJung-uk Kim BN_ULONG tt; \ 5707bded2dbSJung-uk Kim BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 5717bded2dbSJung-uk Kim BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 5727bded2dbSJung-uk Kim mul64(lo,hi,bl,bh); \ 5737bded2dbSJung-uk Kim tt = hi; \ 5748f1ef87aSJung-uk Kim c0 = (c0+lo)&BN_MASK2; tt += (c0<lo); \ 5758f1ef87aSJung-uk Kim c1 = (c1+tt)&BN_MASK2; c2 += (c1<tt); \ 5768f1ef87aSJung-uk Kim c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ 5778f1ef87aSJung-uk Kim c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 5787bded2dbSJung-uk Kim } while(0) 57974664626SKris Kennaway 5807bded2dbSJung-uk Kim # define sqr_add_c(a,i,c0,c1,c2) do { \ 5817bded2dbSJung-uk Kim BN_ULONG lo, hi; \ 5827bded2dbSJung-uk Kim sqr64(lo,hi,(a)[i]); \ 5838f1ef87aSJung-uk Kim c0 = (c0+lo)&BN_MASK2; hi += (c0<lo); \ 5848f1ef87aSJung-uk Kim c1 = (c1+hi)&BN_MASK2; c2 += (c1<hi); \ 5857bded2dbSJung-uk Kim } while(0) 58674664626SKris Kennaway 58774664626SKris Kennaway # define sqr_add_c2(a,i,j,c0,c1,c2) \ 58874664626SKris Kennaway mul_add_c2((a)[i],(a)[j],c0,c1,c2) 589f579bf8eSKris Kennaway # endif /* !BN_LLONG */ 59074664626SKris Kennaway 59174664626SKris Kennaway void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 59274664626SKris Kennaway { 59374664626SKris Kennaway BN_ULONG c1, c2, c3; 59474664626SKris Kennaway 59574664626SKris Kennaway c1 = 0; 59674664626SKris Kennaway c2 = 0; 59774664626SKris Kennaway c3 = 0; 59874664626SKris Kennaway mul_add_c(a[0], b[0], c1, c2, c3); 59974664626SKris Kennaway r[0] = c1; 60074664626SKris Kennaway c1 = 0; 60174664626SKris Kennaway mul_add_c(a[0], b[1], c2, c3, c1); 60274664626SKris Kennaway mul_add_c(a[1], b[0], c2, c3, c1); 60374664626SKris Kennaway r[1] = c2; 60474664626SKris Kennaway c2 = 0; 60574664626SKris Kennaway mul_add_c(a[2], b[0], c3, c1, c2); 60674664626SKris Kennaway mul_add_c(a[1], b[1], c3, c1, c2); 60774664626SKris Kennaway mul_add_c(a[0], b[2], c3, c1, c2); 60874664626SKris Kennaway r[2] = c3; 60974664626SKris Kennaway c3 = 0; 61074664626SKris Kennaway mul_add_c(a[0], b[3], c1, c2, c3); 61174664626SKris Kennaway mul_add_c(a[1], b[2], c1, c2, c3); 61274664626SKris Kennaway mul_add_c(a[2], b[1], c1, c2, c3); 61374664626SKris Kennaway mul_add_c(a[3], b[0], c1, c2, c3); 61474664626SKris Kennaway r[3] = c1; 61574664626SKris Kennaway c1 = 0; 61674664626SKris Kennaway mul_add_c(a[4], b[0], c2, c3, c1); 61774664626SKris Kennaway mul_add_c(a[3], b[1], c2, c3, c1); 61874664626SKris Kennaway mul_add_c(a[2], b[2], c2, c3, c1); 61974664626SKris Kennaway mul_add_c(a[1], b[3], c2, c3, c1); 62074664626SKris Kennaway mul_add_c(a[0], b[4], c2, c3, c1); 62174664626SKris Kennaway r[4] = c2; 62274664626SKris Kennaway c2 = 0; 62374664626SKris Kennaway mul_add_c(a[0], b[5], c3, c1, c2); 62474664626SKris Kennaway mul_add_c(a[1], b[4], c3, c1, c2); 62574664626SKris Kennaway mul_add_c(a[2], b[3], c3, c1, c2); 62674664626SKris Kennaway mul_add_c(a[3], b[2], c3, c1, c2); 62774664626SKris Kennaway mul_add_c(a[4], b[1], c3, c1, c2); 62874664626SKris Kennaway mul_add_c(a[5], b[0], c3, c1, c2); 62974664626SKris Kennaway r[5] = c3; 63074664626SKris Kennaway c3 = 0; 63174664626SKris Kennaway mul_add_c(a[6], b[0], c1, c2, c3); 63274664626SKris Kennaway mul_add_c(a[5], b[1], c1, c2, c3); 63374664626SKris Kennaway mul_add_c(a[4], b[2], c1, c2, c3); 63474664626SKris Kennaway mul_add_c(a[3], b[3], c1, c2, c3); 63574664626SKris Kennaway mul_add_c(a[2], b[4], c1, c2, c3); 63674664626SKris Kennaway mul_add_c(a[1], b[5], c1, c2, c3); 63774664626SKris Kennaway mul_add_c(a[0], b[6], c1, c2, c3); 63874664626SKris Kennaway r[6] = c1; 63974664626SKris Kennaway c1 = 0; 64074664626SKris Kennaway mul_add_c(a[0], b[7], c2, c3, c1); 64174664626SKris Kennaway mul_add_c(a[1], b[6], c2, c3, c1); 64274664626SKris Kennaway mul_add_c(a[2], b[5], c2, c3, c1); 64374664626SKris Kennaway mul_add_c(a[3], b[4], c2, c3, c1); 64474664626SKris Kennaway mul_add_c(a[4], b[3], c2, c3, c1); 64574664626SKris Kennaway mul_add_c(a[5], b[2], c2, c3, c1); 64674664626SKris Kennaway mul_add_c(a[6], b[1], c2, c3, c1); 64774664626SKris Kennaway mul_add_c(a[7], b[0], c2, c3, c1); 64874664626SKris Kennaway r[7] = c2; 64974664626SKris Kennaway c2 = 0; 65074664626SKris Kennaway mul_add_c(a[7], b[1], c3, c1, c2); 65174664626SKris Kennaway mul_add_c(a[6], b[2], c3, c1, c2); 65274664626SKris Kennaway mul_add_c(a[5], b[3], c3, c1, c2); 65374664626SKris Kennaway mul_add_c(a[4], b[4], c3, c1, c2); 65474664626SKris Kennaway mul_add_c(a[3], b[5], c3, c1, c2); 65574664626SKris Kennaway mul_add_c(a[2], b[6], c3, c1, c2); 65674664626SKris Kennaway mul_add_c(a[1], b[7], c3, c1, c2); 65774664626SKris Kennaway r[8] = c3; 65874664626SKris Kennaway c3 = 0; 65974664626SKris Kennaway mul_add_c(a[2], b[7], c1, c2, c3); 66074664626SKris Kennaway mul_add_c(a[3], b[6], c1, c2, c3); 66174664626SKris Kennaway mul_add_c(a[4], b[5], c1, c2, c3); 66274664626SKris Kennaway mul_add_c(a[5], b[4], c1, c2, c3); 66374664626SKris Kennaway mul_add_c(a[6], b[3], c1, c2, c3); 66474664626SKris Kennaway mul_add_c(a[7], b[2], c1, c2, c3); 66574664626SKris Kennaway r[9] = c1; 66674664626SKris Kennaway c1 = 0; 66774664626SKris Kennaway mul_add_c(a[7], b[3], c2, c3, c1); 66874664626SKris Kennaway mul_add_c(a[6], b[4], c2, c3, c1); 66974664626SKris Kennaway mul_add_c(a[5], b[5], c2, c3, c1); 67074664626SKris Kennaway mul_add_c(a[4], b[6], c2, c3, c1); 67174664626SKris Kennaway mul_add_c(a[3], b[7], c2, c3, c1); 67274664626SKris Kennaway r[10] = c2; 67374664626SKris Kennaway c2 = 0; 67474664626SKris Kennaway mul_add_c(a[4], b[7], c3, c1, c2); 67574664626SKris Kennaway mul_add_c(a[5], b[6], c3, c1, c2); 67674664626SKris Kennaway mul_add_c(a[6], b[5], c3, c1, c2); 67774664626SKris Kennaway mul_add_c(a[7], b[4], c3, c1, c2); 67874664626SKris Kennaway r[11] = c3; 67974664626SKris Kennaway c3 = 0; 68074664626SKris Kennaway mul_add_c(a[7], b[5], c1, c2, c3); 68174664626SKris Kennaway mul_add_c(a[6], b[6], c1, c2, c3); 68274664626SKris Kennaway mul_add_c(a[5], b[7], c1, c2, c3); 68374664626SKris Kennaway r[12] = c1; 68474664626SKris Kennaway c1 = 0; 68574664626SKris Kennaway mul_add_c(a[6], b[7], c2, c3, c1); 68674664626SKris Kennaway mul_add_c(a[7], b[6], c2, c3, c1); 68774664626SKris Kennaway r[13] = c2; 68874664626SKris Kennaway c2 = 0; 68974664626SKris Kennaway mul_add_c(a[7], b[7], c3, c1, c2); 69074664626SKris Kennaway r[14] = c3; 69174664626SKris Kennaway r[15] = c1; 69274664626SKris Kennaway } 69374664626SKris Kennaway 69474664626SKris Kennaway void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 69574664626SKris Kennaway { 69674664626SKris Kennaway BN_ULONG c1, c2, c3; 69774664626SKris Kennaway 69874664626SKris Kennaway c1 = 0; 69974664626SKris Kennaway c2 = 0; 70074664626SKris Kennaway c3 = 0; 70174664626SKris Kennaway mul_add_c(a[0], b[0], c1, c2, c3); 70274664626SKris Kennaway r[0] = c1; 70374664626SKris Kennaway c1 = 0; 70474664626SKris Kennaway mul_add_c(a[0], b[1], c2, c3, c1); 70574664626SKris Kennaway mul_add_c(a[1], b[0], c2, c3, c1); 70674664626SKris Kennaway r[1] = c2; 70774664626SKris Kennaway c2 = 0; 70874664626SKris Kennaway mul_add_c(a[2], b[0], c3, c1, c2); 70974664626SKris Kennaway mul_add_c(a[1], b[1], c3, c1, c2); 71074664626SKris Kennaway mul_add_c(a[0], b[2], c3, c1, c2); 71174664626SKris Kennaway r[2] = c3; 71274664626SKris Kennaway c3 = 0; 71374664626SKris Kennaway mul_add_c(a[0], b[3], c1, c2, c3); 71474664626SKris Kennaway mul_add_c(a[1], b[2], c1, c2, c3); 71574664626SKris Kennaway mul_add_c(a[2], b[1], c1, c2, c3); 71674664626SKris Kennaway mul_add_c(a[3], b[0], c1, c2, c3); 71774664626SKris Kennaway r[3] = c1; 71874664626SKris Kennaway c1 = 0; 71974664626SKris Kennaway mul_add_c(a[3], b[1], c2, c3, c1); 72074664626SKris Kennaway mul_add_c(a[2], b[2], c2, c3, c1); 72174664626SKris Kennaway mul_add_c(a[1], b[3], c2, c3, c1); 72274664626SKris Kennaway r[4] = c2; 72374664626SKris Kennaway c2 = 0; 72474664626SKris Kennaway mul_add_c(a[2], b[3], c3, c1, c2); 72574664626SKris Kennaway mul_add_c(a[3], b[2], c3, c1, c2); 72674664626SKris Kennaway r[5] = c3; 72774664626SKris Kennaway c3 = 0; 72874664626SKris Kennaway mul_add_c(a[3], b[3], c1, c2, c3); 72974664626SKris Kennaway r[6] = c1; 73074664626SKris Kennaway r[7] = c2; 73174664626SKris Kennaway } 73274664626SKris Kennaway 7335c87c606SMark Murray void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 73474664626SKris Kennaway { 73574664626SKris Kennaway BN_ULONG c1, c2, c3; 73674664626SKris Kennaway 73774664626SKris Kennaway c1 = 0; 73874664626SKris Kennaway c2 = 0; 73974664626SKris Kennaway c3 = 0; 74074664626SKris Kennaway sqr_add_c(a, 0, c1, c2, c3); 74174664626SKris Kennaway r[0] = c1; 74274664626SKris Kennaway c1 = 0; 74374664626SKris Kennaway sqr_add_c2(a, 1, 0, c2, c3, c1); 74474664626SKris Kennaway r[1] = c2; 74574664626SKris Kennaway c2 = 0; 74674664626SKris Kennaway sqr_add_c(a, 1, c3, c1, c2); 74774664626SKris Kennaway sqr_add_c2(a, 2, 0, c3, c1, c2); 74874664626SKris Kennaway r[2] = c3; 74974664626SKris Kennaway c3 = 0; 75074664626SKris Kennaway sqr_add_c2(a, 3, 0, c1, c2, c3); 75174664626SKris Kennaway sqr_add_c2(a, 2, 1, c1, c2, c3); 75274664626SKris Kennaway r[3] = c1; 75374664626SKris Kennaway c1 = 0; 75474664626SKris Kennaway sqr_add_c(a, 2, c2, c3, c1); 75574664626SKris Kennaway sqr_add_c2(a, 3, 1, c2, c3, c1); 75674664626SKris Kennaway sqr_add_c2(a, 4, 0, c2, c3, c1); 75774664626SKris Kennaway r[4] = c2; 75874664626SKris Kennaway c2 = 0; 75974664626SKris Kennaway sqr_add_c2(a, 5, 0, c3, c1, c2); 76074664626SKris Kennaway sqr_add_c2(a, 4, 1, c3, c1, c2); 76174664626SKris Kennaway sqr_add_c2(a, 3, 2, c3, c1, c2); 76274664626SKris Kennaway r[5] = c3; 76374664626SKris Kennaway c3 = 0; 76474664626SKris Kennaway sqr_add_c(a, 3, c1, c2, c3); 76574664626SKris Kennaway sqr_add_c2(a, 4, 2, c1, c2, c3); 76674664626SKris Kennaway sqr_add_c2(a, 5, 1, c1, c2, c3); 76774664626SKris Kennaway sqr_add_c2(a, 6, 0, c1, c2, c3); 76874664626SKris Kennaway r[6] = c1; 76974664626SKris Kennaway c1 = 0; 77074664626SKris Kennaway sqr_add_c2(a, 7, 0, c2, c3, c1); 77174664626SKris Kennaway sqr_add_c2(a, 6, 1, c2, c3, c1); 77274664626SKris Kennaway sqr_add_c2(a, 5, 2, c2, c3, c1); 77374664626SKris Kennaway sqr_add_c2(a, 4, 3, c2, c3, c1); 77474664626SKris Kennaway r[7] = c2; 77574664626SKris Kennaway c2 = 0; 77674664626SKris Kennaway sqr_add_c(a, 4, c3, c1, c2); 77774664626SKris Kennaway sqr_add_c2(a, 5, 3, c3, c1, c2); 77874664626SKris Kennaway sqr_add_c2(a, 6, 2, c3, c1, c2); 77974664626SKris Kennaway sqr_add_c2(a, 7, 1, c3, c1, c2); 78074664626SKris Kennaway r[8] = c3; 78174664626SKris Kennaway c3 = 0; 78274664626SKris Kennaway sqr_add_c2(a, 7, 2, c1, c2, c3); 78374664626SKris Kennaway sqr_add_c2(a, 6, 3, c1, c2, c3); 78474664626SKris Kennaway sqr_add_c2(a, 5, 4, c1, c2, c3); 78574664626SKris Kennaway r[9] = c1; 78674664626SKris Kennaway c1 = 0; 78774664626SKris Kennaway sqr_add_c(a, 5, c2, c3, c1); 78874664626SKris Kennaway sqr_add_c2(a, 6, 4, c2, c3, c1); 78974664626SKris Kennaway sqr_add_c2(a, 7, 3, c2, c3, c1); 79074664626SKris Kennaway r[10] = c2; 79174664626SKris Kennaway c2 = 0; 79274664626SKris Kennaway sqr_add_c2(a, 7, 4, c3, c1, c2); 79374664626SKris Kennaway sqr_add_c2(a, 6, 5, c3, c1, c2); 79474664626SKris Kennaway r[11] = c3; 79574664626SKris Kennaway c3 = 0; 79674664626SKris Kennaway sqr_add_c(a, 6, c1, c2, c3); 79774664626SKris Kennaway sqr_add_c2(a, 7, 5, c1, c2, c3); 79874664626SKris Kennaway r[12] = c1; 79974664626SKris Kennaway c1 = 0; 80074664626SKris Kennaway sqr_add_c2(a, 7, 6, c2, c3, c1); 80174664626SKris Kennaway r[13] = c2; 80274664626SKris Kennaway c2 = 0; 80374664626SKris Kennaway sqr_add_c(a, 7, c3, c1, c2); 80474664626SKris Kennaway r[14] = c3; 80574664626SKris Kennaway r[15] = c1; 80674664626SKris Kennaway } 80774664626SKris Kennaway 8085c87c606SMark Murray void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 80974664626SKris Kennaway { 81074664626SKris Kennaway BN_ULONG c1, c2, c3; 81174664626SKris Kennaway 81274664626SKris Kennaway c1 = 0; 81374664626SKris Kennaway c2 = 0; 81474664626SKris Kennaway c3 = 0; 81574664626SKris Kennaway sqr_add_c(a, 0, c1, c2, c3); 81674664626SKris Kennaway r[0] = c1; 81774664626SKris Kennaway c1 = 0; 81874664626SKris Kennaway sqr_add_c2(a, 1, 0, c2, c3, c1); 81974664626SKris Kennaway r[1] = c2; 82074664626SKris Kennaway c2 = 0; 82174664626SKris Kennaway sqr_add_c(a, 1, c3, c1, c2); 82274664626SKris Kennaway sqr_add_c2(a, 2, 0, c3, c1, c2); 82374664626SKris Kennaway r[2] = c3; 82474664626SKris Kennaway c3 = 0; 82574664626SKris Kennaway sqr_add_c2(a, 3, 0, c1, c2, c3); 82674664626SKris Kennaway sqr_add_c2(a, 2, 1, c1, c2, c3); 82774664626SKris Kennaway r[3] = c1; 82874664626SKris Kennaway c1 = 0; 82974664626SKris Kennaway sqr_add_c(a, 2, c2, c3, c1); 83074664626SKris Kennaway sqr_add_c2(a, 3, 1, c2, c3, c1); 83174664626SKris Kennaway r[4] = c2; 83274664626SKris Kennaway c2 = 0; 83374664626SKris Kennaway sqr_add_c2(a, 3, 2, c3, c1, c2); 83474664626SKris Kennaway r[5] = c3; 83574664626SKris Kennaway c3 = 0; 83674664626SKris Kennaway sqr_add_c(a, 3, c1, c2, c3); 83774664626SKris Kennaway r[6] = c1; 83874664626SKris Kennaway r[7] = c2; 83974664626SKris Kennaway } 8401f13597dSJung-uk Kim 8411f13597dSJung-uk Kim # ifdef OPENSSL_NO_ASM 8421f13597dSJung-uk Kim # ifdef OPENSSL_BN_ASM_MONT 8431f13597dSJung-uk Kim # include <alloca.h> 8441f13597dSJung-uk Kim /* 8451f13597dSJung-uk Kim * This is essentially reference implementation, which may or may not 8461f13597dSJung-uk Kim * result in performance improvement. E.g. on IA-32 this routine was 8471f13597dSJung-uk Kim * observed to give 40% faster rsa1024 private key operations and 10% 8481f13597dSJung-uk Kim * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 8491f13597dSJung-uk Kim * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 8501f13597dSJung-uk Kim * reference implementation, one to be used as starting point for 8511f13597dSJung-uk Kim * platform-specific assembler. Mentioned numbers apply to compiler 8521f13597dSJung-uk Kim * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 8531f13597dSJung-uk Kim * can vary not only from platform to platform, but even for compiler 8541f13597dSJung-uk Kim * versions. Assembler vs. assembler improvement coefficients can 8551f13597dSJung-uk Kim * [and are known to] differ and are to be documented elsewhere. 8561f13597dSJung-uk Kim */ 8576f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 8586f9291ceSJung-uk Kim const BN_ULONG *np, const BN_ULONG *n0p, int num) 8591f13597dSJung-uk Kim { 8601f13597dSJung-uk Kim BN_ULONG c0, c1, ml, *tp, n0; 8611f13597dSJung-uk Kim # ifdef mul64 8621f13597dSJung-uk Kim BN_ULONG mh; 8631f13597dSJung-uk Kim # endif 8641f13597dSJung-uk Kim volatile BN_ULONG *vp; 8651f13597dSJung-uk Kim int i = 0, j; 8661f13597dSJung-uk Kim 8676f9291ceSJung-uk Kim # if 0 /* template for platform-specific 8686f9291ceSJung-uk Kim * implementation */ 8696f9291ceSJung-uk Kim if (ap == bp) 8706f9291ceSJung-uk Kim return bn_sqr_mont(rp, ap, np, n0p, num); 8711f13597dSJung-uk Kim # endif 8721f13597dSJung-uk Kim vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 8731f13597dSJung-uk Kim 8741f13597dSJung-uk Kim n0 = *n0p; 8751f13597dSJung-uk Kim 8761f13597dSJung-uk Kim c0 = 0; 8771f13597dSJung-uk Kim ml = bp[0]; 8781f13597dSJung-uk Kim # ifdef mul64 8791f13597dSJung-uk Kim mh = HBITS(ml); 8801f13597dSJung-uk Kim ml = LBITS(ml); 8811f13597dSJung-uk Kim for (j = 0; j < num; ++j) 8821f13597dSJung-uk Kim mul(tp[j], ap[j], ml, mh, c0); 8831f13597dSJung-uk Kim # else 8841f13597dSJung-uk Kim for (j = 0; j < num; ++j) 8851f13597dSJung-uk Kim mul(tp[j], ap[j], ml, c0); 8861f13597dSJung-uk Kim # endif 8871f13597dSJung-uk Kim 8881f13597dSJung-uk Kim tp[num] = c0; 8891f13597dSJung-uk Kim tp[num + 1] = 0; 8901f13597dSJung-uk Kim goto enter; 8911f13597dSJung-uk Kim 8926f9291ceSJung-uk Kim for (i = 0; i < num; i++) { 8931f13597dSJung-uk Kim c0 = 0; 8941f13597dSJung-uk Kim ml = bp[i]; 8951f13597dSJung-uk Kim # ifdef mul64 8961f13597dSJung-uk Kim mh = HBITS(ml); 8971f13597dSJung-uk Kim ml = LBITS(ml); 8981f13597dSJung-uk Kim for (j = 0; j < num; ++j) 8991f13597dSJung-uk Kim mul_add(tp[j], ap[j], ml, mh, c0); 9001f13597dSJung-uk Kim # else 9011f13597dSJung-uk Kim for (j = 0; j < num; ++j) 9021f13597dSJung-uk Kim mul_add(tp[j], ap[j], ml, c0); 9031f13597dSJung-uk Kim # endif 9041f13597dSJung-uk Kim c1 = (tp[num] + c0) & BN_MASK2; 9051f13597dSJung-uk Kim tp[num] = c1; 9061f13597dSJung-uk Kim tp[num + 1] = (c1 < c0 ? 1 : 0); 9071f13597dSJung-uk Kim enter: 9081f13597dSJung-uk Kim c1 = tp[0]; 9091f13597dSJung-uk Kim ml = (c1 * n0) & BN_MASK2; 9101f13597dSJung-uk Kim c0 = 0; 9111f13597dSJung-uk Kim # ifdef mul64 9121f13597dSJung-uk Kim mh = HBITS(ml); 9131f13597dSJung-uk Kim ml = LBITS(ml); 9141f13597dSJung-uk Kim mul_add(c1, np[0], ml, mh, c0); 9151f13597dSJung-uk Kim # else 9161f13597dSJung-uk Kim mul_add(c1, ml, np[0], c0); 9171f13597dSJung-uk Kim # endif 9186f9291ceSJung-uk Kim for (j = 1; j < num; j++) { 9191f13597dSJung-uk Kim c1 = tp[j]; 9201f13597dSJung-uk Kim # ifdef mul64 9211f13597dSJung-uk Kim mul_add(c1, np[j], ml, mh, c0); 9221f13597dSJung-uk Kim # else 9231f13597dSJung-uk Kim mul_add(c1, ml, np[j], c0); 9241f13597dSJung-uk Kim # endif 9251f13597dSJung-uk Kim tp[j - 1] = c1 & BN_MASK2; 9261f13597dSJung-uk Kim } 9271f13597dSJung-uk Kim c1 = (tp[num] + c0) & BN_MASK2; 9281f13597dSJung-uk Kim tp[num - 1] = c1; 9291f13597dSJung-uk Kim tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 9301f13597dSJung-uk Kim } 9311f13597dSJung-uk Kim 9326f9291ceSJung-uk Kim if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 9331f13597dSJung-uk Kim c0 = bn_sub_words(rp, tp, np, num); 9346f9291ceSJung-uk Kim if (tp[num] != 0 || c0 == 0) { 9356f9291ceSJung-uk Kim for (i = 0; i < num + 2; i++) 9366f9291ceSJung-uk Kim vp[i] = 0; 9371f13597dSJung-uk Kim return 1; 9381f13597dSJung-uk Kim } 9391f13597dSJung-uk Kim } 9406f9291ceSJung-uk Kim for (i = 0; i < num; i++) 9416f9291ceSJung-uk Kim rp[i] = tp[i], vp[i] = 0; 9421f13597dSJung-uk Kim vp[num] = 0; 9431f13597dSJung-uk Kim vp[num + 1] = 0; 9441f13597dSJung-uk Kim return 1; 9451f13597dSJung-uk Kim } 9461f13597dSJung-uk Kim # else 9471f13597dSJung-uk Kim /* 9481f13597dSJung-uk Kim * Return value of 0 indicates that multiplication/convolution was not 9491f13597dSJung-uk Kim * performed to signal the caller to fall down to alternative/original 9501f13597dSJung-uk Kim * code-path. 9511f13597dSJung-uk Kim */ 9526f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 9536f9291ceSJung-uk Kim const BN_ULONG *np, const BN_ULONG *n0, int num) 9546f9291ceSJung-uk Kim { 9556f9291ceSJung-uk Kim return 0; 9566f9291ceSJung-uk Kim } 9571f13597dSJung-uk Kim # endif /* OPENSSL_BN_ASM_MONT */ 9581f13597dSJung-uk Kim # endif 9591f13597dSJung-uk Kim 960f579bf8eSKris Kennaway #else /* !BN_MUL_COMBA */ 96174664626SKris Kennaway 96274664626SKris Kennaway /* hmm... is it faster just to do a multiply? */ 96374664626SKris Kennaway # undef bn_sqr_comba4 964e71b7053SJung-uk Kim # undef bn_sqr_comba8 9651f13597dSJung-uk Kim void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 96674664626SKris Kennaway { 96774664626SKris Kennaway BN_ULONG t[8]; 96874664626SKris Kennaway bn_sqr_normal(r, a, 4, t); 96974664626SKris Kennaway } 97074664626SKris Kennaway 9711f13597dSJung-uk Kim void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 97274664626SKris Kennaway { 97374664626SKris Kennaway BN_ULONG t[16]; 97474664626SKris Kennaway bn_sqr_normal(r, a, 8, t); 97574664626SKris Kennaway } 97674664626SKris Kennaway 97774664626SKris Kennaway void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 97874664626SKris Kennaway { 97974664626SKris Kennaway r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 98074664626SKris Kennaway r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 98174664626SKris Kennaway r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 98274664626SKris Kennaway r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 98374664626SKris Kennaway } 98474664626SKris Kennaway 98574664626SKris Kennaway void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 98674664626SKris Kennaway { 98774664626SKris Kennaway r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 98874664626SKris Kennaway r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 98974664626SKris Kennaway r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 99074664626SKris Kennaway r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 99174664626SKris Kennaway r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 99274664626SKris Kennaway r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 99374664626SKris Kennaway r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 99474664626SKris Kennaway r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 99574664626SKris Kennaway } 99674664626SKris Kennaway 9971f13597dSJung-uk Kim # ifdef OPENSSL_NO_ASM 9981f13597dSJung-uk Kim # ifdef OPENSSL_BN_ASM_MONT 9991f13597dSJung-uk Kim # include <alloca.h> 10006f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 10016f9291ceSJung-uk Kim const BN_ULONG *np, const BN_ULONG *n0p, int num) 10021f13597dSJung-uk Kim { 10031f13597dSJung-uk Kim BN_ULONG c0, c1, *tp, n0 = *n0p; 10041f13597dSJung-uk Kim volatile BN_ULONG *vp; 10051f13597dSJung-uk Kim int i = 0, j; 10061f13597dSJung-uk Kim 10071f13597dSJung-uk Kim vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 10081f13597dSJung-uk Kim 10096f9291ceSJung-uk Kim for (i = 0; i <= num; i++) 10106f9291ceSJung-uk Kim tp[i] = 0; 10111f13597dSJung-uk Kim 10126f9291ceSJung-uk Kim for (i = 0; i < num; i++) { 10131f13597dSJung-uk Kim c0 = bn_mul_add_words(tp, ap, num, bp[i]); 10141f13597dSJung-uk Kim c1 = (tp[num] + c0) & BN_MASK2; 10151f13597dSJung-uk Kim tp[num] = c1; 10161f13597dSJung-uk Kim tp[num + 1] = (c1 < c0 ? 1 : 0); 10171f13597dSJung-uk Kim 10181f13597dSJung-uk Kim c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 10191f13597dSJung-uk Kim c1 = (tp[num] + c0) & BN_MASK2; 10201f13597dSJung-uk Kim tp[num] = c1; 10211f13597dSJung-uk Kim tp[num + 1] += (c1 < c0 ? 1 : 0); 10226f9291ceSJung-uk Kim for (j = 0; j <= num; j++) 10236f9291ceSJung-uk Kim tp[j] = tp[j + 1]; 10241f13597dSJung-uk Kim } 10251f13597dSJung-uk Kim 10266f9291ceSJung-uk Kim if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 10271f13597dSJung-uk Kim c0 = bn_sub_words(rp, tp, np, num); 10286f9291ceSJung-uk Kim if (tp[num] != 0 || c0 == 0) { 10296f9291ceSJung-uk Kim for (i = 0; i < num + 2; i++) 10306f9291ceSJung-uk Kim vp[i] = 0; 10311f13597dSJung-uk Kim return 1; 10321f13597dSJung-uk Kim } 10331f13597dSJung-uk Kim } 10346f9291ceSJung-uk Kim for (i = 0; i < num; i++) 10356f9291ceSJung-uk Kim rp[i] = tp[i], vp[i] = 0; 10361f13597dSJung-uk Kim vp[num] = 0; 10371f13597dSJung-uk Kim vp[num + 1] = 0; 10381f13597dSJung-uk Kim return 1; 10391f13597dSJung-uk Kim } 10401f13597dSJung-uk Kim # else 10416f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 10426f9291ceSJung-uk Kim const BN_ULONG *np, const BN_ULONG *n0, int num) 10436f9291ceSJung-uk Kim { 10446f9291ceSJung-uk Kim return 0; 10456f9291ceSJung-uk Kim } 10461f13597dSJung-uk Kim # endif /* OPENSSL_BN_ASM_MONT */ 10471f13597dSJung-uk Kim # endif 10481f13597dSJung-uk Kim 1049f579bf8eSKris Kennaway #endif /* !BN_MUL_COMBA */ 1050