1e71b7053SJung-uk Kim /* 2e71b7053SJung-uk Kim * Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. 374664626SKris Kennaway * 4e71b7053SJung-uk Kim * Licensed under the OpenSSL license (the "License"). You may not use 5e71b7053SJung-uk Kim * this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim * in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim * https://www.openssl.org/source/license.html 874664626SKris Kennaway */ 974664626SKris Kennaway 10f579bf8eSKris Kennaway #include <assert.h> 11e71b7053SJung-uk Kim #include <openssl/crypto.h> 12e71b7053SJung-uk Kim #include "internal/cryptlib.h" 1374664626SKris Kennaway #include "bn_lcl.h" 1474664626SKris Kennaway 15f579bf8eSKris Kennaway #if defined(BN_LLONG) || defined(BN_UMULT_HIGH) 1674664626SKris Kennaway 176f9291ceSJung-uk Kim BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 186f9291ceSJung-uk Kim BN_ULONG w) 1974664626SKris Kennaway { 2074664626SKris Kennaway BN_ULONG c1 = 0; 2174664626SKris Kennaway 22f579bf8eSKris Kennaway assert(num >= 0); 236f9291ceSJung-uk Kim if (num <= 0) 24e71b7053SJung-uk Kim return c1; 2574664626SKris Kennaway 261f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 276f9291ceSJung-uk Kim while (num & ~3) { 2874664626SKris Kennaway mul_add(rp[0], ap[0], w, c1); 2974664626SKris Kennaway mul_add(rp[1], ap[1], w, c1); 3074664626SKris Kennaway mul_add(rp[2], ap[2], w, c1); 3174664626SKris Kennaway mul_add(rp[3], ap[3], w, c1); 326f9291ceSJung-uk Kim ap += 4; 336f9291ceSJung-uk Kim rp += 4; 346f9291ceSJung-uk Kim num -= 4; 35f579bf8eSKris Kennaway } 361f13597dSJung-uk Kim # endif 376f9291ceSJung-uk Kim while (num) { 381f13597dSJung-uk Kim mul_add(rp[0], ap[0], w, c1); 396f9291ceSJung-uk Kim ap++; 406f9291ceSJung-uk Kim rp++; 416f9291ceSJung-uk Kim num--; 4274664626SKris Kennaway } 4374664626SKris Kennaway 44e71b7053SJung-uk Kim return c1; 4574664626SKris Kennaway } 4674664626SKris Kennaway 475c87c606SMark Murray BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 4874664626SKris Kennaway { 4974664626SKris Kennaway BN_ULONG c1 = 0; 5074664626SKris Kennaway 51f579bf8eSKris Kennaway assert(num >= 0); 526f9291ceSJung-uk Kim if (num <= 0) 53e71b7053SJung-uk Kim return c1; 5474664626SKris Kennaway 551f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 566f9291ceSJung-uk Kim while (num & ~3) { 5774664626SKris Kennaway mul(rp[0], ap[0], w, c1); 5874664626SKris Kennaway mul(rp[1], ap[1], w, c1); 5974664626SKris Kennaway mul(rp[2], ap[2], w, c1); 6074664626SKris Kennaway mul(rp[3], ap[3], w, c1); 616f9291ceSJung-uk Kim ap += 4; 626f9291ceSJung-uk Kim rp += 4; 636f9291ceSJung-uk Kim num -= 4; 64f579bf8eSKris Kennaway } 651f13597dSJung-uk Kim # endif 666f9291ceSJung-uk Kim while (num) { 671f13597dSJung-uk Kim mul(rp[0], ap[0], w, c1); 686f9291ceSJung-uk Kim ap++; 696f9291ceSJung-uk Kim rp++; 706f9291ceSJung-uk Kim num--; 7174664626SKris Kennaway } 72e71b7053SJung-uk Kim return c1; 7374664626SKris Kennaway } 7474664626SKris Kennaway 755c87c606SMark Murray void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 7674664626SKris Kennaway { 77f579bf8eSKris Kennaway assert(n >= 0); 786f9291ceSJung-uk Kim if (n <= 0) 796f9291ceSJung-uk Kim return; 801f13597dSJung-uk Kim 811f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 826f9291ceSJung-uk Kim while (n & ~3) { 83f579bf8eSKris Kennaway sqr(r[0], r[1], a[0]); 84f579bf8eSKris Kennaway sqr(r[2], r[3], a[1]); 85f579bf8eSKris Kennaway sqr(r[4], r[5], a[2]); 86f579bf8eSKris Kennaway sqr(r[6], r[7], a[3]); 876f9291ceSJung-uk Kim a += 4; 886f9291ceSJung-uk Kim r += 8; 896f9291ceSJung-uk Kim n -= 4; 90f579bf8eSKris Kennaway } 911f13597dSJung-uk Kim # endif 926f9291ceSJung-uk Kim while (n) { 931f13597dSJung-uk Kim sqr(r[0], r[1], a[0]); 946f9291ceSJung-uk Kim a++; 956f9291ceSJung-uk Kim r += 2; 966f9291ceSJung-uk Kim n--; 9774664626SKris Kennaway } 9874664626SKris Kennaway } 9974664626SKris Kennaway 1006f9291ceSJung-uk Kim #else /* !(defined(BN_LLONG) || 1016f9291ceSJung-uk Kim * defined(BN_UMULT_HIGH)) */ 10274664626SKris Kennaway 1036f9291ceSJung-uk Kim BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, 1046f9291ceSJung-uk Kim BN_ULONG w) 10574664626SKris Kennaway { 10674664626SKris Kennaway BN_ULONG c = 0; 10774664626SKris Kennaway BN_ULONG bl, bh; 10874664626SKris Kennaway 109f579bf8eSKris Kennaway assert(num >= 0); 1106f9291ceSJung-uk Kim if (num <= 0) 111e71b7053SJung-uk Kim return (BN_ULONG)0; 11274664626SKris Kennaway 11374664626SKris Kennaway bl = LBITS(w); 11474664626SKris Kennaway bh = HBITS(w); 11574664626SKris Kennaway 1161f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 1176f9291ceSJung-uk Kim while (num & ~3) { 11874664626SKris Kennaway mul_add(rp[0], ap[0], bl, bh, c); 11974664626SKris Kennaway mul_add(rp[1], ap[1], bl, bh, c); 12074664626SKris Kennaway mul_add(rp[2], ap[2], bl, bh, c); 12174664626SKris Kennaway mul_add(rp[3], ap[3], bl, bh, c); 1226f9291ceSJung-uk Kim ap += 4; 1236f9291ceSJung-uk Kim rp += 4; 1246f9291ceSJung-uk Kim num -= 4; 1251f13597dSJung-uk Kim } 1261f13597dSJung-uk Kim # endif 1276f9291ceSJung-uk Kim while (num) { 1281f13597dSJung-uk Kim mul_add(rp[0], ap[0], bl, bh, c); 1296f9291ceSJung-uk Kim ap++; 1306f9291ceSJung-uk Kim rp++; 1316f9291ceSJung-uk Kim num--; 13274664626SKris Kennaway } 133e71b7053SJung-uk Kim return c; 13474664626SKris Kennaway } 13574664626SKris Kennaway 1365c87c606SMark Murray BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) 13774664626SKris Kennaway { 13874664626SKris Kennaway BN_ULONG carry = 0; 13974664626SKris Kennaway BN_ULONG bl, bh; 14074664626SKris Kennaway 141f579bf8eSKris Kennaway assert(num >= 0); 1426f9291ceSJung-uk Kim if (num <= 0) 143e71b7053SJung-uk Kim return (BN_ULONG)0; 14474664626SKris Kennaway 14574664626SKris Kennaway bl = LBITS(w); 14674664626SKris Kennaway bh = HBITS(w); 14774664626SKris Kennaway 1481f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 1496f9291ceSJung-uk Kim while (num & ~3) { 15074664626SKris Kennaway mul(rp[0], ap[0], bl, bh, carry); 15174664626SKris Kennaway mul(rp[1], ap[1], bl, bh, carry); 15274664626SKris Kennaway mul(rp[2], ap[2], bl, bh, carry); 15374664626SKris Kennaway mul(rp[3], ap[3], bl, bh, carry); 1546f9291ceSJung-uk Kim ap += 4; 1556f9291ceSJung-uk Kim rp += 4; 1566f9291ceSJung-uk Kim num -= 4; 1571f13597dSJung-uk Kim } 1581f13597dSJung-uk Kim # endif 1596f9291ceSJung-uk Kim while (num) { 1601f13597dSJung-uk Kim mul(rp[0], ap[0], bl, bh, carry); 1616f9291ceSJung-uk Kim ap++; 1626f9291ceSJung-uk Kim rp++; 1636f9291ceSJung-uk Kim num--; 16474664626SKris Kennaway } 165e71b7053SJung-uk Kim return carry; 16674664626SKris Kennaway } 16774664626SKris Kennaway 1685c87c606SMark Murray void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) 16974664626SKris Kennaway { 170f579bf8eSKris Kennaway assert(n >= 0); 1716f9291ceSJung-uk Kim if (n <= 0) 1726f9291ceSJung-uk Kim return; 1731f13597dSJung-uk Kim 1741f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 1756f9291ceSJung-uk Kim while (n & ~3) { 17674664626SKris Kennaway sqr64(r[0], r[1], a[0]); 17774664626SKris Kennaway sqr64(r[2], r[3], a[1]); 17874664626SKris Kennaway sqr64(r[4], r[5], a[2]); 17974664626SKris Kennaway sqr64(r[6], r[7], a[3]); 1806f9291ceSJung-uk Kim a += 4; 1816f9291ceSJung-uk Kim r += 8; 1826f9291ceSJung-uk Kim n -= 4; 1831f13597dSJung-uk Kim } 1841f13597dSJung-uk Kim # endif 1856f9291ceSJung-uk Kim while (n) { 1861f13597dSJung-uk Kim sqr64(r[0], r[1], a[0]); 1876f9291ceSJung-uk Kim a++; 1886f9291ceSJung-uk Kim r += 2; 1896f9291ceSJung-uk Kim n--; 19074664626SKris Kennaway } 19174664626SKris Kennaway } 19274664626SKris Kennaway 1936f9291ceSJung-uk Kim #endif /* !(defined(BN_LLONG) || 1946f9291ceSJung-uk Kim * defined(BN_UMULT_HIGH)) */ 19574664626SKris Kennaway 19674664626SKris Kennaway #if defined(BN_LLONG) && defined(BN_DIV2W) 19774664626SKris Kennaway 19874664626SKris Kennaway BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 19974664626SKris Kennaway { 20074664626SKris Kennaway return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d)); 20174664626SKris Kennaway } 20274664626SKris Kennaway 20374664626SKris Kennaway #else 20474664626SKris Kennaway 205ddd58736SKris Kennaway /* Divide h,l by d and return the result. */ 20674664626SKris Kennaway /* I need to test this some more :-( */ 20774664626SKris Kennaway BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) 20874664626SKris Kennaway { 20974664626SKris Kennaway BN_ULONG dh, dl, q, ret = 0, th, tl, t; 21074664626SKris Kennaway int i, count = 2; 21174664626SKris Kennaway 2126f9291ceSJung-uk Kim if (d == 0) 213e71b7053SJung-uk Kim return BN_MASK2; 21474664626SKris Kennaway 21574664626SKris Kennaway i = BN_num_bits_word(d); 2163b4e3dcbSSimon L. B. Nielsen assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); 217ddd58736SKris Kennaway 21874664626SKris Kennaway i = BN_BITS2 - i; 2196f9291ceSJung-uk Kim if (h >= d) 2206f9291ceSJung-uk Kim h -= d; 22174664626SKris Kennaway 2226f9291ceSJung-uk Kim if (i) { 22374664626SKris Kennaway d <<= i; 22474664626SKris Kennaway h = (h << i) | (l >> (BN_BITS2 - i)); 22574664626SKris Kennaway l <<= i; 22674664626SKris Kennaway } 22774664626SKris Kennaway dh = (d & BN_MASK2h) >> BN_BITS4; 22874664626SKris Kennaway dl = (d & BN_MASK2l); 2296f9291ceSJung-uk Kim for (;;) { 23074664626SKris Kennaway if ((h >> BN_BITS4) == dh) 23174664626SKris Kennaway q = BN_MASK2l; 23274664626SKris Kennaway else 23374664626SKris Kennaway q = h / dh; 23474664626SKris Kennaway 23574664626SKris Kennaway th = q * dh; 23674664626SKris Kennaway tl = dl * q; 2376f9291ceSJung-uk Kim for (;;) { 23874664626SKris Kennaway t = h - th; 23974664626SKris Kennaway if ((t & BN_MASK2h) || 2406f9291ceSJung-uk Kim ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) 24174664626SKris Kennaway break; 24274664626SKris Kennaway q--; 24374664626SKris Kennaway th -= dh; 24474664626SKris Kennaway tl -= dl; 24574664626SKris Kennaway } 24674664626SKris Kennaway t = (tl >> BN_BITS4); 24774664626SKris Kennaway tl = (tl << BN_BITS4) & BN_MASK2h; 24874664626SKris Kennaway th += t; 24974664626SKris Kennaway 2506f9291ceSJung-uk Kim if (l < tl) 2516f9291ceSJung-uk Kim th++; 25274664626SKris Kennaway l -= tl; 2536f9291ceSJung-uk Kim if (h < th) { 25474664626SKris Kennaway h += d; 25574664626SKris Kennaway q--; 25674664626SKris Kennaway } 25774664626SKris Kennaway h -= th; 25874664626SKris Kennaway 2596f9291ceSJung-uk Kim if (--count == 0) 2606f9291ceSJung-uk Kim break; 26174664626SKris Kennaway 26274664626SKris Kennaway ret = q << BN_BITS4; 26374664626SKris Kennaway h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2; 26474664626SKris Kennaway l = (l & BN_MASK2l) << BN_BITS4; 26574664626SKris Kennaway } 26674664626SKris Kennaway ret |= q; 267e71b7053SJung-uk Kim return ret; 26874664626SKris Kennaway } 269f579bf8eSKris Kennaway #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */ 27074664626SKris Kennaway 27174664626SKris Kennaway #ifdef BN_LLONG 2726f9291ceSJung-uk Kim BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 2736f9291ceSJung-uk Kim int n) 27474664626SKris Kennaway { 27574664626SKris Kennaway BN_ULLONG ll = 0; 27674664626SKris Kennaway 277f579bf8eSKris Kennaway assert(n >= 0); 2786f9291ceSJung-uk Kim if (n <= 0) 279e71b7053SJung-uk Kim return (BN_ULONG)0; 28074664626SKris Kennaway 2811f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 2826f9291ceSJung-uk Kim while (n & ~3) { 28374664626SKris Kennaway ll += (BN_ULLONG) a[0] + b[0]; 28474664626SKris Kennaway r[0] = (BN_ULONG)ll & BN_MASK2; 28574664626SKris Kennaway ll >>= BN_BITS2; 28674664626SKris Kennaway ll += (BN_ULLONG) a[1] + b[1]; 28774664626SKris Kennaway r[1] = (BN_ULONG)ll & BN_MASK2; 28874664626SKris Kennaway ll >>= BN_BITS2; 28974664626SKris Kennaway ll += (BN_ULLONG) a[2] + b[2]; 29074664626SKris Kennaway r[2] = (BN_ULONG)ll & BN_MASK2; 29174664626SKris Kennaway ll >>= BN_BITS2; 29274664626SKris Kennaway ll += (BN_ULLONG) a[3] + b[3]; 29374664626SKris Kennaway r[3] = (BN_ULONG)ll & BN_MASK2; 29474664626SKris Kennaway ll >>= BN_BITS2; 2956f9291ceSJung-uk Kim a += 4; 2966f9291ceSJung-uk Kim b += 4; 2976f9291ceSJung-uk Kim r += 4; 2986f9291ceSJung-uk Kim n -= 4; 2991f13597dSJung-uk Kim } 3001f13597dSJung-uk Kim # endif 3016f9291ceSJung-uk Kim while (n) { 3021f13597dSJung-uk Kim ll += (BN_ULLONG) a[0] + b[0]; 3031f13597dSJung-uk Kim r[0] = (BN_ULONG)ll & BN_MASK2; 3041f13597dSJung-uk Kim ll >>= BN_BITS2; 3056f9291ceSJung-uk Kim a++; 3066f9291ceSJung-uk Kim b++; 3076f9291ceSJung-uk Kim r++; 3086f9291ceSJung-uk Kim n--; 30974664626SKris Kennaway } 310e71b7053SJung-uk Kim return (BN_ULONG)ll; 31174664626SKris Kennaway } 312f579bf8eSKris Kennaway #else /* !BN_LLONG */ 3136f9291ceSJung-uk Kim BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 3146f9291ceSJung-uk Kim int n) 31574664626SKris Kennaway { 31674664626SKris Kennaway BN_ULONG c, l, t; 31774664626SKris Kennaway 318f579bf8eSKris Kennaway assert(n >= 0); 3196f9291ceSJung-uk Kim if (n <= 0) 320e71b7053SJung-uk Kim return (BN_ULONG)0; 32174664626SKris Kennaway 32274664626SKris Kennaway c = 0; 3231f13597dSJung-uk Kim # ifndef OPENSSL_SMALL_FOOTPRINT 3246f9291ceSJung-uk Kim while (n & ~3) { 32574664626SKris Kennaway t = a[0]; 32674664626SKris Kennaway t = (t + c) & BN_MASK2; 32774664626SKris Kennaway c = (t < c); 32874664626SKris Kennaway l = (t + b[0]) & BN_MASK2; 32974664626SKris Kennaway c += (l < t); 33074664626SKris Kennaway r[0] = l; 33174664626SKris Kennaway t = a[1]; 33274664626SKris Kennaway t = (t + c) & BN_MASK2; 33374664626SKris Kennaway c = (t < c); 33474664626SKris Kennaway l = (t + b[1]) & BN_MASK2; 33574664626SKris Kennaway c += (l < t); 33674664626SKris Kennaway r[1] = l; 33774664626SKris Kennaway t = a[2]; 33874664626SKris Kennaway t = (t + c) & BN_MASK2; 33974664626SKris Kennaway c = (t < c); 34074664626SKris Kennaway l = (t + b[2]) & BN_MASK2; 34174664626SKris Kennaway c += (l < t); 34274664626SKris Kennaway r[2] = l; 34374664626SKris Kennaway t = a[3]; 34474664626SKris Kennaway t = (t + c) & BN_MASK2; 34574664626SKris Kennaway c = (t < c); 34674664626SKris Kennaway l = (t + b[3]) & BN_MASK2; 34774664626SKris Kennaway c += (l < t); 34874664626SKris Kennaway r[3] = l; 3496f9291ceSJung-uk Kim a += 4; 3506f9291ceSJung-uk Kim b += 4; 3516f9291ceSJung-uk Kim r += 4; 3526f9291ceSJung-uk Kim n -= 4; 3531f13597dSJung-uk Kim } 3541f13597dSJung-uk Kim # endif 3556f9291ceSJung-uk Kim while (n) { 3561f13597dSJung-uk Kim t = a[0]; 3571f13597dSJung-uk Kim t = (t + c) & BN_MASK2; 3581f13597dSJung-uk Kim c = (t < c); 3591f13597dSJung-uk Kim l = (t + b[0]) & BN_MASK2; 3601f13597dSJung-uk Kim c += (l < t); 3611f13597dSJung-uk Kim r[0] = l; 3626f9291ceSJung-uk Kim a++; 3636f9291ceSJung-uk Kim b++; 3646f9291ceSJung-uk Kim r++; 3656f9291ceSJung-uk Kim n--; 36674664626SKris Kennaway } 367e71b7053SJung-uk Kim return (BN_ULONG)c; 36874664626SKris Kennaway } 369f579bf8eSKris Kennaway #endif /* !BN_LLONG */ 37074664626SKris Kennaway 3716f9291ceSJung-uk Kim BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 3726f9291ceSJung-uk Kim int n) 37374664626SKris Kennaway { 37474664626SKris Kennaway BN_ULONG t1, t2; 37574664626SKris Kennaway int c = 0; 37674664626SKris Kennaway 377f579bf8eSKris Kennaway assert(n >= 0); 3786f9291ceSJung-uk Kim if (n <= 0) 379e71b7053SJung-uk Kim return (BN_ULONG)0; 38074664626SKris Kennaway 3811f13597dSJung-uk Kim #ifndef OPENSSL_SMALL_FOOTPRINT 3826f9291ceSJung-uk Kim while (n & ~3) { 3836f9291ceSJung-uk Kim t1 = a[0]; 3846f9291ceSJung-uk Kim t2 = b[0]; 38574664626SKris Kennaway r[0] = (t1 - t2 - c) & BN_MASK2; 3866f9291ceSJung-uk Kim if (t1 != t2) 3876f9291ceSJung-uk Kim c = (t1 < t2); 3886f9291ceSJung-uk Kim t1 = a[1]; 3896f9291ceSJung-uk Kim t2 = b[1]; 39074664626SKris Kennaway r[1] = (t1 - t2 - c) & BN_MASK2; 3916f9291ceSJung-uk Kim if (t1 != t2) 3926f9291ceSJung-uk Kim c = (t1 < t2); 3936f9291ceSJung-uk Kim t1 = a[2]; 3946f9291ceSJung-uk Kim t2 = b[2]; 39574664626SKris Kennaway r[2] = (t1 - t2 - c) & BN_MASK2; 3966f9291ceSJung-uk Kim if (t1 != t2) 3976f9291ceSJung-uk Kim c = (t1 < t2); 3986f9291ceSJung-uk Kim t1 = a[3]; 3996f9291ceSJung-uk Kim t2 = b[3]; 40074664626SKris Kennaway r[3] = (t1 - t2 - c) & BN_MASK2; 4016f9291ceSJung-uk Kim if (t1 != t2) 4026f9291ceSJung-uk Kim c = (t1 < t2); 4036f9291ceSJung-uk Kim a += 4; 4046f9291ceSJung-uk Kim b += 4; 4056f9291ceSJung-uk Kim r += 4; 4066f9291ceSJung-uk Kim n -= 4; 4071f13597dSJung-uk Kim } 4081f13597dSJung-uk Kim #endif 4096f9291ceSJung-uk Kim while (n) { 4106f9291ceSJung-uk Kim t1 = a[0]; 4116f9291ceSJung-uk Kim t2 = b[0]; 4121f13597dSJung-uk Kim r[0] = (t1 - t2 - c) & BN_MASK2; 4136f9291ceSJung-uk Kim if (t1 != t2) 4146f9291ceSJung-uk Kim c = (t1 < t2); 4156f9291ceSJung-uk Kim a++; 4166f9291ceSJung-uk Kim b++; 4176f9291ceSJung-uk Kim r++; 4186f9291ceSJung-uk Kim n--; 41974664626SKris Kennaway } 420e71b7053SJung-uk Kim return c; 42174664626SKris Kennaway } 42274664626SKris Kennaway 4231f13597dSJung-uk Kim #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT) 42474664626SKris Kennaway 42574664626SKris Kennaway # undef bn_mul_comba8 42674664626SKris Kennaway # undef bn_mul_comba4 42774664626SKris Kennaway # undef bn_sqr_comba8 42874664626SKris Kennaway # undef bn_sqr_comba4 42974664626SKris Kennaway 430f579bf8eSKris Kennaway /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ 431f579bf8eSKris Kennaway /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ 432f579bf8eSKris Kennaway /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ 4336f9291ceSJung-uk Kim /* 4346f9291ceSJung-uk Kim * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number 4356f9291ceSJung-uk Kim * c=(c2,c1,c0) 4366f9291ceSJung-uk Kim */ 437f579bf8eSKris Kennaway 43874664626SKris Kennaway # ifdef BN_LLONG 4397bded2dbSJung-uk Kim /* 4407bded2dbSJung-uk Kim * Keep in mind that additions to multiplication result can not 4417bded2dbSJung-uk Kim * overflow, because its high half cannot be all-ones. 4427bded2dbSJung-uk Kim */ 4437bded2dbSJung-uk Kim # define mul_add_c(a,b,c0,c1,c2) do { \ 4447bded2dbSJung-uk Kim BN_ULONG hi; \ 4457bded2dbSJung-uk Kim BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 4467bded2dbSJung-uk Kim t += c0; /* no carry */ \ 4477bded2dbSJung-uk Kim c0 = (BN_ULONG)Lw(t); \ 4487bded2dbSJung-uk Kim hi = (BN_ULONG)Hw(t); \ 4497bded2dbSJung-uk Kim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 4507bded2dbSJung-uk Kim } while(0) 45174664626SKris Kennaway 4527bded2dbSJung-uk Kim # define mul_add_c2(a,b,c0,c1,c2) do { \ 4537bded2dbSJung-uk Kim BN_ULONG hi; \ 4547bded2dbSJung-uk Kim BN_ULLONG t = (BN_ULLONG)(a)*(b); \ 4557bded2dbSJung-uk Kim BN_ULLONG tt = t+c0; /* no carry */ \ 4567bded2dbSJung-uk Kim c0 = (BN_ULONG)Lw(tt); \ 4577bded2dbSJung-uk Kim hi = (BN_ULONG)Hw(tt); \ 4587bded2dbSJung-uk Kim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 4597bded2dbSJung-uk Kim t += c0; /* no carry */ \ 4607bded2dbSJung-uk Kim c0 = (BN_ULONG)Lw(t); \ 4617bded2dbSJung-uk Kim hi = (BN_ULONG)Hw(t); \ 4627bded2dbSJung-uk Kim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 4637bded2dbSJung-uk Kim } while(0) 46474664626SKris Kennaway 4657bded2dbSJung-uk Kim # define sqr_add_c(a,i,c0,c1,c2) do { \ 4667bded2dbSJung-uk Kim BN_ULONG hi; \ 4677bded2dbSJung-uk Kim BN_ULLONG t = (BN_ULLONG)a[i]*a[i]; \ 4687bded2dbSJung-uk Kim t += c0; /* no carry */ \ 4697bded2dbSJung-uk Kim c0 = (BN_ULONG)Lw(t); \ 4707bded2dbSJung-uk Kim hi = (BN_ULONG)Hw(t); \ 4717bded2dbSJung-uk Kim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 4727bded2dbSJung-uk Kim } while(0) 47374664626SKris Kennaway 47474664626SKris Kennaway # define sqr_add_c2(a,i,j,c0,c1,c2) \ 47574664626SKris Kennaway mul_add_c2((a)[i],(a)[j],c0,c1,c2) 476f579bf8eSKris Kennaway 4773b4e3dcbSSimon L. B. Nielsen # elif defined(BN_UMULT_LOHI) 4787bded2dbSJung-uk Kim /* 4797bded2dbSJung-uk Kim * Keep in mind that additions to hi can not overflow, because 4807bded2dbSJung-uk Kim * the high word of a multiplication result cannot be all-ones. 4817bded2dbSJung-uk Kim */ 4827bded2dbSJung-uk Kim # define mul_add_c(a,b,c0,c1,c2) do { \ 4833b4e3dcbSSimon L. B. Nielsen BN_ULONG ta = (a), tb = (b); \ 4847bded2dbSJung-uk Kim BN_ULONG lo, hi; \ 4857bded2dbSJung-uk Kim BN_UMULT_LOHI(lo,hi,ta,tb); \ 4867bded2dbSJung-uk Kim c0 += lo; hi += (c0<lo)?1:0; \ 4877bded2dbSJung-uk Kim c1 += hi; c2 += (c1<hi)?1:0; \ 4887bded2dbSJung-uk Kim } while(0) 4893b4e3dcbSSimon L. B. Nielsen 4907bded2dbSJung-uk Kim # define mul_add_c2(a,b,c0,c1,c2) do { \ 4917bded2dbSJung-uk Kim BN_ULONG ta = (a), tb = (b); \ 4927bded2dbSJung-uk Kim BN_ULONG lo, hi, tt; \ 4937bded2dbSJung-uk Kim BN_UMULT_LOHI(lo,hi,ta,tb); \ 4947bded2dbSJung-uk Kim c0 += lo; tt = hi+((c0<lo)?1:0); \ 4957bded2dbSJung-uk Kim c1 += tt; c2 += (c1<tt)?1:0; \ 4967bded2dbSJung-uk Kim c0 += lo; hi += (c0<lo)?1:0; \ 4977bded2dbSJung-uk Kim c1 += hi; c2 += (c1<hi)?1:0; \ 4987bded2dbSJung-uk Kim } while(0) 4993b4e3dcbSSimon L. B. Nielsen 5007bded2dbSJung-uk Kim # define sqr_add_c(a,i,c0,c1,c2) do { \ 5013b4e3dcbSSimon L. B. Nielsen BN_ULONG ta = (a)[i]; \ 5027bded2dbSJung-uk Kim BN_ULONG lo, hi; \ 5037bded2dbSJung-uk Kim BN_UMULT_LOHI(lo,hi,ta,ta); \ 5047bded2dbSJung-uk Kim c0 += lo; hi += (c0<lo)?1:0; \ 5057bded2dbSJung-uk Kim c1 += hi; c2 += (c1<hi)?1:0; \ 5067bded2dbSJung-uk Kim } while(0) 5073b4e3dcbSSimon L. B. Nielsen 5083b4e3dcbSSimon L. B. Nielsen # define sqr_add_c2(a,i,j,c0,c1,c2) \ 5093b4e3dcbSSimon L. B. Nielsen mul_add_c2((a)[i],(a)[j],c0,c1,c2) 5103b4e3dcbSSimon L. B. Nielsen 511f579bf8eSKris Kennaway # elif defined(BN_UMULT_HIGH) 5127bded2dbSJung-uk Kim /* 5137bded2dbSJung-uk Kim * Keep in mind that additions to hi can not overflow, because 5147bded2dbSJung-uk Kim * the high word of a multiplication result cannot be all-ones. 5157bded2dbSJung-uk Kim */ 5167bded2dbSJung-uk Kim # define mul_add_c(a,b,c0,c1,c2) do { \ 517f579bf8eSKris Kennaway BN_ULONG ta = (a), tb = (b); \ 5187bded2dbSJung-uk Kim BN_ULONG lo = ta * tb; \ 5197bded2dbSJung-uk Kim BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 5207bded2dbSJung-uk Kim c0 += lo; hi += (c0<lo)?1:0; \ 5217bded2dbSJung-uk Kim c1 += hi; c2 += (c1<hi)?1:0; \ 5227bded2dbSJung-uk Kim } while(0) 523f579bf8eSKris Kennaway 5247bded2dbSJung-uk Kim # define mul_add_c2(a,b,c0,c1,c2) do { \ 5257bded2dbSJung-uk Kim BN_ULONG ta = (a), tb = (b), tt; \ 5267bded2dbSJung-uk Kim BN_ULONG lo = ta * tb; \ 5277bded2dbSJung-uk Kim BN_ULONG hi = BN_UMULT_HIGH(ta,tb); \ 5287bded2dbSJung-uk Kim c0 += lo; tt = hi + ((c0<lo)?1:0); \ 5297bded2dbSJung-uk Kim c1 += tt; c2 += (c1<tt)?1:0; \ 5307bded2dbSJung-uk Kim c0 += lo; hi += (c0<lo)?1:0; \ 5317bded2dbSJung-uk Kim c1 += hi; c2 += (c1<hi)?1:0; \ 5327bded2dbSJung-uk Kim } while(0) 533f579bf8eSKris Kennaway 5347bded2dbSJung-uk Kim # define sqr_add_c(a,i,c0,c1,c2) do { \ 535f579bf8eSKris Kennaway BN_ULONG ta = (a)[i]; \ 5367bded2dbSJung-uk Kim BN_ULONG lo = ta * ta; \ 5377bded2dbSJung-uk Kim BN_ULONG hi = BN_UMULT_HIGH(ta,ta); \ 5387bded2dbSJung-uk Kim c0 += lo; hi += (c0<lo)?1:0; \ 5397bded2dbSJung-uk Kim c1 += hi; c2 += (c1<hi)?1:0; \ 5407bded2dbSJung-uk Kim } while(0) 541f579bf8eSKris Kennaway 542f579bf8eSKris Kennaway # define sqr_add_c2(a,i,j,c0,c1,c2) \ 543f579bf8eSKris Kennaway mul_add_c2((a)[i],(a)[j],c0,c1,c2) 544f579bf8eSKris Kennaway 545f579bf8eSKris Kennaway # else /* !BN_LLONG */ 5467bded2dbSJung-uk Kim /* 5477bded2dbSJung-uk Kim * Keep in mind that additions to hi can not overflow, because 5487bded2dbSJung-uk Kim * the high word of a multiplication result cannot be all-ones. 5497bded2dbSJung-uk Kim */ 5507bded2dbSJung-uk Kim # define mul_add_c(a,b,c0,c1,c2) do { \ 5517bded2dbSJung-uk Kim BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 5527bded2dbSJung-uk Kim BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 5537bded2dbSJung-uk Kim mul64(lo,hi,bl,bh); \ 5547bded2dbSJung-uk Kim c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 5557bded2dbSJung-uk Kim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 5567bded2dbSJung-uk Kim } while(0) 55774664626SKris Kennaway 5587bded2dbSJung-uk Kim # define mul_add_c2(a,b,c0,c1,c2) do { \ 5597bded2dbSJung-uk Kim BN_ULONG tt; \ 5607bded2dbSJung-uk Kim BN_ULONG lo = LBITS(a), hi = HBITS(a); \ 5617bded2dbSJung-uk Kim BN_ULONG bl = LBITS(b), bh = HBITS(b); \ 5627bded2dbSJung-uk Kim mul64(lo,hi,bl,bh); \ 5637bded2dbSJung-uk Kim tt = hi; \ 5647bded2dbSJung-uk Kim c0 = (c0+lo)&BN_MASK2; if (c0<lo) tt++; \ 5657bded2dbSJung-uk Kim c1 = (c1+tt)&BN_MASK2; if (c1<tt) c2++; \ 5667bded2dbSJung-uk Kim c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 5677bded2dbSJung-uk Kim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 5687bded2dbSJung-uk Kim } while(0) 56974664626SKris Kennaway 5707bded2dbSJung-uk Kim # define sqr_add_c(a,i,c0,c1,c2) do { \ 5717bded2dbSJung-uk Kim BN_ULONG lo, hi; \ 5727bded2dbSJung-uk Kim sqr64(lo,hi,(a)[i]); \ 5737bded2dbSJung-uk Kim c0 = (c0+lo)&BN_MASK2; if (c0<lo) hi++; \ 5747bded2dbSJung-uk Kim c1 = (c1+hi)&BN_MASK2; if (c1<hi) c2++; \ 5757bded2dbSJung-uk Kim } while(0) 57674664626SKris Kennaway 57774664626SKris Kennaway # define sqr_add_c2(a,i,j,c0,c1,c2) \ 57874664626SKris Kennaway mul_add_c2((a)[i],(a)[j],c0,c1,c2) 579f579bf8eSKris Kennaway # endif /* !BN_LLONG */ 58074664626SKris Kennaway 58174664626SKris Kennaway void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 58274664626SKris Kennaway { 58374664626SKris Kennaway BN_ULONG c1, c2, c3; 58474664626SKris Kennaway 58574664626SKris Kennaway c1 = 0; 58674664626SKris Kennaway c2 = 0; 58774664626SKris Kennaway c3 = 0; 58874664626SKris Kennaway mul_add_c(a[0], b[0], c1, c2, c3); 58974664626SKris Kennaway r[0] = c1; 59074664626SKris Kennaway c1 = 0; 59174664626SKris Kennaway mul_add_c(a[0], b[1], c2, c3, c1); 59274664626SKris Kennaway mul_add_c(a[1], b[0], c2, c3, c1); 59374664626SKris Kennaway r[1] = c2; 59474664626SKris Kennaway c2 = 0; 59574664626SKris Kennaway mul_add_c(a[2], b[0], c3, c1, c2); 59674664626SKris Kennaway mul_add_c(a[1], b[1], c3, c1, c2); 59774664626SKris Kennaway mul_add_c(a[0], b[2], c3, c1, c2); 59874664626SKris Kennaway r[2] = c3; 59974664626SKris Kennaway c3 = 0; 60074664626SKris Kennaway mul_add_c(a[0], b[3], c1, c2, c3); 60174664626SKris Kennaway mul_add_c(a[1], b[2], c1, c2, c3); 60274664626SKris Kennaway mul_add_c(a[2], b[1], c1, c2, c3); 60374664626SKris Kennaway mul_add_c(a[3], b[0], c1, c2, c3); 60474664626SKris Kennaway r[3] = c1; 60574664626SKris Kennaway c1 = 0; 60674664626SKris Kennaway mul_add_c(a[4], b[0], c2, c3, c1); 60774664626SKris Kennaway mul_add_c(a[3], b[1], c2, c3, c1); 60874664626SKris Kennaway mul_add_c(a[2], b[2], c2, c3, c1); 60974664626SKris Kennaway mul_add_c(a[1], b[3], c2, c3, c1); 61074664626SKris Kennaway mul_add_c(a[0], b[4], c2, c3, c1); 61174664626SKris Kennaway r[4] = c2; 61274664626SKris Kennaway c2 = 0; 61374664626SKris Kennaway mul_add_c(a[0], b[5], c3, c1, c2); 61474664626SKris Kennaway mul_add_c(a[1], b[4], c3, c1, c2); 61574664626SKris Kennaway mul_add_c(a[2], b[3], c3, c1, c2); 61674664626SKris Kennaway mul_add_c(a[3], b[2], c3, c1, c2); 61774664626SKris Kennaway mul_add_c(a[4], b[1], c3, c1, c2); 61874664626SKris Kennaway mul_add_c(a[5], b[0], c3, c1, c2); 61974664626SKris Kennaway r[5] = c3; 62074664626SKris Kennaway c3 = 0; 62174664626SKris Kennaway mul_add_c(a[6], b[0], c1, c2, c3); 62274664626SKris Kennaway mul_add_c(a[5], b[1], c1, c2, c3); 62374664626SKris Kennaway mul_add_c(a[4], b[2], c1, c2, c3); 62474664626SKris Kennaway mul_add_c(a[3], b[3], c1, c2, c3); 62574664626SKris Kennaway mul_add_c(a[2], b[4], c1, c2, c3); 62674664626SKris Kennaway mul_add_c(a[1], b[5], c1, c2, c3); 62774664626SKris Kennaway mul_add_c(a[0], b[6], c1, c2, c3); 62874664626SKris Kennaway r[6] = c1; 62974664626SKris Kennaway c1 = 0; 63074664626SKris Kennaway mul_add_c(a[0], b[7], c2, c3, c1); 63174664626SKris Kennaway mul_add_c(a[1], b[6], c2, c3, c1); 63274664626SKris Kennaway mul_add_c(a[2], b[5], c2, c3, c1); 63374664626SKris Kennaway mul_add_c(a[3], b[4], c2, c3, c1); 63474664626SKris Kennaway mul_add_c(a[4], b[3], c2, c3, c1); 63574664626SKris Kennaway mul_add_c(a[5], b[2], c2, c3, c1); 63674664626SKris Kennaway mul_add_c(a[6], b[1], c2, c3, c1); 63774664626SKris Kennaway mul_add_c(a[7], b[0], c2, c3, c1); 63874664626SKris Kennaway r[7] = c2; 63974664626SKris Kennaway c2 = 0; 64074664626SKris Kennaway mul_add_c(a[7], b[1], c3, c1, c2); 64174664626SKris Kennaway mul_add_c(a[6], b[2], c3, c1, c2); 64274664626SKris Kennaway mul_add_c(a[5], b[3], c3, c1, c2); 64374664626SKris Kennaway mul_add_c(a[4], b[4], c3, c1, c2); 64474664626SKris Kennaway mul_add_c(a[3], b[5], c3, c1, c2); 64574664626SKris Kennaway mul_add_c(a[2], b[6], c3, c1, c2); 64674664626SKris Kennaway mul_add_c(a[1], b[7], c3, c1, c2); 64774664626SKris Kennaway r[8] = c3; 64874664626SKris Kennaway c3 = 0; 64974664626SKris Kennaway mul_add_c(a[2], b[7], c1, c2, c3); 65074664626SKris Kennaway mul_add_c(a[3], b[6], c1, c2, c3); 65174664626SKris Kennaway mul_add_c(a[4], b[5], c1, c2, c3); 65274664626SKris Kennaway mul_add_c(a[5], b[4], c1, c2, c3); 65374664626SKris Kennaway mul_add_c(a[6], b[3], c1, c2, c3); 65474664626SKris Kennaway mul_add_c(a[7], b[2], c1, c2, c3); 65574664626SKris Kennaway r[9] = c1; 65674664626SKris Kennaway c1 = 0; 65774664626SKris Kennaway mul_add_c(a[7], b[3], c2, c3, c1); 65874664626SKris Kennaway mul_add_c(a[6], b[4], c2, c3, c1); 65974664626SKris Kennaway mul_add_c(a[5], b[5], c2, c3, c1); 66074664626SKris Kennaway mul_add_c(a[4], b[6], c2, c3, c1); 66174664626SKris Kennaway mul_add_c(a[3], b[7], c2, c3, c1); 66274664626SKris Kennaway r[10] = c2; 66374664626SKris Kennaway c2 = 0; 66474664626SKris Kennaway mul_add_c(a[4], b[7], c3, c1, c2); 66574664626SKris Kennaway mul_add_c(a[5], b[6], c3, c1, c2); 66674664626SKris Kennaway mul_add_c(a[6], b[5], c3, c1, c2); 66774664626SKris Kennaway mul_add_c(a[7], b[4], c3, c1, c2); 66874664626SKris Kennaway r[11] = c3; 66974664626SKris Kennaway c3 = 0; 67074664626SKris Kennaway mul_add_c(a[7], b[5], c1, c2, c3); 67174664626SKris Kennaway mul_add_c(a[6], b[6], c1, c2, c3); 67274664626SKris Kennaway mul_add_c(a[5], b[7], c1, c2, c3); 67374664626SKris Kennaway r[12] = c1; 67474664626SKris Kennaway c1 = 0; 67574664626SKris Kennaway mul_add_c(a[6], b[7], c2, c3, c1); 67674664626SKris Kennaway mul_add_c(a[7], b[6], c2, c3, c1); 67774664626SKris Kennaway r[13] = c2; 67874664626SKris Kennaway c2 = 0; 67974664626SKris Kennaway mul_add_c(a[7], b[7], c3, c1, c2); 68074664626SKris Kennaway r[14] = c3; 68174664626SKris Kennaway r[15] = c1; 68274664626SKris Kennaway } 68374664626SKris Kennaway 68474664626SKris Kennaway void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 68574664626SKris Kennaway { 68674664626SKris Kennaway BN_ULONG c1, c2, c3; 68774664626SKris Kennaway 68874664626SKris Kennaway c1 = 0; 68974664626SKris Kennaway c2 = 0; 69074664626SKris Kennaway c3 = 0; 69174664626SKris Kennaway mul_add_c(a[0], b[0], c1, c2, c3); 69274664626SKris Kennaway r[0] = c1; 69374664626SKris Kennaway c1 = 0; 69474664626SKris Kennaway mul_add_c(a[0], b[1], c2, c3, c1); 69574664626SKris Kennaway mul_add_c(a[1], b[0], c2, c3, c1); 69674664626SKris Kennaway r[1] = c2; 69774664626SKris Kennaway c2 = 0; 69874664626SKris Kennaway mul_add_c(a[2], b[0], c3, c1, c2); 69974664626SKris Kennaway mul_add_c(a[1], b[1], c3, c1, c2); 70074664626SKris Kennaway mul_add_c(a[0], b[2], c3, c1, c2); 70174664626SKris Kennaway r[2] = c3; 70274664626SKris Kennaway c3 = 0; 70374664626SKris Kennaway mul_add_c(a[0], b[3], c1, c2, c3); 70474664626SKris Kennaway mul_add_c(a[1], b[2], c1, c2, c3); 70574664626SKris Kennaway mul_add_c(a[2], b[1], c1, c2, c3); 70674664626SKris Kennaway mul_add_c(a[3], b[0], c1, c2, c3); 70774664626SKris Kennaway r[3] = c1; 70874664626SKris Kennaway c1 = 0; 70974664626SKris Kennaway mul_add_c(a[3], b[1], c2, c3, c1); 71074664626SKris Kennaway mul_add_c(a[2], b[2], c2, c3, c1); 71174664626SKris Kennaway mul_add_c(a[1], b[3], c2, c3, c1); 71274664626SKris Kennaway r[4] = c2; 71374664626SKris Kennaway c2 = 0; 71474664626SKris Kennaway mul_add_c(a[2], b[3], c3, c1, c2); 71574664626SKris Kennaway mul_add_c(a[3], b[2], c3, c1, c2); 71674664626SKris Kennaway r[5] = c3; 71774664626SKris Kennaway c3 = 0; 71874664626SKris Kennaway mul_add_c(a[3], b[3], c1, c2, c3); 71974664626SKris Kennaway r[6] = c1; 72074664626SKris Kennaway r[7] = c2; 72174664626SKris Kennaway } 72274664626SKris Kennaway 7235c87c606SMark Murray void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 72474664626SKris Kennaway { 72574664626SKris Kennaway BN_ULONG c1, c2, c3; 72674664626SKris Kennaway 72774664626SKris Kennaway c1 = 0; 72874664626SKris Kennaway c2 = 0; 72974664626SKris Kennaway c3 = 0; 73074664626SKris Kennaway sqr_add_c(a, 0, c1, c2, c3); 73174664626SKris Kennaway r[0] = c1; 73274664626SKris Kennaway c1 = 0; 73374664626SKris Kennaway sqr_add_c2(a, 1, 0, c2, c3, c1); 73474664626SKris Kennaway r[1] = c2; 73574664626SKris Kennaway c2 = 0; 73674664626SKris Kennaway sqr_add_c(a, 1, c3, c1, c2); 73774664626SKris Kennaway sqr_add_c2(a, 2, 0, c3, c1, c2); 73874664626SKris Kennaway r[2] = c3; 73974664626SKris Kennaway c3 = 0; 74074664626SKris Kennaway sqr_add_c2(a, 3, 0, c1, c2, c3); 74174664626SKris Kennaway sqr_add_c2(a, 2, 1, c1, c2, c3); 74274664626SKris Kennaway r[3] = c1; 74374664626SKris Kennaway c1 = 0; 74474664626SKris Kennaway sqr_add_c(a, 2, c2, c3, c1); 74574664626SKris Kennaway sqr_add_c2(a, 3, 1, c2, c3, c1); 74674664626SKris Kennaway sqr_add_c2(a, 4, 0, c2, c3, c1); 74774664626SKris Kennaway r[4] = c2; 74874664626SKris Kennaway c2 = 0; 74974664626SKris Kennaway sqr_add_c2(a, 5, 0, c3, c1, c2); 75074664626SKris Kennaway sqr_add_c2(a, 4, 1, c3, c1, c2); 75174664626SKris Kennaway sqr_add_c2(a, 3, 2, c3, c1, c2); 75274664626SKris Kennaway r[5] = c3; 75374664626SKris Kennaway c3 = 0; 75474664626SKris Kennaway sqr_add_c(a, 3, c1, c2, c3); 75574664626SKris Kennaway sqr_add_c2(a, 4, 2, c1, c2, c3); 75674664626SKris Kennaway sqr_add_c2(a, 5, 1, c1, c2, c3); 75774664626SKris Kennaway sqr_add_c2(a, 6, 0, c1, c2, c3); 75874664626SKris Kennaway r[6] = c1; 75974664626SKris Kennaway c1 = 0; 76074664626SKris Kennaway sqr_add_c2(a, 7, 0, c2, c3, c1); 76174664626SKris Kennaway sqr_add_c2(a, 6, 1, c2, c3, c1); 76274664626SKris Kennaway sqr_add_c2(a, 5, 2, c2, c3, c1); 76374664626SKris Kennaway sqr_add_c2(a, 4, 3, c2, c3, c1); 76474664626SKris Kennaway r[7] = c2; 76574664626SKris Kennaway c2 = 0; 76674664626SKris Kennaway sqr_add_c(a, 4, c3, c1, c2); 76774664626SKris Kennaway sqr_add_c2(a, 5, 3, c3, c1, c2); 76874664626SKris Kennaway sqr_add_c2(a, 6, 2, c3, c1, c2); 76974664626SKris Kennaway sqr_add_c2(a, 7, 1, c3, c1, c2); 77074664626SKris Kennaway r[8] = c3; 77174664626SKris Kennaway c3 = 0; 77274664626SKris Kennaway sqr_add_c2(a, 7, 2, c1, c2, c3); 77374664626SKris Kennaway sqr_add_c2(a, 6, 3, c1, c2, c3); 77474664626SKris Kennaway sqr_add_c2(a, 5, 4, c1, c2, c3); 77574664626SKris Kennaway r[9] = c1; 77674664626SKris Kennaway c1 = 0; 77774664626SKris Kennaway sqr_add_c(a, 5, c2, c3, c1); 77874664626SKris Kennaway sqr_add_c2(a, 6, 4, c2, c3, c1); 77974664626SKris Kennaway sqr_add_c2(a, 7, 3, c2, c3, c1); 78074664626SKris Kennaway r[10] = c2; 78174664626SKris Kennaway c2 = 0; 78274664626SKris Kennaway sqr_add_c2(a, 7, 4, c3, c1, c2); 78374664626SKris Kennaway sqr_add_c2(a, 6, 5, c3, c1, c2); 78474664626SKris Kennaway r[11] = c3; 78574664626SKris Kennaway c3 = 0; 78674664626SKris Kennaway sqr_add_c(a, 6, c1, c2, c3); 78774664626SKris Kennaway sqr_add_c2(a, 7, 5, c1, c2, c3); 78874664626SKris Kennaway r[12] = c1; 78974664626SKris Kennaway c1 = 0; 79074664626SKris Kennaway sqr_add_c2(a, 7, 6, c2, c3, c1); 79174664626SKris Kennaway r[13] = c2; 79274664626SKris Kennaway c2 = 0; 79374664626SKris Kennaway sqr_add_c(a, 7, c3, c1, c2); 79474664626SKris Kennaway r[14] = c3; 79574664626SKris Kennaway r[15] = c1; 79674664626SKris Kennaway } 79774664626SKris Kennaway 7985c87c606SMark Murray void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 79974664626SKris Kennaway { 80074664626SKris Kennaway BN_ULONG c1, c2, c3; 80174664626SKris Kennaway 80274664626SKris Kennaway c1 = 0; 80374664626SKris Kennaway c2 = 0; 80474664626SKris Kennaway c3 = 0; 80574664626SKris Kennaway sqr_add_c(a, 0, c1, c2, c3); 80674664626SKris Kennaway r[0] = c1; 80774664626SKris Kennaway c1 = 0; 80874664626SKris Kennaway sqr_add_c2(a, 1, 0, c2, c3, c1); 80974664626SKris Kennaway r[1] = c2; 81074664626SKris Kennaway c2 = 0; 81174664626SKris Kennaway sqr_add_c(a, 1, c3, c1, c2); 81274664626SKris Kennaway sqr_add_c2(a, 2, 0, c3, c1, c2); 81374664626SKris Kennaway r[2] = c3; 81474664626SKris Kennaway c3 = 0; 81574664626SKris Kennaway sqr_add_c2(a, 3, 0, c1, c2, c3); 81674664626SKris Kennaway sqr_add_c2(a, 2, 1, c1, c2, c3); 81774664626SKris Kennaway r[3] = c1; 81874664626SKris Kennaway c1 = 0; 81974664626SKris Kennaway sqr_add_c(a, 2, c2, c3, c1); 82074664626SKris Kennaway sqr_add_c2(a, 3, 1, c2, c3, c1); 82174664626SKris Kennaway r[4] = c2; 82274664626SKris Kennaway c2 = 0; 82374664626SKris Kennaway sqr_add_c2(a, 3, 2, c3, c1, c2); 82474664626SKris Kennaway r[5] = c3; 82574664626SKris Kennaway c3 = 0; 82674664626SKris Kennaway sqr_add_c(a, 3, c1, c2, c3); 82774664626SKris Kennaway r[6] = c1; 82874664626SKris Kennaway r[7] = c2; 82974664626SKris Kennaway } 8301f13597dSJung-uk Kim 8311f13597dSJung-uk Kim # ifdef OPENSSL_NO_ASM 8321f13597dSJung-uk Kim # ifdef OPENSSL_BN_ASM_MONT 8331f13597dSJung-uk Kim # include <alloca.h> 8341f13597dSJung-uk Kim /* 8351f13597dSJung-uk Kim * This is essentially reference implementation, which may or may not 8361f13597dSJung-uk Kim * result in performance improvement. E.g. on IA-32 this routine was 8371f13597dSJung-uk Kim * observed to give 40% faster rsa1024 private key operations and 10% 8381f13597dSJung-uk Kim * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only 8391f13597dSJung-uk Kim * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a 8401f13597dSJung-uk Kim * reference implementation, one to be used as starting point for 8411f13597dSJung-uk Kim * platform-specific assembler. Mentioned numbers apply to compiler 8421f13597dSJung-uk Kim * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and 8431f13597dSJung-uk Kim * can vary not only from platform to platform, but even for compiler 8441f13597dSJung-uk Kim * versions. Assembler vs. assembler improvement coefficients can 8451f13597dSJung-uk Kim * [and are known to] differ and are to be documented elsewhere. 8461f13597dSJung-uk Kim */ 8476f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 8486f9291ceSJung-uk Kim const BN_ULONG *np, const BN_ULONG *n0p, int num) 8491f13597dSJung-uk Kim { 8501f13597dSJung-uk Kim BN_ULONG c0, c1, ml, *tp, n0; 8511f13597dSJung-uk Kim # ifdef mul64 8521f13597dSJung-uk Kim BN_ULONG mh; 8531f13597dSJung-uk Kim # endif 8541f13597dSJung-uk Kim volatile BN_ULONG *vp; 8551f13597dSJung-uk Kim int i = 0, j; 8561f13597dSJung-uk Kim 8576f9291ceSJung-uk Kim # if 0 /* template for platform-specific 8586f9291ceSJung-uk Kim * implementation */ 8596f9291ceSJung-uk Kim if (ap == bp) 8606f9291ceSJung-uk Kim return bn_sqr_mont(rp, ap, np, n0p, num); 8611f13597dSJung-uk Kim # endif 8621f13597dSJung-uk Kim vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 8631f13597dSJung-uk Kim 8641f13597dSJung-uk Kim n0 = *n0p; 8651f13597dSJung-uk Kim 8661f13597dSJung-uk Kim c0 = 0; 8671f13597dSJung-uk Kim ml = bp[0]; 8681f13597dSJung-uk Kim # ifdef mul64 8691f13597dSJung-uk Kim mh = HBITS(ml); 8701f13597dSJung-uk Kim ml = LBITS(ml); 8711f13597dSJung-uk Kim for (j = 0; j < num; ++j) 8721f13597dSJung-uk Kim mul(tp[j], ap[j], ml, mh, c0); 8731f13597dSJung-uk Kim # else 8741f13597dSJung-uk Kim for (j = 0; j < num; ++j) 8751f13597dSJung-uk Kim mul(tp[j], ap[j], ml, c0); 8761f13597dSJung-uk Kim # endif 8771f13597dSJung-uk Kim 8781f13597dSJung-uk Kim tp[num] = c0; 8791f13597dSJung-uk Kim tp[num + 1] = 0; 8801f13597dSJung-uk Kim goto enter; 8811f13597dSJung-uk Kim 8826f9291ceSJung-uk Kim for (i = 0; i < num; i++) { 8831f13597dSJung-uk Kim c0 = 0; 8841f13597dSJung-uk Kim ml = bp[i]; 8851f13597dSJung-uk Kim # ifdef mul64 8861f13597dSJung-uk Kim mh = HBITS(ml); 8871f13597dSJung-uk Kim ml = LBITS(ml); 8881f13597dSJung-uk Kim for (j = 0; j < num; ++j) 8891f13597dSJung-uk Kim mul_add(tp[j], ap[j], ml, mh, c0); 8901f13597dSJung-uk Kim # else 8911f13597dSJung-uk Kim for (j = 0; j < num; ++j) 8921f13597dSJung-uk Kim mul_add(tp[j], ap[j], ml, c0); 8931f13597dSJung-uk Kim # endif 8941f13597dSJung-uk Kim c1 = (tp[num] + c0) & BN_MASK2; 8951f13597dSJung-uk Kim tp[num] = c1; 8961f13597dSJung-uk Kim tp[num + 1] = (c1 < c0 ? 1 : 0); 8971f13597dSJung-uk Kim enter: 8981f13597dSJung-uk Kim c1 = tp[0]; 8991f13597dSJung-uk Kim ml = (c1 * n0) & BN_MASK2; 9001f13597dSJung-uk Kim c0 = 0; 9011f13597dSJung-uk Kim # ifdef mul64 9021f13597dSJung-uk Kim mh = HBITS(ml); 9031f13597dSJung-uk Kim ml = LBITS(ml); 9041f13597dSJung-uk Kim mul_add(c1, np[0], ml, mh, c0); 9051f13597dSJung-uk Kim # else 9061f13597dSJung-uk Kim mul_add(c1, ml, np[0], c0); 9071f13597dSJung-uk Kim # endif 9086f9291ceSJung-uk Kim for (j = 1; j < num; j++) { 9091f13597dSJung-uk Kim c1 = tp[j]; 9101f13597dSJung-uk Kim # ifdef mul64 9111f13597dSJung-uk Kim mul_add(c1, np[j], ml, mh, c0); 9121f13597dSJung-uk Kim # else 9131f13597dSJung-uk Kim mul_add(c1, ml, np[j], c0); 9141f13597dSJung-uk Kim # endif 9151f13597dSJung-uk Kim tp[j - 1] = c1 & BN_MASK2; 9161f13597dSJung-uk Kim } 9171f13597dSJung-uk Kim c1 = (tp[num] + c0) & BN_MASK2; 9181f13597dSJung-uk Kim tp[num - 1] = c1; 9191f13597dSJung-uk Kim tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0); 9201f13597dSJung-uk Kim } 9211f13597dSJung-uk Kim 9226f9291ceSJung-uk Kim if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 9231f13597dSJung-uk Kim c0 = bn_sub_words(rp, tp, np, num); 9246f9291ceSJung-uk Kim if (tp[num] != 0 || c0 == 0) { 9256f9291ceSJung-uk Kim for (i = 0; i < num + 2; i++) 9266f9291ceSJung-uk Kim vp[i] = 0; 9271f13597dSJung-uk Kim return 1; 9281f13597dSJung-uk Kim } 9291f13597dSJung-uk Kim } 9306f9291ceSJung-uk Kim for (i = 0; i < num; i++) 9316f9291ceSJung-uk Kim rp[i] = tp[i], vp[i] = 0; 9321f13597dSJung-uk Kim vp[num] = 0; 9331f13597dSJung-uk Kim vp[num + 1] = 0; 9341f13597dSJung-uk Kim return 1; 9351f13597dSJung-uk Kim } 9361f13597dSJung-uk Kim # else 9371f13597dSJung-uk Kim /* 9381f13597dSJung-uk Kim * Return value of 0 indicates that multiplication/convolution was not 9391f13597dSJung-uk Kim * performed to signal the caller to fall down to alternative/original 9401f13597dSJung-uk Kim * code-path. 9411f13597dSJung-uk Kim */ 9426f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 9436f9291ceSJung-uk Kim const BN_ULONG *np, const BN_ULONG *n0, int num) 9446f9291ceSJung-uk Kim { 9456f9291ceSJung-uk Kim return 0; 9466f9291ceSJung-uk Kim } 9471f13597dSJung-uk Kim # endif /* OPENSSL_BN_ASM_MONT */ 9481f13597dSJung-uk Kim # endif 9491f13597dSJung-uk Kim 950f579bf8eSKris Kennaway #else /* !BN_MUL_COMBA */ 95174664626SKris Kennaway 95274664626SKris Kennaway /* hmm... is it faster just to do a multiply? */ 95374664626SKris Kennaway # undef bn_sqr_comba4 954e71b7053SJung-uk Kim # undef bn_sqr_comba8 9551f13597dSJung-uk Kim void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 95674664626SKris Kennaway { 95774664626SKris Kennaway BN_ULONG t[8]; 95874664626SKris Kennaway bn_sqr_normal(r, a, 4, t); 95974664626SKris Kennaway } 96074664626SKris Kennaway 9611f13597dSJung-uk Kim void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 96274664626SKris Kennaway { 96374664626SKris Kennaway BN_ULONG t[16]; 96474664626SKris Kennaway bn_sqr_normal(r, a, 8, t); 96574664626SKris Kennaway } 96674664626SKris Kennaway 96774664626SKris Kennaway void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 96874664626SKris Kennaway { 96974664626SKris Kennaway r[4] = bn_mul_words(&(r[0]), a, 4, b[0]); 97074664626SKris Kennaway r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]); 97174664626SKris Kennaway r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]); 97274664626SKris Kennaway r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]); 97374664626SKris Kennaway } 97474664626SKris Kennaway 97574664626SKris Kennaway void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) 97674664626SKris Kennaway { 97774664626SKris Kennaway r[8] = bn_mul_words(&(r[0]), a, 8, b[0]); 97874664626SKris Kennaway r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]); 97974664626SKris Kennaway r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]); 98074664626SKris Kennaway r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]); 98174664626SKris Kennaway r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]); 98274664626SKris Kennaway r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]); 98374664626SKris Kennaway r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]); 98474664626SKris Kennaway r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]); 98574664626SKris Kennaway } 98674664626SKris Kennaway 9871f13597dSJung-uk Kim # ifdef OPENSSL_NO_ASM 9881f13597dSJung-uk Kim # ifdef OPENSSL_BN_ASM_MONT 9891f13597dSJung-uk Kim # include <alloca.h> 9906f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 9916f9291ceSJung-uk Kim const BN_ULONG *np, const BN_ULONG *n0p, int num) 9921f13597dSJung-uk Kim { 9931f13597dSJung-uk Kim BN_ULONG c0, c1, *tp, n0 = *n0p; 9941f13597dSJung-uk Kim volatile BN_ULONG *vp; 9951f13597dSJung-uk Kim int i = 0, j; 9961f13597dSJung-uk Kim 9971f13597dSJung-uk Kim vp = tp = alloca((num + 2) * sizeof(BN_ULONG)); 9981f13597dSJung-uk Kim 9996f9291ceSJung-uk Kim for (i = 0; i <= num; i++) 10006f9291ceSJung-uk Kim tp[i] = 0; 10011f13597dSJung-uk Kim 10026f9291ceSJung-uk Kim for (i = 0; i < num; i++) { 10031f13597dSJung-uk Kim c0 = bn_mul_add_words(tp, ap, num, bp[i]); 10041f13597dSJung-uk Kim c1 = (tp[num] + c0) & BN_MASK2; 10051f13597dSJung-uk Kim tp[num] = c1; 10061f13597dSJung-uk Kim tp[num + 1] = (c1 < c0 ? 1 : 0); 10071f13597dSJung-uk Kim 10081f13597dSJung-uk Kim c0 = bn_mul_add_words(tp, np, num, tp[0] * n0); 10091f13597dSJung-uk Kim c1 = (tp[num] + c0) & BN_MASK2; 10101f13597dSJung-uk Kim tp[num] = c1; 10111f13597dSJung-uk Kim tp[num + 1] += (c1 < c0 ? 1 : 0); 10126f9291ceSJung-uk Kim for (j = 0; j <= num; j++) 10136f9291ceSJung-uk Kim tp[j] = tp[j + 1]; 10141f13597dSJung-uk Kim } 10151f13597dSJung-uk Kim 10166f9291ceSJung-uk Kim if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) { 10171f13597dSJung-uk Kim c0 = bn_sub_words(rp, tp, np, num); 10186f9291ceSJung-uk Kim if (tp[num] != 0 || c0 == 0) { 10196f9291ceSJung-uk Kim for (i = 0; i < num + 2; i++) 10206f9291ceSJung-uk Kim vp[i] = 0; 10211f13597dSJung-uk Kim return 1; 10221f13597dSJung-uk Kim } 10231f13597dSJung-uk Kim } 10246f9291ceSJung-uk Kim for (i = 0; i < num; i++) 10256f9291ceSJung-uk Kim rp[i] = tp[i], vp[i] = 0; 10261f13597dSJung-uk Kim vp[num] = 0; 10271f13597dSJung-uk Kim vp[num + 1] = 0; 10281f13597dSJung-uk Kim return 1; 10291f13597dSJung-uk Kim } 10301f13597dSJung-uk Kim # else 10316f9291ceSJung-uk Kim int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 10326f9291ceSJung-uk Kim const BN_ULONG *np, const BN_ULONG *n0, int num) 10336f9291ceSJung-uk Kim { 10346f9291ceSJung-uk Kim return 0; 10356f9291ceSJung-uk Kim } 10361f13597dSJung-uk Kim # endif /* OPENSSL_BN_ASM_MONT */ 10371f13597dSJung-uk Kim # endif 10381f13597dSJung-uk Kim 1039f579bf8eSKris Kennaway #endif /* !BN_MUL_COMBA */ 1040