1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs 2 of their products. 3 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS 7 FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED 8 THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10 Copyright 2004, 2005, 2009, 2010 Free Software Foundation, Inc. 11 12 This file is part of the GNU MP Library. 13 14 The GNU MP Library is free software; you can redistribute it and/or modify 15 it under the terms of the GNU Lesser General Public License as published by 16 the Free Software Foundation; either version 3 of the License, or (at your 17 option) any later version. 18 19 The GNU MP Library is distributed in the hope that it will be useful, but 20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 21 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 22 License for more details. 23 24 You should have received a copy of the GNU Lesser General Public License 25 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 26 27 #include "gmp.h" 28 #include "gmp-impl.h" 29 30 31 #ifndef MULLO_BASECASE_THRESHOLD 32 #define MULLO_BASECASE_THRESHOLD 0 /* never use mpn_mul_basecase */ 33 #endif 34 35 #ifndef MULLO_DC_THRESHOLD 36 #define MULLO_DC_THRESHOLD 3*MUL_TOOM22_THRESHOLD 37 #endif 38 39 #ifndef MULLO_MUL_N_THRESHOLD 40 #define MULLO_MUL_N_THRESHOLD MUL_FFT_THRESHOLD 41 #endif 42 43 #if TUNE_PROGRAM_BUILD 44 #define MAYBE_range_basecase 1 45 #define MAYBE_range_toom22 1 46 #else 47 #define MAYBE_range_basecase \ 48 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11)) 49 #define MAYBE_range_toom22 \ 50 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) ) 51 #endif 52 53 /* THINK: The DC strategy uses different constants in different Toom's 54 ranges. Something smoother? 55 */ 56 57 /* 58 Compute the least significant half of the product {xy,n}*{yp,n}, or 59 formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). 60 61 Above the given threshold, the Divide and Conquer strategy is used. 62 The operands are split in two, and a full product plus two mullo 63 are used to obtain the final result. The more natural strategy is to 64 split in two halves, but this is far from optimal when a 65 sub-quadratic multiplication is used. 66 67 Mulders suggests an unbalanced split in favour of the full product, 68 split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. 69 70 To compute the value of a, we assume that the cost of mullo for a 71 given size ML(n) is a fraction of the cost of a full product with 72 same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; 73 then we can write: 74 75 ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e 76 77 Given a value for e, want to minimise the value of k, i.e. the 78 function k=(1-a)^e/(1-2*a^e). 79 80 With e=2, the exponent for schoolbook multiplication, the minimum is 81 given by the values a=1-a=1/2. 82 83 With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), 84 Mulders compute (1-a) = 0.694... and we approximate a with 11/36. 85 86 Other possible approximations follow: 87 e=log(5)/log(3) [Toom-3] -> a ~= 9/40 88 e=log(7)/log(4) [Toom-4] -> a ~= 7/39 89 e=log(11)/log(6) [Toom-6] -> a ~= 1/8 90 e=log(15)/log(8) [Toom-8] -> a ~= 1/10 91 92 The values above where obtained with the following trivial commands 93 in the gp-pari shell: 94 95 fun(e,a)=(1-a)^e/(1-2*a^e) 96 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))} 97 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5)) 98 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5)) 99 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5)) 100 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3)) 101 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3)) 102 103 , 104 |\ 105 | \ 106 +----, 107 | | 108 | | 109 | |\ 110 | | \ 111 +----+--` 112 ^ n2 ^n1^ 113 114 For an actual implementation, the assumption that M(n)=n^e is 115 incorrect, as a consequence also the assumption that ML(n)=k*M(n) 116 with a constant k is wrong. 117 118 But theory suggest us two things: 119 - the best the multiplication product is (lower e), the more k 120 approaches 1, and a approaches 0. 121 122 - A value for a smaller than optimal is probably less bad than a 123 bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal 124 value, and k(a)=0.808_ the mul/mullo speed ratio. We get 125 k(a+1/6)=0.929_ but k(a-1/6)=0.865_. 126 */ 127 128 static mp_size_t 129 mpn_mullo_n_itch (mp_size_t n) 130 { 131 return 2*n; 132 } 133 134 /* 135 mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp. 136 It accepts tp == rp. 137 */ 138 static void 139 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp) 140 { 141 mp_size_t n2, n1; 142 ASSERT (n >= 2); 143 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); 144 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); 145 ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); 146 147 /* Divide-and-conquer */ 148 149 /* We need fractional approximation of the value 0 < a <= 1/2 150 giving the minimum in the function k=(1-a)^e/(1-2*a^e). 151 */ 152 if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11))) 153 n1 = n >> 1; 154 else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11))) 155 n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ 156 else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9))) 157 n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ 158 else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9)) 159 n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ 160 /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ 161 else 162 n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ 163 164 n2 = n - n1; 165 166 /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0, 167 y = y1 2^(n2 GMP_NUMB_BITS) + y0 */ 168 169 /* x0 * y0 */ 170 mpn_mul_n (tp, xp, yp, n2); 171 MPN_COPY (rp, tp, n2); 172 173 /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */ 174 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) 175 mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1); 176 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) 177 mpn_mullo_basecase (tp + n, xp + n2, yp, n1); 178 else 179 mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n); 180 mpn_add_n (rp + n2, tp + n2, tp + n, n1); 181 182 /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */ 183 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) 184 mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1); 185 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) 186 mpn_mullo_basecase (tp + n, xp, yp + n2, n1); 187 else 188 mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n); 189 mpn_add_n (rp + n2, rp + n2, tp + n, n1); 190 } 191 192 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ 193 #define MUL_BASECASE_ALLOC \ 194 (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT) 195 196 /* FIXME: This function should accept a temporary area; dc_mullow_n 197 accepts a pointer tp, and handle the case tp == rp, do the same here. 198 Maybe recombine the two functions. 199 THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase 200 (typically thanks to mpn_addmul_2) should we unconditionally use 201 mpn_mul_n? 202 */ 203 204 void 205 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) 206 { 207 ASSERT (n >= 1); 208 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); 209 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); 210 211 if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD)) 212 { 213 /* Allocate workspace of fixed size on stack: fast! */ 214 mp_limb_t tp[MUL_BASECASE_ALLOC]; 215 mpn_mul_basecase (tp, xp, n, yp, n); 216 MPN_COPY (rp, tp, n); 217 } 218 else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD)) 219 { 220 mpn_mullo_basecase (rp, xp, yp, n); 221 } 222 else 223 { 224 mp_ptr tp; 225 TMP_DECL; 226 TMP_MARK; 227 tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n)); 228 if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD)) 229 { 230 mpn_dc_mullo_n (rp, xp, yp, n, tp); 231 } 232 else 233 { 234 /* For really large operands, use plain mpn_mul_n but throw away upper n 235 limbs of result. */ 236 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD) 237 mpn_fft_mul (tp, xp, n, yp, n); 238 #else 239 mpn_mul_n (tp, xp, yp, n); 240 #endif 241 MPN_COPY (rp, tp, n); 242 } 243 TMP_FREE; 244 } 245 } 246