1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
2    of their products.
3 
4    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5 
6    THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
7    FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
8    THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16 
17   * the GNU Lesser General Public License as published by the Free
18     Software Foundation; either version 3 of the License, or (at your
19     option) any later version.
20 
21 or
22 
23   * the GNU General Public License as published by the Free Software
24     Foundation; either version 2 of the License, or (at your option) any
25     later version.
26 
27 or both in parallel, as here.
28 
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32 for more details.
33 
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library.  If not,
36 see https://www.gnu.org/licenses/.  */
37 
38 #include "gmp.h"
39 #include "gmp-impl.h"
40 
41 
42 #ifndef MULLO_BASECASE_THRESHOLD
43 #define MULLO_BASECASE_THRESHOLD 0	/* never use mpn_mul_basecase */
44 #endif
45 
46 #ifndef MULLO_DC_THRESHOLD
47 #define MULLO_DC_THRESHOLD 3*MUL_TOOM22_THRESHOLD
48 #endif
49 
50 #ifndef MULLO_MUL_N_THRESHOLD
51 #define MULLO_MUL_N_THRESHOLD MUL_FFT_THRESHOLD
52 #endif
53 
54 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
55 #define MAYBE_range_basecase 1
56 #define MAYBE_range_toom22   1
57 #else
58 #define MAYBE_range_basecase                                           \
59   ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
60 #define MAYBE_range_toom22                                             \
61   ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
62 #endif
63 
64 /*  THINK: The DC strategy uses different constants in different Toom's
65 	 ranges. Something smoother?
66 */
67 
68 /*
69   Compute the least significant half of the product {xy,n}*{yp,n}, or
70   formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
71 
72   Above the given threshold, the Divide and Conquer strategy is used.
73   The operands are split in two, and a full product plus two mullo
74   are used to obtain the final result. The more natural strategy is to
75   split in two halves, but this is far from optimal when a
76   sub-quadratic multiplication is used.
77 
78   Mulders suggests an unbalanced split in favour of the full product,
79   split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
80 
81   To compute the value of a, we assume that the cost of mullo for a
82   given size ML(n) is a fraction of the cost of a full product with
83   same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
84   then we can write:
85 
86   ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
87 
88   Given a value for e, want to minimise the value of k, i.e. the
89   function k=(1-a)^e/(1-2*a^e).
90 
91   With e=2, the exponent for schoolbook multiplication, the minimum is
92   given by the values a=1-a=1/2.
93 
94   With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
95   Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
96 
97   Other possible approximations follow:
98   e=log(5)/log(3) [Toom-3] -> a ~= 9/40
99   e=log(7)/log(4) [Toom-4] -> a ~= 7/39
100   e=log(11)/log(6) [Toom-6] -> a ~= 1/8
101   e=log(15)/log(8) [Toom-8] -> a ~= 1/10
102 
103   The values above where obtained with the following trivial commands
104   in the gp-pari shell:
105 
106 fun(e,a)=(1-a)^e/(1-2*a^e)
107 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
108 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
109 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
110 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
111 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
112 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
113 
114   ,
115   |\
116   | \
117   +----,
118   |    |
119   |    |
120   |    |\
121   |    | \
122   +----+--`
123   ^ n2 ^n1^
124 
125   For an actual implementation, the assumption that M(n)=n^e is
126   incorrect, as a consequence also the assumption that ML(n)=k*M(n)
127   with a constant k is wrong.
128 
129   But theory suggest us two things:
130   - the best the multiplication product is (lower e), the more k
131     approaches 1, and a approaches 0.
132 
133   - A value for a smaller than optimal is probably less bad than a
134     bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
135     value, and k(a)=0.808_ the mul/mullo speed ratio. We get
136     k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
137 */
138 
139 static mp_size_t
mpn_mullo_n_itch(mp_size_t n)140 mpn_mullo_n_itch (mp_size_t n)
141 {
142   return 2*n;
143 }
144 
145 /*
146     mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
147     It accepts tp == rp.
148 */
149 static void
mpn_dc_mullo_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n,mp_ptr tp)150 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
151 {
152   mp_size_t n2, n1;
153   ASSERT (n >= 2);
154   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
155   ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
156   ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
157 
158   /* Divide-and-conquer */
159 
160   /* We need fractional approximation of the value 0 < a <= 1/2
161      giving the minimum in the function k=(1-a)^e/(1-2*a^e).
162   */
163   if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
164     n1 = n >> 1;
165   else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
166     n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
167   else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
168     n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
169   else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
170     n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
171   /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
172   else
173     n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
174 
175   n2 = n - n1;
176 
177   /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
178 	      y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
179 
180   /* x0 * y0 */
181   mpn_mul_n (tp, xp, yp, n2);
182   MPN_COPY (rp, tp, n2);
183 
184   /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
185   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
186     mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
187   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
188     mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
189   else
190     mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
191   mpn_add_n (rp + n2, tp + n2, tp + n, n1);
192 
193   /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
194   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
195     mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
196   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
197     mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
198   else
199     mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
200   mpn_add_n (rp + n2, rp + n2, tp + n, n1);
201 }
202 
203 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
204 #define MUL_BASECASE_ALLOC \
205  (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
206 
207 /* FIXME: This function should accept a temporary area; dc_mullow_n
208    accepts a pointer tp, and handle the case tp == rp, do the same here.
209    Maybe recombine the two functions.
210    THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
211 	  (typically thanks to mpn_addmul_2) should we unconditionally use
212 	  mpn_mul_n?
213 */
214 
215 void
mpn_mullo_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n)216 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
217 {
218   ASSERT (n >= 1);
219   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
220   ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
221 
222   if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
223     {
224       /* Allocate workspace of fixed size on stack: fast! */
225       mp_limb_t tp[MUL_BASECASE_ALLOC];
226       mpn_mul_basecase (tp, xp, n, yp, n);
227       MPN_COPY (rp, tp, n);
228     }
229   else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
230     {
231       mpn_mullo_basecase (rp, xp, yp, n);
232     }
233   else
234     {
235       mp_ptr tp;
236       TMP_DECL;
237       TMP_MARK;
238       tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
239       if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
240 	{
241 	  mpn_dc_mullo_n (rp, xp, yp, n, tp);
242 	}
243       else
244 	{
245 	  /* For really large operands, use plain mpn_mul_n but throw away upper n
246 	     limbs of result.  */
247 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
248 	  mpn_fft_mul (tp, xp, n, yp, n);
249 #else
250 	  mpn_mul_n (tp, xp, yp, n);
251 #endif
252 	  MPN_COPY (rp, tp, n);
253 	}
254       TMP_FREE;
255     }
256 }
257