xref: /dragonfly/contrib/gmp/mpn/generic/mullo_n.c (revision cd1c6085)
1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
2    of their products.
3 
4    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5 
6    THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
7    FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
8    THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2004, 2005, 2009, 2010 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of the GNU Lesser General Public License as published by
16 the Free Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
18 
19 The GNU MP Library is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
22 License for more details.
23 
24 You should have received a copy of the GNU Lesser General Public License
25 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
26 
27 #include "gmp.h"
28 #include "gmp-impl.h"
29 
30 
31 #ifndef MULLO_BASECASE_THRESHOLD
32 #define MULLO_BASECASE_THRESHOLD 0	/* never use mpn_mul_basecase */
33 #endif
34 
35 #ifndef MULLO_DC_THRESHOLD
36 #define MULLO_DC_THRESHOLD 3*MUL_TOOM22_THRESHOLD
37 #endif
38 
39 #ifndef MULLO_MUL_N_THRESHOLD
40 #define MULLO_MUL_N_THRESHOLD MUL_FFT_THRESHOLD
41 #endif
42 
43 #if TUNE_PROGRAM_BUILD
44 #define MAYBE_range_basecase 1
45 #define MAYBE_range_toom22   1
46 #else
47 #define MAYBE_range_basecase                                           \
48   ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
49 #define MAYBE_range_toom22                                             \
50   ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
51 #endif
52 
53 /*  THINK: The DC strategy uses different constants in different Toom's
54 	 ranges. Something smoother?
55 */
56 
57 /*
58   Compute the least significant half of the product {xy,n}*{yp,n}, or
59   formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
60 
61   Above the given threshold, the Divide and Conquer strategy is used.
62   The operands are split in two, and a full product plus two mullo
63   are used to obtain the final result. The more natural strategy is to
64   split in two halves, but this is far from optimal when a
65   sub-quadratic multiplication is used.
66 
67   Mulders suggests an unbalanced split in favour of the full product,
68   split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
69 
70   To compute the value of a, we assume that the cost of mullo for a
71   given size ML(n) is a fraction of the cost of a full product with
72   same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
73   then we can write:
74 
75   ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
76 
77   Given a value for e, want to minimise the value of k, i.e. the
78   function k=(1-a)^e/(1-2*a^e).
79 
80   With e=2, the exponent for schoolbook multiplication, the minimum is
81   given by the values a=1-a=1/2.
82 
83   With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
84   Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
85 
86   Other possible approximations follow:
87   e=log(5)/log(3) [Toom-3] -> a ~= 9/40
88   e=log(7)/log(4) [Toom-4] -> a ~= 7/39
89   e=log(11)/log(6) [Toom-6] -> a ~= 1/8
90   e=log(15)/log(8) [Toom-8] -> a ~= 1/10
91 
92   The values above where obtained with the following trivial commands
93   in the gp-pari shell:
94 
95 fun(e,a)=(1-a)^e/(1-2*a^e)
96 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
97 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
98 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
99 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
100 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
101 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
102 
103   ,
104   |\
105   | \
106   +----,
107   |    |
108   |    |
109   |    |\
110   |    | \
111   +----+--`
112   ^ n2 ^n1^
113 
114   For an actual implementation, the assumption that M(n)=n^e is
115   incorrect, as a consequence also the assumption that ML(n)=k*M(n)
116   with a constant k is wrong.
117 
118   But theory suggest us two things:
119   - the best the multiplication product is (lower e), the more k
120     approaches 1, and a approaches 0.
121 
122   - A value for a smaller than optimal is probably less bad than a
123     bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
124     value, and k(a)=0.808_ the mul/mullo speed ratio. We get
125     k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
126 */
127 
128 static mp_size_t
129 mpn_mullo_n_itch (mp_size_t n)
130 {
131   return 2*n;
132 }
133 
134 /*
135     mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
136     It accepts tp == rp.
137 */
138 static void
139 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
140 {
141   mp_size_t n2, n1;
142   ASSERT (n >= 2);
143   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
144   ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
145   ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
146 
147   /* Divide-and-conquer */
148 
149   /* We need fractional approximation of the value 0 < a <= 1/2
150      giving the minimum in the function k=(1-a)^e/(1-2*a^e).
151   */
152   if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
153     n1 = n >> 1;
154   else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
155     n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
156   else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
157     n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
158   else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
159     n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
160   /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
161   else
162     n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
163 
164   n2 = n - n1;
165 
166   /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
167 	      y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
168 
169   /* x0 * y0 */
170   mpn_mul_n (tp, xp, yp, n2);
171   MPN_COPY (rp, tp, n2);
172 
173   /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
174   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
175     mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
176   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
177     mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
178   else
179     mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
180   mpn_add_n (rp + n2, tp + n2, tp + n, n1);
181 
182   /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
183   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
184     mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
185   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
186     mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
187   else
188     mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
189   mpn_add_n (rp + n2, rp + n2, tp + n, n1);
190 }
191 
192 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
193 #define MUL_BASECASE_ALLOC \
194  (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
195 
196 /* FIXME: This function should accept a temporary area; dc_mullow_n
197    accepts a pointer tp, and handle the case tp == rp, do the same here.
198    Maybe recombine the two functions.
199    THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
200 	  (typically thanks to mpn_addmul_2) should we unconditionally use
201 	  mpn_mul_n?
202 */
203 
204 void
205 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
206 {
207   ASSERT (n >= 1);
208   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
209   ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
210 
211   if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
212     {
213       /* Allocate workspace of fixed size on stack: fast! */
214       mp_limb_t tp[MUL_BASECASE_ALLOC];
215       mpn_mul_basecase (tp, xp, n, yp, n);
216       MPN_COPY (rp, tp, n);
217     }
218   else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
219     {
220       mpn_mullo_basecase (rp, xp, yp, n);
221     }
222   else
223     {
224       mp_ptr tp;
225       TMP_DECL;
226       TMP_MARK;
227       tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
228       if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
229 	{
230 	  mpn_dc_mullo_n (rp, xp, yp, n, tp);
231 	}
232       else
233 	{
234 	  /* For really large operands, use plain mpn_mul_n but throw away upper n
235 	     limbs of result.  */
236 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
237 	  mpn_fft_mul (tp, xp, n, yp, n);
238 #else
239 	  mpn_mul_n (tp, xp, yp, n);
240 #endif
241 	  MPN_COPY (rp, tp, n);
242 	}
243       TMP_FREE;
244     }
245 }
246