1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
2 of their products.
3
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6 THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
7 FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
8 THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10 Copyright 2004, 2005, 2009, 2010 Free Software Foundation, Inc.
11
12 This file is part of the GNU MP Library.
13
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of the GNU Lesser General Public License as published by
16 the Free Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
18
19 The GNU MP Library is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
22 License for more details.
23
24 You should have received a copy of the GNU Lesser General Public License
25 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
26
27 #include "gmp.h"
28 #include "gmp-impl.h"
29
30
31 #ifndef MULLO_BASECASE_THRESHOLD
32 #define MULLO_BASECASE_THRESHOLD 0 /* never use mpn_mul_basecase */
33 #endif
34
35 #ifndef MULLO_DC_THRESHOLD
36 #define MULLO_DC_THRESHOLD 3*MUL_TOOM22_THRESHOLD
37 #endif
38
39 #ifndef MULLO_MUL_N_THRESHOLD
40 #define MULLO_MUL_N_THRESHOLD MUL_FFT_THRESHOLD
41 #endif
42
43 #if TUNE_PROGRAM_BUILD
44 #define MAYBE_range_basecase 1
45 #define MAYBE_range_toom22 1
46 #else
47 #define MAYBE_range_basecase \
48 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
49 #define MAYBE_range_toom22 \
50 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
51 #endif
52
53 /* THINK: The DC strategy uses different constants in different Toom's
54 ranges. Something smoother?
55 */
56
57 /*
58 Compute the least significant half of the product {xy,n}*{yp,n}, or
59 formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
60
61 Above the given threshold, the Divide and Conquer strategy is used.
62 The operands are split in two, and a full product plus two mullo
63 are used to obtain the final result. The more natural strategy is to
64 split in two halves, but this is far from optimal when a
65 sub-quadratic multiplication is used.
66
67 Mulders suggests an unbalanced split in favour of the full product,
68 split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
69
70 To compute the value of a, we assume that the cost of mullo for a
71 given size ML(n) is a fraction of the cost of a full product with
72 same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
73 then we can write:
74
75 ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
76
77 Given a value for e, want to minimise the value of k, i.e. the
78 function k=(1-a)^e/(1-2*a^e).
79
80 With e=2, the exponent for schoolbook multiplication, the minimum is
81 given by the values a=1-a=1/2.
82
83 With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
84 Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
85
86 Other possible approximations follow:
87 e=log(5)/log(3) [Toom-3] -> a ~= 9/40
88 e=log(7)/log(4) [Toom-4] -> a ~= 7/39
89 e=log(11)/log(6) [Toom-6] -> a ~= 1/8
90 e=log(15)/log(8) [Toom-8] -> a ~= 1/10
91
92 The values above where obtained with the following trivial commands
93 in the gp-pari shell:
94
95 fun(e,a)=(1-a)^e/(1-2*a^e)
96 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
97 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
98 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
99 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
100 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
101 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
102
103 ,
104 |\
105 | \
106 +----,
107 | |
108 | |
109 | |\
110 | | \
111 +----+--`
112 ^ n2 ^n1^
113
114 For an actual implementation, the assumption that M(n)=n^e is
115 incorrect, as a consequence also the assumption that ML(n)=k*M(n)
116 with a constant k is wrong.
117
118 But theory suggest us two things:
119 - the best the multiplication product is (lower e), the more k
120 approaches 1, and a approaches 0.
121
122 - A value for a smaller than optimal is probably less bad than a
123 bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
124 value, and k(a)=0.808_ the mul/mullo speed ratio. We get
125 k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
126 */
127
128 static mp_size_t
mpn_mullo_n_itch(mp_size_t n)129 mpn_mullo_n_itch (mp_size_t n)
130 {
131 return 2*n;
132 }
133
134 /*
135 mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
136 It accepts tp == rp.
137 */
138 static void
mpn_dc_mullo_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n,mp_ptr tp)139 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
140 {
141 mp_size_t n2, n1;
142 ASSERT (n >= 2);
143 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
144 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
145 ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
146
147 /* Divide-and-conquer */
148
149 /* We need fractional approximation of the value 0 < a <= 1/2
150 giving the minimum in the function k=(1-a)^e/(1-2*a^e).
151 */
152 if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
153 n1 = n >> 1;
154 else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
155 n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */
156 else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
157 n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */
158 else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
159 n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */
160 /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */
161 else
162 n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */
163
164 n2 = n - n1;
165
166 /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
167 y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
168
169 /* x0 * y0 */
170 mpn_mul_n (tp, xp, yp, n2);
171 MPN_COPY (rp, tp, n2);
172
173 /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
174 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
175 mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
176 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
177 mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
178 else
179 mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
180 mpn_add_n (rp + n2, tp + n2, tp + n, n1);
181
182 /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
183 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
184 mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
185 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
186 mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
187 else
188 mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
189 mpn_add_n (rp + n2, rp + n2, tp + n, n1);
190 }
191
192 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
193 #define MUL_BASECASE_ALLOC \
194 (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
195
196 /* FIXME: This function should accept a temporary area; dc_mullow_n
197 accepts a pointer tp, and handle the case tp == rp, do the same here.
198 Maybe recombine the two functions.
199 THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
200 (typically thanks to mpn_addmul_2) should we unconditionally use
201 mpn_mul_n?
202 */
203
204 void
mpn_mullo_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n)205 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
206 {
207 ASSERT (n >= 1);
208 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
209 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
210
211 if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
212 {
213 /* Allocate workspace of fixed size on stack: fast! */
214 mp_limb_t tp[MUL_BASECASE_ALLOC];
215 mpn_mul_basecase (tp, xp, n, yp, n);
216 MPN_COPY (rp, tp, n);
217 }
218 else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
219 {
220 mpn_mullo_basecase (rp, xp, yp, n);
221 }
222 else
223 {
224 mp_ptr tp;
225 TMP_DECL;
226 TMP_MARK;
227 tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
228 if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
229 {
230 mpn_dc_mullo_n (rp, xp, yp, n, tp);
231 }
232 else
233 {
234 /* For really large operands, use plain mpn_mul_n but throw away upper n
235 limbs of result. */
236 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
237 mpn_fft_mul (tp, xp, n, yp, n);
238 #else
239 mpn_mul_n (tp, xp, yp, n);
240 #endif
241 MPN_COPY (rp, tp, n);
242 }
243 TMP_FREE;
244 }
245 }
246