1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
2 of their products.
3
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6 THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
7 FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
8 THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10 Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
11
12 This file is part of the GNU MP Library.
13
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20
21 or
22
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
26
27 or both in parallel, as here.
28
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
33
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
37
38 #include "gmp.h"
39 #include "gmp-impl.h"
40
41
42 #ifndef MULLO_BASECASE_THRESHOLD
43 #define MULLO_BASECASE_THRESHOLD 0 /* never use mpn_mul_basecase */
44 #endif
45
46 #ifndef MULLO_DC_THRESHOLD
47 #define MULLO_DC_THRESHOLD 3*MUL_TOOM22_THRESHOLD
48 #endif
49
50 #ifndef MULLO_MUL_N_THRESHOLD
51 #define MULLO_MUL_N_THRESHOLD MUL_FFT_THRESHOLD
52 #endif
53
54 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
55 #define MAYBE_range_basecase 1
56 #define MAYBE_range_toom22 1
57 #else
58 #define MAYBE_range_basecase \
59 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
60 #define MAYBE_range_toom22 \
61 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
62 #endif
63
64 /* THINK: The DC strategy uses different constants in different Toom's
65 ranges. Something smoother?
66 */
67
68 /*
69 Compute the least significant half of the product {xy,n}*{yp,n}, or
70 formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
71
72 Above the given threshold, the Divide and Conquer strategy is used.
73 The operands are split in two, and a full product plus two mullo
74 are used to obtain the final result. The more natural strategy is to
75 split in two halves, but this is far from optimal when a
76 sub-quadratic multiplication is used.
77
78 Mulders suggests an unbalanced split in favour of the full product,
79 split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
80
81 To compute the value of a, we assume that the cost of mullo for a
82 given size ML(n) is a fraction of the cost of a full product with
83 same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
84 then we can write:
85
86 ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
87
88 Given a value for e, want to minimise the value of k, i.e. the
89 function k=(1-a)^e/(1-2*a^e).
90
91 With e=2, the exponent for schoolbook multiplication, the minimum is
92 given by the values a=1-a=1/2.
93
94 With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
95 Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
96
97 Other possible approximations follow:
98 e=log(5)/log(3) [Toom-3] -> a ~= 9/40
99 e=log(7)/log(4) [Toom-4] -> a ~= 7/39
100 e=log(11)/log(6) [Toom-6] -> a ~= 1/8
101 e=log(15)/log(8) [Toom-8] -> a ~= 1/10
102
103 The values above where obtained with the following trivial commands
104 in the gp-pari shell:
105
106 fun(e,a)=(1-a)^e/(1-2*a^e)
107 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
108 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
109 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
110 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
111 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
112 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
113
114 ,
115 |\
116 | \
117 +----,
118 | |
119 | |
120 | |\
121 | | \
122 +----+--`
123 ^ n2 ^n1^
124
125 For an actual implementation, the assumption that M(n)=n^e is
126 incorrect, as a consequence also the assumption that ML(n)=k*M(n)
127 with a constant k is wrong.
128
129 But theory suggest us two things:
130 - the best the multiplication product is (lower e), the more k
131 approaches 1, and a approaches 0.
132
133 - A value for a smaller than optimal is probably less bad than a
134 bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
135 value, and k(a)=0.808_ the mul/mullo speed ratio. We get
136 k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
137 */
138
139 static mp_size_t
mpn_mullo_n_itch(mp_size_t n)140 mpn_mullo_n_itch (mp_size_t n)
141 {
142 return 2*n;
143 }
144
145 /*
146 mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
147 It accepts tp == rp.
148 */
149 static void
mpn_dc_mullo_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n,mp_ptr tp)150 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
151 {
152 mp_size_t n2, n1;
153 ASSERT (n >= 2);
154 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
155 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
156 ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
157
158 /* Divide-and-conquer */
159
160 /* We need fractional approximation of the value 0 < a <= 1/2
161 giving the minimum in the function k=(1-a)^e/(1-2*a^e).
162 */
163 if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
164 n1 = n >> 1;
165 else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
166 n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */
167 else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
168 n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */
169 else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
170 n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */
171 /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */
172 else
173 n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */
174
175 n2 = n - n1;
176
177 /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
178 y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
179
180 /* x0 * y0 */
181 mpn_mul_n (tp, xp, yp, n2);
182 MPN_COPY (rp, tp, n2);
183
184 /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
185 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
186 mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
187 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
188 mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
189 else
190 mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
191 mpn_add_n (rp + n2, tp + n2, tp + n, n1);
192
193 /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
194 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
195 mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
196 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
197 mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
198 else
199 mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
200 mpn_add_n (rp + n2, rp + n2, tp + n, n1);
201 }
202
203 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
204 #define MUL_BASECASE_ALLOC \
205 (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
206
207 /* FIXME: This function should accept a temporary area; dc_mullow_n
208 accepts a pointer tp, and handle the case tp == rp, do the same here.
209 Maybe recombine the two functions.
210 THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
211 (typically thanks to mpn_addmul_2) should we unconditionally use
212 mpn_mul_n?
213 */
214
215 void
mpn_mullo_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n)216 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
217 {
218 ASSERT (n >= 1);
219 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
220 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
221
222 if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
223 {
224 /* Allocate workspace of fixed size on stack: fast! */
225 mp_limb_t tp[MUL_BASECASE_ALLOC];
226 mpn_mul_basecase (tp, xp, n, yp, n);
227 MPN_COPY (rp, tp, n);
228 }
229 else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
230 {
231 mpn_mullo_basecase (rp, xp, yp, n);
232 }
233 else
234 {
235 mp_ptr tp;
236 TMP_DECL;
237 TMP_MARK;
238 tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
239 if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
240 {
241 mpn_dc_mullo_n (rp, xp, yp, n, tp);
242 }
243 else
244 {
245 /* For really large operands, use plain mpn_mul_n but throw away upper n
246 limbs of result. */
247 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
248 mpn_fft_mul (tp, xp, n, yp, n);
249 #else
250 mpn_mul_n (tp, xp, yp, n);
251 #endif
252 MPN_COPY (rp, tp, n);
253 }
254 TMP_FREE;
255 }
256 }
257