1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
2 of their products.
3
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5
6 THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
7 FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
8 THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10 Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
11
12 This file is part of the GNU MP Library.
13
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20
21 or
22
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
26
27 or both in parallel, as here.
28
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
33
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
37
38 #include "gmp-impl.h"
39
40
41 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
42 #define MAYBE_range_basecase 1
43 #define MAYBE_range_toom22 1
44 #else
45 #define MAYBE_range_basecase \
46 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
47 #define MAYBE_range_toom22 \
48 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
49 #endif
50
51 /* THINK: The DC strategy uses different constants in different Toom's
52 ranges. Something smoother?
53 */
54
55 /*
56 Compute the least significant half of the product {xy,n}*{yp,n}, or
57 formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
58
59 Above the given threshold, the Divide and Conquer strategy is used.
60 The operands are split in two, and a full product plus two mullo
61 are used to obtain the final result. The more natural strategy is to
62 split in two halves, but this is far from optimal when a
63 sub-quadratic multiplication is used.
64
65 Mulders suggests an unbalanced split in favour of the full product,
66 split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
67
68 To compute the value of a, we assume that the cost of mullo for a
69 given size ML(n) is a fraction of the cost of a full product with
70 same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
71 then we can write:
72
73 ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
74
75 Given a value for e, want to minimise the value of k, i.e. the
76 function k=(1-a)^e/(1-2*a^e).
77
78 With e=2, the exponent for schoolbook multiplication, the minimum is
79 given by the values a=1-a=1/2.
80
81 With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
82 Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
83
84 Other possible approximations follow:
85 e=log(5)/log(3) [Toom-3] -> a ~= 9/40
86 e=log(7)/log(4) [Toom-4] -> a ~= 7/39
87 e=log(11)/log(6) [Toom-6] -> a ~= 1/8
88 e=log(15)/log(8) [Toom-8] -> a ~= 1/10
89
90 The values above where obtained with the following trivial commands
91 in the gp-pari shell:
92
93 fun(e,a)=(1-a)^e/(1-2*a^e)
94 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
95 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
96 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
97 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
98 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
99 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
100
101 ,
102 |\
103 | \
104 +----,
105 | |
106 | |
107 | |\
108 | | \
109 +----+--`
110 ^ n2 ^n1^
111
112 For an actual implementation, the assumption that M(n)=n^e is
113 incorrect, as a consequence also the assumption that ML(n)=k*M(n)
114 with a constant k is wrong.
115
116 But theory suggest us two things:
117 - the best the multiplication product is (lower e), the more k
118 approaches 1, and a approaches 0.
119
120 - A value for a smaller than optimal is probably less bad than a
121 bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
122 value, and k(a)=0.808_ the mul/mullo speed ratio. We get
123 k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
124 */
125
126 static mp_size_t
mpn_mullo_n_itch(mp_size_t n)127 mpn_mullo_n_itch (mp_size_t n)
128 {
129 return 2*n;
130 }
131
132 /*
133 mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
134 It accepts tp == rp.
135 */
136 static void
mpn_dc_mullo_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n,mp_ptr tp)137 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
138 {
139 mp_size_t n2, n1;
140 ASSERT (n >= 2);
141 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
142 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
143 ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
144
145 /* Divide-and-conquer */
146
147 /* We need fractional approximation of the value 0 < a <= 1/2
148 giving the minimum in the function k=(1-a)^e/(1-2*a^e).
149 */
150 if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
151 n1 = n >> 1;
152 else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
153 n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */
154 else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
155 n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */
156 else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
157 n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */
158 /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */
159 else
160 n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */
161
162 n2 = n - n1;
163
164 /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
165 y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
166
167 /* x0 * y0 */
168 mpn_mul_n (tp, xp, yp, n2);
169 MPN_COPY (rp, tp, n2);
170
171 /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
172 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
173 mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
174 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
175 mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
176 else
177 mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
178 mpn_add_n (rp + n2, tp + n2, tp + n, n1);
179
180 /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
181 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
182 mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
183 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
184 mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
185 else
186 mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
187 mpn_add_n (rp + n2, rp + n2, tp + n, n1);
188 }
189
190 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
191 #define MUL_BASECASE_ALLOC \
192 (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
193
194 /* FIXME: This function should accept a temporary area; dc_mullow_n
195 accepts a pointer tp, and handle the case tp == rp, do the same here.
196 Maybe recombine the two functions.
197 THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
198 (typically thanks to mpn_addmul_2) should we unconditionally use
199 mpn_mul_n?
200 */
201
202 void
mpn_mullo_n(mp_ptr rp,mp_srcptr xp,mp_srcptr yp,mp_size_t n)203 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
204 {
205 ASSERT (n >= 1);
206 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
207 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
208
209 if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
210 {
211 /* Allocate workspace of fixed size on stack: fast! */
212 mp_limb_t tp[MUL_BASECASE_ALLOC];
213 mpn_mul_basecase (tp, xp, n, yp, n);
214 MPN_COPY (rp, tp, n);
215 }
216 else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
217 {
218 mpn_mullo_basecase (rp, xp, yp, n);
219 }
220 else
221 {
222 mp_ptr tp;
223 TMP_DECL;
224 TMP_MARK;
225 tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
226 if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
227 {
228 mpn_dc_mullo_n (rp, xp, yp, n, tp);
229 }
230 else
231 {
232 /* For really large operands, use plain mpn_mul_n but throw away upper n
233 limbs of result. */
234 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
235 mpn_fft_mul (tp, xp, n, yp, n);
236 #else
237 mpn_mul_n (tp, xp, yp, n);
238 #endif
239 MPN_COPY (rp, tp, n);
240 }
241 TMP_FREE;
242 }
243 }
244