1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2 size. Or more accurately, bn <= an < (3/2)bn.
3
4 Contributed to the GNU project by Torbjorn Granlund.
5 Additional improvements by Marco Bodrato.
6
7 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
8 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11 Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 or
23
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
27
28 or both in parallel, as here.
29
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
34
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
38
39
40 #include "gmp-impl.h"
41
42 /* Evaluate in: -1, 0, +1, +2, +inf
43
44 <-s--><--n--><--n-->
45 ____ ______ ______
46 |_a2_|___a1_|___a0_|
47 |b2_|___b1_|___b0_|
48 <-t-><--n--><--n-->
49
50 v0 = a0 * b0 # A(0)*B(0)
51 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
52 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
53 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
54 vinf= a2 * b2 # A(inf)*B(inf)
55 */
56
57 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
58 #define MAYBE_mul_basecase 1
59 #define MAYBE_mul_toom33 1
60 #else
61 #define MAYBE_mul_basecase \
62 (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
63 #define MAYBE_mul_toom33 \
64 (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
65 #endif
66
67 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
68 multiplication at the infinity point. We may have
69 MAYBE_mul_basecase == 0, and still get s just below
70 MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
71 s == 1 and mpn_toom22_mul will crash.
72 */
73
74 #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
75 do { \
76 if (MAYBE_mul_basecase \
77 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
78 mpn_mul_basecase (p, a, n, b, n); \
79 else if (! MAYBE_mul_toom33 \
80 || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
81 mpn_toom22_mul (p, a, n, b, n, ws); \
82 else \
83 mpn_toom33_mul (p, a, n, b, n, ws); \
84 } while (0)
85
86 void
mpn_toom33_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)87 mpn_toom33_mul (mp_ptr pp,
88 mp_srcptr ap, mp_size_t an,
89 mp_srcptr bp, mp_size_t bn,
90 mp_ptr scratch)
91 {
92 const int __gmpn_cpuvec_initialized = 1;
93 mp_size_t n, s, t;
94 int vm1_neg;
95 mp_limb_t cy, vinf0;
96 mp_ptr gp;
97 mp_ptr as1, asm1, as2;
98 mp_ptr bs1, bsm1, bs2;
99
100 #define a0 ap
101 #define a1 (ap + n)
102 #define a2 (ap + 2*n)
103 #define b0 bp
104 #define b1 (bp + n)
105 #define b2 (bp + 2*n)
106
107 n = (an + 2) / (size_t) 3;
108
109 s = an - 2 * n;
110 t = bn - 2 * n;
111
112 ASSERT (an >= bn);
113
114 ASSERT (0 < s && s <= n);
115 ASSERT (0 < t && t <= n);
116
117 as1 = scratch + 4 * n + 4;
118 asm1 = scratch + 2 * n + 2;
119 as2 = pp + n + 1;
120
121 bs1 = pp;
122 bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
123 bs2 = pp + 2 * n + 2;
124
125 gp = scratch;
126
127 vm1_neg = 0;
128
129 /* Compute as1 and asm1. */
130 cy = mpn_add (gp, a0, n, a2, s);
131 #if HAVE_NATIVE_mpn_add_n_sub_n
132 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
133 {
134 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
135 as1[n] = cy >> 1;
136 asm1[n] = 0;
137 vm1_neg = 1;
138 }
139 else
140 {
141 mp_limb_t cy2;
142 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
143 as1[n] = cy + (cy2 >> 1);
144 asm1[n] = cy - (cy2 & 1);
145 }
146 #else
147 as1[n] = cy + mpn_add_n (as1, gp, a1, n);
148 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
149 {
150 mpn_sub_n (asm1, a1, gp, n);
151 asm1[n] = 0;
152 vm1_neg = 1;
153 }
154 else
155 {
156 cy -= mpn_sub_n (asm1, gp, a1, n);
157 asm1[n] = cy;
158 }
159 #endif
160
161 /* Compute as2. */
162 #if HAVE_NATIVE_mpn_rsblsh1_n
163 cy = mpn_add_n (as2, a2, as1, s);
164 if (s != n)
165 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
166 cy += as1[n];
167 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
168 #else
169 #if HAVE_NATIVE_mpn_addlsh1_n
170 cy = mpn_addlsh1_n (as2, a1, a2, s);
171 if (s != n)
172 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
173 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
174 #else
175 cy = mpn_add_n (as2, a2, as1, s);
176 if (s != n)
177 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
178 cy += as1[n];
179 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
180 cy -= mpn_sub_n (as2, as2, a0, n);
181 #endif
182 #endif
183 as2[n] = cy;
184
185 /* Compute bs1 and bsm1. */
186 cy = mpn_add (gp, b0, n, b2, t);
187 #if HAVE_NATIVE_mpn_add_n_sub_n
188 if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
189 {
190 cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
191 bs1[n] = cy >> 1;
192 bsm1[n] = 0;
193 vm1_neg ^= 1;
194 }
195 else
196 {
197 mp_limb_t cy2;
198 cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
199 bs1[n] = cy + (cy2 >> 1);
200 bsm1[n] = cy - (cy2 & 1);
201 }
202 #else
203 bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
204 if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
205 {
206 mpn_sub_n (bsm1, b1, gp, n);
207 bsm1[n] = 0;
208 vm1_neg ^= 1;
209 }
210 else
211 {
212 cy -= mpn_sub_n (bsm1, gp, b1, n);
213 bsm1[n] = cy;
214 }
215 #endif
216
217 /* Compute bs2. */
218 #if HAVE_NATIVE_mpn_rsblsh1_n
219 cy = mpn_add_n (bs2, b2, bs1, t);
220 if (t != n)
221 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
222 cy += bs1[n];
223 cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
224 #else
225 #if HAVE_NATIVE_mpn_addlsh1_n
226 cy = mpn_addlsh1_n (bs2, b1, b2, t);
227 if (t != n)
228 cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
229 cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
230 #else
231 cy = mpn_add_n (bs2, bs1, b2, t);
232 if (t != n)
233 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
234 cy += bs1[n];
235 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
236 cy -= mpn_sub_n (bs2, bs2, b0, n);
237 #endif
238 #endif
239 bs2[n] = cy;
240
241 ASSERT (as1[n] <= 2);
242 ASSERT (bs1[n] <= 2);
243 ASSERT (asm1[n] <= 1);
244 ASSERT (bsm1[n] <= 1);
245 ASSERT (as2[n] <= 6);
246 ASSERT (bs2[n] <= 6);
247
248 #define v0 pp /* 2n */
249 #define v1 (pp + 2 * n) /* 2n+1 */
250 #define vinf (pp + 4 * n) /* s+t */
251 #define vm1 scratch /* 2n+1 */
252 #define v2 (scratch + 2 * n + 1) /* 2n+2 */
253 #define scratch_out (scratch + 5 * n + 5)
254
255 /* vm1, 2n+1 limbs */
256 #ifdef SMALLER_RECURSION
257 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
258 cy = 0;
259 if (asm1[n] != 0)
260 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
261 if (bsm1[n] != 0)
262 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
263 vm1[2 * n] = cy;
264 #else
265 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
266 #endif
267
268 TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
269
270 /* vinf, s+t limbs */
271 if (s > t) mpn_mul (vinf, a2, s, b2, t);
272 else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
273
274 vinf0 = vinf[0]; /* v1 overlaps with this */
275
276 #ifdef SMALLER_RECURSION
277 /* v1, 2n+1 limbs */
278 TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
279 if (as1[n] == 1)
280 {
281 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
282 }
283 else if (as1[n] != 0)
284 {
285 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
286 cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
287 #else
288 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
289 #endif
290 }
291 else
292 cy = 0;
293 if (bs1[n] == 1)
294 {
295 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
296 }
297 else if (bs1[n] != 0)
298 {
299 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
300 cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
301 #else
302 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
303 #endif
304 }
305 v1[2 * n] = cy;
306 #else
307 cy = vinf[1];
308 TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
309 vinf[1] = cy;
310 #endif
311
312 TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
313
314 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
315 }
316