1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2    size.  Or more accurately, bn <= an < (3/2)bn.
3 
4    Contributed to the GNU project by Torbjorn Granlund.
5    Additional improvements by Marco Bodrato.
6 
7    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
8    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 
11 Copyright 2006-2008, 2010, 2012, 2015 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17 
18   * the GNU Lesser General Public License as published by the Free
19     Software Foundation; either version 3 of the License, or (at your
20     option) any later version.
21 
22 or
23 
24   * the GNU General Public License as published by the Free Software
25     Foundation; either version 2 of the License, or (at your option) any
26     later version.
27 
28 or both in parallel, as here.
29 
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33 for more details.
34 
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library.  If not,
37 see https://www.gnu.org/licenses/.  */
38 
39 
40 #include "gmp-impl.h"
41 
42 /* Evaluate in: -1, 0, +1, +2, +inf
43 
44   <-s--><--n--><--n-->
45    ____ ______ ______
46   |_a2_|___a1_|___a0_|
47    |b2_|___b1_|___b0_|
48    <-t-><--n--><--n-->
49 
50   v0  =  a0         * b0          #   A(0)*B(0)
51   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
52   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
53   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
54   vinf=          a2 *         b2  # A(inf)*B(inf)
55 */
56 
57 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
58 #define MAYBE_mul_basecase 1
59 #define MAYBE_mul_toom33   1
60 #else
61 #define MAYBE_mul_basecase						\
62   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
63 #define MAYBE_mul_toom33						\
64   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
65 #endif
66 
67 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
68    multiplication at the infinity point. We may have
69    MAYBE_mul_basecase == 0, and still get s just below
70    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
71    s == 1 and mpn_toom22_mul will crash.
72 */
73 
74 #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
75   do {									\
76     if (MAYBE_mul_basecase						\
77 	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
78       mpn_mul_basecase (p, a, n, b, n);					\
79     else if (! MAYBE_mul_toom33						\
80 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
81       mpn_toom22_mul (p, a, n, b, n, ws);				\
82     else								\
83       mpn_toom33_mul (p, a, n, b, n, ws);				\
84   } while (0)
85 
86 void
mpn_toom33_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)87 mpn_toom33_mul (mp_ptr pp,
88 		mp_srcptr ap, mp_size_t an,
89 		mp_srcptr bp, mp_size_t bn,
90 		mp_ptr scratch)
91 {
92   const int __gmpn_cpuvec_initialized = 1;
93   mp_size_t n, s, t;
94   int vm1_neg;
95   mp_limb_t cy, vinf0;
96   mp_ptr gp;
97   mp_ptr as1, asm1, as2;
98   mp_ptr bs1, bsm1, bs2;
99 
100 #define a0  ap
101 #define a1  (ap + n)
102 #define a2  (ap + 2*n)
103 #define b0  bp
104 #define b1  (bp + n)
105 #define b2  (bp + 2*n)
106 
107   n = (an + 2) / (size_t) 3;
108 
109   s = an - 2 * n;
110   t = bn - 2 * n;
111 
112   ASSERT (an >= bn);
113 
114   ASSERT (0 < s && s <= n);
115   ASSERT (0 < t && t <= n);
116 
117   as1  = scratch + 4 * n + 4;
118   asm1 = scratch + 2 * n + 2;
119   as2 = pp + n + 1;
120 
121   bs1 = pp;
122   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
123   bs2 = pp + 2 * n + 2;
124 
125   gp = scratch;
126 
127   vm1_neg = 0;
128 
129   /* Compute as1 and asm1.  */
130   cy = mpn_add (gp, a0, n, a2, s);
131 #if HAVE_NATIVE_mpn_add_n_sub_n
132   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
133     {
134       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
135       as1[n] = cy >> 1;
136       asm1[n] = 0;
137       vm1_neg = 1;
138     }
139   else
140     {
141       mp_limb_t cy2;
142       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
143       as1[n] = cy + (cy2 >> 1);
144       asm1[n] = cy - (cy2 & 1);
145     }
146 #else
147   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
148   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
149     {
150       mpn_sub_n (asm1, a1, gp, n);
151       asm1[n] = 0;
152       vm1_neg = 1;
153     }
154   else
155     {
156       cy -= mpn_sub_n (asm1, gp, a1, n);
157       asm1[n] = cy;
158     }
159 #endif
160 
161   /* Compute as2.  */
162 #if HAVE_NATIVE_mpn_rsblsh1_n
163   cy = mpn_add_n (as2, a2, as1, s);
164   if (s != n)
165     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
166   cy += as1[n];
167   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
168 #else
169 #if HAVE_NATIVE_mpn_addlsh1_n
170   cy  = mpn_addlsh1_n (as2, a1, a2, s);
171   if (s != n)
172     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
173   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
174 #else
175   cy = mpn_add_n (as2, a2, as1, s);
176   if (s != n)
177     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
178   cy += as1[n];
179   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
180   cy -= mpn_sub_n (as2, as2, a0, n);
181 #endif
182 #endif
183   as2[n] = cy;
184 
185   /* Compute bs1 and bsm1.  */
186   cy = mpn_add (gp, b0, n, b2, t);
187 #if HAVE_NATIVE_mpn_add_n_sub_n
188   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
189     {
190       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
191       bs1[n] = cy >> 1;
192       bsm1[n] = 0;
193       vm1_neg ^= 1;
194     }
195   else
196     {
197       mp_limb_t cy2;
198       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
199       bs1[n] = cy + (cy2 >> 1);
200       bsm1[n] = cy - (cy2 & 1);
201     }
202 #else
203   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
204   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
205     {
206       mpn_sub_n (bsm1, b1, gp, n);
207       bsm1[n] = 0;
208       vm1_neg ^= 1;
209     }
210   else
211     {
212       cy -= mpn_sub_n (bsm1, gp, b1, n);
213       bsm1[n] = cy;
214     }
215 #endif
216 
217   /* Compute bs2.  */
218 #if HAVE_NATIVE_mpn_rsblsh1_n
219   cy = mpn_add_n (bs2, b2, bs1, t);
220   if (t != n)
221     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
222   cy += bs1[n];
223   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
224 #else
225 #if HAVE_NATIVE_mpn_addlsh1_n
226   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
227   if (t != n)
228     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
229   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
230 #else
231   cy  = mpn_add_n (bs2, bs1, b2, t);
232   if (t != n)
233     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
234   cy += bs1[n];
235   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
236   cy -= mpn_sub_n (bs2, bs2, b0, n);
237 #endif
238 #endif
239   bs2[n] = cy;
240 
241   ASSERT (as1[n] <= 2);
242   ASSERT (bs1[n] <= 2);
243   ASSERT (asm1[n] <= 1);
244   ASSERT (bsm1[n] <= 1);
245   ASSERT (as2[n] <= 6);
246   ASSERT (bs2[n] <= 6);
247 
248 #define v0    pp				/* 2n */
249 #define v1    (pp + 2 * n)			/* 2n+1 */
250 #define vinf  (pp + 4 * n)			/* s+t */
251 #define vm1   scratch				/* 2n+1 */
252 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
253 #define scratch_out  (scratch + 5 * n + 5)
254 
255   /* vm1, 2n+1 limbs */
256 #ifdef SMALLER_RECURSION
257   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
258   cy = 0;
259   if (asm1[n] != 0)
260     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
261   if (bsm1[n] != 0)
262     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
263   vm1[2 * n] = cy;
264 #else
265   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
266 #endif
267 
268   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
269 
270   /* vinf, s+t limbs */
271   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
272   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
273 
274   vinf0 = vinf[0];				/* v1 overlaps with this */
275 
276 #ifdef SMALLER_RECURSION
277   /* v1, 2n+1 limbs */
278   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
279   if (as1[n] == 1)
280     {
281       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
282     }
283   else if (as1[n] != 0)
284     {
285 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
286       cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
287 #else
288       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
289 #endif
290     }
291   else
292     cy = 0;
293   if (bs1[n] == 1)
294     {
295       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
296     }
297   else if (bs1[n] != 0)
298     {
299 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
300       cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
301 #else
302       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
303 #endif
304     }
305   v1[2 * n] = cy;
306 #else
307   cy = vinf[1];
308   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
309   vinf[1] = cy;
310 #endif
311 
312   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
313 
314   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
315 }
316