1 /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5
2    times as large as bn.  Or more accurately, bn < an < 3bn.
3 
4    Contributed to the GNU project by Torbjorn Granlund.
5    Improvements by Marco Bodrato and Niels Möller.
6 
7    The idea of applying toom to unbalanced multiplication is due to Marco
8    Bodrato and Alberto Zanoni.
9 
10    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
11    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13 
14 Copyright 2006-2010 Free Software Foundation, Inc.
15 
16 This file is part of the GNU MP Library.
17 
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of either:
20 
21   * the GNU Lesser General Public License as published by the Free
22     Software Foundation; either version 3 of the License, or (at your
23     option) any later version.
24 
25 or
26 
27   * the GNU General Public License as published by the Free Software
28     Foundation; either version 2 of the License, or (at your option) any
29     later version.
30 
31 or both in parallel, as here.
32 
33 The GNU MP Library is distributed in the hope that it will be useful, but
34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
36 for more details.
37 
38 You should have received copies of the GNU General Public License and the
39 GNU Lesser General Public License along with the GNU MP Library.  If not,
40 see https://www.gnu.org/licenses/.  */
41 
42 
43 #include "gmp.h"
44 #include "gmp-impl.h"
45 
46 /* Evaluate in: -1, 0, +1, +inf
47 
48   <-s-><--n--><--n-->
49    ___ ______ ______
50   |a2_|___a1_|___a0_|
51 	|_b1_|___b0_|
52 	<-t--><--n-->
53 
54   v0  =  a0         * b0      #   A(0)*B(0)
55   v1  = (a0+ a1+ a2)*(b0+ b1) #   A(1)*B(1)      ah  <= 2  bh <= 1
56   vm1 = (a0- a1+ a2)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh = 0
57   vinf=          a2 *     b1  # A(inf)*B(inf)
58 */
59 
60 #define TOOM32_MUL_N_REC(p, a, b, n, ws)				\
61   do {									\
62     mpn_mul_n (p, a, b, n);						\
63   } while (0)
64 
65 void
mpn_toom32_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)66 mpn_toom32_mul (mp_ptr pp,
67 		mp_srcptr ap, mp_size_t an,
68 		mp_srcptr bp, mp_size_t bn,
69 		mp_ptr scratch)
70 {
71   mp_size_t n, s, t;
72   int vm1_neg;
73   mp_limb_t cy;
74   mp_limb_signed_t hi;
75   mp_limb_t ap1_hi, bp1_hi;
76 
77 #define a0  ap
78 #define a1  (ap + n)
79 #define a2  (ap + 2 * n)
80 #define b0  bp
81 #define b1  (bp + n)
82 
83   /* Required, to ensure that s + t >= n. */
84   ASSERT (bn + 2 <= an && an + 6 <= 3*bn);
85 
86   n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1);
87 
88   s = an - 2 * n;
89   t = bn - n;
90 
91   ASSERT (0 < s && s <= n);
92   ASSERT (0 < t && t <= n);
93   ASSERT (s + t >= n);
94 
95   /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */
96 #define ap1 (pp)		/* n, most significant limb in ap1_hi */
97 #define bp1 (pp + n)		/* n, most significant bit in bp1_hi */
98 #define am1 (pp + 2*n)		/* n, most significant bit in hi */
99 #define bm1 (pp + 3*n)		/* n */
100 #define v1 (scratch)		/* 2n + 1 */
101 #define vm1 (pp)		/* 2n + 1 */
102 #define scratch_out (scratch + 2*n + 1) /* Currently unused. */
103 
104   /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */
105 
106   /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */
107 
108   /* Compute ap1 = a0 + a1 + a3, am1 = a0 - a1 + a3 */
109   ap1_hi = mpn_add (ap1, a0, n, a2, s);
110 #if HAVE_NATIVE_mpn_add_n_sub_n
111   if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
112     {
113       ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1;
114       hi = 0;
115       vm1_neg = 1;
116     }
117   else
118     {
119       cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n);
120       hi = ap1_hi - (cy & 1);
121       ap1_hi += (cy >> 1);
122       vm1_neg = 0;
123     }
124 #else
125   if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0)
126     {
127       ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n));
128       hi = 0;
129       vm1_neg = 1;
130     }
131   else
132     {
133       hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n);
134       vm1_neg = 0;
135     }
136   ap1_hi += mpn_add_n (ap1, ap1, a1, n);
137 #endif
138 
139   /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */
140   if (t == n)
141     {
142 #if HAVE_NATIVE_mpn_add_n_sub_n
143       if (mpn_cmp (b0, b1, n) < 0)
144 	{
145 	  cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n);
146 	  vm1_neg ^= 1;
147 	}
148       else
149 	{
150 	  cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n);
151 	}
152       bp1_hi = cy >> 1;
153 #else
154       bp1_hi = mpn_add_n (bp1, b0, b1, n);
155 
156       if (mpn_cmp (b0, b1, n) < 0)
157 	{
158 	  ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n));
159 	  vm1_neg ^= 1;
160 	}
161       else
162 	{
163 	  ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n));
164 	}
165 #endif
166     }
167   else
168     {
169       /* FIXME: Should still use mpn_add_n_sub_n for the main part. */
170       bp1_hi = mpn_add (bp1, b0, n, b1, t);
171 
172       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
173 	{
174 	  ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t));
175 	  MPN_ZERO (bm1 + t, n - t);
176 	  vm1_neg ^= 1;
177 	}
178       else
179 	{
180 	  ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t));
181 	}
182     }
183 
184   TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out);
185   if (ap1_hi == 1)
186     {
187       cy = bp1_hi + mpn_add_n (v1 + n, v1 + n, bp1, n);
188     }
189   else if (ap1_hi == 2)
190     {
191 #if HAVE_NATIVE_mpn_addlsh1_n
192       cy = 2 * bp1_hi + mpn_addlsh1_n (v1 + n, v1 + n, bp1, n);
193 #else
194       cy = 2 * bp1_hi + mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2));
195 #endif
196     }
197   else
198     cy = 0;
199   if (bp1_hi != 0)
200     cy += mpn_add_n (v1 + n, v1 + n, ap1, n);
201   v1[2 * n] = cy;
202 
203   TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out);
204   if (hi)
205     hi = mpn_add_n (vm1+n, vm1+n, bm1, n);
206 
207   vm1[2*n] = hi;
208 
209   /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */
210   if (vm1_neg)
211     {
212 #if HAVE_NATIVE_mpn_rsh1sub_n
213       mpn_rsh1sub_n (v1, v1, vm1, 2*n+1);
214 #else
215       mpn_sub_n (v1, v1, vm1, 2*n+1);
216       ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
217 #endif
218     }
219   else
220     {
221 #if HAVE_NATIVE_mpn_rsh1add_n
222       mpn_rsh1add_n (v1, v1, vm1, 2*n+1);
223 #else
224       mpn_add_n (v1, v1, vm1, 2*n+1);
225       ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1));
226 #endif
227     }
228 
229   /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence
230 
231      y = x1 + x3 + (x0 + x2) * B
232        = (x0 + x2) * B + (x0 + x2) - vm1.
233 
234      y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as
235      follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n
236      (already in place, except for carry propagation).
237 
238      We thus add
239 
240    B^3  B^2   B    1
241     |    |    |    |
242    +-----+----+
243  + |  x0 + x2 |
244    +----+-----+----+
245  +      |  x0 + x2 |
246 	+----------+
247  -      |  vm1     |
248  --+----++----+----+-
249    | y2  | y1 | y0 |
250    +-----+----+----+
251 
252   Since we store y0 at the same location as the low half of x0 + x2, we
253   need to do the middle sum first. */
254 
255   hi = vm1[2*n];
256   cy = mpn_add_n (pp + 2*n, v1, v1 + n, n);
257   MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]);
258 
259   /* FIXME: Can we get rid of this second vm1_neg conditional by
260      swapping the location of +1 and -1 values? */
261   if (vm1_neg)
262     {
263       cy = mpn_add_n (v1, v1, vm1, n);
264       hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
265       MPN_INCR_U (v1 + n, n+1, hi);
266     }
267   else
268     {
269       cy = mpn_sub_n (v1, v1, vm1, n);
270       hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy);
271       MPN_DECR_U (v1 + n, n+1, hi);
272     }
273 
274   TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out);
275   /* vinf, s+t limbs.  Use mpn_mul for now, to handle unbalanced operands */
276   if (s > t)  mpn_mul (pp+3*n, a2, s, b1, t);
277   else        mpn_mul (pp+3*n, b1, t, a2, s);
278 
279   /* Remaining interpolation.
280 
281      y * B + x0 + x3 B^3 - x0 B^2 - x3 B
282      = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B
283      = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B
284        + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2
285      = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2
286        + (y2 - (H x0 - L x3)) B^3 + H x3 B^4
287 
288 	  B^4       B^3       B^2        B         1
289  |         |         |         |         |         |
290    +-------+                   +---------+---------+
291    |  Hx3  |                   | Hx0-Lx3 |    Lx0  |
292    +------+----------+---------+---------+---------+
293 	  |    y2    |  y1     |   y0    |
294 	  ++---------+---------+---------+
295 	  -| Hx0-Lx3 | - Lx0   |
296 	   +---------+---------+
297 		      | - Hx3  |
298 		      +--------+
299 
300     We must take into account the carry from Hx0 - Lx3.
301   */
302 
303   cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n);
304   hi = scratch[2*n] + cy;
305 
306   cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy);
307   hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy);
308 
309   hi += mpn_add (pp + n, pp + n, 3*n, scratch, n);
310 
311   /* FIXME: Is support for s + t == n needed? */
312   if (LIKELY (s + t > n))
313     {
314       hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n);
315 
316       if (hi < 0)
317 	MPN_DECR_U (pp + 4*n, s+t-n, -hi);
318       else
319 	MPN_INCR_U (pp + 4*n, s+t-n, hi);
320     }
321   else
322     ASSERT (hi == 0);
323 }
324