xref: /dragonfly/contrib/gmp/mpn/generic/toom33_mul.c (revision fcf53d9b)
1 /* mpn_toom33_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in
2    size.  Or more accurately, bn <= an < (3/2)bn.
3 
4    Contributed to the GNU project by Torbjorn Granlund.
5 
6    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of the GNU Lesser General Public License as published by
16 the Free Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
18 
19 The GNU MP Library is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
22 License for more details.
23 
24 You should have received a copy of the GNU Lesser General Public License
25 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
26 
27 
28 /*
29   Things to work on:
30 
31   1. Trim allocation.  The allocations for as1, asm1, bs1, and bsm1 could be
32      avoided by instead reusing the pp area and the scratch area.
33   2. Use new toom functions for the recursive calls.
34 */
35 
36 #include "gmp.h"
37 #include "gmp-impl.h"
38 
39 /* Evaluate in: -1, 0, +1, +2, +inf
40 
41   <-s-><--n--><--n--><--n-->
42    ___ ______ ______ ______
43   |a3_|___a2_|___a1_|___a0_|
44 	       |_b1_|___b0_|
45 	       <-t--><--n-->
46 
47   v0  =  a0         * b0          #   A(0)*B(0)
48   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
49   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
50   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
51   vinf=          a2 *         b2  # A(inf)*B(inf)
52 */
53 
54 #if TUNE_PROGRAM_BUILD
55 #define MAYBE_mul_basecase 1
56 #define MAYBE_mul_toom33   1
57 #else
58 #define MAYBE_mul_basecase						\
59   (MUL_TOOM33_THRESHOLD < 3 * MUL_KARATSUBA_THRESHOLD)
60 #define MAYBE_mul_toom33						\
61   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
62 #endif
63 
64 #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
65   do {									\
66     if (MAYBE_mul_basecase						\
67 	&& BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))		\
68       mpn_mul_basecase (p, a, n, b, n);					\
69     else if (! MAYBE_mul_toom33						\
70 	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
71       mpn_kara_mul_n (p, a, b, n, ws);					\
72     else								\
73       mpn_toom3_mul_n (p, a, b, n, ws);					\
74   } while (0)
75 
76 void
77 mpn_toom33_mul (mp_ptr pp,
78 		mp_srcptr ap, mp_size_t an,
79 		mp_srcptr bp, mp_size_t bn,
80 		mp_ptr scratch)
81 {
82   mp_size_t n, s, t;
83   int vm1_neg;
84   mp_limb_t cy, vinf0;
85   mp_ptr gp;
86   mp_ptr as1, asm1, as2;
87   mp_ptr bs1, bsm1, bs2;
88   TMP_DECL;
89 
90 #define a0  ap
91 #define a1  (ap + n)
92 #define a2  (ap + 2*n)
93 #define b0  bp
94 #define b1  (bp + n)
95 #define b2  (bp + 2*n)
96 
97   n = (an + 2) / (size_t) 3;
98 
99   s = an - 2 * n;
100   t = bn - 2 * n;
101 
102   ASSERT (an >= bn);
103 
104   ASSERT (0 < s && s <= n);
105   ASSERT (0 < t && t <= n);
106 
107   TMP_MARK;
108 
109   as1 = TMP_SALLOC_LIMBS (n + 1);
110   asm1 = TMP_SALLOC_LIMBS (n + 1);
111   as2 = TMP_SALLOC_LIMBS (n + 1);
112 
113   bs1 = TMP_SALLOC_LIMBS (n + 1);
114   bsm1 = TMP_SALLOC_LIMBS (n + 1);
115   bs2 = TMP_SALLOC_LIMBS (n + 1);
116 
117   gp = pp;
118 
119   vm1_neg = 0;
120 
121   /* Compute as1 and asm1.  */
122   cy = mpn_add (gp, a0, n, a2, s);
123 #if HAVE_NATIVE_mpn_addsub_n
124   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
125     {
126       cy = mpn_addsub_n (as1, asm1, a1, gp, n);
127       as1[n] = 0;
128       asm1[n] = 0;
129       vm1_neg = 1;
130     }
131   else
132     {
133       cy2 = mpn_addsub_n (as1, asm1, gp, a1, n);
134       as1[n] = cy + (cy2 >> 1);
135       asm1[n] = cy - (cy & 1);
136     }
137 #else
138   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
139   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
140     {
141       mpn_sub_n (asm1, a1, gp, n);
142       asm1[n] = 0;
143       vm1_neg = 1;
144     }
145   else
146     {
147       cy -= mpn_sub_n (asm1, gp, a1, n);
148       asm1[n] = cy;
149     }
150 #endif
151 
152   /* Compute as2.  */
153 #if HAVE_NATIVE_mpn_addlsh1_n
154   cy  = mpn_addlsh1_n (as2, a1, a2, s);
155   if (s != n)
156     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
157   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
158 #else
159   cy  = mpn_lshift (as2, a2, s, 1);
160   cy += mpn_add_n (as2, a1, as2, s);
161   if (s != n)
162     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
163   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
164   cy += mpn_add_n (as2, a0, as2, n);
165 #endif
166   as2[n] = cy;
167 
168   /* Compute bs1 and bsm1.  */
169   cy = mpn_add (gp, b0, n, b2, t);
170 #if HAVE_NATIVE_mpn_addsub_n
171   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
172     {
173       cy = mpn_addsub_n (bs1, bsm1, b1, gp, n);
174       bs1[n] = 0;
175       bsm1[n] = 0;
176       vm1_neg ^= 1;
177     }
178   else
179     {
180       cy2 = mpn_addsub_n (bs1, bsm1, gp, b1, n);
181       bs1[n] = cy + (cy2 >> 1);
182       bsm1[n] = cy - (cy & 1);
183     }
184 #else
185   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
186   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
187     {
188       mpn_sub_n (bsm1, b1, gp, n);
189       bsm1[n] = 0;
190       vm1_neg ^= 1;
191     }
192   else
193     {
194       cy -= mpn_sub_n (bsm1, gp, b1, n);
195       bsm1[n] = cy;
196     }
197 #endif
198 
199   /* Compute bs2.  */
200 #if HAVE_NATIVE_mpn_addlsh1_n
201   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
202   if (t != n)
203     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
204   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
205 #else
206   cy  = mpn_lshift (bs2, b2, t, 1);
207   cy += mpn_add_n (bs2, b1, bs2, t);
208   if (t != n)
209     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
210   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
211   cy += mpn_add_n (bs2, b0, bs2, n);
212 #endif
213   bs2[n] = cy;
214 
215   ASSERT (as1[n] <= 2);
216   ASSERT (bs1[n] <= 2);
217   ASSERT (asm1[n] <= 1);
218   ASSERT (bsm1[n] <= 1);
219   ASSERT (as2[n] <= 6);
220   ASSERT (bs2[n] <= 6);
221 
222 #define v0    pp				/* 2n */
223 #define v1    (pp + 2 * n)			/* 2n+1 */
224 #define vinf  (pp + 4 * n)			/* s+t */
225 #define vm1   scratch				/* 2n+1 */
226 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
227 #define scratch_out  (scratch + 4 * n + 4)
228 
229   /* vm1, 2n+1 limbs */
230 #ifdef SMALLER_RECURSION
231   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
232   cy = 0;
233   if (asm1[n] != 0)
234     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
235   if (bsm1[n] != 0)
236     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
237   vm1[2 * n] = cy;
238 #else
239   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
240 #endif
241 
242   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
243 
244   /* vinf, s+t limbs */
245   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
246   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
247 
248   vinf0 = vinf[0];				/* v1 overlaps with this */
249 
250 #ifdef SMALLER_RECURSION
251   /* v1, 2n+1 limbs */
252   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
253   if (as1[n] == 1)
254     {
255       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
256     }
257   else if (as1[n] != 0)
258     {
259 #if HAVE_NATIVE_mpn_addlsh1_n
260       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
261 #else
262       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
263 #endif
264     }
265   else
266     cy = 0;
267   if (bs1[n] == 1)
268     {
269       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
270     }
271   else if (bs1[n] != 0)
272     {
273 #if HAVE_NATIVE_mpn_addlsh1_n
274       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
275 #else
276       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
277 #endif
278     }
279   v1[2 * n] = cy;
280 #else
281   cy = vinf[1];
282   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
283   vinf[1] = cy;
284 #endif
285 
286   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
287 
288   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, 1^vm1_neg, vinf0, scratch_out);
289 
290   TMP_FREE;
291 }
292