xref: /dragonfly/contrib/gmp/mpn/generic/toom42_mul.c (revision 92fc8b5c)
1 /* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice
2    as large as bn.  Or more accurately, (3/2)bn < an < 4bn.
3 
4    Contributed to the GNU project by Torbjorn Granlund.
5    Additional improvements by Marco Bodrato.
6 
7    The idea of applying toom to unbalanced multiplication is due to Marco
8    Bodrato and Alberto Zanoni.
9 
10    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
11    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13 
14 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
15 
16 This file is part of the GNU MP Library.
17 
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of the GNU Lesser General Public License as published by
20 the Free Software Foundation; either version 3 of the License, or (at your
21 option) any later version.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
26 License for more details.
27 
28 You should have received a copy of the GNU Lesser General Public License
29 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
30 
31 
32 /*
33   Things to work on:
34 
35   1. Trim allocation.  The allocations for as1, asm1, bs1, and bsm1 could be
36      avoided by instead reusing the pp area and the scratch allocation.
37 
38   2. Apply optimizations also to mul_toom32.c.
39 */
40 
41 #include "gmp.h"
42 #include "gmp-impl.h"
43 
44 /* Evaluate in: -1, 0, +1, +2, +inf
45 
46   <-s-><--n--><--n--><--n-->
47    ___ ______ ______ ______
48   |a3_|___a2_|___a1_|___a0_|
49 	       |_b1_|___b0_|
50 	       <-t--><--n-->
51 
52   v0  =  a0             * b0      #   A(0)*B(0)
53   v1  = (a0+ a1+ a2+ a3)*(b0+ b1) #   A(1)*B(1)      ah  <= 3  bh <= 1
54   vm1 = (a0- a1+ a2- a3)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh  = 0
55   v2  = (a0+2a1+4a2+8a3)*(b0+2b1) #   A(2)*B(2)      ah  <= 14 bh <= 2
56   vinf=              a3 *     b1  # A(inf)*B(inf)
57 */
58 
59 #if TUNE_PROGRAM_BUILD
60 #define MAYBE_mul_toom22   1
61 #else
62 #define MAYBE_mul_toom22						\
63   (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD)
64 #endif
65 
66 #define TOOM22_MUL_N_REC(p, a, b, n, ws)				\
67   do {									\
68     if (! MAYBE_mul_toom22						\
69 	|| BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD))		\
70       mpn_mul_basecase (p, a, n, b, n);					\
71     else								\
72       mpn_toom22_mul (p, a, n, b, n, ws);				\
73   } while (0)
74 
75 void
76 mpn_toom42_mul (mp_ptr pp,
77 		mp_srcptr ap, mp_size_t an,
78 		mp_srcptr bp, mp_size_t bn,
79 		mp_ptr scratch)
80 {
81   mp_size_t n, s, t;
82   int vm1_neg;
83   mp_limb_t cy, vinf0;
84   mp_ptr a0_a2, a1_a3;
85   mp_ptr as1, asm1, as2;
86   mp_ptr bs1, bsm1, bs2;
87   TMP_DECL;
88 
89 #define a0  ap
90 #define a1  (ap + n)
91 #define a2  (ap + 2*n)
92 #define a3  (ap + 3*n)
93 #define b0  bp
94 #define b1  (bp + n)
95 
96   n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1;
97 
98   s = an - 3 * n;
99   t = bn - n;
100 
101   ASSERT (0 < s && s <= n);
102   ASSERT (0 < t && t <= n);
103 
104   TMP_MARK;
105 
106   as1 = TMP_SALLOC_LIMBS (n + 1);
107   asm1 = TMP_SALLOC_LIMBS (n + 1);
108   as2 = TMP_SALLOC_LIMBS (n + 1);
109 
110   bs1 = TMP_SALLOC_LIMBS (n + 1);
111   bsm1 = TMP_SALLOC_LIMBS (n);
112   bs2 = TMP_SALLOC_LIMBS (n + 1);
113 
114   a0_a2 = pp;
115   a1_a3 = pp + n + 1;
116 
117   /* Compute as1 and asm1.  */
118   a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n);
119   a1_a3[n] = mpn_add (a1_a3, a1, n, a3, s);
120 #if HAVE_NATIVE_mpn_addsub_n
121   if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
122     {
123       mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1);
124       vm1_neg = 1;
125     }
126   else
127     {
128       mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1);
129       vm1_neg = 0;
130     }
131 #else
132   mpn_add_n (as1, a0_a2, a1_a3, n + 1);
133   if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0)
134     {
135       mpn_sub_n (asm1, a1_a3, a0_a2, n + 1);
136       vm1_neg = 1;
137     }
138   else
139     {
140       mpn_sub_n (asm1, a0_a2, a1_a3, n + 1);
141       vm1_neg = 0;
142     }
143 #endif
144 
145   /* Compute as2.  */
146 #if HAVE_NATIVE_mpn_addlsh1_n
147   cy  = mpn_addlsh1_n (as2, a2, a3, s);
148   if (s != n)
149     cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
150   cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
151   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
152 #else
153   cy  = mpn_lshift (as2, a3, s, 1);
154   cy += mpn_add_n (as2, a2, as2, s);
155   if (s != n)
156     cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
157   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
158   cy += mpn_add_n (as2, a1, as2, n);
159   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
160   cy += mpn_add_n (as2, a0, as2, n);
161 #endif
162   as2[n] = cy;
163 
164   /* Compute bs1 and bsm1.  */
165   if (t == n)
166     {
167 #if HAVE_NATIVE_mpn_addsub_n
168       if (mpn_cmp (b0, b1, n) < 0)
169 	{
170 	  cy = mpn_addsub_n (bs1, bsm1, b1, b0, n);
171 	  vm1_neg ^= 1;
172 	}
173       else
174 	{
175 	  cy = mpn_addsub_n (bs1, bsm1, b0, b1, n);
176 	}
177       bs1[n] = cy >> 1;
178 #else
179       bs1[n] = mpn_add_n (bs1, b0, b1, n);
180 
181       if (mpn_cmp (b0, b1, n) < 0)
182 	{
183 	  mpn_sub_n (bsm1, b1, b0, n);
184 	  vm1_neg ^= 1;
185 	}
186       else
187 	{
188 	  mpn_sub_n (bsm1, b0, b1, n);
189 	}
190 #endif
191     }
192   else
193     {
194       bs1[n] = mpn_add (bs1, b0, n, b1, t);
195 
196       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
197 	{
198 	  mpn_sub_n (bsm1, b1, b0, t);
199 	  MPN_ZERO (bsm1 + t, n - t);
200 	  vm1_neg ^= 1;
201 	}
202       else
203 	{
204 	  mpn_sub (bsm1, b0, n, b1, t);
205 	}
206     }
207 
208   /* Compute bs2, recycling bs1. bs2=bs1+b1  */
209   mpn_add (bs2, bs1, n + 1, b1, t);
210 
211   ASSERT (as1[n] <= 3);
212   ASSERT (bs1[n] <= 1);
213   ASSERT (asm1[n] <= 1);
214 /*ASSERT (bsm1[n] == 0);*/
215   ASSERT (as2[n] <= 14);
216   ASSERT (bs2[n] <= 2);
217 
218 #define v0    pp				/* 2n */
219 #define v1    (pp + 2 * n)			/* 2n+1 */
220 #define vinf  (pp + 4 * n)			/* s+t */
221 #define vm1   scratch				/* 2n+1 */
222 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
223 #define scratch_out	scratch + 4 * n + 4
224 
225   /* vm1, 2n+1 limbs */
226   TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
227   cy = 0;
228   if (asm1[n] != 0)
229     cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
230   vm1[2 * n] = cy;
231 
232   TOOM22_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
233 
234   /* vinf, s+t limbs */
235   if (s > t)  mpn_mul (vinf, a3, s, b1, t);
236   else        mpn_mul (vinf, b1, t, a3, s);
237 
238   vinf0 = vinf[0];				/* v1 overlaps with this */
239 
240   /* v1, 2n+1 limbs */
241   TOOM22_MUL_N_REC (v1, as1, bs1, n, scratch_out);
242   if (as1[n] == 1)
243     {
244       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
245     }
246   else if (as1[n] == 2)
247     {
248 #if HAVE_NATIVE_mpn_addlsh1_n
249       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
250 #else
251       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
252 #endif
253     }
254   else if (as1[n] == 3)
255     {
256       cy = 3 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3));
257     }
258   else
259     cy = 0;
260   if (bs1[n] != 0)
261     cy += mpn_add_n (v1 + n, v1 + n, as1, n);
262   v1[2 * n] = cy;
263 
264   TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
265 
266   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, 1^vm1_neg, vinf0, scratch + 4 * n + 4);
267 
268   TMP_FREE;
269 }
270