1 /* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice
2    as large as bn.  Or more accurately, (3/2)bn < an < 4bn.
3 
4    Contributed to the GNU project by Torbjorn Granlund.
5    Additional improvements by Marco Bodrato.
6 
7    The idea of applying toom to unbalanced multiplication is due to Marco
8    Bodrato and Alberto Zanoni.
9 
10    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
11    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
12    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
13 
14 Copyright 2006-2008, 2012 Free Software Foundation, Inc.
15 
16 This file is part of the GNU MP Library.
17 
18 The GNU MP Library is free software; you can redistribute it and/or modify
19 it under the terms of either:
20 
21   * the GNU Lesser General Public License as published by the Free
22     Software Foundation; either version 3 of the License, or (at your
23     option) any later version.
24 
25 or
26 
27   * the GNU General Public License as published by the Free Software
28     Foundation; either version 2 of the License, or (at your option) any
29     later version.
30 
31 or both in parallel, as here.
32 
33 The GNU MP Library is distributed in the hope that it will be useful, but
34 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
35 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
36 for more details.
37 
38 You should have received copies of the GNU General Public License and the
39 GNU Lesser General Public License along with the GNU MP Library.  If not,
40 see https://www.gnu.org/licenses/.  */
41 
42 
43 #include "gmp.h"
44 #include "gmp-impl.h"
45 
46 /* Evaluate in: -1, 0, +1, +2, +inf
47 
48   <-s-><--n--><--n--><--n-->
49    ___ ______ ______ ______
50   |a3_|___a2_|___a1_|___a0_|
51 	       |_b1_|___b0_|
52 	       <-t--><--n-->
53 
54   v0  =  a0             * b0      #   A(0)*B(0)
55   v1  = (a0+ a1+ a2+ a3)*(b0+ b1) #   A(1)*B(1)      ah  <= 3  bh <= 1
56   vm1 = (a0- a1+ a2- a3)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 1  bh  = 0
57   v2  = (a0+2a1+4a2+8a3)*(b0+2b1) #   A(2)*B(2)      ah  <= 14 bh <= 2
58   vinf=              a3 *     b1  # A(inf)*B(inf)
59 */
60 
61 #define TOOM42_MUL_N_REC(p, a, b, n, ws)				\
62   do {									\
63     mpn_mul_n (p, a, b, n);						\
64   } while (0)
65 
66 void
mpn_toom42_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)67 mpn_toom42_mul (mp_ptr pp,
68 		mp_srcptr ap, mp_size_t an,
69 		mp_srcptr bp, mp_size_t bn,
70 		mp_ptr scratch)
71 {
72   mp_size_t n, s, t;
73   int vm1_neg;
74   mp_limb_t cy, vinf0;
75   mp_ptr a0_a2;
76   mp_ptr as1, asm1, as2;
77   mp_ptr bs1, bsm1, bs2;
78   TMP_DECL;
79 
80 #define a0  ap
81 #define a1  (ap + n)
82 #define a2  (ap + 2*n)
83 #define a3  (ap + 3*n)
84 #define b0  bp
85 #define b1  (bp + n)
86 
87   n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1;
88 
89   s = an - 3 * n;
90   t = bn - n;
91 
92   ASSERT (0 < s && s <= n);
93   ASSERT (0 < t && t <= n);
94 
95   TMP_MARK;
96 
97   as1 = TMP_SALLOC_LIMBS (n + 1);
98   asm1 = TMP_SALLOC_LIMBS (n + 1);
99   as2 = TMP_SALLOC_LIMBS (n + 1);
100 
101   bs1 = TMP_SALLOC_LIMBS (n + 1);
102   bsm1 = TMP_SALLOC_LIMBS (n);
103   bs2 = TMP_SALLOC_LIMBS (n + 1);
104 
105   a0_a2 = pp;
106 
107   /* Compute as1 and asm1.  */
108   vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2) & 1;
109 
110   /* Compute as2.  */
111 #if HAVE_NATIVE_mpn_addlsh1_n
112   cy  = mpn_addlsh1_n (as2, a2, a3, s);
113   if (s != n)
114     cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
115   cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n);
116   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
117 #else
118   cy  = mpn_lshift (as2, a3, s, 1);
119   cy += mpn_add_n (as2, a2, as2, s);
120   if (s != n)
121     cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy);
122   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
123   cy += mpn_add_n (as2, a1, as2, n);
124   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
125   cy += mpn_add_n (as2, a0, as2, n);
126 #endif
127   as2[n] = cy;
128 
129   /* Compute bs1 and bsm1.  */
130   if (t == n)
131     {
132 #if HAVE_NATIVE_mpn_add_n_sub_n
133       if (mpn_cmp (b0, b1, n) < 0)
134 	{
135 	  cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n);
136 	  vm1_neg ^= 1;
137 	}
138       else
139 	{
140 	  cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n);
141 	}
142       bs1[n] = cy >> 1;
143 #else
144       bs1[n] = mpn_add_n (bs1, b0, b1, n);
145 
146       if (mpn_cmp (b0, b1, n) < 0)
147 	{
148 	  mpn_sub_n (bsm1, b1, b0, n);
149 	  vm1_neg ^= 1;
150 	}
151       else
152 	{
153 	  mpn_sub_n (bsm1, b0, b1, n);
154 	}
155 #endif
156     }
157   else
158     {
159       bs1[n] = mpn_add (bs1, b0, n, b1, t);
160 
161       if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
162 	{
163 	  mpn_sub_n (bsm1, b1, b0, t);
164 	  MPN_ZERO (bsm1 + t, n - t);
165 	  vm1_neg ^= 1;
166 	}
167       else
168 	{
169 	  mpn_sub (bsm1, b0, n, b1, t);
170 	}
171     }
172 
173   /* Compute bs2, recycling bs1. bs2=bs1+b1  */
174   mpn_add (bs2, bs1, n + 1, b1, t);
175 
176   ASSERT (as1[n] <= 3);
177   ASSERT (bs1[n] <= 1);
178   ASSERT (asm1[n] <= 1);
179 /*ASSERT (bsm1[n] == 0);*/
180   ASSERT (as2[n] <= 14);
181   ASSERT (bs2[n] <= 2);
182 
183 #define v0    pp				/* 2n */
184 #define v1    (pp + 2 * n)			/* 2n+1 */
185 #define vinf  (pp + 4 * n)			/* s+t */
186 #define vm1   scratch				/* 2n+1 */
187 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
188 #define scratch_out	scratch + 4 * n + 4	/* Currently unused. */
189 
190   /* vm1, 2n+1 limbs */
191   TOOM42_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
192   cy = 0;
193   if (asm1[n] != 0)
194     cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
195   vm1[2 * n] = cy;
196 
197   TOOM42_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
198 
199   /* vinf, s+t limbs */
200   if (s > t)  mpn_mul (vinf, a3, s, b1, t);
201   else        mpn_mul (vinf, b1, t, a3, s);
202 
203   vinf0 = vinf[0];				/* v1 overlaps with this */
204 
205   /* v1, 2n+1 limbs */
206   TOOM42_MUL_N_REC (v1, as1, bs1, n, scratch_out);
207   if (as1[n] == 1)
208     {
209       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
210     }
211   else if (as1[n] == 2)
212     {
213 #if HAVE_NATIVE_mpn_addlsh1_n
214       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
215 #else
216       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
217 #endif
218     }
219   else if (as1[n] == 3)
220     {
221       cy = 3 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3));
222     }
223   else
224     cy = 0;
225   if (bs1[n] != 0)
226     cy += mpn_add_n (v1 + n, v1 + n, as1, n);
227   v1[2 * n] = cy;
228 
229   TOOM42_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
230 
231   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
232 
233   TMP_FREE;
234 }
235