1 /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3
2    times as large as bn.  Or more accurately, bn < an < 2 bn.
3 
4    Contributed to the GNU project by Marco Bodrato.
5 
6    The idea of applying toom to unbalanced multiplication is due to Marco
7    Bodrato and Alberto Zanoni.
8 
9    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
10    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
11    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
12 
13 Copyright 2009 Free Software Foundation, Inc.
14 
15 This file is part of the GNU MP Library.
16 
17 The GNU MP Library is free software; you can redistribute it and/or modify
18 it under the terms of either:
19 
20   * the GNU Lesser General Public License as published by the Free
21     Software Foundation; either version 3 of the License, or (at your
22     option) any later version.
23 
24 or
25 
26   * the GNU General Public License as published by the Free Software
27     Foundation; either version 2 of the License, or (at your option) any
28     later version.
29 
30 or both in parallel, as here.
31 
32 The GNU MP Library is distributed in the hope that it will be useful, but
33 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
34 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
35 for more details.
36 
37 You should have received copies of the GNU General Public License and the
38 GNU Lesser General Public License along with the GNU MP Library.  If not,
39 see https://www.gnu.org/licenses/.  */
40 
41 
42 #include "gmp.h"
43 #include "gmp-impl.h"
44 
45 /* Evaluate in: -2, -1, 0, +1, +2, +inf
46 
47   <-s-><--n--><--n--><--n-->
48    ___ ______ ______ ______
49   |a3_|___a2_|___a1_|___a0_|
50 	|_b2_|___b1_|___b0_|
51 	<-t--><--n--><--n-->
52 
53   v0  =  a0             * b0          #   A(0)*B(0)
54   v1  = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 3  bh <= 2
55   vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1 |bh|<= 1
56   v2  = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 14 bh <= 6
57   vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) #  A(-2)*B(-2)    |ah| <= 9 |bh|<= 4
58   vinf=              a3 *         b2  # A(inf)*B(inf)
59 */
60 
61 void
mpn_toom43_mul(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr scratch)62 mpn_toom43_mul (mp_ptr pp,
63 		mp_srcptr ap, mp_size_t an,
64 		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
65 {
66   mp_size_t n, s, t;
67   enum toom6_flags flags;
68   mp_limb_t cy;
69 
70 #define a0  ap
71 #define a1  (ap + n)
72 #define a2  (ap + 2 * n)
73 #define a3  (ap + 3 * n)
74 #define b0  bp
75 #define b1  (bp + n)
76 #define b2  (bp + 2 * n)
77 
78   n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3);
79 
80   s = an - 3 * n;
81   t = bn - 2 * n;
82 
83   ASSERT (0 < s && s <= n);
84   ASSERT (0 < t && t <= n);
85 
86   /* This is true whenever an >= 25 or bn >= 19, I think. It
87      guarantees that we can fit 5 values of size n+1 in the product
88      area. */
89   ASSERT (s+t >= 5);
90 
91 #define v0    pp				/* 2n */
92 #define vm1   (scratch)				/* 2n+1 */
93 #define v1    (pp + 2*n)			/* 2n+1 */
94 #define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
95 #define v2    (scratch + 4 * n + 2)		/* 2n+1 */
96 #define vinf  (pp + 5 * n)			/* s+t */
97 #define bs1    pp				/* n+1 */
98 #define bsm1  (scratch + 2 * n + 2)		/* n+1 */
99 #define asm1  (scratch + 3 * n + 3)		/* n+1 */
100 #define asm2  (scratch + 4 * n + 4)		/* n+1 */
101 #define bsm2  (pp + n + 1)			/* n+1 */
102 #define bs2   (pp + 2 * n + 2)			/* n+1 */
103 #define as2   (pp + 3 * n + 3)			/* n+1 */
104 #define as1   (pp + 4 * n + 4)			/* n+1 */
105 
106   /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra
107      limb, because products will overwrite 2n+2 limbs. */
108 
109 #define a0a2  scratch
110 #define b0b2  scratch
111 #define a1a3  asm1
112 #define b1d   bsm1
113 
114   /* Compute as2 and asm2.  */
115   flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3));
116 
117   /* Compute bs2 and bsm2.  */
118   b1d[n] = mpn_lshift (b1d, b1, n, 1);			/*       2b1      */
119   cy  = mpn_lshift (b0b2, b2, t, 2);			/*  4b2           */
120   cy += mpn_add_n (b0b2, b0b2, b0, t);			/*  4b2      + b0 */
121   if (t != n)
122     cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
123   b0b2[n] = cy;
124 
125 #if HAVE_NATIVE_mpn_add_n_sub_n
126   if (mpn_cmp (b0b2, b1d, n+1) < 0)
127     {
128       mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1);
129       flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
130     }
131   else
132     {
133       mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1);
134     }
135 #else
136   mpn_add_n (bs2, b0b2, b1d, n+1);
137   if (mpn_cmp (b0b2, b1d, n+1) < 0)
138     {
139       mpn_sub_n (bsm2, b1d, b0b2, n+1);
140       flags = (enum toom6_flags) (flags ^ toom6_vm2_neg);
141     }
142   else
143     {
144       mpn_sub_n (bsm2, b0b2, b1d, n+1);
145     }
146 #endif
147 
148   /* Compute as1 and asm1.  */
149   flags = (enum toom6_flags) (flags ^ toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2));
150 
151   /* Compute bs1 and bsm1.  */
152   bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
153 #if HAVE_NATIVE_mpn_add_n_sub_n
154   if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
155     {
156       cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n);
157       bs1[n] = cy >> 1;
158       flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
159     }
160   else
161     {
162       cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n);
163       bs1[n] = bsm1[n] + (cy >> 1);
164       bsm1[n]-= cy & 1;
165     }
166 #else
167   bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
168   if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
169     {
170       mpn_sub_n (bsm1, b1, bsm1, n);
171       flags = (enum toom6_flags) (flags ^ toom6_vm1_neg);
172     }
173   else
174     {
175       bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
176     }
177 #endif
178 
179   ASSERT (as1[n] <= 3);
180   ASSERT (bs1[n] <= 2);
181   ASSERT (asm1[n] <= 1);
182   ASSERT (bsm1[n] <= 1);
183   ASSERT (as2[n] <=14);
184   ASSERT (bs2[n] <= 6);
185   ASSERT (asm2[n] <= 9);
186   ASSERT (bsm2[n] <= 4);
187 
188   /* vm1, 2n+1 limbs */
189   mpn_mul_n (vm1, asm1, bsm1, n+1);  /* W4 */
190 
191   /* vm2, 2n+1 limbs */
192   mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */
193 
194   /* v2, 2n+1 limbs */
195   mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */
196 
197   /* v1, 2n+1 limbs */
198   mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */
199 
200   /* vinf, s+t limbs */   /* W0 */
201   if (s > t)  mpn_mul (vinf, a3, s, b2, t);
202   else        mpn_mul (vinf, b2, t, a3, s);
203 
204   /* v0, 2n limbs */
205   mpn_mul_n (v0, ap, bp, n);  /* W5 */
206 
207   mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);
208 
209 #undef v0
210 #undef vm1
211 #undef v1
212 #undef vm2
213 #undef v2
214 #undef vinf
215 #undef bs1
216 #undef bs2
217 #undef bsm1
218 #undef bsm2
219 #undef asm1
220 #undef asm2
221 /* #undef as1 */
222 /* #undef as2 */
223 #undef a0a2
224 #undef b0b2
225 #undef a1a3
226 #undef b1d
227 #undef a0
228 #undef a1
229 #undef a2
230 #undef a3
231 #undef b0
232 #undef b1
233 #undef b2
234 }
235