1 /* mpn_toom42_mulmid -- toom42 middle product
2 
3    Contributed by David Harvey.
4 
5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2011 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15 
16   * the GNU Lesser General Public License as published by the Free
17     Software Foundation; either version 3 of the License, or (at your
18     option) any later version.
19 
20 or
21 
22   * the GNU General Public License as published by the Free Software
23     Foundation; either version 2 of the License, or (at your option) any
24     later version.
25 
26 or both in parallel, as here.
27 
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31 for more details.
32 
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library.  If not,
35 see https://www.gnu.org/licenses/.  */
36 
37 
38 #include "gmp-impl.h"
39 
40 
41 
42 /*
43   Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
44 
45   Neither ap nor bp may overlap rp.
46 
47   Must have n >= 4.
48 
49   Amount of scratch space required is given by mpn_toom42_mulmid_itch().
50 
51   FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
52   requirements should be clarified.
53 */
54 void
mpn_toom42_mulmid(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,mp_ptr scratch)55 mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
56                    mp_ptr scratch)
57 {
58   mp_limb_t cy, e[12], zh, zl;
59   mp_size_t m;
60   int neg;
61 
62   ASSERT (n >= 4);
63   ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
64   ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
65 
66   ap += n & 1;   /* handle odd row and diagonal later */
67   m = n / 2;
68 
69   /* (e0h:e0l) etc are correction terms, in 2's complement */
70 #define e0l (e[0])
71 #define e0h (e[1])
72 #define e1l (e[2])
73 #define e1h (e[3])
74 #define e2l (e[4])
75 #define e2h (e[5])
76 #define e3l (e[6])
77 #define e3h (e[7])
78 #define e4l (e[8])
79 #define e4h (e[9])
80 #define e5l (e[10])
81 #define e5h (e[11])
82 
83 #define s (scratch + 2)
84 #define t (rp + m + 2)
85 #define p0 rp
86 #define p1 scratch
87 #define p2 (rp + m)
88 #define next_scratch (scratch + 3*m + 1)
89 
90   /*
91             rp                            scratch
92   |---------|-----------|    |---------|---------|----------|
93   0         m         2m+2   0         m         2m        3m+1
94             <----p2---->       <-------------s------------->
95   <----p0----><---t---->     <----p1---->
96   */
97 
98   /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
99   cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
100   cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
101 		       bp + m, bp, m, cy);
102   mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
103 
104   /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
105   if (mpn_cmp (bp + m, bp, m) < 0)
106     {
107       ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
108 				      ap + m - 1, ap + 2*m - 1, m, 0));
109       neg = 1;
110     }
111   else
112     {
113       ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
114 				      ap + m - 1, ap + 2*m - 1, m, 0));
115       neg = 0;
116     }
117 
118   /* recursive middle products. The picture is:
119 
120       b[2m-1]   A   A   A   B   B   B   -   -   -   -   -
121       ...       -   A   A   A   B   B   B   -   -   -   -
122       b[m]      -   -   A   A   A   B   B   B   -   -   -
123       b[m-1]    -   -   -   C   C   C   D   D   D   -   -
124       ...       -   -   -   -   C   C   C   D   D   D   -
125       b[0]      -   -   -   -   -   C   C   C   D   D   D
126                a[0]   ...  a[m]  ...  a[2m]    ...    a[4m-2]
127   */
128 
129   if (m < MULMID_TOOM42_THRESHOLD)
130     {
131       /* A + B */
132       mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
133       /* accumulate high limbs of p0 into e1 */
134       ADDC_LIMB (cy, e1l, e1l, p0[m]);
135       e1h += p0[m + 1] + cy;
136       /* (-1)^neg * (B - C)   (overwrites first m limbs of s) */
137       mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
138       /* C + D   (overwrites t) */
139       mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
140     }
141   else
142     {
143       /* as above, but use toom42 instead */
144       mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
145       ADDC_LIMB (cy, e1l, e1l, p0[m]);
146       e1h += p0[m + 1] + cy;
147       mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
148       mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
149     }
150 
151   /* apply error terms */
152 
153   /* -e0 at rp[0] */
154   SUBC_LIMB (cy, rp[0], rp[0], e0l);
155   SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
156   if (UNLIKELY (cy))
157     {
158       cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
159       SUBC_LIMB (cy, e1l, e1l, cy);
160       e1h -= cy;
161     }
162 
163   /* z = e1 - e2 + high(p0) */
164   SUBC_LIMB (cy, zl, e1l, e2l);
165   zh = e1h - e2h - cy;
166 
167   /* z at rp[m] */
168   ADDC_LIMB (cy, rp[m], rp[m], zl);
169   zh = (zh + cy) & GMP_NUMB_MASK;
170   ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
171   cy -= (zh >> (GMP_NUMB_BITS - 1));
172   if (UNLIKELY (cy))
173     {
174       if (cy == 1)
175 	mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
176       else /* cy == -1 */
177 	mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
178     }
179 
180   /* e3 at rp[2*m] */
181   ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
182   rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
183 
184   /* e4 at p1[0] */
185   ADDC_LIMB (cy, p1[0], p1[0], e4l);
186   ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
187   if (UNLIKELY (cy))
188     mpn_add_1 (p1 + 2, p1 + 2, m, 1);
189 
190   /* -e5 at p1[m] */
191   SUBC_LIMB (cy, p1[m], p1[m], e5l);
192   p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
193 
194   /* adjustment if p1 ends up negative */
195   cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
196 
197   /* add (-1)^neg * (p1 - B^m * p1) to output */
198   if (neg)
199     {
200       mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
201       mpn_add (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
202       mpn_sub_n (rp + m, rp + m, p1, m + 2);            /* B + D */
203     }
204   else
205     {
206       mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
207       mpn_sub (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
208       mpn_add_n (rp + m, rp + m, p1, m + 2);            /* B + D */
209     }
210 
211   /* odd row and diagonal */
212   if (n & 1)
213     {
214       /*
215         Products marked E are already done. We need to do products marked O.
216 
217         OOOOO----
218         -EEEEO---
219         --EEEEO--
220         ---EEEEO-
221         ----EEEEO
222        */
223 
224       /* first row of O's */
225       cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
226       ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
227 
228       /* O's on diagonal */
229       /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
230          that can handle the sum below. Currently we're relying on
231          mulmid_basecase being pretty fast for a diagonal sum like this,
232 	 which is true at least for the K8 asm version, but surely false
233 	 for the generic version. */
234       mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
235       mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
236     }
237 }
238