1 /* mpn_toom42_mulmid -- toom42 middle product
2
3 Contributed by David Harvey.
4
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9 Copyright 2011 Free Software Foundation, Inc.
10
11 This file is part of the GNU MP Library.
12
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19
20 or
21
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
24 later version.
25
26 or both in parallel, as here.
27
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 for more details.
32
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library. If not,
35 see https://www.gnu.org/licenses/. */
36
37
38 #include "gmp-impl.h"
39
40
41
42 /*
43 Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
44
45 Neither ap nor bp may overlap rp.
46
47 Must have n >= 4.
48
49 Amount of scratch space required is given by mpn_toom42_mulmid_itch().
50
51 FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
52 requirements should be clarified.
53 */
54 void
mpn_toom42_mulmid(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t n,mp_ptr scratch)55 mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
56 mp_ptr scratch)
57 {
58 mp_limb_t cy, e[12], zh, zl;
59 mp_size_t m;
60 int neg;
61
62 ASSERT (n >= 4);
63 ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
64 ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
65
66 ap += n & 1; /* handle odd row and diagonal later */
67 m = n / 2;
68
69 /* (e0h:e0l) etc are correction terms, in 2's complement */
70 #define e0l (e[0])
71 #define e0h (e[1])
72 #define e1l (e[2])
73 #define e1h (e[3])
74 #define e2l (e[4])
75 #define e2h (e[5])
76 #define e3l (e[6])
77 #define e3h (e[7])
78 #define e4l (e[8])
79 #define e4h (e[9])
80 #define e5l (e[10])
81 #define e5h (e[11])
82
83 #define s (scratch + 2)
84 #define t (rp + m + 2)
85 #define p0 rp
86 #define p1 scratch
87 #define p2 (rp + m)
88 #define next_scratch (scratch + 3*m + 1)
89
90 /*
91 rp scratch
92 |---------|-----------| |---------|---------|----------|
93 0 m 2m+2 0 m 2m 3m+1
94 <----p2----> <-------------s------------->
95 <----p0----><---t----> <----p1---->
96 */
97
98 /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
99 cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
100 cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
101 bp + m, bp, m, cy);
102 mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
103
104 /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
105 if (mpn_cmp (bp + m, bp, m) < 0)
106 {
107 ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
108 ap + m - 1, ap + 2*m - 1, m, 0));
109 neg = 1;
110 }
111 else
112 {
113 ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
114 ap + m - 1, ap + 2*m - 1, m, 0));
115 neg = 0;
116 }
117
118 /* recursive middle products. The picture is:
119
120 b[2m-1] A A A B B B - - - - -
121 ... - A A A B B B - - - -
122 b[m] - - A A A B B B - - -
123 b[m-1] - - - C C C D D D - -
124 ... - - - - C C C D D D -
125 b[0] - - - - - C C C D D D
126 a[0] ... a[m] ... a[2m] ... a[4m-2]
127 */
128
129 if (m < MULMID_TOOM42_THRESHOLD)
130 {
131 /* A + B */
132 mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
133 /* accumulate high limbs of p0 into e1 */
134 ADDC_LIMB (cy, e1l, e1l, p0[m]);
135 e1h += p0[m + 1] + cy;
136 /* (-1)^neg * (B - C) (overwrites first m limbs of s) */
137 mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
138 /* C + D (overwrites t) */
139 mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
140 }
141 else
142 {
143 /* as above, but use toom42 instead */
144 mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
145 ADDC_LIMB (cy, e1l, e1l, p0[m]);
146 e1h += p0[m + 1] + cy;
147 mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
148 mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
149 }
150
151 /* apply error terms */
152
153 /* -e0 at rp[0] */
154 SUBC_LIMB (cy, rp[0], rp[0], e0l);
155 SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
156 if (UNLIKELY (cy))
157 {
158 cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
159 SUBC_LIMB (cy, e1l, e1l, cy);
160 e1h -= cy;
161 }
162
163 /* z = e1 - e2 + high(p0) */
164 SUBC_LIMB (cy, zl, e1l, e2l);
165 zh = e1h - e2h - cy;
166
167 /* z at rp[m] */
168 ADDC_LIMB (cy, rp[m], rp[m], zl);
169 zh = (zh + cy) & GMP_NUMB_MASK;
170 ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
171 cy -= (zh >> (GMP_NUMB_BITS - 1));
172 if (UNLIKELY (cy))
173 {
174 if (cy == 1)
175 mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
176 else /* cy == -1 */
177 mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
178 }
179
180 /* e3 at rp[2*m] */
181 ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
182 rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
183
184 /* e4 at p1[0] */
185 ADDC_LIMB (cy, p1[0], p1[0], e4l);
186 ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
187 if (UNLIKELY (cy))
188 mpn_add_1 (p1 + 2, p1 + 2, m, 1);
189
190 /* -e5 at p1[m] */
191 SUBC_LIMB (cy, p1[m], p1[m], e5l);
192 p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
193
194 /* adjustment if p1 ends up negative */
195 cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
196
197 /* add (-1)^neg * (p1 - B^m * p1) to output */
198 if (neg)
199 {
200 mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
201 mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */
202 mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */
203 }
204 else
205 {
206 mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
207 mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */
208 mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */
209 }
210
211 /* odd row and diagonal */
212 if (n & 1)
213 {
214 /*
215 Products marked E are already done. We need to do products marked O.
216
217 OOOOO----
218 -EEEEO---
219 --EEEEO--
220 ---EEEEO-
221 ----EEEEO
222 */
223
224 /* first row of O's */
225 cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
226 ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
227
228 /* O's on diagonal */
229 /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
230 that can handle the sum below. Currently we're relying on
231 mulmid_basecase being pretty fast for a diagonal sum like this,
232 which is true at least for the K8 asm version, but surely false
233 for the generic version. */
234 mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
235 mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
236 }
237 }
238