1 /* mpn_mulmid -- middle product
2
3 Copyright (C) 2009, David Harvey
4
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
9
10 1. Redistributions of source code must retain the above copyright notice, this
11 list of conditions and the following disclaimer.
12
13 2. Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the documentation
15 and/or other materials provided with the distribution.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE
21 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 */
29
30 #include "mpir.h"
31 #include "gmp-impl.h"
32
33
34 #define CHUNK (200 + MULMID_TOOM42_THRESHOLD)
35
36 /* Let a = sum_0^{m-1} a_i B^i and b = sum_0^{n-1} b_j B^j
37
38 then MP(a, m, b, n) = sum_{0<=i<m, 0<=j<n, n-1<=i+j<=m-1} a_ib_j B^{i+j-n+1}
39
40 Note there are m-n+1 different values of i+j and each product a_ib_j will be two limbs. Thus when added together, the sum must take up n-m+3 limbs of space.
41
42 This function computes MP(ap,an,bp,bn), placing the result in {rp, an-bn+3}.
43
44 It is required that bn << GMP_NUMBMAX.
45 */
46
47 void
mpn_mulmid(mp_ptr rp,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn)48 mpn_mulmid (mp_ptr rp,
49 mp_srcptr ap, mp_size_t an,
50 mp_srcptr bp, mp_size_t bn)
51 {
52 mp_size_t rn, k;
53 mp_ptr scratch, temp;
54
55 ASSERT (an >= bn);
56 ASSERT (bn >= 1);
57 ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an));
58 ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn));
59
60 if (bn < MULMID_TOOM42_THRESHOLD)
61 {
62 /* region not tall enough to make toom42 worthwhile for any portion */
63
64 if (an < CHUNK)
65 {
66 /* region not too wide either, just call basecase directly */
67 mpn_mulmid_basecase (rp, ap, an, bp, bn);
68 return;
69 }
70
71 /* Region quite wide. For better locality, use basecase on chunks:
72
73 AAABBBCC..
74 .AAABBBCC.
75 ..AAABBBCC
76 */
77
78 k = CHUNK - bn + 1; /* number of diagonals per chunk */
79
80 /* first chunk (marked A in the above diagram) */
81 mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
82
83 /* remaining chunks (B, C, etc) */
84 an -= k;
85
86 while (an >= CHUNK)
87 {
88 mp_limb_t t0, t1, cy;
89 ap += k, rp += k;
90 t0 = rp[0], t1 = rp[1];
91 mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn);
92 ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
93 MPN_INCR_U (rp + 1, k + 1, t1 + cy);
94 an -= k;
95 }
96
97 if (an >= bn)
98 {
99 /* last remaining chunk */
100 mp_limb_t t0, t1, cy;
101 ap += k, rp += k;
102 t0 = rp[0], t1 = rp[1];
103 mpn_mulmid_basecase (rp, ap, an, bp, bn);
104 ADDC_LIMB (cy, rp[0], rp[0], t0);
105 MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy);
106 }
107
108 return;
109 }
110
111 /* region is tall enough for toom42 */
112
113 rn = an - bn + 1;
114
115 if (rn < MULMID_TOOM42_THRESHOLD)
116 {
117 /* region not wide enough to make toom42 worthwhile for any portion */
118
119 if (bn < CHUNK)
120 {
121 /* region not too tall either, just call basecase directly */
122 mpn_mulmid_basecase (rp, ap, an, bp, bn);
123 return;
124 }
125
126 /* Region quite tall. For better locality, use basecase on chunks:
127
128 AAAAA....
129 .AAAAA...
130 ..BBBBB..
131 ...BBBBB.
132 ....CCCCC
133 */
134 {
135 TMP_DECL;
136 TMP_MARK;
137
138 temp = TMP_ALLOC_LIMBS (rn + 2);
139
140 /* first chunk (marked A in the above diagram) */
141 bp += bn - CHUNK, an -= bn - CHUNK;
142 mpn_mulmid_basecase (rp, ap, an, bp, CHUNK);
143
144 /* remaining chunks (B, C, etc) */
145 bn -= CHUNK;
146
147 while (bn >= CHUNK)
148 {
149 ap += CHUNK, bp -= CHUNK;
150 mpn_mulmid_basecase (temp, ap, an, bp, CHUNK);
151 mpn_add_n (rp, rp, temp, rn + 2);
152 bn -= CHUNK;
153 }
154
155 if (bn)
156 {
157 /* last remaining chunk */
158 ap += CHUNK, bp -= bn;
159 mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn);
160 mpn_add_n (rp, rp, temp, rn + 2);
161 }
162
163 TMP_FREE;
164 }
165 return;
166 }
167
168 /* we're definitely going to use toom42 somewhere */
169
170 if (bn > rn)
171 {
172 /* slice region into chunks, use toom42 on all chunks except possibly
173 the last:
174
175 AA....
176 .AA...
177 ..BB..
178 ...BB.
179 ....CC
180 */
181
182 TMP_DECL;
183 TMP_MARK;
184
185 temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn));
186 scratch = temp + rn + 2;
187
188 /* first chunk (marked A in the above diagram) */
189 bp += bn - rn;
190 mpn_toom42_mulmid (rp, ap, bp, rn, scratch);
191
192 /* remaining chunks (B, C, etc) */
193 bn -= rn;
194
195 while (bn >= rn)
196 {
197 ap += rn, bp -= rn;
198 mpn_toom42_mulmid (temp, ap, bp, rn, scratch);
199 mpn_add_n (rp, rp, temp, rn + 2);
200 bn -= rn;
201 }
202
203 if (bn)
204 {
205 /* last remaining chunk */
206 ap += rn, bp -= bn;
207 mpn_mulmid (temp, ap, rn + bn - 1, bp, bn);
208 mpn_add_n (rp, rp, temp, rn + 2);
209 }
210
211 TMP_FREE;
212 }
213 else
214 {
215 /* slice region into chunks, use toom42 on all chunks except possibly
216 the last:
217
218 AAABBBCC..
219 .AAABBBCC.
220 ..AAABBBCC
221 */
222
223 TMP_DECL;
224 TMP_MARK;
225
226 scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn));
227
228 /* first chunk (marked A in the above diagram) */
229 mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
230
231 /* remaining chunks (B, C, etc) */
232 rn -= bn;
233
234 while (rn >= bn)
235 {
236 mp_limb_t t0, t1, cy;
237 ap += bn, rp += bn;
238 t0 = rp[0], t1 = rp[1];
239 mpn_toom42_mulmid (rp, ap, bp, bn, scratch);
240 ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */
241 MPN_INCR_U (rp + 1, bn + 1, t1 + cy);
242 rn -= bn;
243 }
244
245 TMP_FREE;
246
247 if (rn)
248 {
249 /* last remaining chunk */
250 mp_limb_t t0, t1, cy;
251 ap += bn, rp += bn;
252 t0 = rp[0], t1 = rp[1];
253 mpn_mulmid (rp, ap, rn + bn - 1, bp, bn);
254 ADDC_LIMB (cy, rp[0], rp[0], t0);
255 MPN_INCR_U (rp + 1, rn + 1, t1 + cy);
256 }
257 }
258 }
259