1 /* matrix22_mul.c.
2
3 Contributed by Niels Möller and Marco Bodrato.
4
5 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
10
11 This file is part of the GNU MP Library.
12
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15
16 * the GNU Lesser General Public License as published by the Free
17 Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
19
20 or
21
22 * the GNU General Public License as published by the Free Software
23 Foundation; either version 2 of the License, or (at your option) any
24 later version.
25
26 or both in parallel, as here.
27
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
31 for more details.
32
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library. If not,
35 see https://www.gnu.org/licenses/. */
36
37 #include "gmp.h"
38 #include "gmp-impl.h"
39 #include "longlong.h"
40
41 #define MUL(rp, ap, an, bp, bn) do { \
42 if (an >= bn) \
43 mpn_mul (rp, ap, an, bp, bn); \
44 else \
45 mpn_mul (rp, bp, bn, ap, an); \
46 } while (0)
47
48 /* Inputs are unsigned. */
49 static int
abs_sub_n(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t n)50 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
51 {
52 int c;
53 MPN_CMP (c, ap, bp, n);
54 if (c >= 0)
55 {
56 mpn_sub_n (rp, ap, bp, n);
57 return 0;
58 }
59 else
60 {
61 mpn_sub_n (rp, bp, ap, n);
62 return 1;
63 }
64 }
65
66 static int
add_signed_n(mp_ptr rp,mp_srcptr ap,int as,mp_srcptr bp,int bs,mp_size_t n)67 add_signed_n (mp_ptr rp,
68 mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
69 {
70 if (as != bs)
71 return as ^ abs_sub_n (rp, ap, bp, n);
72 else
73 {
74 ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
75 return as;
76 }
77 }
78
79 mp_size_t
mpn_matrix22_mul_itch(mp_size_t rn,mp_size_t mn)80 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
81 {
82 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
83 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
84 return 3*rn + 2*mn;
85 else
86 return 3*(rn + mn) + 5;
87 }
88
89 /* Algorithm:
90
91 / s0 \ / 1 0 0 0 \ / r0 \
92 | s1 | | 0 1 0 1 | | r1 |
93 | s2 | | 0 0 -1 1 | | r2 |
94 | s3 | = | 0 1 -1 1 | \ r3 /
95 | s4 | | -1 1 -1 1 |
96 | s5 | | 0 1 0 0 |
97 \ s6 / \ 0 0 1 0 /
98
99 / t0 \ / 1 0 0 0 \ / m0 \
100 | t1 | | 0 1 0 1 | | m1 |
101 | t2 | | 0 0 -1 1 | | m2 |
102 | t3 | = | 0 1 -1 1 | \ m3 /
103 | t4 | | -1 1 -1 1 |
104 | t5 | | 0 1 0 0 |
105 \ t6 / \ 0 0 1 0 /
106
107 Note: the two matrices above are the same, but s_i and t_i are used
108 in the same product, only for i<4, see "A Strassen-like Matrix
109 Multiplication suited for squaring and higher power computation" by
110 M. Bodrato, in Proceedings of ISSAC 2010.
111
112 / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \
113 | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 |
114 | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 |
115 \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 |
116 | s4*t5 |
117 | s5*t6 |
118 \ s6*t4 /
119
120 The scheduling uses two temporaries U0 and U1 to store products, and
121 two, S0 and T0, to store combinations of entries of the two
122 operands.
123 */
124
125 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
126 *
127 * Resulting elements are of size up to rn + mn + 1.
128 *
129 * Temporary storage: 3 rn + 3 mn + 5. */
130 void
mpn_matrix22_mul_strassen(mp_ptr r0,mp_ptr r1,mp_ptr r2,mp_ptr r3,mp_size_t rn,mp_srcptr m0,mp_srcptr m1,mp_srcptr m2,mp_srcptr m3,mp_size_t mn,mp_ptr tp)131 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
132 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
133 mp_ptr tp)
134 {
135 mp_ptr s0, t0, u0, u1;
136 int r1s, r3s, s0s, t0s, u1s;
137 s0 = tp; tp += rn + 1;
138 t0 = tp; tp += mn + 1;
139 u0 = tp; tp += rn + mn + 1;
140 u1 = tp; /* rn + mn + 2 */
141
142 MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */
143 r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */
144 if (r3s)
145 {
146 r1s = abs_sub_n (r1, r1, r3, rn);
147 r1[rn] = 0;
148 }
149 else
150 {
151 r1[rn] = mpn_add_n (r1, r1, r3, rn);
152 r1s = 0; /* r1 - r2 + r3 */
153 }
154 if (r1s)
155 {
156 s0[rn] = mpn_add_n (s0, r1, r0, rn);
157 s0s = 0;
158 }
159 else if (r1[rn] != 0)
160 {
161 s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
162 s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */
163 /* Reverse sign! */
164 }
165 else
166 {
167 s0s = abs_sub_n (s0, r0, r1, rn);
168 s0[rn] = 0;
169 }
170 MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */
171 r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
172 ASSERT (r0[rn+mn] < 2); /* u0 + u5 */
173
174 t0s = abs_sub_n (t0, m3, m2, mn);
175 u1s = r3s^t0s^1; /* Reverse sign! */
176 MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */
177 u1[rn+mn] = 0;
178 if (t0s)
179 {
180 t0s = abs_sub_n (t0, m1, t0, mn);
181 t0[mn] = 0;
182 }
183 else
184 {
185 t0[mn] = mpn_add_n (t0, t0, m1, mn);
186 }
187
188 /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
189 at r3. I'd expect that for matrices of random size, the high
190 words t0[mn] and r1[rn] are non-zero with a pretty small
191 probability. If that can be confirmed this should be done as an
192 unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
193 add_n. */
194 if (t0[mn] != 0)
195 {
196 MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */
197 ASSERT (r1[rn] < 2);
198 if (r1[rn] != 0)
199 mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
200 }
201 else
202 {
203 MUL (r3, r1, rn + 1, t0, mn);
204 }
205
206 ASSERT (r3[rn+mn] < 4);
207
208 u0[rn+mn] = 0;
209 if (r1s^t0s)
210 {
211 r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
212 }
213 else
214 {
215 ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
216 r3s = 0; /* u3 + u5 */
217 }
218
219 if (t0s)
220 {
221 t0[mn] = mpn_add_n (t0, t0, m0, mn);
222 }
223 else if (t0[mn] != 0)
224 {
225 t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
226 }
227 else
228 {
229 t0s = abs_sub_n (t0, t0, m0, mn);
230 }
231 MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */
232 ASSERT (u0[rn+mn] < 2);
233 if (r1s)
234 {
235 ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
236 }
237 else
238 {
239 r1[rn] += mpn_add_n (r1, r1, r2, rn);
240 }
241 rn++;
242 t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
243 /* u3 + u5 + u6 */
244 ASSERT (r2[rn+mn-1] < 4);
245 r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
246 /* -u2 + u3 + u5 */
247 ASSERT (r3[rn+mn-1] < 3);
248 MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */
249 ASSERT (u0[rn+mn-1] < 2);
250 t0[mn] = mpn_add_n (t0, m3, m1, mn);
251 MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */
252 mn += rn;
253 ASSERT (u1[mn-1] < 4);
254 ASSERT (u1[mn] == 0);
255 ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
256 /* -u2 + u3 - u4 + u5 */
257 ASSERT (r1[mn-1] < 2);
258 if (r3s)
259 {
260 ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
261 }
262 else
263 {
264 ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
265 /* u1 + u2 - u3 - u5 */
266 }
267 ASSERT (r3[mn-1] < 2);
268 if (t0s)
269 {
270 ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
271 }
272 else
273 {
274 ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
275 /* u1 - u3 - u5 - u6 */
276 }
277 ASSERT (r2[mn-1] < 2);
278 }
279
280 void
mpn_matrix22_mul(mp_ptr r0,mp_ptr r1,mp_ptr r2,mp_ptr r3,mp_size_t rn,mp_srcptr m0,mp_srcptr m1,mp_srcptr m2,mp_srcptr m3,mp_size_t mn,mp_ptr tp)281 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
282 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
283 mp_ptr tp)
284 {
285 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
286 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
287 {
288 mp_ptr p0, p1;
289 unsigned i;
290
291 /* Temporary storage: 3 rn + 2 mn */
292 p0 = tp + rn;
293 p1 = p0 + rn + mn;
294
295 for (i = 0; i < 2; i++)
296 {
297 MPN_COPY (tp, r0, rn);
298
299 if (rn >= mn)
300 {
301 mpn_mul (p0, r0, rn, m0, mn);
302 mpn_mul (p1, r1, rn, m3, mn);
303 mpn_mul (r0, r1, rn, m2, mn);
304 mpn_mul (r1, tp, rn, m1, mn);
305 }
306 else
307 {
308 mpn_mul (p0, m0, mn, r0, rn);
309 mpn_mul (p1, m3, mn, r1, rn);
310 mpn_mul (r0, m2, mn, r1, rn);
311 mpn_mul (r1, m1, mn, tp, rn);
312 }
313 r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
314 r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
315
316 r0 = r2; r1 = r3;
317 }
318 }
319 else
320 mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
321 m0, m1, m2, m3, mn, tp);
322 }
323