1 /* matrix22_mul.c.
2 
3    Contributed by Niels Möller and Marco Bodrato.
4 
5    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
6    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8 
9 Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc.
10 
11 This file is part of the GNU MP Library.
12 
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of either:
15 
16   * the GNU Lesser General Public License as published by the Free
17     Software Foundation; either version 3 of the License, or (at your
18     option) any later version.
19 
20 or
21 
22   * the GNU General Public License as published by the Free Software
23     Foundation; either version 2 of the License, or (at your option) any
24     later version.
25 
26 or both in parallel, as here.
27 
28 The GNU MP Library is distributed in the hope that it will be useful, but
29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31 for more details.
32 
33 You should have received copies of the GNU General Public License and the
34 GNU Lesser General Public License along with the GNU MP Library.  If not,
35 see https://www.gnu.org/licenses/.  */
36 
37 #include "gmp.h"
38 #include "gmp-impl.h"
39 #include "longlong.h"
40 
41 #define MUL(rp, ap, an, bp, bn) do {		\
42   if (an >= bn)					\
43     mpn_mul (rp, ap, an, bp, bn);		\
44   else						\
45     mpn_mul (rp, bp, bn, ap, an);		\
46 } while (0)
47 
48 /* Inputs are unsigned. */
49 static int
abs_sub_n(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t n)50 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n)
51 {
52   int c;
53   MPN_CMP (c, ap, bp, n);
54   if (c >= 0)
55     {
56       mpn_sub_n (rp, ap, bp, n);
57       return 0;
58     }
59   else
60     {
61       mpn_sub_n (rp, bp, ap, n);
62       return 1;
63     }
64 }
65 
66 static int
add_signed_n(mp_ptr rp,mp_srcptr ap,int as,mp_srcptr bp,int bs,mp_size_t n)67 add_signed_n (mp_ptr rp,
68 	      mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n)
69 {
70   if (as != bs)
71     return as ^ abs_sub_n (rp, ap, bp, n);
72   else
73     {
74       ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n));
75       return as;
76     }
77 }
78 
79 mp_size_t
mpn_matrix22_mul_itch(mp_size_t rn,mp_size_t mn)80 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn)
81 {
82   if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
83       || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
84     return 3*rn + 2*mn;
85   else
86     return 3*(rn + mn) + 5;
87 }
88 
89 /* Algorithm:
90 
91     / s0 \   /  1  0  0  0 \ / r0 \
92     | s1 |   |  0  1  0  1 | | r1 |
93     | s2 |   |  0  0 -1  1 | | r2 |
94     | s3 | = |  0  1 -1  1 | \ r3 /
95     | s4 |   | -1  1 -1  1 |
96     | s5 |   |  0  1  0  0 |
97     \ s6 /   \  0  0  1  0 /
98 
99     / t0 \   /  1  0  0  0 \ / m0 \
100     | t1 |   |  0  1  0  1 | | m1 |
101     | t2 |   |  0  0 -1  1 | | m2 |
102     | t3 | = |  0  1 -1  1 | \ m3 /
103     | t4 |   | -1  1 -1  1 |
104     | t5 |   |  0  1  0  0 |
105     \ t6 /   \  0  0  1  0 /
106 
107   Note: the two matrices above are the same, but s_i and t_i are used
108   in the same product, only for i<4, see "A Strassen-like Matrix
109   Multiplication suited for squaring and higher power computation" by
110   M. Bodrato, in Proceedings of ISSAC 2010.
111 
112     / r0 \   / 1 0  0  0  0  1  0 \ / s0*t0 \
113     | r1 | = | 0 0 -1  1 -1  1  0 | | s1*t1 |
114     | r2 |   | 0 1  0 -1  0 -1 -1 | | s2*t2 |
115     \ r3 /   \ 0 1  1 -1  0 -1  0 / | s3*t3 |
116 				    | s4*t5 |
117 				    | s5*t6 |
118 				    \ s6*t4 /
119 
120   The scheduling uses two temporaries U0 and U1 to store products, and
121   two, S0 and T0, to store combinations of entries of the two
122   operands.
123 */
124 
125 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3).
126  *
127  * Resulting elements are of size up to rn + mn + 1.
128  *
129  * Temporary storage: 3 rn + 3 mn + 5. */
130 void
mpn_matrix22_mul_strassen(mp_ptr r0,mp_ptr r1,mp_ptr r2,mp_ptr r3,mp_size_t rn,mp_srcptr m0,mp_srcptr m1,mp_srcptr m2,mp_srcptr m3,mp_size_t mn,mp_ptr tp)131 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
132 			   mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
133 			   mp_ptr tp)
134 {
135   mp_ptr s0, t0, u0, u1;
136   int r1s, r3s, s0s, t0s, u1s;
137   s0 = tp; tp += rn + 1;
138   t0 = tp; tp += mn + 1;
139   u0 = tp; tp += rn + mn + 1;
140   u1 = tp; /* rn + mn + 2 */
141 
142   MUL (u0, r1, rn, m2, mn);		/* u5 = s5 * t6 */
143   r3s = abs_sub_n (r3, r3, r2, rn);	/* r3 - r2 */
144   if (r3s)
145     {
146       r1s = abs_sub_n (r1, r1, r3, rn);
147       r1[rn] = 0;
148     }
149   else
150     {
151       r1[rn] = mpn_add_n (r1, r1, r3, rn);
152       r1s = 0;				/* r1 - r2 + r3  */
153     }
154   if (r1s)
155     {
156       s0[rn] = mpn_add_n (s0, r1, r0, rn);
157       s0s = 0;
158     }
159   else if (r1[rn] != 0)
160     {
161       s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn);
162       s0s = 1;				/* s4 = -r0 + r1 - r2 + r3 */
163 					/* Reverse sign! */
164     }
165   else
166     {
167       s0s = abs_sub_n (s0, r0, r1, rn);
168       s0[rn] = 0;
169     }
170   MUL (u1, r0, rn, m0, mn);		/* u0 = s0 * t0 */
171   r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn);
172   ASSERT (r0[rn+mn] < 2);		/* u0 + u5 */
173 
174   t0s = abs_sub_n (t0, m3, m2, mn);
175   u1s = r3s^t0s^1;			/* Reverse sign! */
176   MUL (u1, r3, rn, t0, mn);		/* u2 = s2 * t2 */
177   u1[rn+mn] = 0;
178   if (t0s)
179     {
180       t0s = abs_sub_n (t0, m1, t0, mn);
181       t0[mn] = 0;
182     }
183   else
184     {
185       t0[mn] = mpn_add_n (t0, t0, m1, mn);
186     }
187 
188   /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs
189      at r3. I'd expect that for matrices of random size, the high
190      words t0[mn] and r1[rn] are non-zero with a pretty small
191      probability. If that can be confirmed this should be done as an
192      unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn]))
193      add_n. */
194   if (t0[mn] != 0)
195     {
196       MUL (r3, r1, rn, t0, mn + 1);	/* u3 = s3 * t3 */
197       ASSERT (r1[rn] < 2);
198       if (r1[rn] != 0)
199 	mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1);
200     }
201   else
202     {
203       MUL (r3, r1, rn + 1, t0, mn);
204     }
205 
206   ASSERT (r3[rn+mn] < 4);
207 
208   u0[rn+mn] = 0;
209   if (r1s^t0s)
210     {
211       r3s = abs_sub_n (r3, u0, r3, rn + mn + 1);
212     }
213   else
214     {
215       ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1));
216       r3s = 0;				/* u3 + u5 */
217     }
218 
219   if (t0s)
220     {
221       t0[mn] = mpn_add_n (t0, t0, m0, mn);
222     }
223   else if (t0[mn] != 0)
224     {
225       t0[mn] -= mpn_sub_n (t0, t0, m0, mn);
226     }
227   else
228     {
229       t0s = abs_sub_n (t0, t0, m0, mn);
230     }
231   MUL (u0, r2, rn, t0, mn + 1);		/* u6 = s6 * t4 */
232   ASSERT (u0[rn+mn] < 2);
233   if (r1s)
234     {
235       ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn));
236     }
237   else
238     {
239       r1[rn] += mpn_add_n (r1, r1, r2, rn);
240     }
241   rn++;
242   t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn);
243 					/* u3 + u5 + u6 */
244   ASSERT (r2[rn+mn-1] < 4);
245   r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn);
246 					/* -u2 + u3 + u5  */
247   ASSERT (r3[rn+mn-1] < 3);
248   MUL (u0, s0, rn, m1, mn);		/* u4 = s4 * t5 */
249   ASSERT (u0[rn+mn-1] < 2);
250   t0[mn] = mpn_add_n (t0, m3, m1, mn);
251   MUL (u1, r1, rn, t0, mn + 1);		/* u1 = s1 * t1 */
252   mn += rn;
253   ASSERT (u1[mn-1] < 4);
254   ASSERT (u1[mn] == 0);
255   ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn));
256 					/* -u2 + u3 - u4 + u5  */
257   ASSERT (r1[mn-1] < 2);
258   if (r3s)
259     {
260       ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn));
261     }
262   else
263     {
264       ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn));
265 					/* u1 + u2 - u3 - u5  */
266     }
267   ASSERT (r3[mn-1] < 2);
268   if (t0s)
269     {
270       ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn));
271     }
272   else
273     {
274       ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn));
275 					/* u1 - u3 - u5 - u6  */
276     }
277   ASSERT (r2[mn-1] < 2);
278 }
279 
280 void
mpn_matrix22_mul(mp_ptr r0,mp_ptr r1,mp_ptr r2,mp_ptr r3,mp_size_t rn,mp_srcptr m0,mp_srcptr m1,mp_srcptr m2,mp_srcptr m3,mp_size_t mn,mp_ptr tp)281 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn,
282 		  mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn,
283 		  mp_ptr tp)
284 {
285   if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD)
286       || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD))
287     {
288       mp_ptr p0, p1;
289       unsigned i;
290 
291       /* Temporary storage: 3 rn + 2 mn */
292       p0 = tp + rn;
293       p1 = p0 + rn + mn;
294 
295       for (i = 0; i < 2; i++)
296 	{
297 	  MPN_COPY (tp, r0, rn);
298 
299 	  if (rn >= mn)
300 	    {
301 	      mpn_mul (p0, r0, rn, m0, mn);
302 	      mpn_mul (p1, r1, rn, m3, mn);
303 	      mpn_mul (r0, r1, rn, m2, mn);
304 	      mpn_mul (r1, tp, rn, m1, mn);
305 	    }
306 	  else
307 	    {
308 	      mpn_mul (p0, m0, mn, r0, rn);
309 	      mpn_mul (p1, m3, mn, r1, rn);
310 	      mpn_mul (r0, m2, mn, r1, rn);
311 	      mpn_mul (r1, m1, mn, tp, rn);
312 	    }
313 	  r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn);
314 	  r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn);
315 
316 	  r0 = r2; r1 = r3;
317 	}
318     }
319   else
320     mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn,
321 			       m0, m1, m2, m3, mn, tp);
322 }
323