1 /* matrix22_mul.c. 2 3 Contributed by Niels M�ller and Marco Bodrato. 4 5 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2003, 2004, 2005, 2008, 2009 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of the GNU Lesser General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 The GNU MP Library is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21 License for more details. 22 23 You should have received a copy of the GNU Lesser General Public License 24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25 26 #include "gmp.h" 27 #include "gmp-impl.h" 28 #include "longlong.h" 29 30 #define MUL(rp, ap, an, bp, bn) do { \ 31 if (an >= bn) \ 32 mpn_mul (rp, ap, an, bp, bn); \ 33 else \ 34 mpn_mul (rp, bp, bn, ap, an); \ 35 } while (0) 36 37 /* Inputs are unsigned. */ 38 static int 39 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) 40 { 41 int c; 42 MPN_CMP (c, ap, bp, n); 43 if (c >= 0) 44 { 45 mpn_sub_n (rp, ap, bp, n); 46 return 0; 47 } 48 else 49 { 50 mpn_sub_n (rp, bp, ap, n); 51 return 1; 52 } 53 } 54 55 static int 56 add_signed_n (mp_ptr rp, 57 mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n) 58 { 59 if (as != bs) 60 return as ^ abs_sub_n (rp, ap, bp, n); 61 else 62 { 63 ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n)); 64 return as; 65 } 66 } 67 68 mp_size_t 69 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn) 70 { 71 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) 72 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) 73 return 3*rn + 2*mn; 74 else 75 return 3*(rn + mn) + 5; 76 } 77 78 /* Algorithm: 79 80 / s0 \ / 1 0 0 0 \ / r0 \ 81 | s1 | | 0 1 0 1 | | r1 | 82 | s2 | | 0 0 -1 1 | | r2 | 83 | s3 | = | 0 1 -1 1 | \ r3 / 84 | s4 | | -1 1 -1 1 | 85 | s5 | | 0 1 0 0 | 86 \ s6 / \ 0 0 1 0 / 87 88 / t0 \ / 1 0 0 0 \ / m0 \ 89 | t1 | | 0 1 0 1 | | m1 | 90 | t2 | | 0 0 -1 1 | | m2 | 91 | t3 | = | 0 1 -1 1 | \ m3 / 92 | t4 | | -1 1 -1 1 | 93 | t5 | | 0 1 0 0 | 94 \ t6 / \ 0 0 1 0 / 95 96 Note: the two matrices above are the same, but s_i and t_i are used 97 in the same product, only for i<4, see "A Strassen-like Matrix 98 Multiplication suited for squaring and higher power computation" by 99 M. Bodrato, in Proceedings of ISSAC 2010. 100 101 / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \ 102 | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 | 103 | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 | 104 \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 | 105 | s4*t5 | 106 | s5*t6 | 107 \ s6*t4 / 108 109 The scheduling uses two temporaries U0 and U1 to store products, and 110 two, S0 and T0, to store combinations of entries of the two 111 operands. 112 */ 113 114 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3). 115 * 116 * Resulting elements are of size up to rn + mn + 1. 117 * 118 * Temporary storage: 3 rn + 3 mn + 5. */ 119 void 120 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, 121 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, 122 mp_ptr tp) 123 { 124 mp_ptr s0, t0, u0, u1; 125 int r1s, r3s, s0s, t0s, u1s; 126 s0 = tp; tp += rn + 1; 127 t0 = tp; tp += mn + 1; 128 u0 = tp; tp += rn + mn + 1; 129 u1 = tp; /* rn + mn + 2 */ 130 131 MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */ 132 r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */ 133 if (r3s) 134 { 135 r1s = abs_sub_n (r1, r1, r3, rn); 136 r1[rn] = 0; 137 } 138 else 139 { 140 r1[rn] = mpn_add_n (r1, r1, r3, rn); 141 r1s = 0; /* r1 - r2 + r3 */ 142 } 143 if (r1s) 144 { 145 s0[rn] = mpn_add_n (s0, r1, r0, rn); 146 s0s = 0; 147 } 148 else if (r1[rn] != 0) 149 { 150 s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn); 151 s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */ 152 /* Reverse sign! */ 153 } 154 else 155 { 156 s0s = abs_sub_n (s0, r0, r1, rn); 157 s0[rn] = 0; 158 } 159 MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */ 160 r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn); 161 ASSERT (r0[rn+mn] < 2); /* u0 + u5 */ 162 163 t0s = abs_sub_n (t0, m3, m2, mn); 164 u1s = r3s^t0s^1; /* Reverse sign! */ 165 MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */ 166 u1[rn+mn] = 0; 167 if (t0s) 168 { 169 t0s = abs_sub_n (t0, m1, t0, mn); 170 t0[mn] = 0; 171 } 172 else 173 { 174 t0[mn] = mpn_add_n (t0, t0, m1, mn); 175 } 176 177 /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs 178 at r3. I'd expect that for matrices of random size, the high 179 words t0[mn] and r1[rn] are non-zero with a pretty small 180 probability. If that can be confirmed this should be done as an 181 unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn])) 182 add_n. */ 183 if (t0[mn] != 0) 184 { 185 MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */ 186 ASSERT (r1[rn] < 2); 187 if (r1[rn] != 0) 188 mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1); 189 } 190 else 191 { 192 MUL (r3, r1, rn + 1, t0, mn); 193 } 194 195 ASSERT (r3[rn+mn] < 4); 196 197 u0[rn+mn] = 0; 198 if (r1s^t0s) 199 { 200 r3s = abs_sub_n (r3, u0, r3, rn + mn + 1); 201 } 202 else 203 { 204 ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1)); 205 r3s = 0; /* u3 + u5 */ 206 } 207 208 if (t0s) 209 { 210 t0[mn] = mpn_add_n (t0, t0, m0, mn); 211 } 212 else if (t0[mn] != 0) 213 { 214 t0[mn] -= mpn_sub_n (t0, t0, m0, mn); 215 } 216 else 217 { 218 t0s = abs_sub_n (t0, t0, m0, mn); 219 } 220 MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */ 221 ASSERT (u0[rn+mn] < 2); 222 if (r1s) 223 { 224 ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn)); 225 } 226 else 227 { 228 r1[rn] += mpn_add_n (r1, r1, r2, rn); 229 } 230 rn++; 231 t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn); 232 /* u3 + u5 + u6 */ 233 ASSERT (r2[rn+mn-1] < 4); 234 r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn); 235 /* -u2 + u3 + u5 */ 236 ASSERT (r3[rn+mn-1] < 3); 237 MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */ 238 ASSERT (u0[rn+mn-1] < 2); 239 t0[mn] = mpn_add_n (t0, m3, m1, mn); 240 MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */ 241 mn += rn; 242 ASSERT (u1[mn-1] < 4); 243 ASSERT (u1[mn] == 0); 244 ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn)); 245 /* -u2 + u3 - u4 + u5 */ 246 ASSERT (r1[mn-1] < 2); 247 if (r3s) 248 { 249 ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn)); 250 } 251 else 252 { 253 ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn)); 254 /* u1 + u2 - u3 - u5 */ 255 } 256 ASSERT (r3[mn-1] < 2); 257 if (t0s) 258 { 259 ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn)); 260 } 261 else 262 { 263 ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn)); 264 /* u1 - u3 - u5 - u6 */ 265 } 266 ASSERT (r2[mn-1] < 2); 267 } 268 269 void 270 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, 271 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, 272 mp_ptr tp) 273 { 274 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) 275 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) 276 { 277 mp_ptr p0, p1; 278 unsigned i; 279 280 /* Temporary storage: 3 rn + 2 mn */ 281 p0 = tp + rn; 282 p1 = p0 + rn + mn; 283 284 for (i = 0; i < 2; i++) 285 { 286 MPN_COPY (tp, r0, rn); 287 288 if (rn >= mn) 289 { 290 mpn_mul (p0, r0, rn, m0, mn); 291 mpn_mul (p1, r1, rn, m3, mn); 292 mpn_mul (r0, r1, rn, m2, mn); 293 mpn_mul (r1, tp, rn, m1, mn); 294 } 295 else 296 { 297 mpn_mul (p0, m0, mn, r0, rn); 298 mpn_mul (p1, m3, mn, r1, rn); 299 mpn_mul (r0, m2, mn, r1, rn); 300 mpn_mul (r1, m1, mn, tp, rn); 301 } 302 r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn); 303 r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn); 304 305 r0 = r2; r1 = r3; 306 } 307 } 308 else 309 mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn, 310 m0, m1, m2, m3, mn, tp); 311 } 312