1 /* matrix22_mul.c. 2 3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 6 7 Copyright 2003, 2004, 2005, 2008 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU Lesser General Public License as published by 13 the Free Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 The GNU MP Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19 License for more details. 20 21 You should have received a copy of the GNU Lesser General Public License 22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 23 24 #include "gmp.h" 25 #include "gmp-impl.h" 26 #include "longlong.h" 27 28 #define MUL(rp, ap, an, bp, bn) do { \ 29 if (an >= bn) \ 30 mpn_mul (rp, ap, an, bp, bn); \ 31 else \ 32 mpn_mul (rp, bp, bn, ap, an); \ 33 } while (0) 34 35 /* Inputs are unsigned. */ 36 static int 37 abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) 38 { 39 int c; 40 MPN_CMP (c, ap, bp, n); 41 if (c >= 0) 42 { 43 mpn_sub_n (rp, ap, bp, n); 44 return 0; 45 } 46 else 47 { 48 mpn_sub_n (rp, bp, ap, n); 49 return 1; 50 } 51 } 52 53 static int 54 add_signed_n (mp_ptr rp, 55 mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n) 56 { 57 if (as != bs) 58 return as ^ abs_sub_n (rp, ap, bp, n); 59 else 60 { 61 ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n)); 62 return as; 63 } 64 } 65 66 mp_size_t 67 mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn) 68 { 69 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) 70 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) 71 return 3*rn + 2*mn; 72 else 73 return 4*(rn + mn) + 5; 74 } 75 76 /* Algorithm: 77 78 / s0 \ / 1 0 0 0 \ / r0 \ 79 | s1 | | 0 1 0 0 | | r1 | 80 | s2 | | 0 0 1 1 | | r2 | 81 | s3 | = | -1 0 1 1 | \ r3 / 82 | s4 | | 1 0 -1 0 | 83 | s5 | | 1 1 -1 -1 | 84 \ s6 / \ 0 0 0 1 / 85 86 / t0 \ / 1 0 0 0 \ / m0 \ 87 | t1 | | 0 0 1 0 | | m1 | 88 | t2 | | -1 1 0 0 | | m2 | 89 | t3 | = | 1 -1 0 1 | \ m3 / 90 | t4 | | 0 -1 0 1 | 91 | t5 | | 0 0 0 1 | 92 \ t6 / \ -1 1 1 -1 / 93 94 / r0 \ / 1 1 0 0 0 0 0 \ / s0 * t0 \ 95 | r1 | = | 1 0 1 1 0 1 0 | | s1 * t1 | 96 | r2 | | 1 0 0 1 1 0 1 | | s2 * t2 | 97 \ r3 / \ 1 0 1 1 1 0 0 / | s3 * t3 | 98 | s4 * t4 | 99 | s5 * t5 | 100 \ s6 * t6 / 101 */ 102 103 /* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3). 104 * 105 * Resulting elements are of size up to rn + mn + 1. 106 * 107 * Temporary storage: 4 rn + 4 mn + 5. */ 108 void 109 mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, 110 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, 111 mp_ptr tp) 112 { 113 mp_ptr s2, s3, t2, t3, u0, u1; 114 int r2s, r3s, s3s, t2s, t3s, u0s, u1s; 115 s2 = tp; tp += rn; 116 s3 = tp; tp += rn + 1; 117 t2 = tp; tp += mn; 118 t3 = tp; tp += mn + 1; 119 u0 = tp; tp += rn + mn + 1; 120 u1 = tp; /* rn + mn + 2 */ 121 122 MUL (u0, r0, rn, m0, mn); /* 0 */ 123 MUL (u1, r1, rn, m2, mn); /* 1 */ 124 125 MPN_COPY (s2, r3, rn); 126 127 r3[rn] = mpn_add_n (r3, r3, r2, rn); 128 r0[rn] = 0; 129 s3s = abs_sub_n (s3, r3, r0, rn + 1); 130 t2s = abs_sub_n (t2, m1, m0, mn); 131 if (t2s) 132 { 133 t3[mn] = mpn_add_n (t3, m3, t2, mn); 134 t3s = 0; 135 } 136 else 137 { 138 t3s = abs_sub_n (t3, m3, t2, mn); 139 t3[mn] = 0; 140 } 141 142 r2s = abs_sub_n (r2, r0, r2, rn); 143 r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn); 144 145 MUL(u1, s3, rn+1, t3, mn+1); /* 3 */ 146 u1s = s3s ^ t3s; 147 ASSERT (u1[rn+mn+1] == 0); 148 ASSERT (u1[rn+mn] < 4); 149 150 if (u1s) 151 { 152 u0[rn+mn] = 0; 153 u0s = abs_sub_n (u0, u0, u1, rn + mn + 1); 154 } 155 else 156 { 157 u0[rn+mn] = u1[rn+mn] + mpn_add_n (u0, u0, u1, rn + mn); 158 u0s = 0; 159 } 160 MUL(u1, r3, rn + 1, t2, mn); /* 2 */ 161 u1s = t2s; 162 ASSERT (u1[rn+mn] < 2); 163 164 u1s = add_signed_n (u1, u0, u0s, u1, u1s, rn + mn + 1); 165 166 t2s = abs_sub_n (t2, m3, m1, mn); 167 if (s3s) 168 { 169 s3[rn] += mpn_add_n (s3, s3, r1, rn); 170 s3s = 0; 171 } 172 else if (s3[rn] > 0) 173 { 174 s3[rn] -= mpn_sub_n (s3, s3, r1, rn); 175 s3s = 1; 176 } 177 else 178 { 179 s3s = abs_sub_n (s3, r1, s3, rn); 180 } 181 MUL (r1, s3, rn+1, m3, mn); /* 5 */ 182 ASSERT_NOCARRY(add_signed_n (r1, r1, s3s, u1, u1s, rn + mn + 1)); 183 ASSERT (r1[rn + mn] < 2); 184 185 MUL (r3, r2, rn, t2, mn); /* 4 */ 186 r3s = r2s ^ t2s; 187 r3[rn + mn] = 0; 188 u0s = add_signed_n (u0, u0, u0s, r3, r3s, rn + mn + 1); 189 ASSERT_NOCARRY (add_signed_n (r3, r3, r3s, u1, u1s, rn + mn + 1)); 190 ASSERT (r3[rn + mn] < 2); 191 192 if (t3s) 193 { 194 t3[mn] += mpn_add_n (t3, m2, t3, mn); 195 t3s = 0; 196 } 197 else if (t3[mn] > 0) 198 { 199 t3[mn] -= mpn_sub_n (t3, t3, m2, mn); 200 t3s = 1; 201 } 202 else 203 { 204 t3s = abs_sub_n (t3, m2, t3, mn); 205 } 206 MUL (r2, s2, rn, t3, mn + 1); /* 6 */ 207 208 ASSERT_NOCARRY (add_signed_n (r2, r2, t3s, u0, u0s, rn + mn + 1)); 209 ASSERT (r2[rn + mn] < 2); 210 } 211 212 void 213 mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, 214 mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, 215 mp_ptr tp) 216 { 217 if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) 218 || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) 219 { 220 mp_ptr p0, p1; 221 unsigned i; 222 223 /* Temporary storage: 3 rn + 2 mn */ 224 p0 = tp + rn; 225 p1 = p0 + rn + mn; 226 227 for (i = 0; i < 2; i++) 228 { 229 MPN_COPY (tp, r0, rn); 230 231 if (rn >= mn) 232 { 233 mpn_mul (p0, r0, rn, m0, mn); 234 mpn_mul (p1, r1, rn, m3, mn); 235 mpn_mul (r0, r1, rn, m2, mn); 236 mpn_mul (r1, tp, rn, m1, mn); 237 } 238 else 239 { 240 mpn_mul (p0, m0, mn, r0, rn); 241 mpn_mul (p1, m3, mn, r1, rn); 242 mpn_mul (r0, m2, mn, r1, rn); 243 mpn_mul (r1, m1, mn, tp, rn); 244 } 245 r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn); 246 r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn); 247 248 r0 = r2; r1 = r3; 249 } 250 } 251 else 252 mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn, 253 m0, m1, m2, m3, mn, tp); 254 } 255