1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in 2 size. Or more accurately, bn <= an < (3/2)bn. 3 4 Contributed to the GNU project by Torbjorn Granlund. 5 Additional improvements by Marco Bodrato. 6 7 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 8 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 9 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 10 11 Copyright 2006, 2007, 2008, 2010 Free Software Foundation, Inc. 12 13 This file is part of the GNU MP Library. 14 15 The GNU MP Library is free software; you can redistribute it and/or modify 16 it under the terms of the GNU Lesser General Public License as published by 17 the Free Software Foundation; either version 3 of the License, or (at your 18 option) any later version. 19 20 The GNU MP Library is distributed in the hope that it will be useful, but 21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 22 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 23 License for more details. 24 25 You should have received a copy of the GNU Lesser General Public License 26 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 27 28 29 #include "gmp.h" 30 #include "gmp-impl.h" 31 32 /* Evaluate in: -1, 0, +1, +2, +inf 33 34 <-s--><--n--><--n--><--n--> 35 ____ ______ ______ ______ 36 |_a3_|___a2_|___a1_|___a0_| 37 |b3_|___b2_|___b1_|___b0_| 38 <-t-><--n--><--n--><--n--> 39 40 v0 = a0 * b0 # A(0)*B(0) 41 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2 42 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1 43 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6 44 vinf= a2 * b2 # A(inf)*B(inf) 45 */ 46 47 #if TUNE_PROGRAM_BUILD 48 #define MAYBE_mul_basecase 1 49 #define MAYBE_mul_toom33 1 50 #else 51 #define MAYBE_mul_basecase \ 52 (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD) 53 #define MAYBE_mul_toom33 \ 54 (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD) 55 #endif 56 57 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced 58 multiplication at the infinity point. We may have 59 MAYBE_mul_basecase == 0, and still get s just below 60 MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get 61 s == 1 and mpn_toom22_mul will crash. 62 */ 63 64 #define TOOM33_MUL_N_REC(p, a, b, n, ws) \ 65 do { \ 66 if (MAYBE_mul_basecase \ 67 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ 68 mpn_mul_basecase (p, a, n, b, n); \ 69 else if (! MAYBE_mul_toom33 \ 70 || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ 71 mpn_toom22_mul (p, a, n, b, n, ws); \ 72 else \ 73 mpn_toom33_mul (p, a, n, b, n, ws); \ 74 } while (0) 75 76 void 77 mpn_toom33_mul (mp_ptr pp, 78 mp_srcptr ap, mp_size_t an, 79 mp_srcptr bp, mp_size_t bn, 80 mp_ptr scratch) 81 { 82 mp_size_t n, s, t; 83 int vm1_neg; 84 mp_limb_t cy, vinf0; 85 mp_ptr gp; 86 mp_ptr as1, asm1, as2; 87 mp_ptr bs1, bsm1, bs2; 88 89 #define a0 ap 90 #define a1 (ap + n) 91 #define a2 (ap + 2*n) 92 #define b0 bp 93 #define b1 (bp + n) 94 #define b2 (bp + 2*n) 95 96 n = (an + 2) / (size_t) 3; 97 98 s = an - 2 * n; 99 t = bn - 2 * n; 100 101 ASSERT (an >= bn); 102 103 ASSERT (0 < s && s <= n); 104 ASSERT (0 < t && t <= n); 105 106 as1 = scratch + 4 * n + 4; 107 asm1 = scratch + 2 * n + 2; 108 as2 = pp + n + 1; 109 110 bs1 = pp; 111 bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */ 112 bs2 = pp + 2 * n + 2; 113 114 gp = scratch; 115 116 vm1_neg = 0; 117 118 /* Compute as1 and asm1. */ 119 cy = mpn_add (gp, a0, n, a2, s); 120 #if HAVE_NATIVE_mpn_add_n_sub_n 121 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 122 { 123 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); 124 as1[n] = cy >> 1; 125 asm1[n] = 0; 126 vm1_neg = 1; 127 } 128 else 129 { 130 mp_limb_t cy2; 131 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); 132 as1[n] = cy + (cy2 >> 1); 133 asm1[n] = cy - (cy2 & 1); 134 } 135 #else 136 as1[n] = cy + mpn_add_n (as1, gp, a1, n); 137 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 138 { 139 mpn_sub_n (asm1, a1, gp, n); 140 asm1[n] = 0; 141 vm1_neg = 1; 142 } 143 else 144 { 145 cy -= mpn_sub_n (asm1, gp, a1, n); 146 asm1[n] = cy; 147 } 148 #endif 149 150 /* Compute as2. */ 151 #if HAVE_NATIVE_mpn_rsblsh1_n 152 cy = mpn_add_n (as2, a2, as1, s); 153 if (s != n) 154 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 155 cy += as1[n]; 156 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); 157 #else 158 #if HAVE_NATIVE_mpn_addlsh1_n 159 cy = mpn_addlsh1_n (as2, a1, a2, s); 160 if (s != n) 161 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); 162 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 163 #else 164 cy = mpn_add_n (as2, a2, as1, s); 165 if (s != n) 166 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); 167 cy += as1[n]; 168 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 169 cy -= mpn_sub_n (as2, as2, a0, n); 170 #endif 171 #endif 172 as2[n] = cy; 173 174 /* Compute bs1 and bsm1. */ 175 cy = mpn_add (gp, b0, n, b2, t); 176 #if HAVE_NATIVE_mpn_add_n_sub_n 177 if (cy == 0 && mpn_cmp (gp, b1, n) < 0) 178 { 179 cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n); 180 bs1[n] = cy >> 1; 181 bsm1[n] = 0; 182 vm1_neg ^= 1; 183 } 184 else 185 { 186 mp_limb_t cy2; 187 cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n); 188 bs1[n] = cy + (cy2 >> 1); 189 bsm1[n] = cy - (cy2 & 1); 190 } 191 #else 192 bs1[n] = cy + mpn_add_n (bs1, gp, b1, n); 193 if (cy == 0 && mpn_cmp (gp, b1, n) < 0) 194 { 195 mpn_sub_n (bsm1, b1, gp, n); 196 bsm1[n] = 0; 197 vm1_neg ^= 1; 198 } 199 else 200 { 201 cy -= mpn_sub_n (bsm1, gp, b1, n); 202 bsm1[n] = cy; 203 } 204 #endif 205 206 /* Compute bs2. */ 207 #if HAVE_NATIVE_mpn_rsblsh1_n 208 cy = mpn_add_n (bs2, b2, bs1, t); 209 if (t != n) 210 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); 211 cy += bs1[n]; 212 cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n); 213 #else 214 #if HAVE_NATIVE_mpn_addlsh1_n 215 cy = mpn_addlsh1_n (bs2, b1, b2, t); 216 if (t != n) 217 cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy); 218 cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n); 219 #else 220 cy = mpn_add_n (bs2, bs1, b2, t); 221 if (t != n) 222 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); 223 cy += bs1[n]; 224 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1); 225 cy -= mpn_sub_n (bs2, bs2, b0, n); 226 #endif 227 #endif 228 bs2[n] = cy; 229 230 ASSERT (as1[n] <= 2); 231 ASSERT (bs1[n] <= 2); 232 ASSERT (asm1[n] <= 1); 233 ASSERT (bsm1[n] <= 1); 234 ASSERT (as2[n] <= 6); 235 ASSERT (bs2[n] <= 6); 236 237 #define v0 pp /* 2n */ 238 #define v1 (pp + 2 * n) /* 2n+1 */ 239 #define vinf (pp + 4 * n) /* s+t */ 240 #define vm1 scratch /* 2n+1 */ 241 #define v2 (scratch + 2 * n + 1) /* 2n+2 */ 242 #define scratch_out (scratch + 5 * n + 5) 243 244 /* vm1, 2n+1 limbs */ 245 #ifdef SMALLER_RECURSION 246 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); 247 cy = 0; 248 if (asm1[n] != 0) 249 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 250 if (bsm1[n] != 0) 251 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); 252 vm1[2 * n] = cy; 253 #else 254 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out); 255 #endif 256 257 TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ 258 259 /* vinf, s+t limbs */ 260 if (s > t) mpn_mul (vinf, a2, s, b2, t); 261 else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out); 262 263 vinf0 = vinf[0]; /* v1 overlaps with this */ 264 265 #ifdef SMALLER_RECURSION 266 /* v1, 2n+1 limbs */ 267 TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out); 268 if (as1[n] == 1) 269 { 270 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 271 } 272 else if (as1[n] != 0) 273 { 274 #if HAVE_NATIVE_mpn_addlsh1_n 275 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); 276 #else 277 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 278 #endif 279 } 280 else 281 cy = 0; 282 if (bs1[n] == 1) 283 { 284 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 285 } 286 else if (bs1[n] != 0) 287 { 288 #if HAVE_NATIVE_mpn_addlsh1_n 289 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n); 290 #else 291 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 292 #endif 293 } 294 v1[2 * n] = cy; 295 #else 296 cy = vinf[1]; 297 TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out); 298 vinf[1] = cy; 299 #endif 300 301 TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ 302 303 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); 304 } 305