1 /* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5 2 times as large as bn. Or more accurately, bn < an < 3bn. 3 4 Contributed to the GNU project by Torbjorn Granlund. 5 6 The idea of applying toom to unbalanced multiplication is due to Marco 7 Bodrato and Alberto Zanoni. 8 9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 12 13 Copyright 2006, 2007, 2008 Free Software Foundation, Inc. 14 15 This file is part of the GNU MP Library. 16 17 The GNU MP Library is free software; you can redistribute it and/or modify 18 it under the terms of the GNU Lesser General Public License as published by 19 the Free Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22 The GNU MP Library is distributed in the hope that it will be useful, but 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 25 License for more details. 26 27 You should have received a copy of the GNU Lesser General Public License 28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 29 30 31 /* 32 Things to work on: 33 34 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be 35 avoided by instead reusing the pp area and the scratch allocation. 36 37 2. Apply optimizations also to mul_toom42.c. 38 */ 39 40 #include "gmp.h" 41 #include "gmp-impl.h" 42 43 /* Evaluate in: -1, 0, +1, +inf 44 45 <-s-><--n--><--n--> 46 ___ ______ ______ 47 |a2_|___a1_|___a0_| 48 |_b1_|___b0_| 49 <-t--><--n--> 50 51 v0 = a0 * b0 # A(0)*B(0) 52 v1 = (a0+ a1+ a2)*(b0+ b1) # A(1)*B(1) ah <= 2 bh <= 1 53 vm1 = (a0- a1+ a2)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0 54 vinf= a2 * b1 # A(inf)*B(inf) 55 */ 56 57 #if TUNE_PROGRAM_BUILD 58 #define MAYBE_mul_toom22 1 59 #else 60 #define MAYBE_mul_toom22 \ 61 (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD) 62 #endif 63 64 #define TOOM22_MUL_N_REC(p, a, b, n, ws) \ 65 do { \ 66 if (! MAYBE_mul_toom22 \ 67 || BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \ 68 mpn_mul_basecase (p, a, n, b, n); \ 69 else \ 70 mpn_toom22_mul (p, a, n, b, n, ws); \ 71 } while (0) 72 73 void 74 mpn_toom32_mul (mp_ptr pp, 75 mp_srcptr ap, mp_size_t an, 76 mp_srcptr bp, mp_size_t bn, 77 mp_ptr scratch) 78 { 79 mp_size_t n, s, t; 80 int vm1_neg; 81 #if HAVE_NATIVE_mpn_add_nc 82 mp_limb_t cy; 83 #else 84 mp_limb_t cy, cy2; 85 #endif 86 mp_ptr a0_a2; 87 mp_ptr as1, asm1; 88 mp_ptr bs1, bsm1; 89 TMP_DECL; 90 91 #define a0 ap 92 #define a1 (ap + n) 93 #define a2 (ap + 2 * n) 94 #define b0 bp 95 #define b1 (bp + n) 96 97 n = 1 + (2 * an >= 3 * bn ? (an - 1) / (size_t) 3 : (bn - 1) >> 1); 98 99 s = an - 2 * n; 100 t = bn - n; 101 102 ASSERT (0 < s && s <= n); 103 ASSERT (0 < t && t <= n); 104 105 TMP_MARK; 106 107 as1 = TMP_SALLOC_LIMBS (n + 1); 108 asm1 = TMP_SALLOC_LIMBS (n + 1); 109 110 bs1 = TMP_SALLOC_LIMBS (n + 1); 111 bsm1 = TMP_SALLOC_LIMBS (n); 112 113 a0_a2 = pp; 114 115 /* Compute as1 and asm1. */ 116 a0_a2[n] = mpn_add (a0_a2, a0, n, a2, s); 117 #if HAVE_NATIVE_mpn_addsub_n 118 if (a0_a2[n] == 0 && mpn_cmp (a0_a2, a1, n) < 0) 119 { 120 cy = mpn_addsub_n (as1, asm1, a1, a0_a2, n); 121 as1[n] = cy >> 1; 122 asm1[n] = 0; 123 vm1_neg = 1; 124 } 125 else 126 { 127 cy = mpn_addsub_n (as1, asm1, a0_a2, a1, n); 128 as1[n] = a0_a2[n] + (cy >> 1); 129 asm1[n] = a0_a2[n] - (cy & 1); 130 vm1_neg = 0; 131 } 132 #else 133 as1[n] = a0_a2[n] + mpn_add_n (as1, a0_a2, a1, n); 134 if (a0_a2[n] == 0 && mpn_cmp (a0_a2, a1, n) < 0) 135 { 136 mpn_sub_n (asm1, a1, a0_a2, n); 137 asm1[n] = 0; 138 vm1_neg = 1; 139 } 140 else 141 { 142 cy = mpn_sub_n (asm1, a0_a2, a1, n); 143 asm1[n] = a0_a2[n] - cy; 144 vm1_neg = 0; 145 } 146 #endif 147 148 /* Compute bs1 and bsm1. */ 149 if (t == n) 150 { 151 #if HAVE_NATIVE_mpn_addsub_n 152 if (mpn_cmp (b0, b1, n) < 0) 153 { 154 cy = mpn_addsub_n (bs1, bsm1, b1, b0, n); 155 vm1_neg ^= 1; 156 } 157 else 158 { 159 cy = mpn_addsub_n (bs1, bsm1, b0, b1, n); 160 } 161 bs1[n] = cy >> 1; 162 #else 163 bs1[n] = mpn_add_n (bs1, b0, b1, n); 164 165 if (mpn_cmp (b0, b1, n) < 0) 166 { 167 mpn_sub_n (bsm1, b1, b0, n); 168 vm1_neg ^= 1; 169 } 170 else 171 { 172 mpn_sub_n (bsm1, b0, b1, n); 173 } 174 #endif 175 } 176 else 177 { 178 bs1[n] = mpn_add (bs1, b0, n, b1, t); 179 180 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) 181 { 182 mpn_sub_n (bsm1, b1, b0, t); 183 MPN_ZERO (bsm1 + t, n - t); 184 vm1_neg ^= 1; 185 } 186 else 187 { 188 mpn_sub (bsm1, b0, n, b1, t); 189 } 190 } 191 192 ASSERT (as1[n] <= 2); 193 ASSERT (bs1[n] <= 1); 194 ASSERT (asm1[n] <= 1); 195 /*ASSERT (bsm1[n] == 0); */ 196 197 #define v0 pp /* 2n */ 198 #define v1 (scratch) /* 2n+1 */ 199 #define vinf (pp + 3 * n) /* s+t */ 200 #define vm1 (scratch + 2 * n + 1) /* 2n+1 */ 201 #define scratch_out scratch + 4 * n + 2 202 203 /* vm1, 2n+1 limbs */ 204 TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); 205 cy = 0; 206 if (asm1[n] != 0) 207 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 208 vm1[2 * n] = cy; 209 210 /* vinf, s+t limbs */ 211 if (s > t) mpn_mul (vinf, a2, s, b1, t); 212 else mpn_mul (vinf, b1, t, a2, s); 213 214 /* v1, 2n+1 limbs */ 215 TOOM22_MUL_N_REC (v1, as1, bs1, n, scratch_out); 216 if (as1[n] == 1) 217 { 218 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 219 } 220 else if (as1[n] == 2) 221 { 222 #if HAVE_NATIVE_mpn_addlsh1_n 223 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); 224 #else 225 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 226 #endif 227 } 228 else 229 cy = 0; 230 if (bs1[n] != 0) 231 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 232 v1[2 * n] = cy; 233 234 mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */ 235 236 /* Interpolate */ 237 238 if (vm1_neg) 239 { 240 #if HAVE_NATIVE_mpn_rsh1add_n 241 mpn_rsh1add_n (vm1, v1, vm1, 2 * n + 1); 242 #else 243 mpn_add_n (vm1, v1, vm1, 2 * n + 1); 244 mpn_rshift (vm1, vm1, 2 * n + 1, 1); 245 #endif 246 } 247 else 248 { 249 #if HAVE_NATIVE_mpn_rsh1sub_n 250 mpn_rsh1sub_n (vm1, v1, vm1, 2 * n + 1); 251 #else 252 mpn_sub_n (vm1, v1, vm1, 2 * n + 1); 253 mpn_rshift (vm1, vm1, 2 * n + 1, 1); 254 #endif 255 } 256 257 mpn_sub_n (v1, v1, vm1, 2 * n + 1); 258 v1[2 * n] -= mpn_sub_n (v1, v1, v0, 2 * n); 259 260 /* 261 pp[] prior to operations: 262 |_H vinf|_L vinf|_______|_______|_______| 263 264 summation scheme for remaining operations: 265 |_______|_______|_______|_______|_______| 266 |_Hvinf_|_Lvinf_| |_H v0__|_L v0__| 267 | H vm1 | L vm1 | 268 |-H vinf|-L vinf| 269 | H v1 | L v1 | 270 */ 271 272 mpn_sub (vm1, vm1, 2 * n + 1, vinf, s + t); 273 #if HAVE_NATIVE_mpn_add_nc 274 cy = mpn_add_n (pp + n, pp + n, vm1, n); 275 cy = mpn_add_nc (pp + 2 * n, v1, vm1 + n, n, cy); 276 cy = mpn_add_nc (pp + 3 * n, pp + 3 * n, v1 + n, n, cy); 277 mpn_incr_u (pp + 3 * n, vm1[2 * n]); 278 if (LIKELY (n != s + t)) /* FIXME: Limit operand range to avoid condition */ 279 mpn_incr_u (pp + 4 * n, cy + v1[2 * n]); 280 #else 281 cy2 = mpn_add_n (pp + n, pp + n, vm1, n); 282 cy = mpn_add_n (pp + 2 * n, v1, vm1 + n, n); 283 mpn_incr_u (pp + 2 * n, cy2); 284 mpn_incr_u (pp + 3 * n, cy + vm1[2 * n]); 285 cy = mpn_add_n (pp + 3 * n, pp + 3 * n, v1 + n, n); 286 if (LIKELY (n != s + t)) /* FIXME: Limit operand range to avoid condition */ 287 mpn_incr_u (pp + 4 * n, cy + v1[2 * n]); 288 #endif 289 290 TMP_FREE; 291 } 292