1 /* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice 2 as large as bn. Or more accurately, (3/2)bn < an < 4bn. 3 4 Contributed to the GNU project by Torbjorn Granlund. 5 Additional improvements by Marco Bodrato. 6 7 The idea of applying toom to unbalanced multiplication is due to by Marco 8 Bodrato and Alberto Zanoni. 9 10 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 11 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 12 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 13 14 Copyright 2006, 2007, 2008 Free Software Foundation, Inc. 15 16 This file is part of the GNU MP Library. 17 18 The GNU MP Library is free software; you can redistribute it and/or modify 19 it under the terms of the GNU Lesser General Public License as published by 20 the Free Software Foundation; either version 3 of the License, or (at your 21 option) any later version. 22 23 The GNU MP Library is distributed in the hope that it will be useful, but 24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 26 License for more details. 27 28 You should have received a copy of the GNU Lesser General Public License 29 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 30 31 32 /* 33 Things to work on: 34 35 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be 36 avoided by instead reusing the pp area and the scratch allocation. 37 38 2. Apply optimizations also to mul_toom32.c. 39 */ 40 41 #include "gmp.h" 42 #include "gmp-impl.h" 43 44 /* Evaluate in: -1, 0, +1, +2, +inf 45 46 <-s-><--n--><--n--><--n--> 47 ___ ______ ______ ______ 48 |a3_|___a2_|___a1_|___a0_| 49 |_b1_|___b0_| 50 <-t--><--n--> 51 52 v0 = a0 * b0 # A(0)*B(0) 53 v1 = (a0+ a1+ a2+ a3)*(b0+ b1) # A(1)*B(1) ah <= 3 bh <= 1 54 vm1 = (a0- a1+ a2- a3)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0 55 v2 = (a0+2a1+4a2+8a3)*(b0+2b1) # A(2)*B(2) ah <= 14 bh <= 2 56 vinf= a3 * b1 # A(inf)*B(inf) 57 */ 58 59 #if TUNE_PROGRAM_BUILD 60 #define MAYBE_mul_toom22 1 61 #else 62 #define MAYBE_mul_toom22 \ 63 (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD) 64 #endif 65 66 #define TOOM22_MUL_N_REC(p, a, b, n, ws) \ 67 do { \ 68 if (! MAYBE_mul_toom22 \ 69 || BELOW_THRESHOLD (n, MUL_KARATSUBA_THRESHOLD)) \ 70 mpn_mul_basecase (p, a, n, b, n); \ 71 else \ 72 mpn_toom22_mul (p, a, n, b, n, ws); \ 73 } while (0) 74 75 void 76 mpn_toom42_mul (mp_ptr pp, 77 mp_srcptr ap, mp_size_t an, 78 mp_srcptr bp, mp_size_t bn, 79 mp_ptr scratch) 80 { 81 mp_size_t n, s, t; 82 int vm1_neg; 83 mp_limb_t cy, vinf0; 84 mp_ptr a0_a2, a1_a3; 85 mp_ptr as1, asm1, as2; 86 mp_ptr bs1, bsm1, bs2; 87 TMP_DECL; 88 89 #define a0 ap 90 #define a1 (ap + n) 91 #define a2 (ap + 2*n) 92 #define a3 (ap + 3*n) 93 #define b0 bp 94 #define b1 (bp + n) 95 96 n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1; 97 98 s = an - 3 * n; 99 t = bn - n; 100 101 ASSERT (0 < s && s <= n); 102 ASSERT (0 < t && t <= n); 103 104 TMP_MARK; 105 106 as1 = TMP_SALLOC_LIMBS (n + 1); 107 asm1 = TMP_SALLOC_LIMBS (n + 1); 108 as2 = TMP_SALLOC_LIMBS (n + 1); 109 110 bs1 = TMP_SALLOC_LIMBS (n + 1); 111 bsm1 = TMP_SALLOC_LIMBS (n); 112 bs2 = TMP_SALLOC_LIMBS (n + 1); 113 114 a0_a2 = pp; 115 a1_a3 = pp + n + 1; 116 117 /* Compute as1 and asm1. */ 118 a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n); 119 a1_a3[n] = mpn_add (a1_a3, a1, n, a3, s); 120 #if HAVE_NATIVE_mpn_addsub_n 121 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) 122 { 123 mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1); 124 vm1_neg = 1; 125 } 126 else 127 { 128 mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1); 129 vm1_neg = 0; 130 } 131 #else 132 mpn_add_n (as1, a0_a2, a1_a3, n + 1); 133 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) 134 { 135 mpn_sub_n (asm1, a1_a3, a0_a2, n + 1); 136 vm1_neg = 1; 137 } 138 else 139 { 140 mpn_sub_n (asm1, a0_a2, a1_a3, n + 1); 141 vm1_neg = 0; 142 } 143 #endif 144 145 /* Compute as2. */ 146 #if HAVE_NATIVE_mpn_addlsh1_n 147 cy = mpn_addlsh1_n (as2, a2, a3, s); 148 if (s != n) 149 cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy); 150 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); 151 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 152 #else 153 cy = mpn_lshift (as2, a3, s, 1); 154 cy += mpn_add_n (as2, a2, as2, s); 155 if (s != n) 156 cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy); 157 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 158 cy += mpn_add_n (as2, a1, as2, n); 159 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 160 cy += mpn_add_n (as2, a0, as2, n); 161 #endif 162 as2[n] = cy; 163 164 /* Compute bs1 and bsm1. */ 165 if (t == n) 166 { 167 #if HAVE_NATIVE_mpn_addsub_n 168 if (mpn_cmp (b0, b1, n) < 0) 169 { 170 cy = mpn_addsub_n (bs1, bsm1, b1, b0, n); 171 vm1_neg ^= 1; 172 } 173 else 174 { 175 cy = mpn_addsub_n (bs1, bsm1, b0, b1, n); 176 } 177 bs1[n] = cy >> 1; 178 #else 179 bs1[n] = mpn_add_n (bs1, b0, b1, n); 180 181 if (mpn_cmp (b0, b1, n) < 0) 182 { 183 mpn_sub_n (bsm1, b1, b0, n); 184 vm1_neg ^= 1; 185 } 186 else 187 { 188 mpn_sub_n (bsm1, b0, b1, n); 189 } 190 #endif 191 } 192 else 193 { 194 bs1[n] = mpn_add (bs1, b0, n, b1, t); 195 196 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) 197 { 198 mpn_sub_n (bsm1, b1, b0, t); 199 MPN_ZERO (bsm1 + t, n - t); 200 vm1_neg ^= 1; 201 } 202 else 203 { 204 mpn_sub (bsm1, b0, n, b1, t); 205 } 206 } 207 208 /* Compute bs2, recycling bs1. bs2=bs1+b1 */ 209 mpn_add (bs2, bs1, n + 1, b1, t); 210 211 ASSERT (as1[n] <= 3); 212 ASSERT (bs1[n] <= 1); 213 ASSERT (asm1[n] <= 1); 214 /*ASSERT (bsm1[n] == 0);*/ 215 ASSERT (as2[n] <= 14); 216 ASSERT (bs2[n] <= 2); 217 218 #define v0 pp /* 2n */ 219 #define v1 (pp + 2 * n) /* 2n+1 */ 220 #define vinf (pp + 4 * n) /* s+t */ 221 #define vm1 scratch /* 2n+1 */ 222 #define v2 (scratch + 2 * n + 1) /* 2n+2 */ 223 #define scratch_out scratch + 4 * n + 4 224 225 /* vm1, 2n+1 limbs */ 226 TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); 227 cy = 0; 228 if (asm1[n] != 0) 229 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 230 vm1[2 * n] = cy; 231 232 TOOM22_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ 233 234 /* vinf, s+t limbs */ 235 if (s > t) mpn_mul (vinf, a3, s, b1, t); 236 else mpn_mul (vinf, b1, t, a3, s); 237 238 vinf0 = vinf[0]; /* v1 overlaps with this */ 239 240 /* v1, 2n+1 limbs */ 241 TOOM22_MUL_N_REC (v1, as1, bs1, n, scratch_out); 242 if (as1[n] == 1) 243 { 244 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 245 } 246 else if (as1[n] == 2) 247 { 248 #if HAVE_NATIVE_mpn_addlsh1_n 249 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); 250 #else 251 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 252 #endif 253 } 254 else if (as1[n] == 3) 255 { 256 cy = 3 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3)); 257 } 258 else 259 cy = 0; 260 if (bs1[n] != 0) 261 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 262 v1[2 * n] = cy; 263 264 TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ 265 266 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, 1^vm1_neg, vinf0, scratch + 4 * n + 4); 267 268 TMP_FREE; 269 } 270