1 /* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in 2 size. Or more accurately, bn <= an < (4/3)bn. 3 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 9 10 Copyright 2006, 2007, 2008 Free Software Foundation, Inc. 11 12 This file is part of the GNU MP Library. 13 14 The GNU MP Library is free software; you can redistribute it and/or modify 15 it under the terms of the GNU Lesser General Public License as published by 16 the Free Software Foundation; either version 3 of the License, or (at your 17 option) any later version. 18 19 The GNU MP Library is distributed in the hope that it will be useful, but 20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 21 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 22 License for more details. 23 24 You should have received a copy of the GNU Lesser General Public License 25 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 26 27 28 #include "gmp.h" 29 #include "gmp-impl.h" 30 31 /* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf 32 33 <-s--><--n--><--n--><--n--> 34 ____ ______ ______ ______ 35 |_a3_|___a2_|___a1_|___a0_| 36 |b3_|___b2_|___b1_|___b0_| 37 <-t-><--n--><--n--><--n--> 38 39 v0 = a0 * b0 # A(0)*B(0) 40 v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3 41 vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1 42 v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14 43 vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) # A(2)*B(2) ah <= 9 |bh| <= 9 44 vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14 45 vinf= a3 * b2 # A(inf)*B(inf) 46 */ 47 48 #if TUNE_PROGRAM_BUILD 49 #define MAYBE_mul_basecase 1 50 #define MAYBE_mul_toom22 1 51 #define MAYBE_mul_toom44 1 52 #else 53 #define MAYBE_mul_basecase \ 54 (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD) 55 #define MAYBE_mul_toom22 \ 56 (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD) 57 #define MAYBE_mul_toom44 \ 58 (MUL_FFT_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD) 59 #endif 60 61 #define TOOM44_MUL_N_REC(p, a, b, n, ws) \ 62 do { \ 63 if (MAYBE_mul_basecase \ 64 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ 65 mpn_mul_basecase (p, a, n, b, n); \ 66 else if (MAYBE_mul_toom22 \ 67 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ 68 mpn_toom22_mul (p, a, n, b, n, ws); \ 69 else if (! MAYBE_mul_toom44 \ 70 || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) \ 71 mpn_toom33_mul (p, a, n, b, n, ws); \ 72 else \ 73 mpn_toom44_mul (p, a, n, b, n, ws); \ 74 } while (0) 75 76 /* Use of scratch space. In the product area, we store 77 78 ___________________ 79 |vinf|____|_v1_|_v0_| 80 s+t 2n-1 2n+1 2n 81 82 The other recursive products, vm1, v2, vm2, vh are stored in the 83 scratch area. When computing them, we use the product area for 84 intermediate values. 85 86 Next, we compute v1. We can store the intermediate factors at v0 87 and at vh + 2n + 2. 88 89 Finally, for v0 and vinf, factors are parts of the input operands, 90 and we need scratch space only for the recursive multiplication. 91 92 In all, if S(an) is the scratch need, the needed space is bounded by 93 94 S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1) 95 96 which should give S(n) = 8 n/3 + c log(n) for some constant c. 97 */ 98 99 void 100 mpn_toom44_mul (mp_ptr pp, 101 mp_srcptr ap, mp_size_t an, 102 mp_srcptr bp, mp_size_t bn, 103 mp_ptr scratch) 104 { 105 mp_size_t n, s, t; 106 mp_limb_t cy; 107 enum toom7_flags flags; 108 109 #define a0 ap 110 #define a1 (ap + n) 111 #define a2 (ap + 2*n) 112 #define a3 (ap + 3*n) 113 #define b0 bp 114 #define b1 (bp + n) 115 #define b2 (bp + 2*n) 116 #define b3 (bp + 3*n) 117 118 ASSERT (an >= bn); 119 120 n = (an + 3) >> 2; 121 122 s = an - 3 * n; 123 t = bn - 3 * n; 124 125 ASSERT (0 < s && s <= n); 126 ASSERT (0 < t && t <= n); 127 ASSERT (s >= t); 128 129 /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the 130 * following limb, so these must be computed in order, and we need a 131 * one limb gap to tp. */ 132 #define v0 pp /* 2n */ 133 #define v1 (pp + 2 * n) /* 2n+1 */ 134 #define vinf (pp + 6 * n) /* s+t */ 135 #define v2 scratch /* 2n+1 */ 136 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ 137 #define vh (scratch + 4 * n + 2) /* 2n+1 */ 138 #define vm1 (scratch + 6 * n + 3) /* 2n+1 */ 139 #define tp (scratch + 8*n + 5) 140 141 /* apx and bpx must not overlap with v1 */ 142 #define apx pp /* n+1 */ 143 #define amx (pp + n + 1) /* n+1 */ 144 #define bmx (pp + 2*n + 2) /* n+1 */ 145 #define bpx (pp + 4*n + 2) /* n+1 */ 146 147 /* Total scratch need: 8*n + 5 + scratch for recursive calls. This 148 gives roughly 32 n/3 + log term. */ 149 150 /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ 151 flags = toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp); 152 153 /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */ 154 flags ^= toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp); 155 156 TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */ 157 TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */ 158 159 /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ 160 #if HAVE_NATIVE_mpn_addlsh1_n 161 cy = mpn_addlsh1_n (apx, a1, a0, n); 162 cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); 163 if (s < n) 164 { 165 mp_limb_t cy2; 166 cy2 = mpn_addlsh1_n (apx, a3, apx, s); 167 apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); 168 MPN_INCR_U (apx + s, n+1-s, cy2); 169 } 170 else 171 apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); 172 #else 173 cy = mpn_lshift (apx, a0, n, 1); 174 cy += mpn_add_n (apx, apx, a1, n); 175 cy = 2*cy + mpn_lshift (apx, apx, n, 1); 176 cy += mpn_add_n (apx, apx, a2, n); 177 cy = 2*cy + mpn_lshift (apx, apx, n, 1); 178 apx[n] = cy + mpn_add (apx, apx, n, a3, s); 179 #endif 180 181 /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */ 182 #if HAVE_NATIVE_mpn_addlsh1_n 183 cy = mpn_addlsh1_n (bpx, b1, b0, n); 184 cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n); 185 if (t < n) 186 { 187 mp_limb_t cy2; 188 cy2 = mpn_addlsh1_n (bpx, b3, bpx, t); 189 bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1); 190 MPN_INCR_U (bpx + t, n+1-t, cy2); 191 } 192 else 193 bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n); 194 #else 195 cy = mpn_lshift (bpx, b0, n, 1); 196 cy += mpn_add_n (bpx, bpx, b1, n); 197 cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); 198 cy += mpn_add_n (bpx, bpx, b2, n); 199 cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); 200 bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t); 201 #endif 202 203 ASSERT (apx[n] < 15); 204 ASSERT (bpx[n] < 15); 205 206 TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */ 207 208 /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ 209 flags |= toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp); 210 211 /* Compute bpx = b0 + b1 + b2 + b3 bnd bmx = b0 - b1 + b2 - b3. */ 212 flags ^= toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp); 213 214 TOOM44_MUL_N_REC (vm1, amx, bmx, n + 1, tp); /* vm1, 2n+1 limbs */ 215 /* Clobbers amx, bmx. */ 216 TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */ 217 218 TOOM44_MUL_N_REC (v0, a0, b0, n, tp); 219 if (s > t) 220 mpn_mul (vinf, a3, s, b3, t); 221 else 222 TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */ 223 224 mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp); 225 } 226