1 /* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3 2 times as large as bn. Or more accurately, bn < an < 2 bn. 3 4 Contributed to the GNU project by Marco Bodrato. 5 6 The idea of applying toom to unbalanced multiplication is due to Marco 7 Bodrato and Alberto Zanoni. 8 9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 12 13 Copyright 2009 Free Software Foundation, Inc. 14 15 This file is part of the GNU MP Library. 16 17 The GNU MP Library is free software; you can redistribute it and/or modify 18 it under the terms of the GNU Lesser General Public License as published by 19 the Free Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22 The GNU MP Library is distributed in the hope that it will be useful, but 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 25 License for more details. 26 27 You should have received a copy of the GNU Lesser General Public License 28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 29 30 31 #include "gmp.h" 32 #include "gmp-impl.h" 33 34 /* Evaluate in: -2, -1, 0, +1, +2, +inf 35 36 <-s-><--n--><--n--><--n--> 37 ___ ______ ______ ______ 38 |a3_|___a2_|___a1_|___a0_| 39 |_b2_|___b1_|___b0_| 40 <-t--><--n--><--n--> 41 42 v0 = a0 * b0 # A(0)*B(0) 43 v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2 44 vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1 45 v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6 46 vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4 47 vinf= a3 * b2 # A(inf)*B(inf) 48 */ 49 50 void 51 mpn_toom43_mul (mp_ptr pp, 52 mp_srcptr ap, mp_size_t an, 53 mp_srcptr bp, mp_size_t bn, mp_ptr scratch) 54 { 55 mp_size_t n, s, t; 56 enum toom6_flags flags; 57 mp_limb_t cy; 58 59 #define a0 ap 60 #define a1 (ap + n) 61 #define a2 (ap + 2 * n) 62 #define a3 (ap + 3 * n) 63 #define b0 bp 64 #define b1 (bp + n) 65 #define b2 (bp + 2 * n) 66 67 n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3); 68 69 s = an - 3 * n; 70 t = bn - 2 * n; 71 72 ASSERT (0 < s && s <= n); 73 ASSERT (0 < t && t <= n); 74 75 /* This is true whenever an >= 25 or bn >= 19, I think. It 76 guarantees that we can fit 5 values of size n+1 in the product 77 area. */ 78 ASSERT (s+t >= 5); 79 80 #define v0 pp /* 2n */ 81 #define vm1 (scratch) /* 2n+1 */ 82 #define v1 (pp + 2*n) /* 2n+1 */ 83 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ 84 #define v2 (scratch + 4 * n + 2) /* 2n+1 */ 85 #define vinf (pp + 5 * n) /* s+t */ 86 #define bs1 pp /* n+1 */ 87 #define bsm1 (scratch + 2 * n + 2) /* n+1 */ 88 #define asm1 (scratch + 3 * n + 3) /* n+1 */ 89 #define asm2 (scratch + 4 * n + 4) /* n+1 */ 90 #define bsm2 (pp + n + 1) /* n+1 */ 91 #define bs2 (pp + 2 * n + 2) /* n+1 */ 92 #define as2 (pp + 3 * n + 3) /* n+1 */ 93 #define as1 (pp + 4 * n + 4) /* n+1 */ 94 95 /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra 96 limb, because products will overwrite 2n+2 limbs. */ 97 98 #define a0a2 scratch 99 #define b0b2 scratch 100 #define a1a3 asm1 101 #define b1d bsm1 102 103 /* Compute as2 and asm2. */ 104 flags = toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3); 105 106 /* Compute bs2 and bsm2. */ 107 b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */ 108 cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */ 109 cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */ 110 if (t != n) 111 cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy); 112 b0b2[n] = cy; 113 114 #if HAVE_NATIVE_mpn_add_n_sub_n 115 if (mpn_cmp (b0b2, b1d, n+1) < 0) 116 { 117 mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1); 118 flags ^= toom6_vm2_neg; 119 } 120 else 121 { 122 mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1); 123 } 124 #else 125 mpn_add_n (bs2, b0b2, b1d, n+1); 126 if (mpn_cmp (b0b2, b1d, n+1) < 0) 127 { 128 mpn_sub_n (bsm2, b1d, b0b2, n+1); 129 flags ^= toom6_vm2_neg; 130 } 131 else 132 { 133 mpn_sub_n (bsm2, b0b2, b1d, n+1); 134 } 135 #endif 136 137 /* Compute as1 and asm1. */ 138 flags ^= toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2); 139 140 /* Compute bs1 and bsm1. */ 141 bsm1[n] = mpn_add (bsm1, b0, n, b2, t); 142 #if HAVE_NATIVE_mpn_add_n_sub_n 143 if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0) 144 { 145 cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n); 146 bs1[n] = cy >> 1; 147 flags ^= toom6_vm1_neg; 148 } 149 else 150 { 151 cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n); 152 bs1[n] = bsm1[n] + (cy >> 1); 153 bsm1[n]-= cy & 1; 154 } 155 #else 156 bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n); 157 if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0) 158 { 159 mpn_sub_n (bsm1, b1, bsm1, n); 160 flags ^= toom6_vm1_neg; 161 } 162 else 163 { 164 bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n); 165 } 166 #endif 167 168 ASSERT (as1[n] <= 3); 169 ASSERT (bs1[n] <= 2); 170 ASSERT (asm1[n] <= 1); 171 ASSERT (bsm1[n] <= 1); 172 ASSERT (as2[n] <=14); 173 ASSERT (bs2[n] <= 6); 174 ASSERT (asm2[n] <= 9); 175 ASSERT (bsm2[n] <= 4); 176 177 /* vm1, 2n+1 limbs */ 178 mpn_mul_n (vm1, asm1, bsm1, n+1); /* W4 */ 179 180 /* vm2, 2n+1 limbs */ 181 mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */ 182 183 /* v2, 2n+1 limbs */ 184 mpn_mul_n (v2, as2, bs2, n+1); /* W1 */ 185 186 /* v1, 2n+1 limbs */ 187 mpn_mul_n (v1, as1, bs1, n+1); /* W3 */ 188 189 /* vinf, s+t limbs */ /* W0 */ 190 if (s > t) mpn_mul (vinf, a3, s, b2, t); 191 else mpn_mul (vinf, b2, t, a3, s); 192 193 /* v0, 2n limbs */ 194 mpn_mul_n (v0, ap, bp, n); /* W5 */ 195 196 mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s); 197 198 #undef v0 199 #undef vm1 200 #undef v1 201 #undef vm2 202 #undef v2 203 #undef vinf 204 #undef bs1 205 #undef bs2 206 #undef bsm1 207 #undef bsm2 208 #undef asm1 209 #undef asm2 210 /* #undef as1 */ 211 /* #undef as2 */ 212 #undef a0a2 213 #undef b0b2 214 #undef a1a3 215 #undef b1d 216 #undef a0 217 #undef a1 218 #undef a2 219 #undef a3 220 #undef b0 221 #undef b1 222 #undef b2 223 } 224