1 /* mpn_toom4_sqr -- Square {ap,an}. 2 3 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2006, 2007, 2008 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of the GNU Lesser General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 The GNU MP Library is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21 License for more details. 22 23 You should have received a copy of the GNU Lesser General Public License 24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25 26 27 /* 28 Things to work on: 29 30 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be 31 avoided by instead reusing the pp area and the scratch area. 32 2. Use new toom functions for the recursive calls. 33 */ 34 35 #include "gmp.h" 36 #include "gmp-impl.h" 37 38 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf 39 40 <-s--><--n--><--n--><--n--> 41 ____ ______ ______ ______ 42 |_a3_|___a2_|___a1_|___a0_| 43 |b3_|___b2_|___b1_|___b0_| 44 <-t-><--n--><--n--><--n--> 45 46 v0 = a0 * b0 # A(0)*B(0) 47 v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3 48 vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1 49 v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14 50 vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14 51 vmh = (8a0-4a1+2a2- a3)*(8b0-4b1+2b2- b3) # A(-1/2)*B(-1/2) -4<=ah<=9 -4<=bh<=9 52 vinf= a3 * b2 # A(inf)*B(inf) 53 */ 54 55 #if TUNE_PROGRAM_BUILD 56 #define MAYBE_sqr_basecase 1 57 #define MAYBE_sqr_toom2 1 58 #define MAYBE_sqr_toom4 1 59 #else 60 #define MAYBE_sqr_basecase \ 61 (SQR_TOOM4_THRESHOLD < 4 * SQR_KARATSUBA_THRESHOLD) 62 #define MAYBE_sqr_toom2 \ 63 (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD) 64 #define MAYBE_sqr_toom4 \ 65 (SQR_FFT_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD) 66 #endif 67 68 #define TOOM4_SQR_N_REC(p, a, n, ws) \ 69 do { \ 70 if (MAYBE_sqr_basecase \ 71 && BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \ 72 mpn_sqr_basecase (p, a, n); \ 73 else if (MAYBE_sqr_toom2 \ 74 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ 75 mpn_kara_sqr_n (p, a, n, ws); \ 76 else if (! MAYBE_sqr_toom4 \ 77 || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) \ 78 mpn_toom3_sqr_n (p, a, n, ws); \ 79 else \ 80 mpn_toom4_sqr (p, a, n, ws); \ 81 } while (0) 82 83 void 84 mpn_toom4_sqr (mp_ptr pp, 85 mp_srcptr ap, mp_size_t an, 86 mp_ptr scratch) 87 { 88 mp_size_t n, s; 89 mp_limb_t cy; 90 mp_ptr gp, hp; 91 mp_ptr as1, asm1, as2, ash, asmh; 92 TMP_DECL; 93 94 #define a0 ap 95 #define a1 (ap + n) 96 #define a2 (ap + 2*n) 97 #define a3 (ap + 3*n) 98 99 n = (an + 3) >> 2; 100 101 s = an - 3 * n; 102 103 ASSERT (0 < s && s <= n); 104 105 TMP_MARK; 106 107 as1 = TMP_SALLOC_LIMBS (n + 1); 108 asm1 = TMP_SALLOC_LIMBS (n + 1); 109 as2 = TMP_SALLOC_LIMBS (n + 1); 110 ash = TMP_SALLOC_LIMBS (n + 1); 111 asmh = TMP_SALLOC_LIMBS (n + 1); 112 113 gp = pp; 114 hp = pp + n + 1; 115 116 /* Compute as1 and asm1. */ 117 gp[n] = mpn_add_n (gp, a0, a2, n); 118 hp[n] = mpn_add (hp, a1, n, a3, s); 119 #if HAVE_NATIVE_mpn_addsub_n 120 if (mpn_cmp (gp, hp, n + 1) < 0) 121 { 122 mpn_addsub_n (as1, asm1, hp, gp, n + 1); 123 } 124 else 125 { 126 mpn_addsub_n (as1, asm1, gp, hp, n + 1); 127 } 128 #else 129 mpn_add_n (as1, gp, hp, n + 1); 130 if (mpn_cmp (gp, hp, n + 1) < 0) 131 { 132 mpn_sub_n (asm1, hp, gp, n + 1); 133 } 134 else 135 { 136 mpn_sub_n (asm1, gp, hp, n + 1); 137 } 138 #endif 139 140 /* Compute as2. */ 141 #if HAVE_NATIVE_mpn_addlsh1_n 142 cy = mpn_addlsh1_n (as2, a2, a3, s); 143 if (s != n) 144 cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy); 145 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); 146 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 147 #else 148 cy = mpn_lshift (as2, a3, s, 1); 149 cy += mpn_add_n (as2, a2, as2, s); 150 if (s != n) 151 cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy); 152 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 153 cy += mpn_add_n (as2, a1, as2, n); 154 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 155 cy += mpn_add_n (as2, a0, as2, n); 156 #endif 157 as2[n] = cy; 158 159 /* Compute ash and asmh. */ 160 cy = mpn_lshift (gp, a0, n, 3); /* 8a0 */ 161 #if HAVE_NATIVE_mpn_addlsh1_n 162 gp[n] = cy + mpn_addlsh1_n (gp, gp, a2, n); /* 8a0 + 2a2 */ 163 #else 164 cy += mpn_lshift (hp, a2, n, 1); /* 2a2 */ 165 gp[n] = cy + mpn_add_n (gp, gp, hp, n); /* 8a0 + 2a2 */ 166 #endif 167 cy = mpn_lshift (hp, a1, n, 2); /* 4a1 */ 168 hp[n] = cy + mpn_add (hp, hp, n, a3, s); /* 4a1 + a3 */ 169 #if HAVE_NATIVE_mpn_addsub_n 170 if (mpn_cmp (gp, hp, n + 1) < 0) 171 { 172 mpn_addsub_n (ash, asmh, hp, gp, n + 1); 173 } 174 else 175 { 176 mpn_addsub_n (ash, asmh, gp, hp, n + 1); 177 } 178 #else 179 mpn_add_n (ash, gp, hp, n + 1); 180 if (mpn_cmp (gp, hp, n + 1) < 0) 181 { 182 mpn_sub_n (asmh, hp, gp, n + 1); 183 } 184 else 185 { 186 mpn_sub_n (asmh, gp, hp, n + 1); 187 } 188 #endif 189 190 ASSERT (as1[n] <= 3); 191 ASSERT (asm1[n] <= 1); 192 ASSERT (as2[n] <= 14); 193 ASSERT (ash[n] <= 14); 194 ASSERT (asmh[n] <= 9); 195 196 #define v0 pp /* 2n */ 197 #define v1 (scratch + 6 * n + 6) /* 2n+1 */ 198 #define vm1 scratch /* 2n+1 */ 199 #define v2 (scratch + 2 * n + 2) /* 2n+1 */ 200 #define vinf (pp + 6 * n) /* s+t */ 201 #define vh (pp + 2 * n) /* 2n+1 */ 202 #define vmh (scratch + 4 * n + 4) 203 #define scratch_out (scratch + 8 * n + 8) 204 205 /* vm1, 2n+1 limbs */ 206 TOOM4_SQR_N_REC (vm1, asm1, n + 1, scratch_out); /* vm1, 2n+1 limbs */ 207 208 TOOM4_SQR_N_REC (v2 , as2 , n + 1, scratch_out); /* v2, 2n+1 limbs */ 209 210 TOOM4_SQR_N_REC (vinf, a3 , s, scratch_out); /* vinf, 2s limbs */ 211 212 TOOM4_SQR_N_REC (v1 , as1 , n + 1, scratch_out); /* v1, 2n+1 limbs */ 213 214 TOOM4_SQR_N_REC (vh , ash , n + 1, scratch_out); 215 216 TOOM4_SQR_N_REC (vmh, asmh, n + 1, scratch_out); 217 218 TOOM4_SQR_N_REC (v0 , ap , n , scratch_out); /* v0, 2n limbs */ 219 220 mpn_toom_interpolate_7pts (pp, n, 0, vmh, vm1, v1, v2, s + s, scratch_out); 221 222 TMP_FREE; 223 } 224