1 /* mpn_toom3_sqr -- Square {ap,an}. 2 3 Contributed to the GNU project by Torbjorn Granlund. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2006, 2007, 2008 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of the GNU Lesser General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 The GNU MP Library is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21 License for more details. 22 23 You should have received a copy of the GNU Lesser General Public License 24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25 26 27 /* 28 Things to work on: 29 30 1. Trim allocation. The allocations for as1 and asm1 could be 31 avoided by instead reusing the pp area and the scratch area. 32 2. Use new toom functions for the recursive calls. 33 */ 34 35 #include "gmp.h" 36 #include "gmp-impl.h" 37 38 /* Evaluate in: -1, 0, +1, +2, +inf 39 40 <-s-><--n--><--n--><--n--> 41 ___ ______ ______ ______ 42 |a3_|___a2_|___a1_|___a0_| 43 |_b1_|___b0_| 44 <-t--><--n--> 45 46 v0 = a0 * b0 # A(0)*B(0) 47 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2 48 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1 49 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6 50 vinf= a2 * b2 # A(inf)*B(inf) 51 */ 52 53 #if TUNE_PROGRAM_BUILD 54 #define MAYBE_sqr_basecase 1 55 #define MAYBE_sqr_toom3 1 56 #else 57 #define MAYBE_sqr_basecase \ 58 (SQR_TOOM3_THRESHOLD < 3 * SQR_KARATSUBA_THRESHOLD) 59 #define MAYBE_sqr_toom3 \ 60 (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD) 61 #endif 62 63 #define TOOM3_SQR_N_REC(p, a, n, ws) \ 64 do { \ 65 if (MAYBE_sqr_basecase \ 66 && BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \ 67 mpn_sqr_basecase (p, a, n); \ 68 else if (! MAYBE_sqr_toom3 \ 69 || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ 70 mpn_kara_sqr_n (p, a, n, ws); \ 71 else \ 72 mpn_toom3_sqr_n (p, a, n, ws); \ 73 } while (0) 74 75 void 76 mpn_toom3_sqr (mp_ptr pp, 77 mp_srcptr ap, mp_size_t an, 78 mp_ptr scratch) 79 { 80 mp_size_t n, s; 81 mp_limb_t cy, vinf0; 82 mp_ptr gp; 83 mp_ptr as1, asm1, as2; 84 TMP_DECL; 85 86 #define a0 ap 87 #define a1 (ap + n) 88 #define a2 (ap + 2*n) 89 90 n = (an + 2) / (size_t) 3; 91 92 s = an - 2 * n; 93 94 ASSERT (0 < s && s <= n); 95 96 TMP_MARK; 97 98 as1 = TMP_SALLOC_LIMBS (n + 1); 99 asm1 = TMP_SALLOC_LIMBS (n + 1); 100 as2 = TMP_SALLOC_LIMBS (n + 1); 101 102 gp = pp; 103 104 /* Compute as1 and asm1. */ 105 cy = mpn_add (gp, a0, n, a2, s); 106 #if HAVE_NATIVE_mpn_addsub_n 107 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 108 { 109 cy = mpn_addsub_n (as1, asm1, a1, gp, n); 110 as1[n] = 0; 111 asm1[n] = 0; 112 } 113 else 114 { 115 cy2 = mpn_addsub_n (as1, asm1, gp, a1, n); 116 as1[n] = cy + (cy2 >> 1); 117 asm1[n] = cy - (cy & 1); 118 } 119 #else 120 as1[n] = cy + mpn_add_n (as1, gp, a1, n); 121 if (cy == 0 && mpn_cmp (gp, a1, n) < 0) 122 { 123 mpn_sub_n (asm1, a1, gp, n); 124 asm1[n] = 0; 125 } 126 else 127 { 128 cy -= mpn_sub_n (asm1, gp, a1, n); 129 asm1[n] = cy; 130 } 131 #endif 132 133 /* Compute as2. */ 134 #if HAVE_NATIVE_mpn_addlsh1_n 135 cy = mpn_addlsh1_n (as2, a1, a2, s); 136 if (s != n) 137 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); 138 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 139 #else 140 cy = mpn_lshift (as2, a2, s, 1); 141 cy += mpn_add_n (as2, a1, as2, s); 142 if (s != n) 143 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); 144 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 145 cy += mpn_add_n (as2, a0, as2, n); 146 #endif 147 as2[n] = cy; 148 149 ASSERT (as1[n] <= 2); 150 ASSERT (asm1[n] <= 1); 151 152 #define v0 pp /* 2n */ 153 #define v1 (pp + 2 * n) /* 2n+1 */ 154 #define vinf (pp + 4 * n) /* s+s */ 155 #define vm1 scratch /* 2n+1 */ 156 #define v2 (scratch + 2 * n + 1) /* 2n+2 */ 157 #define scratch_out (scratch + 4 * n + 4) 158 159 /* vm1, 2n+1 limbs */ 160 #ifdef SMALLER_RECURSION 161 TOOM3_SQR_N_REC (vm1, asm1, n, scratch_out); 162 cy = 0; 163 if (asm1[n] != 0) 164 cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n); 165 if (asm1[n] != 0) 166 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); 167 vm1[2 * n] = cy; 168 #else 169 TOOM3_SQR_N_REC (vm1, asm1, n + 1, scratch_out); 170 #endif 171 172 TOOM3_SQR_N_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */ 173 174 TOOM3_SQR_N_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */ 175 176 vinf0 = vinf[0]; /* v1 overlaps with this */ 177 178 #ifdef SMALLER_RECURSION 179 /* v1, 2n+1 limbs */ 180 TOOM3_SQR_N_REC (v1, as1, n, scratch_out); 181 if (as1[n] == 1) 182 { 183 cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n); 184 } 185 else if (as1[n] != 0) 186 { 187 #if HAVE_NATIVE_mpn_addlsh1_n 188 cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n); 189 #else 190 cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 191 #endif 192 } 193 else 194 cy = 0; 195 if (as1[n] == 1) 196 { 197 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 198 } 199 else if (as1[n] != 0) 200 { 201 #if HAVE_NATIVE_mpn_addlsh1_n 202 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n); 203 #else 204 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); 205 #endif 206 } 207 v1[2 * n] = cy; 208 #else 209 cy = vinf[1]; 210 TOOM3_SQR_N_REC (v1, as1, n + 1, scratch_out); 211 vinf[1] = cy; 212 #endif 213 214 TOOM3_SQR_N_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */ 215 216 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 1, vinf0, scratch_out); 217 218 TMP_FREE; 219 } 220