1 /* mpn_toom4_sqr -- Square {ap,an}. 2 3 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of the GNU Lesser General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 The GNU MP Library is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21 License for more details. 22 23 You should have received a copy of the GNU Lesser General Public License 24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25 26 27 #include "gmp.h" 28 #include "gmp-impl.h" 29 30 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf 31 32 <-s--><--n--><--n--><--n--> 33 ____ ______ ______ ______ 34 |_a3_|___a2_|___a1_|___a0_| 35 36 v0 = a0 ^2 # A(0)^2 37 v1 = ( a0+ a1+ a2+ a3)^2 # A(1)^2 ah <= 3 38 vm1 = ( a0- a1+ a2- a3)^2 # A(-1)^2 |ah| <= 1 39 v2 = ( a0+2a1+4a2+8a3)^2 # A(2)^2 ah <= 14 40 vh = (8a0+4a1+2a2+ a3)^2 # A(1/2)^2 ah <= 14 41 vmh = (8a0-4a1+2a2- a3)^2 # A(-1/2)^2 -4<=ah<=9 42 vinf= a3 ^2 # A(inf)^2 43 */ 44 45 #if TUNE_PROGRAM_BUILD 46 #define MAYBE_sqr_basecase 1 47 #define MAYBE_sqr_toom2 1 48 #define MAYBE_sqr_toom4 1 49 #else 50 #define MAYBE_sqr_basecase \ 51 (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM2_THRESHOLD) 52 #define MAYBE_sqr_toom2 \ 53 (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD) 54 #define MAYBE_sqr_toom4 \ 55 (SQR_FFT_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD) 56 #endif 57 58 #define TOOM4_SQR_REC(p, a, n, ws) \ 59 do { \ 60 if (MAYBE_sqr_basecase \ 61 && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \ 62 mpn_sqr_basecase (p, a, n); \ 63 else if (MAYBE_sqr_toom2 \ 64 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ 65 mpn_toom2_sqr (p, a, n, ws); \ 66 else if (! MAYBE_sqr_toom4 \ 67 || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) \ 68 mpn_toom3_sqr (p, a, n, ws); \ 69 else \ 70 mpn_toom4_sqr (p, a, n, ws); \ 71 } while (0) 72 73 void 74 mpn_toom4_sqr (mp_ptr pp, 75 mp_srcptr ap, mp_size_t an, 76 mp_ptr scratch) 77 { 78 mp_size_t n, s; 79 mp_limb_t cy; 80 81 #define a0 ap 82 #define a1 (ap + n) 83 #define a2 (ap + 2*n) 84 #define a3 (ap + 3*n) 85 86 n = (an + 3) >> 2; 87 88 s = an - 3 * n; 89 90 ASSERT (0 < s && s <= n); 91 92 /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the 93 * following limb, so these must be computed in order, and we need a 94 * one limb gap to tp. */ 95 #define v0 pp /* 2n */ 96 #define v1 (pp + 2 * n) /* 2n+1 */ 97 #define vinf (pp + 6 * n) /* s+t */ 98 #define v2 scratch /* 2n+1 */ 99 #define vm2 (scratch + 2 * n + 1) /* 2n+1 */ 100 #define vh (scratch + 4 * n + 2) /* 2n+1 */ 101 #define vm1 (scratch + 6 * n + 3) /* 2n+1 */ 102 #define tp (scratch + 8*n + 5) 103 104 /* No overlap with v1 */ 105 #define apx pp /* n+1 */ 106 #define amx (pp + 4*n + 2) /* n+1 */ 107 108 /* Total scratch need: 8*n + 5 + scratch for recursive calls. This 109 gives roughly 32 n/3 + log term. */ 110 111 /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ 112 mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp); 113 114 TOOM4_SQR_REC (v2, apx, n + 1, tp); /* v2, 2n+1 limbs */ 115 TOOM4_SQR_REC (vm2, amx, n + 1, tp); /* vm2, 2n+1 limbs */ 116 117 /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ 118 #if HAVE_NATIVE_mpn_addlsh1_n 119 cy = mpn_addlsh1_n (apx, a1, a0, n); 120 cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); 121 if (s < n) 122 { 123 mp_limb_t cy2; 124 cy2 = mpn_addlsh1_n (apx, a3, apx, s); 125 apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); 126 MPN_INCR_U (apx + s, n+1-s, cy2); 127 } 128 else 129 apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); 130 #else 131 cy = mpn_lshift (apx, a0, n, 1); 132 cy += mpn_add_n (apx, apx, a1, n); 133 cy = 2*cy + mpn_lshift (apx, apx, n, 1); 134 cy += mpn_add_n (apx, apx, a2, n); 135 cy = 2*cy + mpn_lshift (apx, apx, n, 1); 136 apx[n] = cy + mpn_add (apx, apx, n, a3, s); 137 #endif 138 139 ASSERT (apx[n] < 15); 140 141 TOOM4_SQR_REC (vh, apx, n + 1, tp); /* vh, 2n+1 limbs */ 142 143 /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ 144 mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp); 145 146 TOOM4_SQR_REC (v1, apx, n + 1, tp); /* v1, 2n+1 limbs */ 147 TOOM4_SQR_REC (vm1, amx, n + 1, tp); /* vm1, 2n+1 limbs */ 148 149 TOOM4_SQR_REC (v0, a0, n, tp); 150 TOOM4_SQR_REC (vinf, a3, s, tp); /* vinf, 2s limbs */ 151 152 mpn_toom_interpolate_7pts (pp, n, 0, vm2, vm1, v2, vh, 2*s, tp); 153 } 154