1 /* mpn_toom3_sqr -- Square {ap,an}.
2
3 Contributed to the GNU project by Torbjorn Granlund.
4 Additional improvements by Marco Bodrato.
5
6 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
7 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
8 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10 Copyright 2006-2010, 2012 Free Software Foundation, Inc.
11
12 This file is part of the GNU MP Library.
13
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20
21 or
22
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
26
27 or both in parallel, as here.
28
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
33
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
37
38
39 #include "gmp.h"
40 #include "gmp-impl.h"
41
42 /* Evaluate in: -1, 0, +1, +2, +inf
43
44 <-s--><--n--><--n-->
45 ____ ______ ______
46 |_a2_|___a1_|___a0_|
47
48 v0 = a0 ^2 # A(0)^2
49 v1 = (a0+ a1+ a2)^2 # A(1)^2 ah <= 2
50 vm1 = (a0- a1+ a2)^2 # A(-1)^2 |ah| <= 1
51 v2 = (a0+2a1+4a2)^2 # A(2)^2 ah <= 6
52 vinf= a2 ^2 # A(inf)^2
53 */
54
55 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
56 #define MAYBE_sqr_basecase 1
57 #define MAYBE_sqr_toom3 1
58 #else
59 #define MAYBE_sqr_basecase \
60 (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
61 #define MAYBE_sqr_toom3 \
62 (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
63 #endif
64
65 #define TOOM3_SQR_REC(p, a, n, ws) \
66 do { \
67 if (MAYBE_sqr_basecase \
68 && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \
69 mpn_sqr_basecase (p, a, n); \
70 else if (! MAYBE_sqr_toom3 \
71 || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
72 mpn_toom2_sqr (p, a, n, ws); \
73 else \
74 mpn_toom3_sqr (p, a, n, ws); \
75 } while (0)
76
77 void
mpn_toom3_sqr(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_ptr scratch)78 mpn_toom3_sqr (mp_ptr pp,
79 mp_srcptr ap, mp_size_t an,
80 mp_ptr scratch)
81 {
82 const int __gmpn_cpuvec_initialized = 1;
83 mp_size_t n, s;
84 mp_limb_t cy, vinf0;
85 mp_ptr gp;
86 mp_ptr as1, asm1, as2;
87
88 #define a0 ap
89 #define a1 (ap + n)
90 #define a2 (ap + 2*n)
91
92 n = (an + 2) / (size_t) 3;
93
94 s = an - 2 * n;
95
96 ASSERT (0 < s && s <= n);
97
98 as1 = scratch + 4 * n + 4;
99 asm1 = scratch + 2 * n + 2;
100 as2 = pp + n + 1;
101
102 gp = scratch;
103
104 /* Compute as1 and asm1. */
105 cy = mpn_add (gp, a0, n, a2, s);
106 #if HAVE_NATIVE_mpn_add_n_sub_n
107 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
108 {
109 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
110 as1[n] = cy >> 1;
111 asm1[n] = 0;
112 }
113 else
114 {
115 mp_limb_t cy2;
116 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
117 as1[n] = cy + (cy2 >> 1);
118 asm1[n] = cy - (cy2 & 1);
119 }
120 #else
121 as1[n] = cy + mpn_add_n (as1, gp, a1, n);
122 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
123 {
124 mpn_sub_n (asm1, a1, gp, n);
125 asm1[n] = 0;
126 }
127 else
128 {
129 cy -= mpn_sub_n (asm1, gp, a1, n);
130 asm1[n] = cy;
131 }
132 #endif
133
134 /* Compute as2. */
135 #if HAVE_NATIVE_mpn_rsblsh1_n
136 cy = mpn_add_n (as2, a2, as1, s);
137 if (s != n)
138 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
139 cy += as1[n];
140 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
141 #else
142 #if HAVE_NATIVE_mpn_addlsh1_n
143 cy = mpn_addlsh1_n (as2, a1, a2, s);
144 if (s != n)
145 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
146 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
147 #else
148 cy = mpn_add_n (as2, a2, as1, s);
149 if (s != n)
150 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
151 cy += as1[n];
152 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
153 cy -= mpn_sub_n (as2, as2, a0, n);
154 #endif
155 #endif
156 as2[n] = cy;
157
158 ASSERT (as1[n] <= 2);
159 ASSERT (asm1[n] <= 1);
160
161 #define v0 pp /* 2n */
162 #define v1 (pp + 2 * n) /* 2n+1 */
163 #define vinf (pp + 4 * n) /* s+s */
164 #define vm1 scratch /* 2n+1 */
165 #define v2 (scratch + 2 * n + 1) /* 2n+2 */
166 #define scratch_out (scratch + 5 * n + 5)
167
168 /* vm1, 2n+1 limbs */
169 #ifdef SMALLER_RECURSION
170 TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
171 cy = 0;
172 if (asm1[n] != 0)
173 cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
174 if (asm1[n] != 0)
175 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
176 vm1[2 * n] = cy;
177 #else
178 TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
179 #endif
180
181 TOOM3_SQR_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */
182
183 TOOM3_SQR_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */
184
185 vinf0 = vinf[0]; /* v1 overlaps with this */
186
187 #ifdef SMALLER_RECURSION
188 /* v1, 2n+1 limbs */
189 TOOM3_SQR_REC (v1, as1, n, scratch_out);
190 if (as1[n] == 1)
191 {
192 cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
193 }
194 else if (as1[n] != 0)
195 {
196 #if HAVE_NATIVE_mpn_addlsh1_n
197 cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
198 #else
199 cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
200 #endif
201 }
202 else
203 cy = 0;
204 if (as1[n] == 1)
205 {
206 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
207 }
208 else if (as1[n] != 0)
209 {
210 #if HAVE_NATIVE_mpn_addlsh1_n
211 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
212 #else
213 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
214 #endif
215 }
216 v1[2 * n] = cy;
217 #else
218 cy = vinf[1];
219 TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
220 vinf[1] = cy;
221 #endif
222
223 TOOM3_SQR_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */
224
225 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
226 }
227