1 /* mpn_toom3_sqr -- Square {ap,an}.
2 
3    Contributed to the GNU project by Torbjorn Granlund.
4    Additional improvements by Marco Bodrato.
5 
6    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2006-2010, 2012 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16 
17   * the GNU Lesser General Public License as published by the Free
18     Software Foundation; either version 3 of the License, or (at your
19     option) any later version.
20 
21 or
22 
23   * the GNU General Public License as published by the Free Software
24     Foundation; either version 2 of the License, or (at your option) any
25     later version.
26 
27 or both in parallel, as here.
28 
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32 for more details.
33 
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library.  If not,
36 see https://www.gnu.org/licenses/.  */
37 
38 
39 #include "gmp.h"
40 #include "gmp-impl.h"
41 
42 /* Evaluate in: -1, 0, +1, +2, +inf
43 
44   <-s--><--n--><--n-->
45    ____ ______ ______
46   |_a2_|___a1_|___a0_|
47 
48   v0  =  a0         ^2 #   A(0)^2
49   v1  = (a0+ a1+ a2)^2 #   A(1)^2    ah  <= 2
50   vm1 = (a0- a1+ a2)^2 #  A(-1)^2   |ah| <= 1
51   v2  = (a0+2a1+4a2)^2 #   A(2)^2    ah  <= 6
52   vinf=          a2 ^2 # A(inf)^2
53 */
54 
55 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
56 #define MAYBE_sqr_basecase 1
57 #define MAYBE_sqr_toom3   1
58 #else
59 #define MAYBE_sqr_basecase						\
60   (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD)
61 #define MAYBE_sqr_toom3							\
62   (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
63 #endif
64 
65 #define TOOM3_SQR_REC(p, a, n, ws)					\
66   do {									\
67     if (MAYBE_sqr_basecase						\
68 	&& BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))			\
69       mpn_sqr_basecase (p, a, n);					\
70     else if (! MAYBE_sqr_toom3						\
71 	     || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))		\
72       mpn_toom2_sqr (p, a, n, ws);					\
73     else								\
74       mpn_toom3_sqr (p, a, n, ws);					\
75   } while (0)
76 
77 void
mpn_toom3_sqr(mp_ptr pp,mp_srcptr ap,mp_size_t an,mp_ptr scratch)78 mpn_toom3_sqr (mp_ptr pp,
79 	       mp_srcptr ap, mp_size_t an,
80 	       mp_ptr scratch)
81 {
82   const int __gmpn_cpuvec_initialized = 1;
83   mp_size_t n, s;
84   mp_limb_t cy, vinf0;
85   mp_ptr gp;
86   mp_ptr as1, asm1, as2;
87 
88 #define a0  ap
89 #define a1  (ap + n)
90 #define a2  (ap + 2*n)
91 
92   n = (an + 2) / (size_t) 3;
93 
94   s = an - 2 * n;
95 
96   ASSERT (0 < s && s <= n);
97 
98   as1 = scratch + 4 * n + 4;
99   asm1 = scratch + 2 * n + 2;
100   as2 = pp + n + 1;
101 
102   gp = scratch;
103 
104   /* Compute as1 and asm1.  */
105   cy = mpn_add (gp, a0, n, a2, s);
106 #if HAVE_NATIVE_mpn_add_n_sub_n
107   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
108     {
109       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
110       as1[n] = cy >> 1;
111       asm1[n] = 0;
112     }
113   else
114     {
115       mp_limb_t cy2;
116       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
117       as1[n] = cy + (cy2 >> 1);
118       asm1[n] = cy - (cy2 & 1);
119     }
120 #else
121   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
122   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
123     {
124       mpn_sub_n (asm1, a1, gp, n);
125       asm1[n] = 0;
126     }
127   else
128     {
129       cy -= mpn_sub_n (asm1, gp, a1, n);
130       asm1[n] = cy;
131     }
132 #endif
133 
134   /* Compute as2.  */
135 #if HAVE_NATIVE_mpn_rsblsh1_n
136   cy = mpn_add_n (as2, a2, as1, s);
137   if (s != n)
138     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
139   cy += as1[n];
140   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
141 #else
142 #if HAVE_NATIVE_mpn_addlsh1_n
143   cy  = mpn_addlsh1_n (as2, a1, a2, s);
144   if (s != n)
145     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
146   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
147 #else
148   cy = mpn_add_n (as2, a2, as1, s);
149   if (s != n)
150     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
151   cy += as1[n];
152   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
153   cy -= mpn_sub_n (as2, as2, a0, n);
154 #endif
155 #endif
156   as2[n] = cy;
157 
158   ASSERT (as1[n] <= 2);
159   ASSERT (asm1[n] <= 1);
160 
161 #define v0    pp				/* 2n */
162 #define v1    (pp + 2 * n)			/* 2n+1 */
163 #define vinf  (pp + 4 * n)			/* s+s */
164 #define vm1   scratch				/* 2n+1 */
165 #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
166 #define scratch_out  (scratch + 5 * n + 5)
167 
168   /* vm1, 2n+1 limbs */
169 #ifdef SMALLER_RECURSION
170   TOOM3_SQR_REC (vm1, asm1, n, scratch_out);
171   cy = 0;
172   if (asm1[n] != 0)
173     cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
174   if (asm1[n] != 0)
175     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
176   vm1[2 * n] = cy;
177 #else
178   TOOM3_SQR_REC (vm1, asm1, n + 1, scratch_out);
179 #endif
180 
181   TOOM3_SQR_REC (v2, as2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
182 
183   TOOM3_SQR_REC (vinf, a2, s, scratch_out);	/* vinf, s+s limbs */
184 
185   vinf0 = vinf[0];				/* v1 overlaps with this */
186 
187 #ifdef SMALLER_RECURSION
188   /* v1, 2n+1 limbs */
189   TOOM3_SQR_REC (v1, as1, n, scratch_out);
190   if (as1[n] == 1)
191     {
192       cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
193     }
194   else if (as1[n] != 0)
195     {
196 #if HAVE_NATIVE_mpn_addlsh1_n
197       cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
198 #else
199       cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
200 #endif
201     }
202   else
203     cy = 0;
204   if (as1[n] == 1)
205     {
206       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
207     }
208   else if (as1[n] != 0)
209     {
210 #if HAVE_NATIVE_mpn_addlsh1_n
211       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
212 #else
213       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
214 #endif
215     }
216   v1[2 * n] = cy;
217 #else
218   cy = vinf[1];
219   TOOM3_SQR_REC (v1, as1, n + 1, scratch_out);
220   vinf[1] = cy;
221 #endif
222 
223   TOOM3_SQR_REC (v0, ap, n, scratch_out);	/* v0, 2n limbs */
224 
225   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0);
226 }
227