1 /* mpn_sqrlo -- squares an n-limb number and returns the low n limbs
2    of the result.
3 
4    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
5 
6    THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
7    FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
8    THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16 
17   * the GNU Lesser General Public License as published by the Free
18     Software Foundation; either version 3 of the License, or (at your
19     option) any later version.
20 
21 or
22 
23   * the GNU General Public License as published by the Free Software
24     Foundation; either version 2 of the License, or (at your option) any
25     later version.
26 
27 or both in parallel, as here.
28 
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32 for more details.
33 
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library.  If not,
36 see https://www.gnu.org/licenses/.  */
37 
38 #include "gmp-impl.h"
39 
40 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
41 #define MAYBE_range_basecase 1
42 #define MAYBE_range_toom22   1
43 #else
44 #define MAYBE_range_basecase                                           \
45   ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11))
46 #define MAYBE_range_toom22                                             \
47   ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) )
48 #endif
49 
50 /*  THINK: The DC strategy uses different constants in different Toom's
51 	 ranges. Something smoother?
52 */
53 
54 /*
55   Compute the least significant half of the product {xy,n}*{yp,n}, or
56   formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
57 
58   Above the given threshold, the Divide and Conquer strategy is used.
59   The operand is split in two, and a full square plus a mullo
60   is used to obtain the final result. The more natural strategy is to
61   split in two halves, but this is far from optimal when a
62   sub-quadratic multiplication is used.
63 
64   Mulders suggests an unbalanced split in favour of the full product,
65   split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
66 
67   To compute the value of a, we assume that the cost of mullo for a
68   given size ML(n) is a fraction of the cost of a full product with
69   same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
70   then we can write:
71 
72   ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
73 
74   Given a value for e, want to minimise the value of k, i.e. the
75   function k=(1-a)^e/(1-2*a^e).
76 
77   With e=2, the exponent for schoolbook multiplication, the minimum is
78   given by the values a=1-a=1/2.
79 
80   With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
81   Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
82 
83   Other possible approximations follow:
84   e=log(5)/log(3) [Toom-3] -> a ~= 9/40
85   e=log(7)/log(4) [Toom-4] -> a ~= 7/39
86   e=log(11)/log(6) [Toom-6] -> a ~= 1/8
87   e=log(15)/log(8) [Toom-8] -> a ~= 1/10
88 
89   The values above where obtained with the following trivial commands
90   in the gp-pari shell:
91 
92 fun(e,a)=(1-a)^e/(1-2*a^e)
93 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
94 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
95 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
96 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
97 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
98 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
99 
100   ,
101   |\
102   | \
103   +----,
104   |    |
105   |    |
106   |    |\
107   |    | \
108   +----+--`
109   ^ n2 ^n1^
110 
111   For an actual implementation, the assumption that M(n)=n^e is
112   incorrect, as a consequence also the assumption that ML(n)=k*M(n)
113   with a constant k is wrong.
114 
115   But theory suggest us two things:
116   - the best the multiplication product is (lower e), the more k
117     approaches 1, and a approaches 0.
118 
119   - A value for a smaller than optimal is probably less bad than a
120     bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
121     value, and k(a)=0.808_ the mul/mullo speed ratio. We get
122     k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
123 */
124 
125 static mp_size_t
mpn_sqrlo_itch(mp_size_t n)126 mpn_sqrlo_itch (mp_size_t n)
127 {
128   return 2*n;
129 }
130 
131 /*
132     mpn_dc_sqrlo requires a scratch space of 2*n limbs at tp.
133     It accepts tp == rp.
134 */
135 static void
mpn_dc_sqrlo(mp_ptr rp,mp_srcptr xp,mp_size_t n,mp_ptr tp)136 mpn_dc_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n, mp_ptr tp)
137 {
138   mp_size_t n2, n1;
139   ASSERT (n >= 2);
140   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
141   ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
142 
143   /* Divide-and-conquer */
144 
145   /* We need fractional approximation of the value 0 < a <= 1/2
146      giving the minimum in the function k=(1-a)^e/(1-2*a^e).
147   */
148   if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11)))
149     n1 = n >> 1;
150   else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11)))
151     n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
152   else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9)))
153     n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
154   else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9))
155     n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
156   /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
157   else
158     n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
159 
160   n2 = n - n1;
161 
162   /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */
163 
164   /* x0 ^ 2 */
165   mpn_sqr (tp, xp, n2);
166   MPN_COPY (rp, tp, n2);
167 
168   /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */
169   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
170     mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1);
171   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
172     mpn_mullo_basecase (tp + n, xp + n2, xp, n1);
173   else
174     mpn_mullo_n (tp + n, xp + n2, xp, n1);
175   /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */
176 #if HAVE_NATIVE_mpn_addlsh1_n
177   mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1);
178 #else
179   mpn_lshift (rp + n2, tp + n, n1, 1);
180   mpn_add_n (rp + n2, rp + n2, tp + n2, n1);
181 #endif
182 }
183 
184 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
185 #define SQR_BASECASE_ALLOC \
186  (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT)
187 
188 /* FIXME: This function should accept a temporary area; dc_sqrlo
189    accepts a pointer tp, and handle the case tp == rp, do the same here.
190 */
191 
192 void
mpn_sqrlo(mp_ptr rp,mp_srcptr xp,mp_size_t n)193 mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n)
194 {
195   ASSERT (n >= 1);
196   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
197 
198   if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD))
199     {
200       /* FIXME: smarter criteria? */
201 #if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase
202       /* mullo computes as many products as sqr, but directly writes
203 	 on the result area. */
204       mpn_mullo_basecase (rp, xp, xp, n);
205 #else
206       /* Allocate workspace of fixed size on stack: fast! */
207       mp_limb_t tp[SQR_BASECASE_ALLOC];
208       mpn_sqr_basecase (tp, xp, n);
209       MPN_COPY (rp, tp, n);
210 #endif
211     }
212   else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD))
213     {
214       mpn_sqrlo_basecase (rp, xp, n);
215     }
216   else
217     {
218       mp_ptr tp;
219       TMP_DECL;
220       TMP_MARK;
221       tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n));
222       if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD))
223 	{
224 	  mpn_dc_sqrlo (rp, xp, n, tp);
225 	}
226       else
227 	{
228 	  /* For really large operands, use plain mpn_mul_n but throw away upper n
229 	     limbs of result.  */
230 #if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD)
231 	  mpn_fft_mul (tp, xp, n, xp, n);
232 #else
233 	  mpn_sqr (tp, xp, n);
234 #endif
235 	  MPN_COPY (rp, tp, n);
236 	}
237       TMP_FREE;
238     }
239 }
240