1 /* mulmod_bnm1.c -- multiplication mod B^n-1.
2 
3    Contributed to the GNU project by Niels Möller, Torbjorn Granlund and
4    Marco Bodrato.
5 
6    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
7    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16 
17   * the GNU Lesser General Public License as published by the Free
18     Software Foundation; either version 3 of the License, or (at your
19     option) any later version.
20 
21 or
22 
23   * the GNU General Public License as published by the Free Software
24     Foundation; either version 2 of the License, or (at your option) any
25     later version.
26 
27 or both in parallel, as here.
28 
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32 for more details.
33 
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library.  If not,
36 see https://www.gnu.org/licenses/.  */
37 
38 
39 #include "gmp.h"
40 #include "gmp-impl.h"
41 #include "longlong.h"
42 
43 /* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
44    mod B^rn - 1, and values are semi-normalised; zero is represented
45    as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
46    tp==rp is allowed. */
47 void
mpn_bc_mulmod_bnm1(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t rn,mp_ptr tp)48 mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
49 		    mp_ptr tp)
50 {
51   mp_limb_t cy;
52 
53   ASSERT (0 < rn);
54 
55   mpn_mul_n (tp, ap, bp, rn);
56   cy = mpn_add_n (rp, tp, tp + rn, rn);
57   /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
58    * be no overflow when adding in the carry. */
59   MPN_INCR_U (rp, rn, cy);
60 }
61 
62 
63 /* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
64    semi-normalised representation, computation is mod B^rn + 1. Needs
65    a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
66    Output is normalised. */
67 static void
mpn_bc_mulmod_bnp1(mp_ptr rp,mp_srcptr ap,mp_srcptr bp,mp_size_t rn,mp_ptr tp)68 mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
69 		    mp_ptr tp)
70 {
71   mp_limb_t cy;
72 
73   ASSERT (0 < rn);
74 
75   mpn_mul_n (tp, ap, bp, rn + 1);
76   ASSERT (tp[2*rn+1] == 0);
77   ASSERT (tp[2*rn] < GMP_NUMB_MAX);
78   cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
79   rp[rn] = 0;
80   MPN_INCR_U (rp, rn+1, cy );
81 }
82 
83 
84 /* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
85  *
86  * The result is expected to be ZERO if and only if one of the operand
87  * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
88  * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
89  * combine results and obtain a natural number when one knows in
90  * advance that the final value is less than (B^rn-1).
91  * Moreover it should not be a problem if mulmod_bnm1 is used to
92  * compute the full product with an+bn <= rn, because this condition
93  * implies (B^an-1)(B^bn-1) < (B^rn-1) .
94  *
95  * Requires 0 < bn <= an <= rn and an + bn > rn/2
96  * Scratch need: rn + (need for recursive call OR rn + 4). This gives
97  *
98  * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
99  */
100 void
mpn_mulmod_bnm1(mp_ptr rp,mp_size_t rn,mp_srcptr ap,mp_size_t an,mp_srcptr bp,mp_size_t bn,mp_ptr tp)101 mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
102 {
103   ASSERT (0 < bn);
104   ASSERT (bn <= an);
105   ASSERT (an <= rn);
106 
107   if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
108     {
109       if (UNLIKELY (bn < rn))
110 	{
111 	  if (UNLIKELY (an + bn <= rn))
112 	    {
113 	      mpn_mul (rp, ap, an, bp, bn);
114 	    }
115 	  else
116 	    {
117 	      mp_limb_t cy;
118 	      mpn_mul (tp, ap, an, bp, bn);
119 	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
120 	      MPN_INCR_U (rp, rn, cy);
121 	    }
122 	}
123       else
124 	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
125     }
126   else
127     {
128       mp_size_t n;
129       mp_limb_t cy;
130       mp_limb_t hi;
131 
132       n = rn >> 1;
133 
134       /* We need at least an + bn >= n, to be able to fit one of the
135 	 recursive products at rp. Requiring strict inequality makes
136 	 the coded slightly simpler. If desired, we could avoid this
137 	 restriction by initially halving rn as long as rn is even and
138 	 an + bn <= rn/2. */
139 
140       ASSERT (an + bn > n);
141 
142       /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
143 	 and crt together as
144 
145 	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
146       */
147 
148 #define a0 ap
149 #define a1 (ap + n)
150 #define b0 bp
151 #define b1 (bp + n)
152 
153 #define xp  tp	/* 2n + 2 */
154       /* am1  maybe in {xp, n} */
155       /* bm1  maybe in {xp + n, n} */
156 #define sp1 (tp + 2*n + 2)
157       /* ap1  maybe in {sp1, n + 1} */
158       /* bp1  maybe in {sp1 + n + 1, n + 1} */
159 
160       {
161 	mp_srcptr am1, bm1;
162 	mp_size_t anm, bnm;
163 	mp_ptr so;
164 
165 	bm1 = b0;
166 	bnm = bn;
167 	if (LIKELY (an > n))
168 	  {
169 	    am1 = xp;
170 	    cy = mpn_add (xp, a0, n, a1, an - n);
171 	    MPN_INCR_U (xp, n, cy);
172 	    anm = n;
173 	    so = xp + n;
174 	    if (LIKELY (bn > n))
175 	      {
176 		bm1 = so;
177 		cy = mpn_add (so, b0, n, b1, bn - n);
178 		MPN_INCR_U (so, n, cy);
179 		bnm = n;
180 		so += n;
181 	      }
182 	  }
183 	else
184 	  {
185 	    so = xp;
186 	    am1 = a0;
187 	    anm = an;
188 	  }
189 
190 	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
191       }
192 
193       {
194 	int       k;
195 	mp_srcptr ap1, bp1;
196 	mp_size_t anp, bnp;
197 
198 	bp1 = b0;
199 	bnp = bn;
200 	if (LIKELY (an > n)) {
201 	  ap1 = sp1;
202 	  cy = mpn_sub (sp1, a0, n, a1, an - n);
203 	  sp1[n] = 0;
204 	  MPN_INCR_U (sp1, n + 1, cy);
205 	  anp = n + ap1[n];
206 	  if (LIKELY (bn > n)) {
207 	    bp1 = sp1 + n + 1;
208 	    cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
209 	    sp1[2*n+1] = 0;
210 	    MPN_INCR_U (sp1 + n + 1, n + 1, cy);
211 	    bnp = n + bp1[n];
212 	  }
213 	} else {
214 	  ap1 = a0;
215 	  anp = an;
216 	}
217 
218 	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
219 	  k=0;
220 	else
221 	  {
222 	    int mask;
223 	    k = mpn_fft_best_k (n, 0);
224 	    mask = (1<<k) - 1;
225 	    while (n & mask) {k--; mask >>=1;};
226 	  }
227 	if (k >= FFT_FIRST_K)
228 	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
229 	else if (UNLIKELY (bp1 == b0))
230 	  {
231 	    ASSERT (anp + bnp <= 2*n+1);
232 	    ASSERT (anp + bnp > n);
233 	    ASSERT (anp >= bnp);
234 	    mpn_mul (xp, ap1, anp, bp1, bnp);
235 	    anp = anp + bnp - n;
236 	    ASSERT (anp <= n || xp[2*n]==0);
237 	    anp-= anp > n;
238 	    cy = mpn_sub (xp, xp, n, xp + n, anp);
239 	    xp[n] = 0;
240 	    MPN_INCR_U (xp, n+1, cy);
241 	  }
242 	else
243 	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
244       }
245 
246       /* Here the CRT recomposition begins.
247 
248 	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
249 	 Division by 2 is a bitwise rotation.
250 
251 	 Assumes xp normalised mod (B^n+1).
252 
253 	 The residue class [0] is represented by [B^n-1]; except when
254 	 both input are ZERO.
255       */
256 
257 #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
258 #if HAVE_NATIVE_mpn_rsh1add_nc
259       cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
260       hi = cy << (GMP_NUMB_BITS - 1);
261       cy = 0;
262       /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
263 	 overflows, i.e. a further increment will not overflow again. */
264 #else /* ! _nc */
265       cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
266       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
267       cy >>= 1;
268       /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
269 	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
270 #endif
271 #if GMP_NAIL_BITS == 0
272       add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
273 #else
274       cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
275       rp[n-1] ^= hi;
276 #endif
277 #else /* ! HAVE_NATIVE_mpn_rsh1add_n */
278 #if HAVE_NATIVE_mpn_add_nc
279       cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
280 #else /* ! _nc */
281       cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
282 #endif
283       cy += (rp[0]&1);
284       mpn_rshift(rp, rp, n, 1);
285       ASSERT (cy <= 2);
286       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
287       cy >>= 1;
288       /* We can have cy != 0 only if hi = 0... */
289       ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
290       rp[n-1] |= hi;
291       /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
292 #endif
293       ASSERT (cy <= 1);
294       /* Next increment can not overflow, read the previous comments about cy. */
295       ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
296       MPN_INCR_U(rp, n, cy);
297 
298       /* Compute the highest half:
299 	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
300        */
301       if (UNLIKELY (an + bn < rn))
302 	{
303 	  /* Note that in this case, the only way the result can equal
304 	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
305 	     then the output of both the recursive calls and this CRT
306 	     reconstruction is zero, not B^{rn} - 1. Which is good,
307 	     since the latter representation doesn't fit in the output
308 	     area.*/
309 	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
310 
311 	  /* FIXME: This subtraction of the high parts is not really
312 	     necessary, we do it to get the carry out, and for sanity
313 	     checking. */
314 	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
315 				   xp + an + bn - n, rn - (an + bn), cy);
316 	  ASSERT (an + bn == rn - 1 ||
317 		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
318 	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
319 	  ASSERT (cy == (xp + an + bn - n)[0]);
320 	}
321       else
322 	{
323 	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
324 	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
325 	     DECR will affect _at most_ the lowest n limbs. */
326 	  MPN_DECR_U (rp, 2*n, cy);
327 	}
328 #undef a0
329 #undef a1
330 #undef b0
331 #undef b1
332 #undef xp
333 #undef sp1
334     }
335 }
336 
337 mp_size_t
mpn_mulmod_bnm1_next_size(mp_size_t n)338 mpn_mulmod_bnm1_next_size (mp_size_t n)
339 {
340   mp_size_t nh;
341 
342   if (BELOW_THRESHOLD (n,     MULMOD_BNM1_THRESHOLD))
343     return n;
344   if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
345     return (n + (2-1)) & (-2);
346   if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
347     return (n + (4-1)) & (-4);
348 
349   nh = (n + 1) >> 1;
350 
351   if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
352     return (n + (8-1)) & (-8);
353 
354   return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
355 }
356