1 /* mulmod_bnm1.c -- multiplication mod B^n-1.
2 
3    Contributed to the GNU project by Niels M�ller, Torbjorn Granlund and
4    Marco Bodrato.
5 
6    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
7    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 
10 Copyright 2009, 2010 Free Software Foundation, Inc.
11 
12 This file is part of the GNU MP Library.
13 
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of the GNU Lesser General Public License as published by
16 the Free Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
18 
19 The GNU MP Library is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
22 License for more details.
23 
24 You should have received a copy of the GNU Lesser General Public License
25 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
26 
27 
28 #include "gmp.h"
29 #include "gmp-impl.h"
30 #include "longlong.h"
31 
32 /* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
33    mod B^rn - 1, and values are semi-normalised; zero is represented
34    as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
35    tp==rp is allowed. */
36 void
37 mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
38 		    mp_ptr tp)
39 {
40   mp_limb_t cy;
41 
42   ASSERT (0 < rn);
43 
44   mpn_mul_n (tp, ap, bp, rn);
45   cy = mpn_add_n (rp, tp, tp + rn, rn);
46   /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
47    * be no overflow when adding in the carry. */
48   MPN_INCR_U (rp, rn, cy);
49 }
50 
51 
52 /* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
53    semi-normalised representation, computation is mod B^rn + 1. Needs
54    a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
55    Output is normalised. */
56 static void
57 mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
58 		    mp_ptr tp)
59 {
60   mp_limb_t cy;
61 
62   ASSERT (0 < rn);
63 
64   mpn_mul_n (tp, ap, bp, rn + 1);
65   ASSERT (tp[2*rn+1] == 0);
66   ASSERT (tp[2*rn] < GMP_NUMB_MAX);
67   cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
68   rp[rn] = 0;
69   MPN_INCR_U (rp, rn+1, cy );
70 }
71 
72 
73 /* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
74  *
75  * The result is expected to be ZERO if and only if one of the operand
76  * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
77  * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
78  * combine results and obtain a natural number when one knows in
79  * advance that the final value is less than (B^rn-1).
80  * Moreover it should not be a problem if mulmod_bnm1 is used to
81  * compute the full product with an+bn <= rn, because this condition
82  * implies (B^an-1)(B^bn-1) < (B^rn-1) .
83  *
84  * Requires 0 < bn <= an <= rn and an + bn > rn/2
85  * Scratch need: rn + (need for recursive call OR rn + 4). This gives
86  *
87  * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
88  */
89 void
90 mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
91 {
92   ASSERT (0 < bn);
93   ASSERT (bn <= an);
94   ASSERT (an <= rn);
95 
96   if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
97     {
98       if (UNLIKELY (bn < rn))
99 	{
100 	  if (UNLIKELY (an + bn <= rn))
101 	    {
102 	      mpn_mul (rp, ap, an, bp, bn);
103 	    }
104 	  else
105 	    {
106 	      mp_limb_t cy;
107 	      mpn_mul (tp, ap, an, bp, bn);
108 	      cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
109 	      MPN_INCR_U (rp, rn, cy);
110 	    }
111 	}
112       else
113 	mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
114     }
115   else
116     {
117       mp_size_t n;
118       mp_limb_t cy;
119       mp_limb_t hi;
120 
121       n = rn >> 1;
122 
123       /* We need at least an + bn >= n, to be able to fit one of the
124 	 recursive products at rp. Requiring strict inequality makes
125 	 the coded slightly simpler. If desired, we could avoid this
126 	 restriction by initially halving rn as long as rn is even and
127 	 an + bn <= rn/2. */
128 
129       ASSERT (an + bn > n);
130 
131       /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
132 	 and crt together as
133 
134 	 x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
135       */
136 
137 #define a0 ap
138 #define a1 (ap + n)
139 #define b0 bp
140 #define b1 (bp + n)
141 
142 #define xp  tp	/* 2n + 2 */
143       /* am1  maybe in {xp, n} */
144       /* bm1  maybe in {xp + n, n} */
145 #define sp1 (tp + 2*n + 2)
146       /* ap1  maybe in {sp1, n + 1} */
147       /* bp1  maybe in {sp1 + n + 1, n + 1} */
148 
149       {
150 	mp_srcptr am1, bm1;
151 	mp_size_t anm, bnm;
152 	mp_ptr so;
153 
154 	if (LIKELY (an > n))
155 	  {
156 	    am1 = xp;
157 	    cy = mpn_add (xp, a0, n, a1, an - n);
158 	    MPN_INCR_U (xp, n, cy);
159 	    anm = n;
160 	    if (LIKELY (bn > n))
161 	      {
162 		bm1 = xp + n;
163 		cy = mpn_add (xp + n, b0, n, b1, bn - n);
164 		MPN_INCR_U (xp + n, n, cy);
165 		bnm = n;
166 		so = xp + 2*n;
167 	      }
168 	    else
169 	      {
170 		so = xp + n;
171 		bm1 = b0;
172 		bnm = bn;
173 	      }
174 	  }
175 	else
176 	  {
177 	    so = xp;
178 	    am1 = a0;
179 	    anm = an;
180 	    bm1 = b0;
181 	    bnm = bn;
182 	  }
183 
184 	mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
185       }
186 
187       {
188 	int       k;
189 	mp_srcptr ap1, bp1;
190 	mp_size_t anp, bnp;
191 
192 	if (LIKELY (an > n)) {
193 	  ap1 = sp1;
194 	  cy = mpn_sub (sp1, a0, n, a1, an - n);
195 	  sp1[n] = 0;
196 	  MPN_INCR_U (sp1, n + 1, cy);
197 	  anp = n + ap1[n];
198 	} else {
199 	  ap1 = a0;
200 	  anp = an;
201 	}
202 
203 	if (LIKELY (bn > n)) {
204 	  bp1 = sp1 + n + 1;
205 	  cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
206 	  sp1[2*n+1] = 0;
207 	  MPN_INCR_U (sp1 + n + 1, n + 1, cy);
208 	  bnp = n + bp1[n];
209 	} else {
210 	  bp1 = b0;
211 	  bnp = bn;
212 	}
213 
214 	if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
215 	  k=0;
216 	else
217 	  {
218 	    int mask;
219 	    k = mpn_fft_best_k (n, 0);
220 	    mask = (1<<k) -1;
221 	    while (n & mask) {k--; mask >>=1;};
222 	  }
223 	if (k >= FFT_FIRST_K)
224 	  xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
225 	else if (UNLIKELY (bp1 == b0))
226 	  {
227 	    ASSERT (anp + bnp <= 2*n+1);
228 	    ASSERT (anp + bnp > n);
229 	    ASSERT (anp >= bnp);
230 	    mpn_mul (xp, ap1, anp, bp1, bnp);
231 	    anp = anp + bnp - n;
232 	    ASSERT (anp <= n || xp[2*n]==0);
233 	    anp-= anp > n;
234 	    cy = mpn_sub (xp, xp, n, xp + n, anp);
235 	    xp[n] = 0;
236 	    MPN_INCR_U (xp, n+1, cy);
237 	  }
238 	else
239 	  mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
240       }
241 
242       /* Here the CRT recomposition begins.
243 
244 	 xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
245 	 Division by 2 is a bitwise rotation.
246 
247 	 Assumes xp normalised mod (B^n+1).
248 
249 	 The residue class [0] is represented by [B^n-1]; except when
250 	 both input are ZERO.
251       */
252 
253 #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
254 #if HAVE_NATIVE_mpn_rsh1add_nc
255       cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
256       hi = cy << (GMP_NUMB_BITS - 1);
257       cy = 0;
258       /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
259 	 overflows, i.e. a further increment will not overflow again. */
260 #else /* ! _nc */
261       cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
262       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
263       cy >>= 1;
264       /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
265 	 the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
266 #endif
267 #if GMP_NAIL_BITS == 0
268       add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
269 #else
270       cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
271       rp[n-1] ^= hi;
272 #endif
273 #else /* ! HAVE_NATIVE_mpn_rsh1add_n */
274 #if HAVE_NATIVE_mpn_add_nc
275       cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
276 #else /* ! _nc */
277       cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
278 #endif
279       cy += (rp[0]&1);
280       mpn_rshift(rp, rp, n, 1);
281       ASSERT (cy <= 2);
282       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
283       cy >>= 1;
284       /* We can have cy != 0 only if hi = 0... */
285       ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
286       rp[n-1] |= hi;
287       /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
288 #endif
289       ASSERT (cy <= 1);
290       /* Next increment can not overflow, read the previous comments about cy. */
291       ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
292       MPN_INCR_U(rp, n, cy);
293 
294       /* Compute the highest half:
295 	 ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
296        */
297       if (UNLIKELY (an + bn < rn))
298 	{
299 	  /* Note that in this case, the only way the result can equal
300 	     zero mod B^{rn} - 1 is if one of the inputs is zero, and
301 	     then the output of both the recursive calls and this CRT
302 	     reconstruction is zero, not B^{rn} - 1. Which is good,
303 	     since the latter representation doesn't fit in the output
304 	     area.*/
305 	  cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
306 
307 	  /* FIXME: This subtraction of the high parts is not really
308 	     necessary, we do it to get the carry out, and for sanity
309 	     checking. */
310 	  cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
311 				   xp + an + bn - n, rn - (an + bn), cy);
312 	  ASSERT (an + bn == rn - 1 ||
313 		  mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
314 	  cy = mpn_sub_1 (rp, rp, an + bn, cy);
315 	  ASSERT (cy == (xp + an + bn - n)[0]);
316 	}
317       else
318 	{
319 	  cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
320 	  /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
321 	     DECR will affect _at most_ the lowest n limbs. */
322 	  MPN_DECR_U (rp, 2*n, cy);
323 	}
324 #undef a0
325 #undef a1
326 #undef b0
327 #undef b1
328 #undef xp
329 #undef sp1
330     }
331 }
332 
333 mp_size_t
334 mpn_mulmod_bnm1_next_size (mp_size_t n)
335 {
336   mp_size_t nh;
337 
338   if (BELOW_THRESHOLD (n,     MULMOD_BNM1_THRESHOLD))
339     return n;
340   if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
341     return (n + (2-1)) & (-2);
342   if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
343     return (n + (4-1)) & (-4);
344 
345   nh = (n + 1) >> 1;
346 
347   if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
348     return (n + (8-1)) & (-8);
349 
350   return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
351 }
352