1 /* mpn_mod_1_1p (ap, n, b, cps)
2    Divide (ap,,n) by b.  Return the single-limb remainder.
3 
4    Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
5    Based on a suggestion by Peter L. Montgomery.
6 
7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 
11 Copyright 2008-2011, 2013 Free Software Foundation, Inc.
12 
13 This file is part of the GNU MP Library.
14 
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17 
18   * the GNU Lesser General Public License as published by the Free
19     Software Foundation; either version 3 of the License, or (at your
20     option) any later version.
21 
22 or
23 
24   * the GNU General Public License as published by the Free Software
25     Foundation; either version 2 of the License, or (at your option) any
26     later version.
27 
28 or both in parallel, as here.
29 
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
33 for more details.
34 
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library.  If not,
37 see https://www.gnu.org/licenses/.  */
38 
39 #include "gmp.h"
40 #include "gmp-impl.h"
41 #include "longlong.h"
42 
43 #ifndef MOD_1_1P_METHOD
44 # define MOD_1_1P_METHOD 1    /* need to make sure this is 2 for asm testing */
45 #endif
46 
47 /* Define some longlong.h-style macros, but for wider operations.
48  * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
49  * carry out, in the form of a mask. */
50 
51 #if defined (__GNUC__)
52 
53 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
54 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
55   __asm__ (  "add	%6, %k2\n\t"					\
56 	     "adc	%4, %k1\n\t"					\
57 	     "sbb	%k0, %k0"					\
58 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
59 	   : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
60 	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
61 #endif
62 
63 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
64 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
65   __asm__ (  "add	%6, %q2\n\t"					\
66 	     "adc	%4, %q1\n\t"					\
67 	     "sbb	%q0, %q0"					\
68 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
69 	   : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
70 	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
71 #endif
72 
73 #if defined (__sparc__) && W_TYPE_SIZE == 32
74 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
75   __asm__ (  "addcc	%r5, %6, %2\n\t"				\
76 	     "addxcc	%r3, %4, %1\n\t"				\
77 	     "subx	%%g0, %%g0, %0"					\
78 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
79 	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
80 	 __CLOBBER_CC)
81 #endif
82 
83 #if defined (__sparc__) && W_TYPE_SIZE == 64
84 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
85   __asm__ (  "addcc	%r5, %6, %2\n\t"				\
86 	     "addccc	%r7, %8, %%g0\n\t"				\
87 	     "addccc	%r3, %4, %1\n\t"				\
88 	     "clr	%0\n\t"						\
89 	     "movcs	%%xcc, -1, %0"					\
90 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
91 	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
92 	     "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
93 	 __CLOBBER_CC)
94 #if __VIS__ >= 0x300
95 #undef add_mssaaaa
96 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
97   __asm__ (  "addcc	%r5, %6, %2\n\t"				\
98 	     "addxccc	%r3, %4, %1\n\t"				\
99 	     "clr	%0\n\t"						\
100 	     "movcs	%%xcc, -1, %0"					\
101 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
102 	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
103 	 __CLOBBER_CC)
104 #endif
105 #endif
106 
107 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
108 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
109    processor running in 32-bit mode, since the carry flag then gets the 32-bit
110    carry.  */
111 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
112   __asm__ (  "add%I6c	%2, %5, %6\n\t"					\
113 	     "adde	%1, %3, %4\n\t"					\
114 	     "subfe	%0, %0, %0\n\t"					\
115 	     "nor	%0, %0, %0"					\
116 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
117 	   : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
118 #endif
119 
120 #if defined (__s390x__) && W_TYPE_SIZE == 64
121 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
122   __asm__ (  "algr	%2, %6\n\t"					\
123 	     "alcgr	%1, %4\n\t"					\
124 	     "lghi	%0, 0\n\t"					\
125 	     "alcgr	%0, %0\n\t"					\
126 	     "lcgr	%0, %0"						\
127 	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
128 	   : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),			\
129 	     "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
130 #endif
131 
132 #if defined (__arm__) && W_TYPE_SIZE == 32
133 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
134   __asm__ (  "adds	%2, %5, %6\n\t"					\
135 	     "adcs	%1, %3, %4\n\t"					\
136 	     "movcc	%0, #0\n\t"					\
137 	     "movcs	%0, #-1"					\
138 	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
139 	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
140 #endif
141 #endif /* defined (__GNUC__) */
142 
143 #ifndef add_mssaaaa
144 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
145   do {									\
146     UWtype __s0, __s1, __c0, __c1;					\
147     __s0 = (a0) + (b0);							\
148     __s1 = (a1) + (b1);							\
149     __c0 = __s0 < (a0);							\
150     __c1 = __s1 < (a1);							\
151     (s0) = __s0;							\
152     __s1 = __s1 + __c0;							\
153     (s1) = __s1;							\
154     (m) = - (__c1 + (__s1 < __c0));					\
155   } while (0)
156 #endif
157 
158 #if MOD_1_1P_METHOD == 1
159 void
mpn_mod_1_1p_cps(mp_limb_t cps[4],mp_limb_t b)160 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
161 {
162   mp_limb_t bi;
163   mp_limb_t B1modb, B2modb;
164   int cnt;
165 
166   count_leading_zeros (cnt, b);
167 
168   b <<= cnt;
169   invert_limb (bi, b);
170 
171   cps[0] = bi;
172   cps[1] = cnt;
173 
174   B1modb = -b;
175   if (LIKELY (cnt != 0))
176     B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
177   ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
178   cps[2] = B1modb >> cnt;
179 
180   /* In the normalized case, this can be simplified to
181    *
182    *   B2modb = - b * bi;
183    *   ASSERT (B2modb <= b);    // NB: equality iff b = B/2
184    */
185   udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
186   cps[3] = B2modb >> cnt;
187 }
188 
189 mp_limb_t
mpn_mod_1_1p(mp_srcptr ap,mp_size_t n,mp_limb_t b,const mp_limb_t bmodb[4])190 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
191 {
192   mp_limb_t rh, rl, bi, ph, pl, r;
193   mp_limb_t B1modb, B2modb;
194   mp_size_t i;
195   int cnt;
196   mp_limb_t mask;
197 
198   ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
199 
200   B1modb = bmodb[2];
201   B2modb = bmodb[3];
202 
203   rl = ap[n - 1];
204   umul_ppmm (ph, pl, rl, B1modb);
205   add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]);
206 
207   for (i = n - 3; i >= 0; i -= 1)
208     {
209       /* rr = ap[i]				< B
210 	    + LO(rr)  * (B mod b)		<= (B-1)(b-1)
211 	    + HI(rr)  * (B^2 mod b)		<= (B-1)(b-1)
212       */
213       umul_ppmm (ph, pl, rl, B1modb);
214       add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]);
215 
216       umul_ppmm (rh, rl, rh, B2modb);
217       add_ssaaaa (rh, rl, rh, rl, ph, pl);
218     }
219 
220   cnt = bmodb[1];
221   bi = bmodb[0];
222 
223   if (LIKELY (cnt != 0))
224     rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
225 
226   mask = -(mp_limb_t) (rh >= b);
227   rh -= mask & b;
228 
229   udiv_rnnd_preinv (r, rh, rl << cnt, b, bi);
230 
231   return r >> cnt;
232 }
233 #endif /* MOD_1_1P_METHOD == 1 */
234 
235 #if MOD_1_1P_METHOD == 2
236 void
mpn_mod_1_1p_cps(mp_limb_t cps[4],mp_limb_t b)237 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
238 {
239   mp_limb_t bi;
240   mp_limb_t B2modb;
241   int cnt;
242 
243   count_leading_zeros (cnt, b);
244 
245   b <<= cnt;
246   invert_limb (bi, b);
247 
248   cps[0] = bi;
249   cps[1] = cnt;
250 
251   if (LIKELY (cnt != 0))
252     {
253       mp_limb_t B1modb = -b;
254       B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
255       ASSERT (B1modb <= b);		/* NB: not fully reduced mod b */
256       cps[2] = B1modb >> cnt;
257     }
258   B2modb = - b * bi;
259   ASSERT (B2modb <= b);    // NB: equality iff b = B/2
260   cps[3] = B2modb;
261 }
262 
263 mp_limb_t
mpn_mod_1_1p(mp_srcptr ap,mp_size_t n,mp_limb_t b,const mp_limb_t bmodb[4])264 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
265 {
266   int cnt;
267   mp_limb_t bi, B1modb;
268   mp_limb_t r0, r1;
269   mp_limb_t r;
270 
271   ASSERT (n >= 2);		/* fix tuneup.c if this is changed */
272 
273   r0 = ap[n-2];
274   r1 = ap[n-1];
275 
276   if (n > 2)
277     {
278       mp_limb_t B2modb, B2mb;
279       mp_limb_t p0, p1;
280       mp_limb_t r2;
281       mp_size_t j;
282 
283       B2modb = bmodb[3];
284       B2mb = B2modb - b;
285 
286       umul_ppmm (p1, p0, r1, B2modb);
287       add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
288 
289       for (j = n-4; j >= 0; j--)
290 	{
291 	  mp_limb_t cy;
292 	  /* mp_limb_t t = r0 + B2mb; */
293 	  umul_ppmm (p1, p0, r1, B2modb);
294 
295 	  ADDC_LIMB (cy, r0, r0, r2 & B2modb);
296 	  /* Alternative, for cmov: if (cy) r0 = t; */
297 	  r0 -= (-cy) & b;
298 	  add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
299 	}
300 
301       r1 -= (r2 & b);
302     }
303 
304   cnt = bmodb[1];
305 
306   if (LIKELY (cnt != 0))
307     {
308       mp_limb_t t;
309       mp_limb_t B1modb = bmodb[2];
310 
311       umul_ppmm (r1, t, r1, B1modb);
312       r0 += t;
313       r1 += (r0 < t);
314 
315       /* Normalize */
316       r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
317       r0 <<= cnt;
318 
319       /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */
320     }
321   else
322     {
323       mp_limb_t mask = -(mp_limb_t) (r1 >= b);
324       r1 -= mask & b;
325     }
326 
327   bi = bmodb[0];
328 
329   udiv_rnnd_preinv (r, r1, r0, b, bi);
330   return r >> cnt;
331 }
332 #endif /* MOD_1_1P_METHOD == 2 */
333