1 /* mpn_mod_1_1p (ap, n, b, cps)
2 Divide (ap,,n) by b. Return the single-limb remainder.
3
4 Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
5 Based on a suggestion by Peter L. Montgomery.
6
7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11 Copyright 2008-2011, 2013 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 or
23
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
27
28 or both in parallel, as here.
29
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
34
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
38
39 #include "gmp.h"
40 #include "gmp-impl.h"
41 #include "longlong.h"
42
43 #ifndef MOD_1_1P_METHOD
44 # define MOD_1_1P_METHOD 1 /* need to make sure this is 2 for asm testing */
45 #endif
46
47 /* Define some longlong.h-style macros, but for wider operations.
48 * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
49 * carry out, in the form of a mask. */
50
51 #if defined (__GNUC__)
52
53 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
54 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
55 __asm__ ( "add %6, %k2\n\t" \
56 "adc %4, %k1\n\t" \
57 "sbb %k0, %k0" \
58 : "=r" (m), "=r" (s1), "=&r" (s0) \
59 : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
60 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
61 #endif
62
63 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
64 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
65 __asm__ ( "add %6, %q2\n\t" \
66 "adc %4, %q1\n\t" \
67 "sbb %q0, %q0" \
68 : "=r" (m), "=r" (s1), "=&r" (s0) \
69 : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
70 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
71 #endif
72
73 #if defined (__sparc__) && W_TYPE_SIZE == 32
74 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
75 __asm__ ( "addcc %r5, %6, %2\n\t" \
76 "addxcc %r3, %4, %1\n\t" \
77 "subx %%g0, %%g0, %0" \
78 : "=r" (m), "=r" (sh), "=&r" (sl) \
79 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
80 __CLOBBER_CC)
81 #endif
82
83 #if defined (__sparc__) && W_TYPE_SIZE == 64
84 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
85 __asm__ ( "addcc %r5, %6, %2\n\t" \
86 "addccc %r7, %8, %%g0\n\t" \
87 "addccc %r3, %4, %1\n\t" \
88 "clr %0\n\t" \
89 "movcs %%xcc, -1, %0" \
90 : "=r" (m), "=r" (sh), "=&r" (sl) \
91 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
92 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
93 __CLOBBER_CC)
94 #if __VIS__ >= 0x300
95 #undef add_mssaaaa
96 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
97 __asm__ ( "addcc %r5, %6, %2\n\t" \
98 "addxccc %r3, %4, %1\n\t" \
99 "clr %0\n\t" \
100 "movcs %%xcc, -1, %0" \
101 : "=r" (m), "=r" (sh), "=&r" (sl) \
102 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
103 __CLOBBER_CC)
104 #endif
105 #endif
106
107 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
108 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
109 processor running in 32-bit mode, since the carry flag then gets the 32-bit
110 carry. */
111 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
112 __asm__ ( "add%I6c %2, %5, %6\n\t" \
113 "adde %1, %3, %4\n\t" \
114 "subfe %0, %0, %0\n\t" \
115 "nor %0, %0, %0" \
116 : "=r" (m), "=r" (s1), "=&r" (s0) \
117 : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0))
118 #endif
119
120 #if defined (__s390x__) && W_TYPE_SIZE == 64
121 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
122 __asm__ ( "algr %2, %6\n\t" \
123 "alcgr %1, %4\n\t" \
124 "lghi %0, 0\n\t" \
125 "alcgr %0, %0\n\t" \
126 "lcgr %0, %0" \
127 : "=r" (m), "=r" (s1), "=&r" (s0) \
128 : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
129 "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
130 #endif
131
132 #if defined (__arm__) && W_TYPE_SIZE == 32
133 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
134 __asm__ ( "adds %2, %5, %6\n\t" \
135 "adcs %1, %3, %4\n\t" \
136 "movcc %0, #0\n\t" \
137 "movcs %0, #-1" \
138 : "=r" (m), "=r" (sh), "=&r" (sl) \
139 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
140 #endif
141 #endif /* defined (__GNUC__) */
142
143 #ifndef add_mssaaaa
144 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
145 do { \
146 UWtype __s0, __s1, __c0, __c1; \
147 __s0 = (a0) + (b0); \
148 __s1 = (a1) + (b1); \
149 __c0 = __s0 < (a0); \
150 __c1 = __s1 < (a1); \
151 (s0) = __s0; \
152 __s1 = __s1 + __c0; \
153 (s1) = __s1; \
154 (m) = - (__c1 + (__s1 < __c0)); \
155 } while (0)
156 #endif
157
158 #if MOD_1_1P_METHOD == 1
159 void
mpn_mod_1_1p_cps(mp_limb_t cps[4],mp_limb_t b)160 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
161 {
162 mp_limb_t bi;
163 mp_limb_t B1modb, B2modb;
164 int cnt;
165
166 count_leading_zeros (cnt, b);
167
168 b <<= cnt;
169 invert_limb (bi, b);
170
171 cps[0] = bi;
172 cps[1] = cnt;
173
174 B1modb = -b;
175 if (LIKELY (cnt != 0))
176 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
177 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
178 cps[2] = B1modb >> cnt;
179
180 /* In the normalized case, this can be simplified to
181 *
182 * B2modb = - b * bi;
183 * ASSERT (B2modb <= b); // NB: equality iff b = B/2
184 */
185 udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
186 cps[3] = B2modb >> cnt;
187 }
188
189 mp_limb_t
mpn_mod_1_1p(mp_srcptr ap,mp_size_t n,mp_limb_t b,const mp_limb_t bmodb[4])190 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
191 {
192 mp_limb_t rh, rl, bi, ph, pl, r;
193 mp_limb_t B1modb, B2modb;
194 mp_size_t i;
195 int cnt;
196 mp_limb_t mask;
197
198 ASSERT (n >= 2); /* fix tuneup.c if this is changed */
199
200 B1modb = bmodb[2];
201 B2modb = bmodb[3];
202
203 rl = ap[n - 1];
204 umul_ppmm (ph, pl, rl, B1modb);
205 add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]);
206
207 for (i = n - 3; i >= 0; i -= 1)
208 {
209 /* rr = ap[i] < B
210 + LO(rr) * (B mod b) <= (B-1)(b-1)
211 + HI(rr) * (B^2 mod b) <= (B-1)(b-1)
212 */
213 umul_ppmm (ph, pl, rl, B1modb);
214 add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]);
215
216 umul_ppmm (rh, rl, rh, B2modb);
217 add_ssaaaa (rh, rl, rh, rl, ph, pl);
218 }
219
220 cnt = bmodb[1];
221 bi = bmodb[0];
222
223 if (LIKELY (cnt != 0))
224 rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
225
226 mask = -(mp_limb_t) (rh >= b);
227 rh -= mask & b;
228
229 udiv_rnnd_preinv (r, rh, rl << cnt, b, bi);
230
231 return r >> cnt;
232 }
233 #endif /* MOD_1_1P_METHOD == 1 */
234
235 #if MOD_1_1P_METHOD == 2
236 void
mpn_mod_1_1p_cps(mp_limb_t cps[4],mp_limb_t b)237 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
238 {
239 mp_limb_t bi;
240 mp_limb_t B2modb;
241 int cnt;
242
243 count_leading_zeros (cnt, b);
244
245 b <<= cnt;
246 invert_limb (bi, b);
247
248 cps[0] = bi;
249 cps[1] = cnt;
250
251 if (LIKELY (cnt != 0))
252 {
253 mp_limb_t B1modb = -b;
254 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
255 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
256 cps[2] = B1modb >> cnt;
257 }
258 B2modb = - b * bi;
259 ASSERT (B2modb <= b); // NB: equality iff b = B/2
260 cps[3] = B2modb;
261 }
262
263 mp_limb_t
mpn_mod_1_1p(mp_srcptr ap,mp_size_t n,mp_limb_t b,const mp_limb_t bmodb[4])264 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
265 {
266 int cnt;
267 mp_limb_t bi, B1modb;
268 mp_limb_t r0, r1;
269 mp_limb_t r;
270
271 ASSERT (n >= 2); /* fix tuneup.c if this is changed */
272
273 r0 = ap[n-2];
274 r1 = ap[n-1];
275
276 if (n > 2)
277 {
278 mp_limb_t B2modb, B2mb;
279 mp_limb_t p0, p1;
280 mp_limb_t r2;
281 mp_size_t j;
282
283 B2modb = bmodb[3];
284 B2mb = B2modb - b;
285
286 umul_ppmm (p1, p0, r1, B2modb);
287 add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
288
289 for (j = n-4; j >= 0; j--)
290 {
291 mp_limb_t cy;
292 /* mp_limb_t t = r0 + B2mb; */
293 umul_ppmm (p1, p0, r1, B2modb);
294
295 ADDC_LIMB (cy, r0, r0, r2 & B2modb);
296 /* Alternative, for cmov: if (cy) r0 = t; */
297 r0 -= (-cy) & b;
298 add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
299 }
300
301 r1 -= (r2 & b);
302 }
303
304 cnt = bmodb[1];
305
306 if (LIKELY (cnt != 0))
307 {
308 mp_limb_t t;
309 mp_limb_t B1modb = bmodb[2];
310
311 umul_ppmm (r1, t, r1, B1modb);
312 r0 += t;
313 r1 += (r0 < t);
314
315 /* Normalize */
316 r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
317 r0 <<= cnt;
318
319 /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */
320 }
321 else
322 {
323 mp_limb_t mask = -(mp_limb_t) (r1 >= b);
324 r1 -= mask & b;
325 }
326
327 bi = bmodb[0];
328
329 udiv_rnnd_preinv (r, r1, r0, b, bi);
330 return r >> cnt;
331 }
332 #endif /* MOD_1_1P_METHOD == 2 */
333