1 /* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder.
2
3 Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation,
4 Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15 or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21 or both in parallel, as here.
22
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 for more details.
27
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
31
32 #include "gmp-impl.h"
33 #include "longlong.h"
34
35 #include "mpn/sparc64/sparc64.h"
36
37
38 /* 64-bit divisor 32-bit divisor
39 cycles/limb cycles/limb
40 (approx) (approx)
41 Ultrasparc 2i: 160 120
42 */
43
44
45 /* 32-bit divisors are treated in special case code. This requires 4 mulx
46 per limb instead of 8 in the general case.
47
48 For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
49 addressing, to get the two halves of each limb read in the correct order.
50 This is kept in an adj variable. Doing that measures about 6 c/l faster
51 than just writing HALF_ENDIAN_ADJ(i) in the loop. The latter shouldn't
52 be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc
53 3.2.1 at least).
54
55 A simple udivx/umulx loop for the 32-bit case was attempted for small
56 sizes, but at size==2 it was only about the same speed and at size==3 was
57 slower. */
58
59 static mp_limb_t
mpn_mod_1_anynorm(mp_srcptr src_limbptr,mp_size_t size_limbs,mp_limb_t d_limb)60 mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
61 {
62 int norm, norm_rshift;
63 mp_limb_t src_high_limb;
64 mp_size_t i;
65
66 ASSERT (size_limbs >= 0);
67 ASSERT (d_limb != 0);
68
69 if (UNLIKELY (size_limbs == 0))
70 return 0;
71
72 src_high_limb = src_limbptr[size_limbs-1];
73
74 /* udivx is good for size==1, and no need to bother checking limb<divisor,
75 since if that's likely the caller should check */
76 if (UNLIKELY (size_limbs == 1))
77 return src_high_limb % d_limb;
78
79 if (d_limb <= CNST_LIMB(0xFFFFFFFF))
80 {
81 unsigned *src, n1, n0, r, dummy_q, nshift, norm_rmask;
82 mp_size_t size, adj;
83 mp_limb_t dinv_limb;
84
85 size = 2 * size_limbs; /* halfwords */
86 src = (unsigned *) src_limbptr;
87
88 /* prospective initial remainder, if < d */
89 r = src_high_limb >> 32;
90
91 /* If the length of the source is uniformly distributed, then there's
92 a 50% chance of the high 32-bits being zero, which we can skip. */
93 if (r == 0)
94 {
95 r = (unsigned) src_high_limb;
96 size--;
97 ASSERT (size > 0); /* because always even */
98 }
99
100 /* Skip a division if high < divisor. Having the test here before
101 normalizing will still skip as often as possible. */
102 if (r < d_limb)
103 {
104 size--;
105 ASSERT (size > 0); /* because size==1 handled above */
106 }
107 else
108 r = 0;
109
110 count_leading_zeros_32 (norm, d_limb);
111 norm -= 32;
112 d_limb <<= norm;
113
114 norm_rshift = 32 - norm;
115 norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
116 i = size-1;
117 adj = HALF_ENDIAN_ADJ (i);
118 n1 = src [i + adj];
119 r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
120
121 invert_half_limb (dinv_limb, d_limb);
122 adj = -adj;
123
124 for (i--; i >= 0; i--)
125 {
126 n0 = src [i + adj];
127 adj = -adj;
128 nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
129 udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
130 n1 = n0;
131 }
132
133 /* same as loop, but without n0 */
134 nshift = n1 << norm;
135 udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
136
137 ASSERT ((r & ((1 << norm) - 1)) == 0);
138 return r >> norm;
139 }
140 else
141 {
142 mp_srcptr src;
143 mp_size_t size;
144 mp_limb_t n1, n0, r, dinv, dummy_q, nshift, norm_rmask;
145
146 src = src_limbptr;
147 size = size_limbs;
148 r = src_high_limb; /* initial remainder */
149
150 /* Skip a division if high < divisor. Having the test here before
151 normalizing will still skip as often as possible. */
152 if (r < d_limb)
153 {
154 size--;
155 ASSERT (size > 0); /* because size==1 handled above */
156 }
157 else
158 r = 0;
159
160 count_leading_zeros (norm, d_limb);
161 d_limb <<= norm;
162
163 norm_rshift = GMP_LIMB_BITS - norm;
164 norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
165
166 src += size;
167 n1 = *--src;
168 r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
169
170 invert_limb (dinv, d_limb);
171
172 for (i = size-2; i >= 0; i--)
173 {
174 n0 = *--src;
175 nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
176 udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
177 n1 = n0;
178 }
179
180 /* same as loop, but without n0 */
181 nshift = n1 << norm;
182 udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
183
184 ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0);
185 return r >> norm;
186 }
187 }
188
189 mp_limb_t
mpn_mod_1(mp_srcptr ap,mp_size_t n,mp_limb_t b)190 mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
191 {
192 ASSERT (n >= 0);
193 ASSERT (b != 0);
194
195 /* Should this be handled at all? Rely on callers? Note un==0 is currently
196 required by mpz/fdiv_r_ui.c and possibly other places. */
197 if (n == 0)
198 return 0;
199
200 if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
201 {
202 if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
203 {
204 return mpn_mod_1_anynorm (ap, n, b);
205 }
206 else
207 {
208 mp_limb_t pre[4];
209 mpn_mod_1_1p_cps (pre, b);
210 return mpn_mod_1_1p (ap, n, b, pre);
211 }
212 }
213 else
214 {
215 if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
216 {
217 return mpn_mod_1_anynorm (ap, n, b);
218 }
219 else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
220 {
221 mp_limb_t pre[4];
222 mpn_mod_1_1p_cps (pre, b);
223 return mpn_mod_1_1p (ap, n, b << pre[1], pre);
224 }
225 else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
226 {
227 mp_limb_t pre[5];
228 mpn_mod_1s_2p_cps (pre, b);
229 return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
230 }
231 else
232 {
233 mp_limb_t pre[7];
234 mpn_mod_1s_4p_cps (pre, b);
235 return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
236 }
237 }
238 }
239