1 /* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder.
2 
3 Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation,
4 Inc.
5 
6 This file is part of the GNU MP Library.
7 
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10 
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14 
15 or
16 
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20 
21 or both in parallel, as here.
22 
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27 
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31 
32 #include "gmp-impl.h"
33 #include "longlong.h"
34 
35 #include "mpn/sparc64/sparc64.h"
36 
37 
38 /*                 64-bit divisor   32-bit divisor
39                     cycles/limb      cycles/limb
40                      (approx)         (approx)
41    Ultrasparc 2i:      160               120
42 */
43 
44 
45 /* 32-bit divisors are treated in special case code.  This requires 4 mulx
46    per limb instead of 8 in the general case.
47 
48    For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
49    addressing, to get the two halves of each limb read in the correct order.
50    This is kept in an adj variable.  Doing that measures about 6 c/l faster
51    than just writing HALF_ENDIAN_ADJ(i) in the loop.  The latter shouldn't
52    be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc
53    3.2.1 at least).
54 
55    A simple udivx/umulx loop for the 32-bit case was attempted for small
56    sizes, but at size==2 it was only about the same speed and at size==3 was
57    slower.  */
58 
59 static mp_limb_t
mpn_mod_1_anynorm(mp_srcptr src_limbptr,mp_size_t size_limbs,mp_limb_t d_limb)60 mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
61 {
62   int        norm, norm_rshift;
63   mp_limb_t  src_high_limb;
64   mp_size_t  i;
65 
66   ASSERT (size_limbs >= 0);
67   ASSERT (d_limb != 0);
68 
69   if (UNLIKELY (size_limbs == 0))
70     return 0;
71 
72   src_high_limb = src_limbptr[size_limbs-1];
73 
74   /* udivx is good for size==1, and no need to bother checking limb<divisor,
75      since if that's likely the caller should check */
76   if (UNLIKELY (size_limbs == 1))
77     return src_high_limb % d_limb;
78 
79   if (d_limb <= CNST_LIMB(0xFFFFFFFF))
80     {
81       unsigned   *src, n1, n0, r, dummy_q, nshift, norm_rmask;
82       mp_size_t  size, adj;
83       mp_limb_t  dinv_limb;
84 
85       size = 2 * size_limbs;    /* halfwords */
86       src = (unsigned *) src_limbptr;
87 
88       /* prospective initial remainder, if < d */
89       r = src_high_limb >> 32;
90 
91       /* If the length of the source is uniformly distributed, then there's
92          a 50% chance of the high 32-bits being zero, which we can skip.  */
93       if (r == 0)
94         {
95           r = (unsigned) src_high_limb;
96           size--;
97           ASSERT (size > 0);  /* because always even */
98         }
99 
100       /* Skip a division if high < divisor.  Having the test here before
101          normalizing will still skip as often as possible.  */
102       if (r < d_limb)
103         {
104           size--;
105           ASSERT (size > 0);  /* because size==1 handled above */
106         }
107       else
108         r = 0;
109 
110       count_leading_zeros_32 (norm, d_limb);
111       norm -= 32;
112       d_limb <<= norm;
113 
114       norm_rshift = 32 - norm;
115       norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
116       i = size-1;
117       adj = HALF_ENDIAN_ADJ (i);
118       n1 = src [i + adj];
119       r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
120 
121       invert_half_limb (dinv_limb, d_limb);
122       adj = -adj;
123 
124       for (i--; i >= 0; i--)
125         {
126           n0 = src [i + adj];
127           adj = -adj;
128           nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
129           udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
130           n1 = n0;
131         }
132 
133       /* same as loop, but without n0 */
134       nshift = n1 << norm;
135       udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
136 
137       ASSERT ((r & ((1 << norm) - 1)) == 0);
138       return r >> norm;
139     }
140   else
141     {
142       mp_srcptr  src;
143       mp_size_t  size;
144       mp_limb_t  n1, n0, r, dinv, dummy_q, nshift, norm_rmask;
145 
146       src = src_limbptr;
147       size = size_limbs;
148       r = src_high_limb;  /* initial remainder */
149 
150       /* Skip a division if high < divisor.  Having the test here before
151          normalizing will still skip as often as possible.  */
152       if (r < d_limb)
153         {
154           size--;
155           ASSERT (size > 0);  /* because size==1 handled above */
156         }
157       else
158         r = 0;
159 
160       count_leading_zeros (norm, d_limb);
161       d_limb <<= norm;
162 
163       norm_rshift = GMP_LIMB_BITS - norm;
164       norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
165 
166       src += size;
167       n1 = *--src;
168       r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
169 
170       invert_limb (dinv, d_limb);
171 
172       for (i = size-2; i >= 0; i--)
173         {
174           n0 = *--src;
175           nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
176           udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
177           n1 = n0;
178         }
179 
180       /* same as loop, but without n0 */
181       nshift = n1 << norm;
182       udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
183 
184       ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0);
185       return r >> norm;
186     }
187 }
188 
189 mp_limb_t
mpn_mod_1(mp_srcptr ap,mp_size_t n,mp_limb_t b)190 mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
191 {
192   ASSERT (n >= 0);
193   ASSERT (b != 0);
194 
195   /* Should this be handled at all?  Rely on callers?  Note un==0 is currently
196      required by mpz/fdiv_r_ui.c and possibly other places.  */
197   if (n == 0)
198     return 0;
199 
200   if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
201     {
202       if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
203 	{
204 	  return mpn_mod_1_anynorm (ap, n, b);
205 	}
206       else
207 	{
208 	  mp_limb_t pre[4];
209 	  mpn_mod_1_1p_cps (pre, b);
210 	  return mpn_mod_1_1p (ap, n, b, pre);
211 	}
212     }
213   else
214     {
215       if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
216 	{
217 	  return mpn_mod_1_anynorm (ap, n, b);
218 	}
219       else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
220 	{
221 	  mp_limb_t pre[4];
222 	  mpn_mod_1_1p_cps (pre, b);
223 	  return mpn_mod_1_1p (ap, n, b << pre[1], pre);
224 	}
225       else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
226 	{
227 	  mp_limb_t pre[5];
228 	  mpn_mod_1s_2p_cps (pre, b);
229 	  return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
230 	}
231       else
232 	{
233 	  mp_limb_t pre[7];
234 	  mpn_mod_1s_4p_cps (pre, b);
235 	  return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
236 	}
237     }
238 }
239