xref: /dragonfly/contrib/gmp/mpn/generic/mode1o.c (revision cfd1aba3)
1 /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
2 
3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5    FUTURE GNU MP RELEASES.
6 
7 Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of the GNU Lesser General Public License as published by
13 the Free Software Foundation; either version 3 of the License, or (at your
14 option) any later version.
15 
16 The GNU MP Library is distributed in the hope that it will be useful, but
17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
19 License for more details.
20 
21 You should have received a copy of the GNU Lesser General Public License
22 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
23 
24 #include "gmp.h"
25 #include "gmp-impl.h"
26 #include "longlong.h"
27 
28 
29 /* Calculate an r satisfying
30 
31            r*B^k + a - c == q*d
32 
33    where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
34    (the caller won't know which), and q is the quotient (discarded).  d must
35    be odd, c can be any limb value.
36 
37    If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
38 
39    This slightly strange function suits the initial Nx1 reduction for GCDs
40    or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
41    -r == a mod d (by passing c=0).  For a GCD the factor of -1 on r can be
42    ignored, or for the Jacobi symbol it can be accounted for.  The function
43    also suits divisibility and congruence testing since if r=0 (or r=d) is
44    obtained then a==c mod d.
45 
46 
47    r is a bit like the remainder returned by mpn_divexact_by3c, and is the
48    sort of remainder mpn_divexact_1 might return.  Like mpn_divexact_by3c, r
49    represents a borrow, since effectively quotient limbs are chosen so that
50    subtracting that multiple of d from src at each step will produce a zero
51    limb.
52 
53    A long calculation can be done piece by piece from low to high by passing
54    the return value from one part as the carry parameter to the next part.
55    The effective final k becomes anything between size and size-n, if n
56    pieces are used.
57 
58 
59    A similar sort of routine could be constructed based on adding multiples
60    of d at each limb, much like redc in mpz_powm does.  Subtracting however
61    has a small advantage that when subtracting to cancel out l there's never
62    a borrow into h, whereas using an addition would put a carry into h
63    depending whether l==0 or l!=0.
64 
65 
66    In terms of efficiency, this function is similar to a mul-by-inverse
67    mpn_mod_1.  Both are essentially two multiplies and are best suited to
68    CPUs with low latency multipliers (in comparison to a divide instruction
69    at least.)  But modexact has a few less supplementary operations, only
70    needs low part and high part multiplies, and has fewer working quantities
71    (helping CPUs with few registers).
72 
73 
74    In the main loop it will be noted that the new carry (call it r) is the
75    sum of the high product h and any borrow from l=s-c.  If c<d then we will
76    have r<d too, for the following reasons.  Let q=l*inverse be the quotient
77    limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS.  Now if h=d-1 then
78 
79        l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
80 
81    But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
82    never have h=d-1 and so r=h+borrow <= d-1.
83 
84    When c>=d, on the other hand, h=d-1 can certainly occur together with a
85    borrow, thereby giving only r<=d, as per the function definition above.
86 
87    As a design decision it's left to the caller to check for r=d if it might
88    be passing c>=d.  Several applications have c<d initially so the extra
89    test is often unnecessary, for example the GCDs or a plain divisibility
90    d|a test will pass c=0.
91 
92 
93    The special case for size==1 is so that it can be assumed c<=d in the
94    high<=divisor test at the end.  c<=d is only guaranteed after at least
95    one iteration of the main loop.  There's also a decent chance one % is
96    faster than a binvert_limb, though that will depend on the processor.
97 
98    A CPU specific implementation might want to omit the size==1 code or the
99    high<divisor test.  mpn/x86/k6/mode1o.asm for instance finds neither
100    useful.  */
101 
102 
103 mp_limb_t
104 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
105                      mp_limb_t orig_c)
106 {
107   mp_limb_t  s, h, l, inverse, dummy, dmul, ret;
108   mp_limb_t  c = orig_c;
109   mp_size_t  i;
110 
111   ASSERT (size >= 1);
112   ASSERT (d & 1);
113   ASSERT_MPN (src, size);
114   ASSERT_LIMB (d);
115   ASSERT_LIMB (c);
116 
117   if (size == 1)
118     {
119       s = src[0];
120       if (s > c)
121 	{
122 	  l = s-c;
123 	  h = l % d;
124 	  if (h != 0)
125 	    h = d - h;
126 	}
127       else
128 	{
129 	  l = c-s;
130 	  h = l % d;
131 	}
132       return h;
133     }
134 
135 
136   binvert_limb (inverse, d);
137   dmul = d << GMP_NAIL_BITS;
138 
139   i = 0;
140   do
141     {
142       s = src[i];
143       SUBC_LIMB (c, l, s, c);
144       l = (l * inverse) & GMP_NUMB_MASK;
145       umul_ppmm (h, dummy, l, dmul);
146       c += h;
147     }
148   while (++i < size-1);
149 
150 
151   s = src[i];
152   if (s <= d)
153     {
154       /* With high<=d the final step can be a subtract and addback.  If c==0
155 	 then the addback will restore to l>=0.  If c==d then will get l==d
156 	 if s==0, but that's ok per the function definition.  */
157 
158       l = c - s;
159       if (c < s)
160 	l += d;
161 
162       ret = l;
163     }
164   else
165     {
166       /* Can't skip a divide, just do the loop code once more. */
167 
168       SUBC_LIMB (c, l, s, c);
169       l = (l * inverse) & GMP_NUMB_MASK;
170       umul_ppmm (h, dummy, l, dmul);
171       c += h;
172       ret = c;
173     }
174 
175   ASSERT (orig_c < d ? ret < d : ret <= d);
176   return ret;
177 }
178 
179 
180 
181 #if 0
182 
183 /* The following is an alternate form that might shave one cycle on a
184    superscalar processor since it takes c+=h off the dependent chain,
185    leaving just a low product, high product, and a subtract.
186 
187    This is for CPU specific implementations to consider.  A special case for
188    high<divisor and/or size==1 can be added if desired.
189 
190    Notice that c is only ever 0 or 1, since if s-c produces a borrow then
191    x=0xFF..FF and x-h cannot produce a borrow.  The c=(x>s) could become
192    c=(x==0xFF..FF) too, if that helped.  */
193 
194 mp_limb_t
195 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
196 {
197   mp_limb_t  s, x, y, inverse, dummy, dmul, c1, c2;
198   mp_limb_t  c = 0;
199   mp_size_t  i;
200 
201   ASSERT (size >= 1);
202   ASSERT (d & 1);
203 
204   binvert_limb (inverse, d);
205   dmul = d << GMP_NAIL_BITS;
206 
207   for (i = 0; i < size; i++)
208     {
209       ASSERT (c==0 || c==1);
210 
211       s = src[i];
212       SUBC_LIMB (c1, x, s, c);
213 
214       SUBC_LIMB (c2, y, x, h);
215       c = c1 + c2;
216 
217       y = (y * inverse) & GMP_NUMB_MASK;
218       umul_ppmm (h, dummy, y, dmul);
219     }
220 
221   h += c;
222   return h;
223 }
224 
225 #endif
226