1 /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder.
2 
3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
5    FUTURE GNU MP RELEASES.
6 
7 Copyright 2000-2004 Free Software Foundation, Inc.
8 
9 This file is part of the GNU MP Library.
10 
11 The GNU MP Library is free software; you can redistribute it and/or modify
12 it under the terms of either:
13 
14   * the GNU Lesser General Public License as published by the Free
15     Software Foundation; either version 3 of the License, or (at your
16     option) any later version.
17 
18 or
19 
20   * the GNU General Public License as published by the Free Software
21     Foundation; either version 2 of the License, or (at your option) any
22     later version.
23 
24 or both in parallel, as here.
25 
26 The GNU MP Library is distributed in the hope that it will be useful, but
27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29 for more details.
30 
31 You should have received copies of the GNU General Public License and the
32 GNU Lesser General Public License along with the GNU MP Library.  If not,
33 see https://www.gnu.org/licenses/.  */
34 
35 #include "gmp-impl.h"
36 #include "longlong.h"
37 
38 
39 /* Calculate an r satisfying
40 
41            r*B^k + a - c == q*d
42 
43    where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1
44    (the caller won't know which), and q is the quotient (discarded).  d must
45    be odd, c can be any limb value.
46 
47    If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d.
48 
49    This slightly strange function suits the initial Nx1 reduction for GCDs
50    or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving
51    -r == a mod d (by passing c=0).  For a GCD the factor of -1 on r can be
52    ignored, or for the Jacobi symbol it can be accounted for.  The function
53    also suits divisibility and congruence testing since if r=0 (or r=d) is
54    obtained then a==c mod d.
55 
56 
57    r is a bit like the remainder returned by mpn_divexact_by3c, and is the
58    sort of remainder mpn_divexact_1 might return.  Like mpn_divexact_by3c, r
59    represents a borrow, since effectively quotient limbs are chosen so that
60    subtracting that multiple of d from src at each step will produce a zero
61    limb.
62 
63    A long calculation can be done piece by piece from low to high by passing
64    the return value from one part as the carry parameter to the next part.
65    The effective final k becomes anything between size and size-n, if n
66    pieces are used.
67 
68 
69    A similar sort of routine could be constructed based on adding multiples
70    of d at each limb, much like redc in mpz_powm does.  Subtracting however
71    has a small advantage that when subtracting to cancel out l there's never
72    a borrow into h, whereas using an addition would put a carry into h
73    depending whether l==0 or l!=0.
74 
75 
76    In terms of efficiency, this function is similar to a mul-by-inverse
77    mpn_mod_1.  Both are essentially two multiplies and are best suited to
78    CPUs with low latency multipliers (in comparison to a divide instruction
79    at least.)  But modexact has a few less supplementary operations, only
80    needs low part and high part multiplies, and has fewer working quantities
81    (helping CPUs with few registers).
82 
83 
84    In the main loop it will be noted that the new carry (call it r) is the
85    sum of the high product h and any borrow from l=s-c.  If c<d then we will
86    have r<d too, for the following reasons.  Let q=l*inverse be the quotient
87    limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS.  Now if h=d-1 then
88 
89        l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d
90 
91    But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will
92    never have h=d-1 and so r=h+borrow <= d-1.
93 
94    When c>=d, on the other hand, h=d-1 can certainly occur together with a
95    borrow, thereby giving only r<=d, as per the function definition above.
96 
97    As a design decision it's left to the caller to check for r=d if it might
98    be passing c>=d.  Several applications have c<d initially so the extra
99    test is often unnecessary, for example the GCDs or a plain divisibility
100    d|a test will pass c=0.
101 
102 
103    The special case for size==1 is so that it can be assumed c<=d in the
104    high<=divisor test at the end.  c<=d is only guaranteed after at least
105    one iteration of the main loop.  There's also a decent chance one % is
106    faster than a binvert_limb, though that will depend on the processor.
107 
108    A CPU specific implementation might want to omit the size==1 code or the
109    high<divisor test.  mpn/x86/k6/mode1o.asm for instance finds neither
110    useful.  */
111 
112 
113 mp_limb_t
mpn_modexact_1c_odd(mp_srcptr src,mp_size_t size,mp_limb_t d,mp_limb_t orig_c)114 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d,
115                      mp_limb_t orig_c)
116 {
117   mp_limb_t  s, h, l, inverse, dummy, dmul, ret;
118   mp_limb_t  c = orig_c;
119   mp_size_t  i;
120 
121   ASSERT (size >= 1);
122   ASSERT (d & 1);
123   ASSERT_MPN (src, size);
124   ASSERT_LIMB (d);
125   ASSERT_LIMB (c);
126 
127   if (size == 1)
128     {
129       s = src[0];
130       if (s > c)
131 	{
132 	  l = s-c;
133 	  h = l % d;
134 	  if (h != 0)
135 	    h = d - h;
136 	}
137       else
138 	{
139 	  l = c-s;
140 	  h = l % d;
141 	}
142       return h;
143     }
144 
145 
146   binvert_limb (inverse, d);
147   dmul = d << GMP_NAIL_BITS;
148 
149   i = 0;
150   do
151     {
152       s = src[i];
153       SUBC_LIMB (c, l, s, c);
154       l = (l * inverse) & GMP_NUMB_MASK;
155       umul_ppmm (h, dummy, l, dmul);
156       c += h;
157     }
158   while (++i < size-1);
159 
160 
161   s = src[i];
162   if (s <= d)
163     {
164       /* With high<=d the final step can be a subtract and addback.  If c==0
165 	 then the addback will restore to l>=0.  If c==d then will get l==d
166 	 if s==0, but that's ok per the function definition.  */
167 
168       l = c - s;
169       if (c < s)
170 	l += d;
171 
172       ret = l;
173     }
174   else
175     {
176       /* Can't skip a divide, just do the loop code once more. */
177 
178       SUBC_LIMB (c, l, s, c);
179       l = (l * inverse) & GMP_NUMB_MASK;
180       umul_ppmm (h, dummy, l, dmul);
181       c += h;
182       ret = c;
183     }
184 
185   ASSERT (orig_c < d ? ret < d : ret <= d);
186   return ret;
187 }
188 
189 
190 
191 #if 0
192 
193 /* The following is an alternate form that might shave one cycle on a
194    superscalar processor since it takes c+=h off the dependent chain,
195    leaving just a low product, high product, and a subtract.
196 
197    This is for CPU specific implementations to consider.  A special case for
198    high<divisor and/or size==1 can be added if desired.
199 
200    Notice that c is only ever 0 or 1, since if s-c produces a borrow then
201    x=0xFF..FF and x-h cannot produce a borrow.  The c=(x>s) could become
202    c=(x==0xFF..FF) too, if that helped.  */
203 
204 mp_limb_t
205 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h)
206 {
207   mp_limb_t  s, x, y, inverse, dummy, dmul, c1, c2;
208   mp_limb_t  c = 0;
209   mp_size_t  i;
210 
211   ASSERT (size >= 1);
212   ASSERT (d & 1);
213 
214   binvert_limb (inverse, d);
215   dmul = d << GMP_NAIL_BITS;
216 
217   for (i = 0; i < size; i++)
218     {
219       ASSERT (c==0 || c==1);
220 
221       s = src[i];
222       SUBC_LIMB (c1, x, s, c);
223 
224       SUBC_LIMB (c2, y, x, h);
225       c = c1 + c2;
226 
227       y = (y * inverse) & GMP_NUMB_MASK;
228       umul_ppmm (h, dummy, y, dmul);
229     }
230 
231   h += c;
232   return h;
233 }
234 
235 #endif
236