1 /* hgcd2.c
2 
3    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
4    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6 
7 Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
8 Inc.
9 
10 This file is part of the GNU MP Library.
11 
12 The GNU MP Library is free software; you can redistribute it and/or modify
13 it under the terms of either:
14 
15   * the GNU Lesser General Public License as published by the Free
16     Software Foundation; either version 3 of the License, or (at your
17     option) any later version.
18 
19 or
20 
21   * the GNU General Public License as published by the Free Software
22     Foundation; either version 2 of the License, or (at your option) any
23     later version.
24 
25 or both in parallel, as here.
26 
27 The GNU MP Library is distributed in the hope that it will be useful, but
28 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
30 for more details.
31 
32 You should have received copies of the GNU General Public License and the
33 GNU Lesser General Public License along with the GNU MP Library.  If not,
34 see https://www.gnu.org/licenses/.  */
35 
36 #include "gmp-impl.h"
37 #include "longlong.h"
38 
39 #ifndef HGCD2_DIV1_METHOD
40 #define HGCD2_DIV1_METHOD 3
41 #endif
42 
43 #ifndef HGCD2_DIV2_METHOD
44 #define HGCD2_DIV2_METHOD 2
45 #endif
46 
47 #if GMP_NAIL_BITS != 0
48 #error Nails not implemented
49 #endif
50 
51 #if HAVE_NATIVE_mpn_div_11
52 
53 #define div1 mpn_div_11
54 /* Single-limb division optimized for small quotients.
55    Returned value holds d0 = r, d1 = q. */
56 mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
57 
58 #elif HGCD2_DIV1_METHOD == 1
59 
60 static inline mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)61 div1 (mp_limb_t n0, mp_limb_t d0)
62 {
63   mp_double_limb_t res;
64   res.d1 = n0 / d0;
65   res.d0 = n0 - res.d1 * d0;
66 
67   return res;
68 }
69 
70 #elif HGCD2_DIV1_METHOD == 2
71 
72 static mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)73 div1 (mp_limb_t n0, mp_limb_t d0)
74 {
75   mp_double_limb_t res;
76   int ncnt, dcnt, cnt;
77   mp_limb_t q;
78   mp_limb_t mask;
79 
80   ASSERT (n0 >= d0);
81 
82   count_leading_zeros (ncnt, n0);
83   count_leading_zeros (dcnt, d0);
84   cnt = dcnt - ncnt;
85 
86   d0 <<= cnt;
87 
88   q = -(mp_limb_t) (n0 >= d0);
89   n0 -= d0 & q;
90   d0 >>= 1;
91   q = -q;
92 
93   while (--cnt >= 0)
94     {
95       mask = -(mp_limb_t) (n0 >= d0);
96       n0 -= d0 & mask;
97       d0 >>= 1;
98       q = (q << 1) - mask;
99     }
100 
101   res.d0 = n0;
102   res.d1 = q;
103   return res;
104 }
105 
106 #elif HGCD2_DIV1_METHOD == 3
107 
108 static inline mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)109 div1 (mp_limb_t n0, mp_limb_t d0)
110 {
111   mp_double_limb_t res;
112   if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
113       || UNLIKELY (n0 >= (d0 << 3)))
114     {
115       res.d1 = n0 / d0;
116       res.d0 = n0 - res.d1 * d0;
117     }
118   else
119     {
120       mp_limb_t q, mask;
121 
122       d0 <<= 2;
123 
124       mask = -(mp_limb_t) (n0 >= d0);
125       n0 -= d0 & mask;
126       q = 4 & mask;
127 
128       d0 >>= 1;
129       mask = -(mp_limb_t) (n0 >= d0);
130       n0 -= d0 & mask;
131       q += 2 & mask;
132 
133       d0 >>= 1;
134       mask = -(mp_limb_t) (n0 >= d0);
135       n0 -= d0 & mask;
136       q -= mask;
137 
138       res.d0 = n0;
139       res.d1 = q;
140     }
141   return res;
142 }
143 
144 #elif HGCD2_DIV1_METHOD == 4
145 
146 /* Table quotients.  We extract the NBITS most significant bits of the
147    numerator limb, and the corresponding bits from the divisor limb, and use
148    these to form an index into the table.  This method is probably only useful
149    for short pipelines with slow multiplication.
150 
151    Possible improvements:
152 
153    * Perhaps extract the highest NBITS of the divisor instead of the same bits
154      as from the numerator.  That would require another count_leading_zeros,
155      and a post-multiply shift of the quotient.
156 
157    * Compress tables?  Their values are tiny, and there are lots of zero
158      entries (which are never used).
159 
160    * Round the table entries more cleverly?
161 */
162 
163 #ifndef NBITS
164 #define NBITS 5
165 #endif
166 
167 #if NBITS == 5
168 /* This needs full division about 13.2% of the time. */
169 static const unsigned char tab[512] = {
170 17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171 18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172 19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
173 20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
174 21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
175 22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
176 23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
177 24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
178 25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
179 26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
180 27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
181 28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
182 29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
183 30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
184 31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
185 32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
186 };
187 #elif NBITS == 6
188 /* This needs full division about 9.8% of the time. */
189 static const unsigned char tab[2048] = {
190 33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191  1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192 34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193  1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194 35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195  1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196 36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197  1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198 37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199  1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200 38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201  1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202 39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
203  1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204 40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
205  1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206 41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
207  1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208 42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
209  1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210 43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
211  1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212 44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
213  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214 45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
215  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216 46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
217  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218 47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
219  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220 48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
221  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222 49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
223  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224 50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
225  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
226 51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
227  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
228 52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
229  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
230 53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
231  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
232 54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
233  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
234 55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
235  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
236 56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
237  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
238 57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
239  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
240 58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
241  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
242 59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
243  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
244 60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
245  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
246 61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
247  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
248 62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
249  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
250 63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
251  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
252 64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
253  1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
254 };
255 #else
256 #error No table for provided NBITS
257 #endif
258 
259 /* Doing tabp with a #define makes compiler warnings about pointing outside an
260    object go away.  We used to define this as a variable.  It is not clear if
261    e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
262    (vector[100] + 10) - 10 surely is and there is no sequence point so the
263    expressions should be equivalent.  To make this safe, we might want to
264    define tabp as a macro with the index as an argument.  Depending on the
265    platform, relocs might allow for assembly-time or linker-time resolution to
266    take place. */
267 #define tabp (tab - (1 << (NBITS - 1) << NBITS))
268 
269 static inline mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)270 div1 (mp_limb_t n0, mp_limb_t d0)
271 {
272   int ncnt;
273   size_t nbi, dbi;
274   mp_limb_t q0;
275   mp_limb_t r0;
276   mp_limb_t mask;
277   mp_double_limb_t res;
278 
279   ASSERT (n0 >= d0);		/* Actually only msb position is critical. */
280 
281   count_leading_zeros (ncnt, n0);
282   nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
283   dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
284 
285   q0 = tabp[(nbi << NBITS) + dbi];
286   r0 = n0 - q0 * d0;
287   mask = -(mp_limb_t) (r0 >= d0);
288   q0 -= mask;
289   r0 -= d0 & mask;
290 
291   if (UNLIKELY (r0 >= d0))
292     {
293       q0 = n0 / d0;
294       r0 = n0 - q0 * d0;
295     }
296 
297   res.d1 = q0;
298   res.d0 = r0;
299   return res;
300 }
301 
302 #elif HGCD2_DIV1_METHOD == 5
303 
304 /* Table inverses of divisors.  We don't bother with suppressing the msb from
305    the tables.  We index with the NBITS most significant divisor bits,
306    including the always-set highest bit, but use addressing trickery via tabp
307    to suppress it.
308 
309    Possible improvements:
310 
311    * Do first multiply using 32-bit operations on 64-bit computers.  At least
312      on most Arm64 cores, that uses 3 times less resources.  It also saves on
313      many x86-64 processors.
314 */
315 
316 #ifndef NBITS
317 #define NBITS 7
318 #endif
319 
320 #if NBITS == 5
321 /* This needs full division about 1.63% of the time. */
322 static const unsigned char tab[16] = {
323  63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
324 };
325 #elif NBITS == 6
326 /* This needs full division about 0.93% of the time. */
327 static const unsigned char tab[32] = {
328 127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
329  84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
330 };
331 #elif NBITS == 7
332 /* This needs full division about 0.49% of the time. */
333 static const unsigned char tab[64] = {
334 255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
335 203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
336 169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
337 145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
338 };
339 #elif NBITS == 8
340 /* This needs full division about 0.26% of the time. */
341 static const unsigned short tab[128] = {
342 511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
343 454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
344 408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
345 371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
346 340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
347 314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
348 291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
349 272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
350 };
351 #else
352 #error No table for provided NBITS
353 #endif
354 
355 /* Doing tabp with a #define makes compiler warnings about pointing outside an
356    object go away.  We used to define this as a variable.  It is not clear if
357    e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
358    (vector[100] + 10) - 10 surely is and there is no sequence point so the
359    expressions should be equivalent.  To make this safe, we might want to
360    define tabp as a macro with the index as an argument.  Depending on the
361    platform, relocs might allow for assembly-time or linker-time resolution to
362    take place. */
363 #define tabp (tab - (1 << (NBITS - 1)))
364 
365 static inline mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)366 div1 (mp_limb_t n0, mp_limb_t d0)
367 {
368   int ncnt, dcnt;
369   size_t dbi;
370   mp_limb_t inv;
371   mp_limb_t q0;
372   mp_limb_t r0;
373   mp_limb_t mask;
374   mp_double_limb_t res;
375 
376   count_leading_zeros (ncnt, n0);
377   count_leading_zeros (dcnt, d0);
378 
379   dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
380   inv = tabp[dbi];
381   q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
382   r0 = n0 - q0 * d0;
383   mask = -(mp_limb_t) (r0 >= d0);
384   q0 -= mask;
385   r0 -= d0 & mask;
386 
387   if (UNLIKELY (r0 >= d0))
388     {
389       q0 = n0 / d0;
390       r0 = n0 - q0 * d0;
391     }
392 
393   res.d1 = q0;
394   res.d0 = r0;
395   return res;
396 }
397 
398 #else
399 #error Unknown HGCD2_DIV1_METHOD
400 #endif
401 
402 #if HAVE_NATIVE_mpn_div_22
403 
404 #define div2 mpn_div_22
405 /* Two-limb division optimized for small quotients.  */
406 mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
407 
408 #elif HGCD2_DIV2_METHOD == 1
409 
410 static mp_limb_t
div2(mp_ptr rp,mp_limb_t n1,mp_limb_t n0,mp_limb_t d1,mp_limb_t d0)411 div2 (mp_ptr rp,
412       mp_limb_t n1, mp_limb_t n0,
413       mp_limb_t d1, mp_limb_t d0)
414 {
415   mp_double_limb_t rq = div1 (n1, d1);
416   if (UNLIKELY (rq.d1 > d1))
417     {
418       mp_limb_t n2, q, t1, t0;
419       int c;
420 
421       /* Normalize */
422       count_leading_zeros (c, d1);
423       ASSERT (c > 0);
424 
425       n2 = n1 >> (GMP_LIMB_BITS - c);
426       n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
427       n0 <<= c;
428       d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
429       d0 <<= c;
430 
431       udiv_qrnnd (q, n1, n2, n1, d1);
432       umul_ppmm (t1, t0, q, d0);
433       if (t1 > n1 || (t1 == n1 && t0 > n0))
434 	{
435 	  ASSERT (q > 0);
436 	  q--;
437 	  sub_ddmmss (t1, t0, t1, t0, d1, d0);
438 	}
439       sub_ddmmss (n1, n0, n1, n0, t1, t0);
440 
441       /* Undo normalization */
442       rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
443       rp[1] = n1 >> c;
444 
445       return q;
446     }
447   else
448     {
449       mp_limb_t q, t1, t0;
450       n1 = rq.d0;
451       q = rq.d1;
452       umul_ppmm (t1, t0, q, d0);
453       if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
454 	{
455 	  ASSERT (q > 0);
456 	  q--;
457 	  sub_ddmmss (t1, t0, t1, t0, d1, d0);
458 	}
459       sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
460       return q;
461     }
462 }
463 
464 #elif HGCD2_DIV2_METHOD == 2
465 
466 /* Bit-wise div2. Relies on fast count_leading_zeros. */
467 static mp_limb_t
div2(mp_ptr rp,mp_limb_t n1,mp_limb_t n0,mp_limb_t d1,mp_limb_t d0)468 div2 (mp_ptr rp,
469       mp_limb_t n1, mp_limb_t n0,
470       mp_limb_t d1, mp_limb_t d0)
471 {
472   mp_limb_t q = 0;
473   int ncnt;
474   int dcnt;
475 
476   count_leading_zeros (ncnt, n1);
477   count_leading_zeros (dcnt, d1);
478   dcnt -= ncnt;
479 
480   d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
481   d0 <<= dcnt;
482 
483   do
484     {
485       mp_limb_t mask;
486       q <<= 1;
487       if (UNLIKELY (n1 == d1))
488 	mask = -(n0 >= d0);
489       else
490 	mask = -(n1 > d1);
491 
492       q -= mask;
493 
494       sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
495 
496       d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
497       d1 = d1 >> 1;
498     }
499   while (dcnt--);
500 
501   rp[0] = n0;
502   rp[1] = n1;
503 
504   return q;
505 }
506 #else
507 #error Unknown HGCD2_DIV2_METHOD
508 #endif
509 
510 /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
511    matrix M. Returns 1 if we make progress, i.e. can perform at least
512    one subtraction. Otherwise returns zero. */
513 
514 /* FIXME: Possible optimizations:
515 
516    The div2 function starts with checking the most significant bit of
517    the numerator. We can maintained normalized operands here, call
518    hgcd with normalized operands only, which should make the code
519    simpler and possibly faster.
520 
521    Experiment with table lookups on the most significant bits.
522 
523    This function is also a candidate for assembler implementation.
524 */
525 int
mpn_hgcd2(mp_limb_t ah,mp_limb_t al,mp_limb_t bh,mp_limb_t bl,struct hgcd_matrix1 * M)526 mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
527 	   struct hgcd_matrix1 *M)
528 {
529   mp_limb_t u00, u01, u10, u11;
530 
531   if (ah < 2 || bh < 2)
532     return 0;
533 
534   if (ah > bh || (ah == bh && al > bl))
535     {
536       sub_ddmmss (ah, al, ah, al, bh, bl);
537       if (ah < 2)
538 	return 0;
539 
540       u00 = u01 = u11 = 1;
541       u10 = 0;
542     }
543   else
544     {
545       sub_ddmmss (bh, bl, bh, bl, ah, al);
546       if (bh < 2)
547 	return 0;
548 
549       u00 = u10 = u11 = 1;
550       u01 = 0;
551     }
552 
553   if (ah < bh)
554     goto subtract_a;
555 
556   for (;;)
557     {
558       ASSERT (ah >= bh);
559       if (ah == bh)
560 	goto done;
561 
562       if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
563 	{
564 	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
565 	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
566 
567 	  break;
568 	}
569 
570       /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
571 	 1), affecting the second column of M. */
572       ASSERT (ah > bh);
573       sub_ddmmss (ah, al, ah, al, bh, bl);
574 
575       if (ah < 2)
576 	goto done;
577 
578       if (ah <= bh)
579 	{
580 	  /* Use q = 1 */
581 	  u01 += u00;
582 	  u11 += u10;
583 	}
584       else
585 	{
586 	  mp_limb_t r[2];
587 	  mp_limb_t q = div2 (r, ah, al, bh, bl);
588 	  al = r[0]; ah = r[1];
589 	  if (ah < 2)
590 	    {
591 	      /* A is too small, but q is correct. */
592 	      u01 += q * u00;
593 	      u11 += q * u10;
594 	      goto done;
595 	    }
596 	  q++;
597 	  u01 += q * u00;
598 	  u11 += q * u10;
599 	}
600     subtract_a:
601       ASSERT (bh >= ah);
602       if (ah == bh)
603 	goto done;
604 
605       if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
606 	{
607 	  ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
608 	  bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
609 
610 	  goto subtract_a1;
611 	}
612 
613       /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
614 	 1), affecting the first column of M. */
615       sub_ddmmss (bh, bl, bh, bl, ah, al);
616 
617       if (bh < 2)
618 	goto done;
619 
620       if (bh <= ah)
621 	{
622 	  /* Use q = 1 */
623 	  u00 += u01;
624 	  u10 += u11;
625 	}
626       else
627 	{
628 	  mp_limb_t r[2];
629 	  mp_limb_t q = div2 (r, bh, bl, ah, al);
630 	  bl = r[0]; bh = r[1];
631 	  if (bh < 2)
632 	    {
633 	      /* B is too small, but q is correct. */
634 	      u00 += q * u01;
635 	      u10 += q * u11;
636 	      goto done;
637 	    }
638 	  q++;
639 	  u00 += q * u01;
640 	  u10 += q * u11;
641 	}
642     }
643 
644   /* NOTE: Since we discard the least significant half limb, we don't get a
645      truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */
646   /* Single precision loop */
647   for (;;)
648     {
649       ASSERT (ah >= bh);
650 
651       ah -= bh;
652       if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
653 	break;
654 
655       if (ah <= bh)
656 	{
657 	  /* Use q = 1 */
658 	  u01 += u00;
659 	  u11 += u10;
660 	}
661       else
662 	{
663 	  mp_double_limb_t rq = div1 (ah, bh);
664 	  mp_limb_t q = rq.d1;
665 	  ah = rq.d0;
666 
667 	  if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
668 	    {
669 	      /* A is too small, but q is correct. */
670 	      u01 += q * u00;
671 	      u11 += q * u10;
672 	      break;
673 	    }
674 	  q++;
675 	  u01 += q * u00;
676 	  u11 += q * u10;
677 	}
678     subtract_a1:
679       ASSERT (bh >= ah);
680 
681       bh -= ah;
682       if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
683 	break;
684 
685       if (bh <= ah)
686 	{
687 	  /* Use q = 1 */
688 	  u00 += u01;
689 	  u10 += u11;
690 	}
691       else
692 	{
693 	  mp_double_limb_t rq = div1 (bh, ah);
694 	  mp_limb_t q = rq.d1;
695 	  bh = rq.d0;
696 
697 	  if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
698 	    {
699 	      /* B is too small, but q is correct. */
700 	      u00 += q * u01;
701 	      u10 += q * u11;
702 	      break;
703 	    }
704 	  q++;
705 	  u00 += q * u01;
706 	  u10 += q * u11;
707 	}
708     }
709 
710  done:
711   M->u[0][0] = u00; M->u[0][1] = u01;
712   M->u[1][0] = u10; M->u[1][1] = u11;
713 
714   return 1;
715 }
716 
717 /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
718  * have space for n + 1 limbs. Uses three buffers to avoid a copy*/
719 mp_size_t
mpn_hgcd_mul_matrix1_vector(const struct hgcd_matrix1 * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n)720 mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
721 			     mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
722 {
723   mp_limb_t ah, bh;
724 
725   /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
726 
727      r  = u00 * a
728      r += u10 * b
729      b *= u11
730      b += u01 * a
731   */
732 
733 #if HAVE_NATIVE_mpn_addaddmul_1msb0
734   ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
735   bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
736 #else
737   ah =     mpn_mul_1 (rp, ap, n, M->u[0][0]);
738   ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
739 
740   bh =     mpn_mul_1 (bp, bp, n, M->u[1][1]);
741   bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
742 #endif
743   rp[n] = ah;
744   bp[n] = bh;
745 
746   n += (ah | bh) > 0;
747   return n;
748 }
749