1 /* hgcd2.c
2
3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7 Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation,
8 Inc.
9
10 This file is part of the GNU MP Library.
11
12 The GNU MP Library is free software; you can redistribute it and/or modify
13 it under the terms of either:
14
15 * the GNU Lesser General Public License as published by the Free
16 Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
18
19 or
20
21 * the GNU General Public License as published by the Free Software
22 Foundation; either version 2 of the License, or (at your option) any
23 later version.
24
25 or both in parallel, as here.
26
27 The GNU MP Library is distributed in the hope that it will be useful, but
28 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
30 for more details.
31
32 You should have received copies of the GNU General Public License and the
33 GNU Lesser General Public License along with the GNU MP Library. If not,
34 see https://www.gnu.org/licenses/. */
35
36 #include "gmp-impl.h"
37 #include "longlong.h"
38
39 #ifndef HGCD2_DIV1_METHOD
40 #define HGCD2_DIV1_METHOD 3
41 #endif
42
43 #ifndef HGCD2_DIV2_METHOD
44 #define HGCD2_DIV2_METHOD 2
45 #endif
46
47 #if GMP_NAIL_BITS != 0
48 #error Nails not implemented
49 #endif
50
51 #if HAVE_NATIVE_mpn_div_11
52
53 #define div1 mpn_div_11
54 /* Single-limb division optimized for small quotients.
55 Returned value holds d0 = r, d1 = q. */
56 mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
57
58 #elif HGCD2_DIV1_METHOD == 1
59
60 static inline mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)61 div1 (mp_limb_t n0, mp_limb_t d0)
62 {
63 mp_double_limb_t res;
64 res.d1 = n0 / d0;
65 res.d0 = n0 - res.d1 * d0;
66
67 return res;
68 }
69
70 #elif HGCD2_DIV1_METHOD == 2
71
72 static mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)73 div1 (mp_limb_t n0, mp_limb_t d0)
74 {
75 mp_double_limb_t res;
76 int ncnt, dcnt, cnt;
77 mp_limb_t q;
78 mp_limb_t mask;
79
80 ASSERT (n0 >= d0);
81
82 count_leading_zeros (ncnt, n0);
83 count_leading_zeros (dcnt, d0);
84 cnt = dcnt - ncnt;
85
86 d0 <<= cnt;
87
88 q = -(mp_limb_t) (n0 >= d0);
89 n0 -= d0 & q;
90 d0 >>= 1;
91 q = -q;
92
93 while (--cnt >= 0)
94 {
95 mask = -(mp_limb_t) (n0 >= d0);
96 n0 -= d0 & mask;
97 d0 >>= 1;
98 q = (q << 1) - mask;
99 }
100
101 res.d0 = n0;
102 res.d1 = q;
103 return res;
104 }
105
106 #elif HGCD2_DIV1_METHOD == 3
107
108 static inline mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)109 div1 (mp_limb_t n0, mp_limb_t d0)
110 {
111 mp_double_limb_t res;
112 if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
113 || UNLIKELY (n0 >= (d0 << 3)))
114 {
115 res.d1 = n0 / d0;
116 res.d0 = n0 - res.d1 * d0;
117 }
118 else
119 {
120 mp_limb_t q, mask;
121
122 d0 <<= 2;
123
124 mask = -(mp_limb_t) (n0 >= d0);
125 n0 -= d0 & mask;
126 q = 4 & mask;
127
128 d0 >>= 1;
129 mask = -(mp_limb_t) (n0 >= d0);
130 n0 -= d0 & mask;
131 q += 2 & mask;
132
133 d0 >>= 1;
134 mask = -(mp_limb_t) (n0 >= d0);
135 n0 -= d0 & mask;
136 q -= mask;
137
138 res.d0 = n0;
139 res.d1 = q;
140 }
141 return res;
142 }
143
144 #elif HGCD2_DIV1_METHOD == 4
145
146 /* Table quotients. We extract the NBITS most significant bits of the
147 numerator limb, and the corresponding bits from the divisor limb, and use
148 these to form an index into the table. This method is probably only useful
149 for short pipelines with slow multiplication.
150
151 Possible improvements:
152
153 * Perhaps extract the highest NBITS of the divisor instead of the same bits
154 as from the numerator. That would require another count_leading_zeros,
155 and a post-multiply shift of the quotient.
156
157 * Compress tables? Their values are tiny, and there are lots of zero
158 entries (which are never used).
159
160 * Round the table entries more cleverly?
161 */
162
163 #ifndef NBITS
164 #define NBITS 5
165 #endif
166
167 #if NBITS == 5
168 /* This needs full division about 13.2% of the time. */
169 static const unsigned char tab[512] = {
170 17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
171 18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
172 19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
173 20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
174 21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
175 22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
176 23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
177 24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
178 25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
179 26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
180 27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
181 28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
182 29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
183 30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
184 31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
185 32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
186 };
187 #elif NBITS == 6
188 /* This needs full division about 9.8% of the time. */
189 static const unsigned char tab[2048] = {
190 33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192 34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194 35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196 36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198 37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
199 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200 38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
201 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202 39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
203 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204 40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
205 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206 41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
207 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208 42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
209 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210 43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
211 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212 44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
213 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214 45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
215 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216 46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
217 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218 47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
219 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220 48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
221 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222 49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
223 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224 50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
225 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
226 51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
227 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
228 52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
229 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
230 53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
231 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
232 54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
233 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
234 55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
235 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
236 56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
237 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
238 57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
239 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
240 58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
241 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
242 59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
243 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
244 60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
245 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
246 61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
247 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
248 62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
249 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
250 63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
251 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
252 64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
253 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
254 };
255 #else
256 #error No table for provided NBITS
257 #endif
258
259 /* Doing tabp with a #define makes compiler warnings about pointing outside an
260 object go away. We used to define this as a variable. It is not clear if
261 e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
262 (vector[100] + 10) - 10 surely is and there is no sequence point so the
263 expressions should be equivalent. To make this safe, we might want to
264 define tabp as a macro with the index as an argument. Depending on the
265 platform, relocs might allow for assembly-time or linker-time resolution to
266 take place. */
267 #define tabp (tab - (1 << (NBITS - 1) << NBITS))
268
269 static inline mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)270 div1 (mp_limb_t n0, mp_limb_t d0)
271 {
272 int ncnt;
273 size_t nbi, dbi;
274 mp_limb_t q0;
275 mp_limb_t r0;
276 mp_limb_t mask;
277 mp_double_limb_t res;
278
279 ASSERT (n0 >= d0); /* Actually only msb position is critical. */
280
281 count_leading_zeros (ncnt, n0);
282 nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
283 dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
284
285 q0 = tabp[(nbi << NBITS) + dbi];
286 r0 = n0 - q0 * d0;
287 mask = -(mp_limb_t) (r0 >= d0);
288 q0 -= mask;
289 r0 -= d0 & mask;
290
291 if (UNLIKELY (r0 >= d0))
292 {
293 q0 = n0 / d0;
294 r0 = n0 - q0 * d0;
295 }
296
297 res.d1 = q0;
298 res.d0 = r0;
299 return res;
300 }
301
302 #elif HGCD2_DIV1_METHOD == 5
303
304 /* Table inverses of divisors. We don't bother with suppressing the msb from
305 the tables. We index with the NBITS most significant divisor bits,
306 including the always-set highest bit, but use addressing trickery via tabp
307 to suppress it.
308
309 Possible improvements:
310
311 * Do first multiply using 32-bit operations on 64-bit computers. At least
312 on most Arm64 cores, that uses 3 times less resources. It also saves on
313 many x86-64 processors.
314 */
315
316 #ifndef NBITS
317 #define NBITS 7
318 #endif
319
320 #if NBITS == 5
321 /* This needs full division about 1.63% of the time. */
322 static const unsigned char tab[16] = {
323 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
324 };
325 #elif NBITS == 6
326 /* This needs full division about 0.93% of the time. */
327 static const unsigned char tab[32] = {
328 127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
329 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
330 };
331 #elif NBITS == 7
332 /* This needs full division about 0.49% of the time. */
333 static const unsigned char tab[64] = {
334 255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
335 203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
336 169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
337 145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
338 };
339 #elif NBITS == 8
340 /* This needs full division about 0.26% of the time. */
341 static const unsigned short tab[128] = {
342 511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
343 454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
344 408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
345 371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
346 340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
347 314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
348 291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
349 272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
350 };
351 #else
352 #error No table for provided NBITS
353 #endif
354
355 /* Doing tabp with a #define makes compiler warnings about pointing outside an
356 object go away. We used to define this as a variable. It is not clear if
357 e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
358 (vector[100] + 10) - 10 surely is and there is no sequence point so the
359 expressions should be equivalent. To make this safe, we might want to
360 define tabp as a macro with the index as an argument. Depending on the
361 platform, relocs might allow for assembly-time or linker-time resolution to
362 take place. */
363 #define tabp (tab - (1 << (NBITS - 1)))
364
365 static inline mp_double_limb_t
div1(mp_limb_t n0,mp_limb_t d0)366 div1 (mp_limb_t n0, mp_limb_t d0)
367 {
368 int ncnt, dcnt;
369 size_t dbi;
370 mp_limb_t inv;
371 mp_limb_t q0;
372 mp_limb_t r0;
373 mp_limb_t mask;
374 mp_double_limb_t res;
375
376 count_leading_zeros (ncnt, n0);
377 count_leading_zeros (dcnt, d0);
378
379 dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
380 inv = tabp[dbi];
381 q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
382 r0 = n0 - q0 * d0;
383 mask = -(mp_limb_t) (r0 >= d0);
384 q0 -= mask;
385 r0 -= d0 & mask;
386
387 if (UNLIKELY (r0 >= d0))
388 {
389 q0 = n0 / d0;
390 r0 = n0 - q0 * d0;
391 }
392
393 res.d1 = q0;
394 res.d0 = r0;
395 return res;
396 }
397
398 #else
399 #error Unknown HGCD2_DIV1_METHOD
400 #endif
401
402 #if HAVE_NATIVE_mpn_div_22
403
404 #define div2 mpn_div_22
405 /* Two-limb division optimized for small quotients. */
406 mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
407
408 #elif HGCD2_DIV2_METHOD == 1
409
410 static mp_limb_t
div2(mp_ptr rp,mp_limb_t n1,mp_limb_t n0,mp_limb_t d1,mp_limb_t d0)411 div2 (mp_ptr rp,
412 mp_limb_t n1, mp_limb_t n0,
413 mp_limb_t d1, mp_limb_t d0)
414 {
415 mp_double_limb_t rq = div1 (n1, d1);
416 if (UNLIKELY (rq.d1 > d1))
417 {
418 mp_limb_t n2, q, t1, t0;
419 int c;
420
421 /* Normalize */
422 count_leading_zeros (c, d1);
423 ASSERT (c > 0);
424
425 n2 = n1 >> (GMP_LIMB_BITS - c);
426 n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
427 n0 <<= c;
428 d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
429 d0 <<= c;
430
431 udiv_qrnnd (q, n1, n2, n1, d1);
432 umul_ppmm (t1, t0, q, d0);
433 if (t1 > n1 || (t1 == n1 && t0 > n0))
434 {
435 ASSERT (q > 0);
436 q--;
437 sub_ddmmss (t1, t0, t1, t0, d1, d0);
438 }
439 sub_ddmmss (n1, n0, n1, n0, t1, t0);
440
441 /* Undo normalization */
442 rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
443 rp[1] = n1 >> c;
444
445 return q;
446 }
447 else
448 {
449 mp_limb_t q, t1, t0;
450 n1 = rq.d0;
451 q = rq.d1;
452 umul_ppmm (t1, t0, q, d0);
453 if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
454 {
455 ASSERT (q > 0);
456 q--;
457 sub_ddmmss (t1, t0, t1, t0, d1, d0);
458 }
459 sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
460 return q;
461 }
462 }
463
464 #elif HGCD2_DIV2_METHOD == 2
465
466 /* Bit-wise div2. Relies on fast count_leading_zeros. */
467 static mp_limb_t
div2(mp_ptr rp,mp_limb_t n1,mp_limb_t n0,mp_limb_t d1,mp_limb_t d0)468 div2 (mp_ptr rp,
469 mp_limb_t n1, mp_limb_t n0,
470 mp_limb_t d1, mp_limb_t d0)
471 {
472 mp_limb_t q = 0;
473 int ncnt;
474 int dcnt;
475
476 count_leading_zeros (ncnt, n1);
477 count_leading_zeros (dcnt, d1);
478 dcnt -= ncnt;
479
480 d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
481 d0 <<= dcnt;
482
483 do
484 {
485 mp_limb_t mask;
486 q <<= 1;
487 if (UNLIKELY (n1 == d1))
488 mask = -(n0 >= d0);
489 else
490 mask = -(n1 > d1);
491
492 q -= mask;
493
494 sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
495
496 d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
497 d1 = d1 >> 1;
498 }
499 while (dcnt--);
500
501 rp[0] = n0;
502 rp[1] = n1;
503
504 return q;
505 }
506 #else
507 #error Unknown HGCD2_DIV2_METHOD
508 #endif
509
510 /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs
511 matrix M. Returns 1 if we make progress, i.e. can perform at least
512 one subtraction. Otherwise returns zero. */
513
514 /* FIXME: Possible optimizations:
515
516 The div2 function starts with checking the most significant bit of
517 the numerator. We can maintained normalized operands here, call
518 hgcd with normalized operands only, which should make the code
519 simpler and possibly faster.
520
521 Experiment with table lookups on the most significant bits.
522
523 This function is also a candidate for assembler implementation.
524 */
525 int
mpn_hgcd2(mp_limb_t ah,mp_limb_t al,mp_limb_t bh,mp_limb_t bl,struct hgcd_matrix1 * M)526 mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl,
527 struct hgcd_matrix1 *M)
528 {
529 mp_limb_t u00, u01, u10, u11;
530
531 if (ah < 2 || bh < 2)
532 return 0;
533
534 if (ah > bh || (ah == bh && al > bl))
535 {
536 sub_ddmmss (ah, al, ah, al, bh, bl);
537 if (ah < 2)
538 return 0;
539
540 u00 = u01 = u11 = 1;
541 u10 = 0;
542 }
543 else
544 {
545 sub_ddmmss (bh, bl, bh, bl, ah, al);
546 if (bh < 2)
547 return 0;
548
549 u00 = u10 = u11 = 1;
550 u01 = 0;
551 }
552
553 if (ah < bh)
554 goto subtract_a;
555
556 for (;;)
557 {
558 ASSERT (ah >= bh);
559 if (ah == bh)
560 goto done;
561
562 if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
563 {
564 ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
565 bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
566
567 break;
568 }
569
570 /* Subtract a -= q b, and multiply M from the right by (1 q ; 0
571 1), affecting the second column of M. */
572 ASSERT (ah > bh);
573 sub_ddmmss (ah, al, ah, al, bh, bl);
574
575 if (ah < 2)
576 goto done;
577
578 if (ah <= bh)
579 {
580 /* Use q = 1 */
581 u01 += u00;
582 u11 += u10;
583 }
584 else
585 {
586 mp_limb_t r[2];
587 mp_limb_t q = div2 (r, ah, al, bh, bl);
588 al = r[0]; ah = r[1];
589 if (ah < 2)
590 {
591 /* A is too small, but q is correct. */
592 u01 += q * u00;
593 u11 += q * u10;
594 goto done;
595 }
596 q++;
597 u01 += q * u00;
598 u11 += q * u10;
599 }
600 subtract_a:
601 ASSERT (bh >= ah);
602 if (ah == bh)
603 goto done;
604
605 if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2)))
606 {
607 ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2));
608 bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2));
609
610 goto subtract_a1;
611 }
612
613 /* Subtract b -= q a, and multiply M from the right by (1 0 ; q
614 1), affecting the first column of M. */
615 sub_ddmmss (bh, bl, bh, bl, ah, al);
616
617 if (bh < 2)
618 goto done;
619
620 if (bh <= ah)
621 {
622 /* Use q = 1 */
623 u00 += u01;
624 u10 += u11;
625 }
626 else
627 {
628 mp_limb_t r[2];
629 mp_limb_t q = div2 (r, bh, bl, ah, al);
630 bl = r[0]; bh = r[1];
631 if (bh < 2)
632 {
633 /* B is too small, but q is correct. */
634 u00 += q * u01;
635 u10 += q * u11;
636 goto done;
637 }
638 q++;
639 u00 += q * u01;
640 u10 += q * u11;
641 }
642 }
643
644 /* NOTE: Since we discard the least significant half limb, we don't get a
645 truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */
646 /* Single precision loop */
647 for (;;)
648 {
649 ASSERT (ah >= bh);
650
651 ah -= bh;
652 if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
653 break;
654
655 if (ah <= bh)
656 {
657 /* Use q = 1 */
658 u01 += u00;
659 u11 += u10;
660 }
661 else
662 {
663 mp_double_limb_t rq = div1 (ah, bh);
664 mp_limb_t q = rq.d1;
665 ah = rq.d0;
666
667 if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
668 {
669 /* A is too small, but q is correct. */
670 u01 += q * u00;
671 u11 += q * u10;
672 break;
673 }
674 q++;
675 u01 += q * u00;
676 u11 += q * u10;
677 }
678 subtract_a1:
679 ASSERT (bh >= ah);
680
681 bh -= ah;
682 if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1)))
683 break;
684
685 if (bh <= ah)
686 {
687 /* Use q = 1 */
688 u00 += u01;
689 u10 += u11;
690 }
691 else
692 {
693 mp_double_limb_t rq = div1 (bh, ah);
694 mp_limb_t q = rq.d1;
695 bh = rq.d0;
696
697 if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1)))
698 {
699 /* B is too small, but q is correct. */
700 u00 += q * u01;
701 u10 += q * u11;
702 break;
703 }
704 q++;
705 u00 += q * u01;
706 u10 += q * u11;
707 }
708 }
709
710 done:
711 M->u[0][0] = u00; M->u[0][1] = u01;
712 M->u[1][0] = u10; M->u[1][1] = u11;
713
714 return 1;
715 }
716
717 /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must
718 * have space for n + 1 limbs. Uses three buffers to avoid a copy*/
719 mp_size_t
mpn_hgcd_mul_matrix1_vector(const struct hgcd_matrix1 * M,mp_ptr rp,mp_srcptr ap,mp_ptr bp,mp_size_t n)720 mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M,
721 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n)
722 {
723 mp_limb_t ah, bh;
724
725 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as
726
727 r = u00 * a
728 r += u10 * b
729 b *= u11
730 b += u01 * a
731 */
732
733 #if HAVE_NATIVE_mpn_addaddmul_1msb0
734 ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]);
735 bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]);
736 #else
737 ah = mpn_mul_1 (rp, ap, n, M->u[0][0]);
738 ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]);
739
740 bh = mpn_mul_1 (bp, bp, n, M->u[1][1]);
741 bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]);
742 #endif
743 rp[n] = ah;
744 bp[n] = bh;
745
746 n += (ah | bh) > 0;
747 return n;
748 }
749