1 /* Some commonly used assembly helper macros for arithmetic on
2 unsigned 64-bit integers.
3 Defining U64ARITH_VERBOSE_ASM puts in the asm output a line for each
4 function call that shows what registers/memory locations the operands
5 are in.
6 Defining U64ARITH_NO_ASM avoids asm macros and uses the C fallback code
7 where available. This can be used for testing the fallback code.
8 */
9
10 #ifndef U64_ARITH_H__
11 #define U64_ARITH_H__
12
13 #include <stdint.h>
14 #include <stdio.h>
15 #include "macros.h"
16
17 #if defined(HAVE_INT128)
18 typedef union {uint64_t x[2]; unsigned __int128 y;} _u64arith_union2_t;
19 #endif
20
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24
25 /** Test whether a1:a2 > b1:b2
26 *
27 * Let a = a1 + 2^64 * a2, b = b1 + 2^64 * b2. Return 1 if a > b,
28 * and 0 otherwise.
29 */
30 static inline int
31 u64arith_gt_2_2(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) ATTRIBUTE((const));
32 static inline int
u64arith_gt_2_2(const uint64_t a1,const uint64_t a2,const uint64_t b1,const uint64_t b2)33 u64arith_gt_2_2(const uint64_t a1, const uint64_t a2,
34 const uint64_t b1, const uint64_t b2)
35 {
36 #if defined(GENUINE_GCC) && __GNUC__ >= 8 && defined(HAVE_INT128)
37 _u64arith_union2_t a = {{a1, a2}}, b = {{b1, b2}};
38 return a > b;
39 #else
40 return a2 > b2 || (a2 == b2 && a1 > b1);
41 #endif
42 }
43
44 /** Test whether a1:a2 >= b1:b2
45 *
46 * Let a = a1 + 2^64 * a2, b = b1 + 2^64 * b2. Return 1 if a >= b,
47 * and 0 otherwise.
48 */
49 static inline int
50 u64arith_ge_2_2(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) ATTRIBUTE((const));
51 static inline int
u64arith_ge_2_2(const uint64_t a1,const uint64_t a2,const uint64_t b1,const uint64_t b2)52 u64arith_ge_2_2(const uint64_t a1, const uint64_t a2,
53 const uint64_t b1, const uint64_t b2)
54 {
55 #if defined(GENUINE_GCC) && __GNUC__ >= 8 && defined(HAVE_INT128)
56 _u64arith_union2_t a = {{a1, a2}, b = {b1, b2}};
57 return a >= b;
58 #else
59 return a2 > b2 || (a2 == b2 && a1 >= b1);
60 #endif
61
62 }
63
64 /** Test whether a1:a2 < b1:b2
65 *
66 * Let a = a1 + 2^64 * a2, b = b1 + 2^64 * b2. Return 1 if a < b,
67 * and 0 otherwise.
68 */
69 static inline int
70 u64arith_lt_2_2(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) ATTRIBUTE((const));
71 static inline int
u64arith_lt_2_2(const uint64_t a1,const uint64_t a2,const uint64_t b1,const uint64_t b2)72 u64arith_lt_2_2(const uint64_t a1, const uint64_t a2,
73 const uint64_t b1, const uint64_t b2)
74 {
75 #if defined(GENUINE_GCC) && __GNUC__ >= 8 && defined(HAVE_INT128)
76 _u64arith_union2_t a = {{a1, a2}}, b = {{b1, b2}};
77 return a < b;
78 #else
79 return a2 < b2 || (a2 == b2 && a1 < b1);
80 #endif
81
82 }
83
84 /** Test whether a1:a2 <= b1:b2
85 *
86 * Let a = a1 + 2^64 * a2, b = b1 + 2^64 * b2. Return 1 if a <= b,
87 * and 0 otherwise.
88 */
89 static inline int
90 u64arith_le_2_2(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) ATTRIBUTE((const));
91 static inline int
u64arith_le_2_2(const uint64_t a1,const uint64_t a2,const uint64_t b1,const uint64_t b2)92 u64arith_le_2_2(const uint64_t a1, const uint64_t a2,
93 const uint64_t b1, const uint64_t b2)
94 {
95 #if defined(GENUINE_GCC) && __GNUC__ >= 8 && defined(HAVE_INT128)
96 _u64arith_union2_t a = {{a1, a2}}, b = {{b1, b2}};
97 return a <= b;
98 #else
99 return a2 < b2 || (a2 == b2 && a1 <= b1);
100 #endif
101
102 }
103
104 /** Add r1:r2 += a1:a2 with carry propagation
105 *
106 * Let r = r1 + 2^64 * r2, a = a1 + 2^64 * a2. Set r1:r2 := (r+a) mod 2^128.
107 */
108 static inline void
u64arith_add_2_2(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)109 u64arith_add_2_2 (uint64_t *r1, uint64_t *r2,
110 const uint64_t a1, const uint64_t a2)
111 {
112 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) \
113 && ( defined(GENUINE_GNUC) && __GNUC__ <= 7 || defined(__INTEL_COMPILER) )
114 #ifdef U64ARITH_VERBOSE_ASM
115 __asm__ ("# u64arith_add_2_2 (%0, %1, %2, %3)\n" : :
116 "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
117 #endif
118 __asm__ __VOLATILE (
119 "addq %2, %0\n\t"
120 "adcq %3, %1\n"
121 : "+&r" (*r1), "+r" (*r2)
122 : "rme" (a1), "rme" (a2)
123 : "cc");
124 #else
125 /* On x86_64, Clang 6.0, 7.0, 8.0, 9.0, gcc 8.0 and 9.0 produce good code
126 * from this. Intel icc however produces terrible code. */
127 *r1 += a1;
128 *r2 += a2 + (*r1 < a1);
129 #endif
130 }
131
132 /** Add r1:r2 += a with carry propagation
133 *
134 * Let r = r1 + 2^64 * r2. Set r1:r2 := (r+a) mod 2^128.
135 */
136 static inline void
u64arith_add_1_2(uint64_t * r1,uint64_t * r2,const uint64_t a)137 u64arith_add_1_2 (uint64_t *r1, uint64_t *r2, const uint64_t a)
138 {
139 /* We have assembly only for x86_64 and on that architecture, this function
140 would generate the same code as u64arith_add_2_2() with an immediate
141 zero value for the high word, so we fall back to that. */
142 u64arith_add_2_2(r1, r2, a, 0);
143 }
144
145
146 /* Adds two uint64_t from two uint64_t with carry propagation
147 from low word (r1) to high word (r2). Returns 1 if there was a carry out
148 from high word, otherwise returns 0. */
149
150 static inline char
u64arith_add_2_2_cy(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)151 u64arith_add_2_2_cy (uint64_t *r1, uint64_t *r2,
152 const uint64_t a1, const uint64_t a2)
153 {
154 char cy;
155 #ifdef U64ARITH_VERBOSE_ASM
156 __asm__ ("# u64arith_add_2_2_cy (%0, %1, %2, %3)\n" : :
157 "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
158 #endif
159 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
160 __asm__ __VOLATILE (
161 "addq %3, %0\n\t"
162 "adcq %4, %1\n\t"
163 "setc %2\n"
164 : "+&r" (*r1), "+r" (*r2), "=r" (cy)
165 : "rme" (a1), "rme" (a2)
166 : "cc");
167 #else
168 uint64_t u1 = *r1 + a1,
169 u2 = *r2 + a2 + (u1 < a1);
170 /* Overflow occurred iff the sum is smaller than one of the summands */
171 cy = u64arith_gt_2_2(a1, a2, u1, u2);
172 *r1 = u1;
173 *r2 = u2;
174 #endif
175 return cy;
176 }
177
178
179 /** Compute r = (a + b) % m
180 *
181 * Requires a < m and b <= m, then r == a+b (mod m) and r < m.
182 */
183 static inline void
u64arith_addmod_1_1(uint64_t * r,const uint64_t a,const uint64_t b,const uint64_t m)184 u64arith_addmod_1_1 (uint64_t *r, const uint64_t a,
185 const uint64_t b, const uint64_t m)
186 {
187 ASSERT_EXPENSIVE (a < m && b <= m);
188
189 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
190 {
191 uint64_t t = a + b, tr = a - m;
192
193 __asm__ __VOLATILE (
194 "addq %2, %0\n\t" /* tr += b */
195 "cmovnc %1, %0\n\t" /* if (!cy) tr = t */
196 : "+&r" (tr)
197 : "rm" (t), "rme" (b)
198 : "cc"
199 );
200 ASSERT_EXPENSIVE (tr == ((a >= m - b) ? (a - (m - b)) : (a + b)));
201 r[0] = tr;
202 }
203 #else
204 r[0] = (b >= m - a) ? (b - (m - a)) : (a + b);
205 #endif
206
207 ASSERT_EXPENSIVE (r[0] < m);
208 }
209
210
211 /** Add r1:r2 -= a1:a2 with borrow propagation
212 *
213 * Let r = r1 + 2^64 * r2, a = a1 + 2^64 * a2. Set r1:r2 := (r-a) mod 2^128.
214 */
215 static inline void
u64arith_sub_2_2(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)216 u64arith_sub_2_2 (uint64_t *r1, uint64_t *r2,
217 const uint64_t a1, const uint64_t a2)
218 {
219 #ifdef U64ARITH_VERBOSE_ASM
220 __asm__ ("# u64arith_sub_2_2 (%0, %1, %2, %3)\n" : :
221 "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
222 #endif
223 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
224 __asm__ __VOLATILE (
225 "subq %2, %0\n\t"
226 "sbbq %3, %1\n"
227 : "+&r" (*r1), "+r" (*r2)
228 : "rme" (a1), "rme" (a2)
229 : "cc");
230 #else
231 uint64_t u = *r1;
232 *r1 -= a1;
233 *r2 -= a2 + (*r1 > u);
234 #endif
235 }
236
237
238 /** Add r1:r2 -= a with borrow propagation
239 *
240 * Let r = r1 + 2^64 * r2. Set r1:r2 := (r-a) mod 2^128.
241 */
242 static inline void
u64arith_sub_1_2(uint64_t * r1,uint64_t * r2,const uint64_t a)243 u64arith_sub_1_2 (uint64_t *r1, uint64_t *r2,
244 const uint64_t a)
245 {
246 u64arith_sub_2_2(r1, r2, a, 0);
247 }
248
249
250 /* Subtract two uint64_t from two uint64_t with borrow propagation
251 from low word (r1) to high word (r2). Returns 1 if there was a borrow out
252 from high word, otherwise returns 0. */
253
254 static inline char
u64arith_sub_2_2_cy(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)255 u64arith_sub_2_2_cy (uint64_t *r1, uint64_t *r2,
256 const uint64_t a1, const uint64_t a2)
257 {
258 char cy;
259 #ifdef U64ARITH_VERBOSE_ASM
260 __asm__ ("# u64arith_sub_2_2_cy (%0, %1, %2, %3)\n" : :
261 "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
262 #endif
263 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
264 __asm__ __VOLATILE (
265 "subq %3, %0\n\t"
266 "sbbq %4, %1\n\t"
267 "setc %2\n"
268 : "+&r" (*r1), "+r" (*r2), "=r" (cy)
269 : "rme" (a1), "rme" (a2)
270 : "cc");
271 #else
272 uint64_t u1 = *r1 - a1, u2 = *r2 - a2;
273 if (a1 > *r1)
274 u2--;
275 cy = u64arith_gt_2_2(a1, a2, *r1, *r2);
276 *r1 = u1;
277 *r2 = u2;
278 #endif
279 return cy;
280 }
281
282
283 /* Subtract only if result is non-negative */
284 static inline void
u64arith_sub_1_1_ge(uint64_t * r,const uint64_t a)285 u64arith_sub_1_1_ge (uint64_t *r, const uint64_t a)
286 {
287 uint64_t t = *r;
288 #ifdef U64ARITH_VERBOSE_ASM
289 __asm__ ("# u64arith_sub_1_1_ge (%0, %1, %2, %3)\n" : :
290 "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
291 #endif
292 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
293 __asm__ __VOLATILE (
294 "subq %2, %0\n\t" /* r -= a */
295 "cmovc %1, %0\n\t" /* If there's a borrow, restore r from t */
296 : "+&r" (*r)
297 : "r" (t), "rme" (a)
298 : "cc");
299 #else
300 t -= a;
301 if (*r >= a)
302 *r = t;
303 #endif
304 }
305
306 /* Subtract only if result is non-negative */
307 static inline void
u64arith_sub_2_2_ge(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)308 u64arith_sub_2_2_ge (uint64_t *r1, uint64_t *r2,
309 const uint64_t a1, const uint64_t a2)
310 {
311 uint64_t t1 = *r1, t2 = *r2;
312 #ifdef U64ARITH_VERBOSE_ASM
313 __asm__ ("# u64arith_sub_2_2_ge (%0, %1, %2, %3)\n" : :
314 "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
315 #endif
316 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
317 __asm__ __VOLATILE (
318 "subq %4, %0\n\t" /* r1 -= a1 */
319 "sbbq %5, %1\n\t" /* r2 -= a2 + cy */
320 "cmovc %2, %0\n\t" /* If there's a borrow, restore r1 from t1 */
321 "cmovc %3, %1\n\t" /* and r2 from t2 */
322 : "+&r" (*r1), "+&r" (*r2)
323 : "r" (t1), "r" (t2), "rme" (a1), "rme" (a2)
324 : "cc");
325 #else
326 if (!u64arith_gt_2_2(a1, a2, *r1, *r2))
327 {
328 *r1 = t1 - a1;
329 *r2 = t2 - a2 - (a1 > t1);
330 }
331 #endif
332 }
333
334
335 /** Compute r = (a - b) % m
336 *
337 * Requires a < m and b < m, then r == a+b (mod m) and r < m.
338 */
339 static inline void
u64arith_submod_1_1(uint64_t * r,const uint64_t a,const uint64_t b,const uint64_t m)340 u64arith_submod_1_1 (uint64_t *r, const uint64_t a,
341 const uint64_t b, const uint64_t m)
342 {
343 ASSERT_EXPENSIVE (a < m && b < m);
344 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
345 /* Could do tr = a+m-b; t = a-b; if (!carry) tr = t; */
346 uint64_t tr, t = a;
347 __asm__ __VOLATILE (
348 "sub %2, %1\n\t" /* t -= b ( = a - b) */
349 "lea (%1,%3,1), %0\n\t" /* tr = t + m ( = a - b + m) */
350 "cmovnc %1, %0\n\t" /* if (a >= b) tr = t */
351 : "=&r" (tr), "+&r" (t)
352 : "rme" (b), "r" (m)
353 : "cc"
354 );
355 r[0] = tr;
356 #elif 1
357 /* Seems to be faster than the one below */
358 uint64_t t = 0, tr;
359 if ((tr = a - b) > a)
360 t = m;
361 r[0] = tr + t;
362 #else
363 r[0] = (a < b) ? (a - b + m) : (a - b);
364 #endif
365
366 ASSERT_EXPENSIVE (r[0] < m);
367 }
368
369
370 /* Multiply two uint64_t "a" and "b" and put the result as
371 r2:r1 (r2 being the high word) */
372
373 static inline void
u64arith_mul_1_1_2(uint64_t * r1,uint64_t * r2,const uint64_t a,const uint64_t b)374 u64arith_mul_1_1_2 (uint64_t *r1, uint64_t *r2,
375 const uint64_t a, const uint64_t b)
376 {
377 #ifdef U64ARITH_VERBOSE_ASM
378 __asm__ ("# u64arith_mul_1_1_2 (%0, %1, %2, %3)\n" : :
379 "X" (*r1), "X" (*r2), "X" (a), "X" (b));
380 #endif
381 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
382 __asm__ __VOLATILE (
383 "mulq %3"
384 : "=a" (*r1), "=d" (*r2)
385 : "%0" (a), "rm" (b)
386 : "cc");
387 #elif defined(HAVE_INT128)
388 unsigned __int128 r = (unsigned __int128) a * b;
389 *r1 = r;
390 *r2 = r >> 64;
391 #else
392 const uint64_t mask = ((uint64_t)1 << 32) - 1;
393 const uint64_t ah = a >> 32, al = a & mask, bh = b >> 32, bl = b & mask;
394 uint64_t t1, t2, p1, p2;
395
396 t1 = al * bl;
397 t2 = 0;
398 p1 = ah * bl;
399 p2 = p1 >> 32;
400 p1 = (p1 & mask) << 32;
401 u64arith_add_2_2 (&t1, &t2, p1, p2);
402 p1 = al * bh;
403 p2 = p1 >> 32;
404 p1 = (p1 & mask) << 32;
405 u64arith_add_2_2 (&t1, &t2, p1, p2);
406 t2 += ah * bh;
407 *r1 = t1;
408 *r2 = t2;
409 #endif
410 }
411
412
413 static inline void
u64arith_sqr_1_2(uint64_t * r1,uint64_t * r2,const uint64_t a)414 u64arith_sqr_1_2 (uint64_t *r1, uint64_t *r2,
415 const uint64_t a)
416 {
417 #ifdef U64ARITH_VERBOSE_ASM
418 __asm__ ("# u64arith_sqr_1_2 (%0, %1, %2)\n" : :
419 "X" (*r1), "X" (*r2), "X" (a));
420 #endif
421 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
422 __asm__ __VOLATILE (
423 "mulq %%rax"
424 : "=a" (*r1), "=d" (*r2)
425 : "0" (a)
426 : "cc");
427 #elif defined(HAVE_INT128)
428 unsigned __int128 r = (unsigned __int128) a * a;
429 *r1 = r;
430 *r2 = r >> 64;
431 #else
432 const uint64_t mask = ((uint64_t)1 << 32) - 1;
433 const uint64_t ah = a >> 32, al = a & mask;
434 uint64_t t1, t2, p1, p2;
435
436 t1 = al * al;
437 t2 = 0;
438 p1 = ah * al;
439 p2 = p1 >> 32;
440 p1 = (p1 & mask) << 32;
441 u64arith_add_2_2 (&t1, &t2, p1, p2);
442 u64arith_add_2_2 (&t1, &t2, p1, p2);
443 t2 += ah * ah;
444 *r1 = t1;
445 *r2 = t2;
446 #endif
447 }
448
449
450 /* Set *r to lo shifted right by i bits, filling in the low bits from hi into the high
451 bits of *r. I.e., *r = (hi * 2^64 + lo) / 2^i. Assumes 0 <= i < 64. */
452 static inline void
u64arith_shrd(uint64_t * r,const uint64_t hi,const uint64_t lo,const unsigned char i)453 u64arith_shrd (uint64_t *r, const uint64_t hi, const uint64_t lo,
454 const unsigned char i)
455 {
456 ASSERT_EXPENSIVE (i < 64);
457 #ifdef U64ARITH_VERBOSE_ASM
458 /* Disable the "uninitialized" warning here, as *r is only written to and
459 does not need to be initialized, but we need to write (*r) here so the
460 "X" constraint can be resolved even when r does not have an address, e.g.,
461 when it is passed around in a register. It seems that "X" is assumed by
462 gcc as possibly referring to an input, and since "X" matches anything,
463 that's probably a neccessary assumtion to make. */
464 #if GNUC_VERSION_ATLEAST(4,4,0)
465 #if GNUC_VERSION_ATLEAST(4,6,0)
466 #pragma GCC diagnostic push
467 #endif
468 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
469 #endif
470 __asm__ ("# u64arith_shrd (*r=%0, hi=%1, lo=%2, i=%3)\n" : :
471 "X" (*r), "X" (hi), "X" (lo), "X" (i));
472 #if GNUC_VERSION_ATLEAST(4,6,0)
473 #pragma GCC diagnostic pop
474 #endif
475 #endif
476
477 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
478 __asm__ __VOLATILE (
479 "shrdq %3, %1, %0\n"
480 : "=rm" (*r)
481 : "r" (hi), "0" (lo), "cJ" (i) /* i can be in %cl or a literal constant < 64 */
482 : "cc");
483 #else
484 if (i > 0) /* shl by 64 is no-op on x86! */
485 *r = (lo >> i) | (hi << (64 - i));
486 else
487 *r = lo;
488 #endif
489 }
490
491
492 /* Shift the 128-bit integer in lo:hi right by i bits, 0 <= i < 64 */
493
494 static inline void
u64arith_shr_2(uint64_t * lo,uint64_t * hi,const unsigned char i)495 u64arith_shr_2 (uint64_t *lo, uint64_t *hi, const unsigned char i)
496 {
497 ASSERT_EXPENSIVE (i < 64);
498 u64arith_shrd(lo, *hi, *lo, i);
499 *hi >>= i;
500 }
501
502 /* Set *r to hi shifted left by i bits, filling in the high bits from lo into the low
503 bits of *r. I.e., *r = (hi + lo*2^-64) * 2^i. Assumes 0 <= i < 64. */
504 static inline void
u64arith_shld(uint64_t * r,const uint64_t lo,const uint64_t hi,const unsigned char i)505 u64arith_shld (uint64_t *r, const uint64_t lo, const uint64_t hi,
506 const unsigned char i)
507 {
508 ASSERT_EXPENSIVE (i < 64);
509 #ifdef U64ARITH_VERBOSE_ASM
510 #if GNUC_VERSION_ATLEAST(4,4,0)
511 #if GNUC_VERSION_ATLEAST(4,6,0)
512 #pragma GCC diagnostic push
513 #endif
514 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
515 #endif
516 __asm__ ("# u64arith_shld (*r=%0, lo=%1, hi=%2, i=%3)\n" : :
517 "X" (*r), "X" (lo), "X" (hi), "X" (i));
518 #if GNUC_VERSION_ATLEAST(4,6,0)
519 #pragma GCC diagnostic pop
520 #endif
521 #endif
522
523 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
524 __asm__ __VOLATILE (
525 "shldq %3, %1, %0\n"
526 : "=rm" (*r)
527 : "r" (lo), "0" (hi), "cJ" (i) /* i can be in %cl or a literal constant < 64 */
528 : "cc");
529 #else
530 if (i > 0) /* shr by 64 is no-op on x86! */
531 *r = (hi << i) | (lo >> (64 - i));
532 else
533 *r = hi;
534 #endif
535 }
536
537
538 /* Shift the 128-bit integer in lo:hi left by i bitss, 0 <= i < 64 */
539
540 static inline void
u64arith_shl_2(uint64_t * lo,uint64_t * hi,const unsigned char i)541 u64arith_shl_2 (uint64_t *lo, uint64_t *hi, const unsigned char i)
542 {
543 ASSERT_EXPENSIVE (i < 64);
544 u64arith_shld(hi, *lo, *hi, i);
545 *lo <<= i;
546 }
547
548 /* Returns number of trailing zeros in a. a must not be zero */
549 static inline int
u64arith_ctz(const uint64_t a)550 u64arith_ctz (const uint64_t a)
551 {
552 #if !defined (U64ARITH_NO_ASM) && GNUC_VERSION_ATLEAST(3, 4, 0)
553 ASSERT_EXPENSIVE (a != 0);
554 return __builtin_ctzll(a);
555 #else
556 static const unsigned char trailing_zeros[256] =
557 {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
558 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
559 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
560 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
561 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
562 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
563 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
564 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
565 char lsh, t = 0;
566 uint64_t y = a;
567 ASSERT_EXPENSIVE (a != 0);
568 do {
569 lsh = trailing_zeros [(unsigned char) y];
570 y >>= lsh;
571 t += lsh;
572 } while (lsh == 8);
573 return (int) t;
574 #endif
575 }
576
577 /* Returns number of leading zeros in a. a must not be zero */
578 static inline int
u64arith_clz(const uint64_t a)579 u64arith_clz (const uint64_t a)
580 {
581 #if !defined (U64ARITH_NO_ASM) && GNUC_VERSION_ATLEAST(3, 4, 0)
582 ASSERT_EXPENSIVE (a != 0);
583 STATIC_ASSERT(sizeof(uint64_t) == sizeof(unsigned long) || sizeof(uint64_t) == sizeof(unsigned long long), please_implement_me);
584 if (sizeof(uint64_t) == sizeof(unsigned long))
585 return __builtin_clzl(a);
586 else
587 return __builtin_clzll(a);
588 #else
589 uint64_t t = (uint64_t)1 << (64 - 1);
590 int i;
591 ASSERT_EXPENSIVE (a != 0);
592 for (i = 0; (a & t) == 0; i++)
593 t >>= 1;
594 return i;
595 #endif
596 }
597
598
599 /* Let A = a1 + a2*2^64. Returns q = floor(A / b), r = A mod b. Requires
600 a2 < b. Slow, simple binary grammar-school division. Used as reference
601 for testing faster code on machines without hardware division. */
602
603 static inline void
u64arith_divqr_2_1_1_slow(uint64_t * q,uint64_t * r,const uint64_t a1,const uint64_t a2,const uint64_t b)604 u64arith_divqr_2_1_1_slow (uint64_t *q, uint64_t *r,
605 const uint64_t a1, const uint64_t a2,
606 const uint64_t b)
607 {
608 uint64_t R1 = a1, R2 = a2, D1 = 0, D2 = b, M = (uint64_t)1 << 63, Q = 0;
609
610 ASSERT(a2 < b); /* Or there will be quotient overflow */
611
612 for (int i = 0; i < 64; i++) {
613 u64arith_shr_2(&D2, &D1, 1);
614 /* if R >= D */
615 if (!u64arith_gt_2_2(D1, D2, R1, R2)) {
616 u64arith_sub_2_2(&R1, &R2, D1, D2); /* R := R - D */
617 Q += M;
618 }
619 M >>= 1;
620 }
621
622 #ifdef WANT_ASSERT_EXPENSIVE
623 ASSERT_EXPENSIVE(R2 == 0 && R1 < b);
624 uint64_t P1, P2;
625 u64arith_mul_1_1_2(&P1, &P2, Q, b);
626 u64arith_add_2_2(&P1, &P2, R1, R2);
627 ASSERT_EXPENSIVE(P1 == a1 && P2 == a2);
628 #endif
629 *q = Q;
630 *r = R1;
631 }
632
633
634 /* Computes (2^128-1) / d - 1 for 2^63 <= d < 2^64. */
635
636 static inline uint64_t
u64arith_reciprocal_for_div(const uint64_t d)637 u64arith_reciprocal_for_div(const uint64_t d)
638 {
639 /* Assumes 2^63 <= d <= 2^64-1 */
640 const uint64_t one = 1;
641 ASSERT(d >= (one << 63));
642 /* lut[i] = (2^19 - 3 * 256) / (i + 256) */
643 static const uint16_t lut[256] = {2045, 2037, 2029, 2021, 2013, 2005, 1998,
644 1990, 1983, 1975, 1968, 1960, 1953, 1946, 1938, 1931, 1924, 1917, 1910,
645 1903, 1896, 1889, 1883, 1876, 1869, 1863, 1856, 1849, 1843, 1836, 1830,
646 1824, 1817, 1811, 1805, 1799, 1792, 1786, 1780, 1774, 1768, 1762, 1756,
647 1750, 1745, 1739, 1733, 1727, 1722, 1716, 1710, 1705, 1699, 1694, 1688,
648 1683, 1677, 1672, 1667, 1661, 1656, 1651, 1646, 1641, 1636, 1630, 1625,
649 1620, 1615, 1610, 1605, 1600, 1596, 1591, 1586, 1581, 1576, 1572, 1567,
650 1562, 1558, 1553, 1548, 1544, 1539, 1535, 1530, 1526, 1521, 1517, 1513,
651 1508, 1504, 1500, 1495, 1491, 1487, 1483, 1478, 1474, 1470, 1466, 1462,
652 1458, 1454, 1450, 1446, 1442, 1438, 1434, 1430, 1426, 1422, 1418, 1414,
653 1411, 1407, 1403, 1399, 1396, 1392, 1388, 1384, 1381, 1377, 1374, 1370,
654 1366, 1363, 1359, 1356, 1352, 1349, 1345, 1342, 1338, 1335, 1332, 1328,
655 1325, 1322, 1318, 1315, 1312, 1308, 1305, 1302, 1299, 1295, 1292, 1289,
656 1286, 1283, 1280, 1276, 1273, 1270, 1267, 1264, 1261, 1258, 1255, 1252,
657 1249, 1246, 1243, 1240, 1237, 1234, 1231, 1228, 1226, 1223, 1220, 1217,
658 1214, 1211, 1209, 1206, 1203, 1200, 1197, 1195, 1192, 1189, 1187, 1184,
659 1181, 1179, 1176, 1173, 1171, 1168, 1165, 1163, 1160, 1158, 1155, 1153,
660 1150, 1148, 1145, 1143, 1140, 1138, 1135, 1133, 1130, 1128, 1125, 1123,
661 1121, 1118, 1116, 1113, 1111, 1109, 1106, 1104, 1102, 1099, 1097, 1095,
662 1092, 1090, 1088, 1086, 1083, 1081, 1079, 1077, 1074, 1072, 1070, 1068,
663 1066, 1064, 1061, 1059, 1057, 1055, 1053, 1051, 1049, 1047, 1044, 1042,
664 1040, 1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024};
665
666 const uint64_t
667 d0 = d % 2,
668 d9 = d >> 55, /* 2^8 <= d9 < 2^9 */
669 d40 = (d >> 24) + 1, /* 2^39 + 1 <= d40 < 2^40+1 */
670 d63 = d / 2 + d0; /* ceil(d/2), 2^62 <= d63 <= 2^63 */
671
672 uint64_t
673 v0 = lut[d9 - 256],
674 v1 = (1<<11)*v0 - (v0*v0*d40 >> 40) - 1,
675 v2 = (v1 << 13) + (v1 * ((one << 60) - v1 * d40) >> 47),
676 p1, p2;
677
678 u64arith_mul_1_1_2(&p1, &p2, v2, ((v2 / 2) & (-d0)) - v2 * d63);
679 uint64_t v3 = (v2 << 31) + p2 / 2;
680
681 u64arith_mul_1_1_2(&p1, &p2, v3, d);
682 u64arith_add_2_2(&p1, &p2, d, d);
683 uint64_t v4 = v3 - p2;
684
685 return v4;
686 }
687
688 static inline uint64_t
u64arith_reciprocal_for_div_3by2(const uint64_t d0,const uint64_t d1)689 u64arith_reciprocal_for_div_3by2(const uint64_t d0, const uint64_t d1)
690 {
691 uint64_t v = u64arith_reciprocal_for_div(d1);
692 uint64_t p = d1*v + d0;
693 if (p < d0) {
694 v--;
695 if (p >= d1) {
696 v--;
697 p -= d1;
698 }
699 p -= d1;
700 }
701 uint64_t t0, t1;
702 u64arith_mul_1_1_2(&t0, &t1, v, d0);
703 p += t1;
704 if (p < t1) {
705 v--;
706 if (u64arith_ge_2_2(t0, p, d0, d1)) {
707 v--;
708 }
709 }
710 return v;
711 }
712
713
714 static inline void
u64arith_divqr_2_1_1_recip_precomp(uint64_t * q,uint64_t * r,const uint64_t a1,const uint64_t a2,const uint64_t d,const uint64_t v,const int s)715 u64arith_divqr_2_1_1_recip_precomp (uint64_t *q, uint64_t *r,
716 const uint64_t a1, const uint64_t a2,
717 const uint64_t d, const uint64_t v,
718 const int s)
719 {
720 uint64_t u0 = a1, u1 = a2;
721 /* Adjust dividend to match divisor */
722 ASSERT_EXPENSIVE(0 <= s && s < 64);
723 ASSERT_EXPENSIVE((u1 & ~(UINT64_MAX >> s)) == 0);
724 u64arith_shl_2(&u0, &u1, s);
725
726 uint64_t q0, q1;
727 u64arith_mul_1_1_2(&q0, &q1, v, u1);
728 u64arith_add_2_2(&q0, &q1, u0, u1);
729 q1++;
730 uint64_t r0 = u0 - q1*d;
731 if (r0 > q0) {
732 q1--;
733 r0 += d;
734 }
735 if (r0 >= d) {
736 q1++;
737 r0 -= d;
738 }
739 r0 >>= s;
740
741 *q = q1;
742 *r = r0;
743 }
744
745 /* Integer division of two uint64_t values a2:a1 by a uint64_t divisor b.
746 Returns quotient and remainder. Uses the algorithm described in
747 Niels Möller and Torbjörn Granlund, Improved Division by Invariant
748 Integers, IEEE Transactions on Computers (Volume: 60, Issue: 2,
749 Feb. 2011) */
750
751 static inline void
u64arith_divqr_2_1_1_recip(uint64_t * q,uint64_t * r,const uint64_t a1,const uint64_t a2,const uint64_t b)752 u64arith_divqr_2_1_1_recip (uint64_t *q, uint64_t *r,
753 const uint64_t a1, const uint64_t a2,
754 const uint64_t b)
755 {
756 if (a2 == 0) {
757 *q = a1 / b;
758 *r = a1 % b;
759 return;
760 }
761
762 const int s = u64arith_clz(b);
763 uint64_t d = b << s;
764 /* Left-adjust divisor */
765 const uint64_t v = u64arith_reciprocal_for_div(d);
766 u64arith_divqr_2_1_1_recip_precomp(q, r, a1, a2, d, v, s);
767
768 #ifdef WANT_ASSERT_EXPENSIVE
769 uint64_t P1, P2;
770 u64arith_mul_1_1_2(&P1, &P2, *q, b);
771 unsigned char cy = u64arith_add_2_2_cy(&P1, &P2, *r, 0);
772 // printf("a1=%lu, a2=%lu, b=%lu, s=%d, q1=%lu, r0=%lu\n", a1, a2, b, s, q1, r0);
773 ASSERT_EXPENSIVE(P1 == a1 && P2 == a2 && cy == 0);
774 #endif
775 }
776
777
778 /* Integer division of two uint64_t values a2:a1 by a uint64_t divisor b.
779 Returns quotient and remainder. */
780
781 static inline void
u64arith_divqr_2_1_1(uint64_t * q,uint64_t * r,const uint64_t a1,const uint64_t a2,const uint64_t b)782 u64arith_divqr_2_1_1 (uint64_t *q, uint64_t *r,
783 const uint64_t a1, const uint64_t a2,
784 const uint64_t b)
785 {
786 ASSERT(a2 < b); /* Or there will be quotient overflow */
787 #ifdef U64ARITH_VERBOSE_ASM
788 __asm__ ("# u64arith_divqr_2_1_1 (%0, %1, %2, %3, %4)\n" : :
789 "X" (*q), "X" (*r), "X" (a1), "X" (a2), "X" (b));
790 #endif
791 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
792 __asm__ __VOLATILE (
793 "divq %4"
794 : "=a" (*q), "=d" (*r)
795 : "0" (a1), "1" (a2), "rm" (b)
796 : "cc");
797 #else
798 u64arith_divqr_2_1_1_recip(q, r, a1, a2, b);
799 #endif
800 }
801
802
803 /* Integer division of two uint64_t values by a uint64_t divisor. Returns
804 only remainder. */
805
806 static inline void
u64arith_divr_2_1_1(uint64_t * r,uint64_t a1,const uint64_t a2,const uint64_t b)807 u64arith_divr_2_1_1 (uint64_t *r, uint64_t a1, const uint64_t a2,
808 const uint64_t b)
809 {
810 ASSERT(a2 < b); /* Or there will be quotient overflow */
811 #ifdef U64ARITH_VERBOSE_ASM
812 __asm__ ("# u64arith_divr_2_1_1 (%0, %1, %2, %3)\n" : :
813 "X" (*r), "X" (a1), "X" (a2), "X" (b));
814 #endif
815 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
816 __asm__ __VOLATILE (
817 "divq %3"
818 : "+a" (a1), "=d" (*r)
819 : "1" (a2), "rm" (b)
820 : "cc");
821 #else
822 uint64_t dummy;
823 u64arith_divqr_2_1_1(&dummy, r, a1, a2, b);
824 #endif
825 }
826
827 static inline void
u64arith_divqr_3_2_1_recip_precomp(uint64_t * q,uint64_t * R0,uint64_t * R1,uint64_t u0,uint64_t u1,uint64_t u2,const uint64_t d0,const uint64_t d1,const uint64_t v,const int s)828 u64arith_divqr_3_2_1_recip_precomp (
829 uint64_t *q, uint64_t *R0, uint64_t *R1,
830 uint64_t u0, uint64_t u1, uint64_t u2,
831 const uint64_t d0, const uint64_t d1, const uint64_t v,
832 const int s)
833 {
834 uint64_t q0, q1, t0, t1, r0, r1;
835 ASSERT((d1 & (UINT64_C(1) << 63)) != 0);
836 ASSERT(u64arith_lt_2_2(u1, u2, d0, d1));
837 ASSERT(0 <= s && s < 64);
838 ASSERT((u2 & ~(UINT64_MAX >> s)) == 0);
839
840 u64arith_shld(&u2, u1, u2, s);
841 u64arith_shld(&u1, u0, u1, s);
842 u0 <<= s;
843
844 u64arith_mul_1_1_2(&q0, &q1, u2, v);
845 u64arith_add_2_2(&q0, &q1, u1, u2);
846 r1 = u1 - q1*d1;
847 u64arith_mul_1_1_2(&t0, &t1, d0, q1);
848 r0 = u0;
849 u64arith_sub_2_2(&r0, &r1, t0, t1);
850 u64arith_sub_2_2(&r0, &r1, d0, d1);
851 q1++;
852 if (r1 >= q0) {
853 q1--;
854 u64arith_add_2_2(&r0, &r1, d0, d1);
855 }
856 if (u64arith_ge_2_2(r0, r1, d0, d1)) {
857 q1++;
858 u64arith_sub_2_2(&r0, &r1, d0, d1);
859 }
860 u64arith_shr_2(&r0, &r1, s);
861 *R0 = r0;
862 *R1 = r1;
863 *q = q1;
864 }
865
866 static inline void
u64arith_divqr_3_2_1(uint64_t * q,uint64_t * R0,uint64_t * R1,uint64_t u0,uint64_t u1,uint64_t u2,uint64_t d0,uint64_t d1)867 u64arith_divqr_3_2_1 (uint64_t *q, uint64_t *R0, uint64_t *R1,
868 uint64_t u0, uint64_t u1, uint64_t u2,
869 uint64_t d0, uint64_t d1)
870 {
871 ASSERT(d1 != 0);
872 ASSERT(u64arith_lt_2_2(u1, u2, d0, d1));
873 int s = u64arith_clz(d1);
874 u64arith_shl_2(&d0, &d1, s);
875 const uint64_t v = u64arith_reciprocal_for_div_3by2(d0, d1);
876 u64arith_divqr_3_2_1_recip_precomp(q, R0, R1, u0, u1, u2, d0, d1, v, s);
877 }
878
879 /* Compute 1/n (mod 2^wordsize) */
880 static inline uint64_t
u64arith_invmod(const uint64_t n)881 u64arith_invmod (const uint64_t n)
882 {
883 uint64_t r;
884
885 ASSERT (n % 2 != 0);
886
887 /* Suggestion from PLM: initing the inverse to (3*n) XOR 2 gives the
888 correct inverse modulo 32, then 3 (for 32 bit) or 4 (for 64 bit)
889 Newton iterations are enough. */
890 r = (3 * n) ^ 2;
891 /* Newton iteration */
892 r = 2 * r - (uint32_t) r * (uint32_t) r * (uint32_t) n;
893 r = 2 * r - (uint32_t) r * (uint32_t) r * (uint32_t) n;
894 r = 2 * r - (uint32_t) r * (uint32_t) r * (uint32_t) n;
895 #if 0
896 r = 2 * r - r * r * n;
897 #else
898 /*
899 r*n = k*2^32 + 1
900
901 r' = 2 * r - r * r * n
902 r' = 2 * r - r * (k*2^32 + 1)
903 r' = 2 * r - r * k*2^32 - r
904 r' = r - r * k*2^32
905 r' = r - ((r * k) % 2^32) * 2^32
906 */
907 uint32_t k = (uint32_t)(r * n >> 32);
908 k *= (uint32_t) r;
909 r = r - ((uint64_t)k << 32);
910 #endif
911
912 ASSERT_EXPENSIVE(n * r == 1);
913 return r;
914 }
915
916 /* Compute n/2 (mod m), where m must be odd. */
917 static inline uint64_t
u64arith_div2mod(const uint64_t n,const uint64_t m)918 u64arith_div2mod (const uint64_t n, const uint64_t m)
919 {
920 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
921 uint64_t N = n, M = m/2, t = 0;
922 ASSERT_EXPENSIVE (m % 2 != 0);
923 __asm__ __VOLATILE(
924 "shr $1, %1\n\t" /* N /= 2 */
925 "cmovc %0, %2\n\t" /* if (cy) {t = M;} */
926 "adc %2, %1\n\t" /* N += t + cy */
927 : "+&r" (M), "+&r" (N), "+&r" (t)
928 : : "cc"
929 );
930 return N;
931 #else
932 ASSERT_EXPENSIVE (m % 2 != 0);
933 if (n % 2 == 0)
934 return n / 2;
935 else
936 return n / 2 + m / 2 + 1;
937 #endif
938 }
939
940
941 /* Integer (truncated) square root of n */
942 static inline uint64_t
u64arith_sqrt(const uint64_t n)943 u64arith_sqrt (const uint64_t n)
944 {
945 /* TODO: use Hensel lifting instead */
946 unsigned int i;
947 uint64_t xs, c, d, s2;
948 const unsigned int l = 63 - (unsigned int)__builtin_clzl(n);
949
950 d = n; /* d = n - x^2 */
951 xs = 0;
952 s2 = (uint64_t)1 << (l - l % 2);
953
954 for (i = l / 2; i != 0; i--)
955 {
956 /* Here, s2 = 1 << (2*i) */
957 /* xs = x << (i + 1), the value of x shifted left i+1 bits */
958
959 c = xs + s2; /* c = (x + 2^i) ^ 2 - x^2 = 2^(i+1) * x + 2^(2*i) */
960 xs >>= 1; /* Now xs is shifted only i positions */
961 if (d >= c)
962 {
963 d -= c;
964 xs |= s2; /* x |= 1 << i <=> xs |= 1 << (2*i) */
965 }
966 s2 >>= 2;
967 }
968
969 c = xs + s2;
970 xs >>= 1;
971 if (d >= c)
972 xs |= s2;
973
974 return xs;
975 }
976
977 /* Given r = -rem/p (mod den), we want num/(den*2^k) (mod p) ==
978 (ratio + rem/den)/2^k (mod p).
979 Using (a variant of) Bezout's identity, we have, for some non-negative
980 integer t,
981 r * p - t * den = -rem, or
982 r * p + rem = t * den,
983 thus den | (r * p + rem), and thus
984 t = (r * p + rem) / den is an integer and satisfies
985 t = rem/den (mod p).
986
987 We have 0 <= r <= den-1 and rem <= den-1, and thus
988 0 <= t = p * r/den + rem/den <=
989 p (1 - 1/den) + 1 - 1/den =
990 p + 1 - (p + 1)/den < p + 1.
991 Thus t is almost a properly reduced residue for rem/den (mod p).
992 As p fits in uint64_t, so does t, and we can compute t modulo
993 2^64; since den is odd, we can multiply by den^{-1} mod 2^64
994 to effect division by den.
995
996 Finally we compute (t + ratio)/2^k mod p = num/(den*2^k) mod p. */
997
998 static inline uint64_t
u64arith_post_process_inverse(const uint64_t r,const uint64_t p,const uint64_t rem,const uint64_t den_inv,const uint64_t ratio,const uint64_t k)999 u64arith_post_process_inverse(const uint64_t r, const uint64_t p,
1000 const uint64_t rem, const uint64_t den_inv,
1001 const uint64_t ratio, const uint64_t k)
1002 {
1003 uint64_t t = (r * p + rem) * den_inv;
1004 const uint64_t ratio_p = (ratio >= p) ? ratio % p : ratio;
1005 ASSERT_ALWAYS(t <= p); /* Cheap and fairly strong test */
1006 /* u64arith_addmod_1_1() accepts third operand == p and still produces
1007 a properly reduced sum mod p. */
1008 u64arith_addmod_1_1 (&t, ratio_p, t, p);
1009
1010 ASSERT_EXPENSIVE(t < p);
1011 ASSERT_EXPENSIVE(k == 0 || p % 2 == 1);
1012 for (uint64_t j = 0; j < k; j++) {
1013 t = u64arith_div2mod(t, p);
1014 }
1015 return t;
1016 }
1017
1018
1019 /* Computes r = ((phigh * 2^64 + plow) / 2^64) % m
1020 Requires phigh < m and invm = -1/m (mod 2^64). */
1021
1022 static inline void
u64arith_redc(uint64_t * r,const uint64_t plow,const uint64_t phigh,const uint64_t m,const uint64_t invm)1023 u64arith_redc(uint64_t *r, const uint64_t plow,
1024 const uint64_t phigh, const uint64_t m,
1025 const uint64_t invm)
1026 {
1027 uint64_t t = phigh;
1028
1029 ASSERT_EXPENSIVE (phigh < m);
1030
1031 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
1032
1033 /* TODO: are the register constraints watertight?
1034 %rax gets modified but putting tlow as an output constraint with "+"
1035 will keep r from getting allocated in %rax, which is a shame
1036 since we often want the result in %rax for the next multiply. */
1037
1038 __asm__ __VOLATILE (
1039 "imulq %[invm], %%rax\n\t"
1040 "cmpq $1, %%rax \n\t" /* if plow != 0, increase t */
1041 "sbbq $-1, %[t]\n\t"
1042 "mulq %[m]\n\t"
1043 "lea (%[t],%%rdx,1), %[r]\n\t" /* compute (rdx + thigh) mod m */
1044 "subq %[m], %[t]\n\t"
1045 "addq %%rdx, %[t]\n\t"
1046 "cmovcq %[t], %[r]\n\t"
1047 : [t] "+&r" (t), [r] "=&r" (r[0])
1048 : [invm] "rm" (invm), [m] "rm" (m), "a" (plow)
1049 : "%rdx", "cc"
1050 );
1051 #else
1052 uint64_t tlow, thigh;
1053 tlow = plow * invm;
1054 u64arith_mul_1_1_2 (&tlow, &thigh, tlow, m);
1055 /* Let w = 2^wordsize. We know (phigh * w + plow) + (thigh * w + tlow)
1056 == 0 (mod w) so either plow == tlow == 0, or plow !=0 and tlow != 0.
1057 In the former case we want phigh + thigh + 1, in the latter
1058 phigh + thigh. Since t = phigh < m, and modredcul_add can handle the
1059 case where the second operand is equal to m, adding 1 is safe */
1060
1061 t += (plow != 0) ? 1 : 0; /* Does not depend on the mul */
1062
1063 u64arith_addmod_1_1(r, t, thigh, m);
1064 #endif
1065 ASSERT_EXPENSIVE (r[0] < m);
1066 }
1067
1068
1069 #ifdef __cplusplus
1070 }
1071 #endif
1072
1073
1074 #endif /* ifndef U64_ARITH_H__ */
1075