1 /* Some commonly used assembly helper macros for arithmetic on
2    unsigned 64-bit integers.
3    Defining U64ARITH_VERBOSE_ASM puts in the asm output a line for each
4    function call that shows what registers/memory locations the operands
5    are in.
6    Defining U64ARITH_NO_ASM avoids asm macros and uses the C fallback code
7    where available. This can be used for testing the fallback code.
8 */
9 
10 #ifndef U64_ARITH_H__
11 #define U64_ARITH_H__
12 
13 #include <stdint.h>
14 #include <stdio.h>
15 #include "macros.h"
16 
17 #if defined(HAVE_INT128)
18 typedef union {uint64_t x[2]; unsigned __int128 y;} _u64arith_union2_t;
19 #endif
20 
21 #ifdef __cplusplus
22 extern "C" {
23 #endif
24 
25 /** Test whether a1:a2 > b1:b2
26  *
27  * Let a = a1 + 2^64 * a2, b = b1 + 2^64 * b2. Return 1 if a > b,
28  * and 0 otherwise.
29  */
30 static inline int
31 u64arith_gt_2_2(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) ATTRIBUTE((const));
32 static inline int
u64arith_gt_2_2(const uint64_t a1,const uint64_t a2,const uint64_t b1,const uint64_t b2)33 u64arith_gt_2_2(const uint64_t a1, const uint64_t a2,
34                 const uint64_t b1, const uint64_t b2)
35 {
36 #if defined(GENUINE_GCC) && __GNUC__ >= 8 && defined(HAVE_INT128)
37     _u64arith_union2_t a = {{a1, a2}}, b = {{b1, b2}};
38     return a > b;
39 #else
40     return a2 > b2 || (a2 == b2 && a1 > b1);
41 #endif
42 }
43 
44 /** Test whether a1:a2 >= b1:b2
45  *
46  * Let a = a1 + 2^64 * a2, b = b1 + 2^64 * b2. Return 1 if a >= b,
47  * and 0 otherwise.
48  */
49 static inline int
50 u64arith_ge_2_2(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) ATTRIBUTE((const));
51 static inline int
u64arith_ge_2_2(const uint64_t a1,const uint64_t a2,const uint64_t b1,const uint64_t b2)52 u64arith_ge_2_2(const uint64_t a1, const uint64_t a2,
53                 const uint64_t b1, const uint64_t b2)
54 {
55 #if defined(GENUINE_GCC) && __GNUC__ >= 8 && defined(HAVE_INT128)
56     _u64arith_union2_t a = {{a1, a2}, b = {b1, b2}};
57     return a >= b;
58 #else
59   return a2 > b2 || (a2 == b2 && a1 >= b1);
60 #endif
61 
62 }
63 
64 /** Test whether a1:a2 < b1:b2
65  *
66  * Let a = a1 + 2^64 * a2, b = b1 + 2^64 * b2. Return 1 if a < b,
67  * and 0 otherwise.
68  */
69 static inline int
70 u64arith_lt_2_2(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) ATTRIBUTE((const));
71 static inline int
u64arith_lt_2_2(const uint64_t a1,const uint64_t a2,const uint64_t b1,const uint64_t b2)72 u64arith_lt_2_2(const uint64_t a1, const uint64_t a2,
73                 const uint64_t b1, const uint64_t b2)
74 {
75 #if defined(GENUINE_GCC) && __GNUC__ >= 8 && defined(HAVE_INT128)
76     _u64arith_union2_t a = {{a1, a2}}, b = {{b1, b2}};
77     return a < b;
78 #else
79   return a2 < b2 || (a2 == b2 && a1 < b1);
80 #endif
81 
82 }
83 
84 /** Test whether a1:a2 <= b1:b2
85  *
86  * Let a = a1 + 2^64 * a2, b = b1 + 2^64 * b2. Return 1 if a <= b,
87  * and 0 otherwise.
88  */
89 static inline int
90 u64arith_le_2_2(uint64_t a1, uint64_t a2, uint64_t b1, uint64_t b2) ATTRIBUTE((const));
91 static inline int
u64arith_le_2_2(const uint64_t a1,const uint64_t a2,const uint64_t b1,const uint64_t b2)92 u64arith_le_2_2(const uint64_t a1, const uint64_t a2,
93                 const uint64_t b1, const uint64_t b2)
94 {
95 #if defined(GENUINE_GCC) && __GNUC__ >= 8 && defined(HAVE_INT128)
96     _u64arith_union2_t a = {{a1, a2}}, b = {{b1, b2}};
97     return a <= b;
98 #else
99   return a2 < b2 || (a2 == b2 && a1 <= b1);
100 #endif
101 
102 }
103 
104 /** Add r1:r2 += a1:a2 with carry propagation
105  *
106  * Let r = r1 + 2^64 * r2, a = a1 + 2^64 * a2. Set r1:r2 := (r+a) mod 2^128.
107  */
108 static inline void
u64arith_add_2_2(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)109 u64arith_add_2_2 (uint64_t *r1, uint64_t *r2,
110 		  const uint64_t a1, const uint64_t a2)
111 {
112 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) \
113     && ( defined(GENUINE_GNUC) && __GNUC__ <= 7 || defined(__INTEL_COMPILER) )
114 #ifdef U64ARITH_VERBOSE_ASM
115   __asm__ ("# u64arith_add_2_2 (%0, %1, %2, %3)\n" : :
116            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
117 #endif
118   __asm__ __VOLATILE (
119     "addq %2, %0\n\t"
120     "adcq %3, %1\n"
121     : "+&r" (*r1), "+r" (*r2)
122     : "rme" (a1), "rme" (a2)
123     : "cc");
124 #else
125   /* On x86_64, Clang 6.0, 7.0, 8.0, 9.0, gcc 8.0 and 9.0 produce good code
126    * from this. Intel icc however produces terrible code. */
127   *r1 += a1;
128   *r2 += a2 + (*r1 < a1);
129 #endif
130 }
131 
132 /** Add r1:r2 += a with carry propagation
133  *
134  * Let r = r1 + 2^64 * r2. Set r1:r2 := (r+a) mod 2^128.
135  */
136 static inline void
u64arith_add_1_2(uint64_t * r1,uint64_t * r2,const uint64_t a)137 u64arith_add_1_2 (uint64_t *r1, uint64_t *r2, const uint64_t a)
138 {
139   /* We have assembly only for x86_64 and on that architecture, this function
140      would generate the same code as u64arith_add_2_2() with an immediate
141      zero value for the high word, so we fall back to that. */
142   u64arith_add_2_2(r1, r2, a, 0);
143 }
144 
145 
146 /* Adds two uint64_t from two uint64_t with carry propagation
147    from low word (r1) to high word (r2). Returns 1 if there was a carry out
148    from high word, otherwise returns 0. */
149 
150 static inline char
u64arith_add_2_2_cy(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)151 u64arith_add_2_2_cy (uint64_t *r1, uint64_t *r2,
152 		     const uint64_t a1, const uint64_t a2)
153 {
154   char cy;
155 #ifdef U64ARITH_VERBOSE_ASM
156   __asm__ ("# u64arith_add_2_2_cy (%0, %1, %2, %3)\n" : :
157            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
158 #endif
159 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
160   __asm__ __VOLATILE (
161     "addq %3, %0\n\t"
162     "adcq %4, %1\n\t"
163     "setc %2\n"
164     : "+&r" (*r1), "+r" (*r2), "=r" (cy)
165     : "rme" (a1), "rme" (a2)
166     : "cc");
167 #else
168   uint64_t u1 = *r1 + a1,
169            u2 = *r2 + a2 + (u1 < a1);
170   /* Overflow occurred iff the sum is smaller than one of the summands */
171   cy = u64arith_gt_2_2(a1, a2, u1, u2);
172   *r1 = u1;
173   *r2 = u2;
174 #endif
175   return cy;
176 }
177 
178 
179 /** Compute r = (a + b) % m
180  *
181  * Requires a < m and b <= m, then r == a+b (mod m) and r < m.
182  */
183 static inline void
u64arith_addmod_1_1(uint64_t * r,const uint64_t a,const uint64_t b,const uint64_t m)184 u64arith_addmod_1_1 (uint64_t *r, const uint64_t a,
185                      const uint64_t b, const uint64_t m)
186 {
187   ASSERT_EXPENSIVE (a < m && b <= m);
188 
189 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
190   {
191     uint64_t t = a + b, tr = a - m;
192 
193     __asm__ __VOLATILE (
194       "addq %2, %0\n\t"   /* tr += b */
195       "cmovnc %1, %0\n\t"  /* if (!cy) tr = t */
196       : "+&r" (tr)
197       : "rm" (t), "rme" (b)
198       : "cc"
199     );
200     ASSERT_EXPENSIVE (tr == ((a >= m - b) ? (a - (m - b)) : (a + b)));
201     r[0] = tr;
202   }
203 #else
204   r[0] = (b >= m - a) ? (b - (m - a)) : (a + b);
205 #endif
206 
207   ASSERT_EXPENSIVE (r[0] < m);
208 }
209 
210 
211 /** Add r1:r2 -= a1:a2 with borrow propagation
212  *
213  * Let r = r1 + 2^64 * r2, a = a1 + 2^64 * a2. Set r1:r2 := (r-a) mod 2^128.
214  */
215 static inline void
u64arith_sub_2_2(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)216 u64arith_sub_2_2 (uint64_t *r1, uint64_t *r2,
217 		  const uint64_t a1, const uint64_t a2)
218 {
219 #ifdef U64ARITH_VERBOSE_ASM
220   __asm__ ("# u64arith_sub_2_2 (%0, %1, %2, %3)\n" : :
221            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
222 #endif
223 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
224   __asm__ __VOLATILE (
225     "subq %2, %0\n\t"
226     "sbbq %3, %1\n"
227     : "+&r" (*r1), "+r" (*r2)
228     : "rme" (a1), "rme" (a2)
229     : "cc");
230 #else
231   uint64_t u = *r1;
232   *r1 -= a1;
233   *r2 -= a2 + (*r1 > u);
234 #endif
235 }
236 
237 
238 /** Add r1:r2 -= a with borrow propagation
239  *
240  * Let r = r1 + 2^64 * r2. Set r1:r2 := (r-a) mod 2^128.
241  */
242 static inline void
u64arith_sub_1_2(uint64_t * r1,uint64_t * r2,const uint64_t a)243 u64arith_sub_1_2 (uint64_t *r1, uint64_t *r2,
244                   const uint64_t a)
245 {
246     u64arith_sub_2_2(r1, r2, a, 0);
247 }
248 
249 
250 /* Subtract two uint64_t from two uint64_t with borrow propagation
251    from low word (r1) to high word (r2). Returns 1 if there was a borrow out
252    from high word, otherwise returns 0. */
253 
254 static inline char
u64arith_sub_2_2_cy(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)255 u64arith_sub_2_2_cy (uint64_t *r1, uint64_t *r2,
256                      const uint64_t a1, const uint64_t a2)
257 {
258   char cy;
259 #ifdef U64ARITH_VERBOSE_ASM
260   __asm__ ("# u64arith_sub_2_2_cy (%0, %1, %2, %3)\n" : :
261            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
262 #endif
263 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
264   __asm__ __VOLATILE (
265     "subq %3, %0\n\t"
266     "sbbq %4, %1\n\t"
267     "setc %2\n"
268     : "+&r" (*r1), "+r" (*r2), "=r" (cy)
269     : "rme" (a1), "rme" (a2)
270     : "cc");
271 #else
272   uint64_t u1 = *r1 - a1, u2 = *r2 - a2;
273   if (a1 > *r1)
274     u2--;
275   cy = u64arith_gt_2_2(a1, a2, *r1, *r2);
276   *r1 = u1;
277   *r2 = u2;
278 #endif
279   return cy;
280 }
281 
282 
283 /* Subtract only if result is non-negative */
284 static inline void
u64arith_sub_1_1_ge(uint64_t * r,const uint64_t a)285 u64arith_sub_1_1_ge (uint64_t *r, const uint64_t a)
286 {
287   uint64_t t = *r;
288 #ifdef U64ARITH_VERBOSE_ASM
289   __asm__ ("# u64arith_sub_1_1_ge (%0, %1, %2, %3)\n" : :
290            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
291 #endif
292 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
293   __asm__ __VOLATILE (
294     "subq %2, %0\n\t" /* r -= a */
295     "cmovc %1, %0\n\t" /* If there's a borrow, restore r from t */
296     : "+&r" (*r)
297     : "r" (t), "rme" (a)
298     : "cc");
299 #else
300   t -= a;
301   if (*r >= a)
302     *r = t;
303 #endif
304 }
305 
306 /* Subtract only if result is non-negative */
307 static inline void
u64arith_sub_2_2_ge(uint64_t * r1,uint64_t * r2,const uint64_t a1,const uint64_t a2)308 u64arith_sub_2_2_ge (uint64_t *r1, uint64_t *r2,
309                      const uint64_t a1, const uint64_t a2)
310 {
311   uint64_t t1 = *r1, t2 = *r2;
312 #ifdef U64ARITH_VERBOSE_ASM
313   __asm__ ("# u64arith_sub_2_2_ge (%0, %1, %2, %3)\n" : :
314            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
315 #endif
316 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
317   __asm__ __VOLATILE (
318     "subq %4, %0\n\t" /* r1 -= a1 */
319     "sbbq %5, %1\n\t" /* r2 -= a2 + cy */
320     "cmovc %2, %0\n\t" /* If there's a borrow, restore r1 from t1 */
321     "cmovc %3, %1\n\t" /* and r2 from t2 */
322     : "+&r" (*r1), "+&r" (*r2)
323     : "r" (t1), "r" (t2), "rme" (a1), "rme" (a2)
324     : "cc");
325 #else
326   if (!u64arith_gt_2_2(a1, a2, *r1, *r2))
327     {
328       *r1 = t1 - a1;
329       *r2 = t2 - a2 - (a1 > t1);
330     }
331 #endif
332 }
333 
334 
335 /** Compute r = (a - b) % m
336  *
337  * Requires a < m and b < m, then r == a+b (mod m) and r < m.
338  */
339 static inline void
u64arith_submod_1_1(uint64_t * r,const uint64_t a,const uint64_t b,const uint64_t m)340 u64arith_submod_1_1 (uint64_t *r, const uint64_t a,
341                      const uint64_t b, const uint64_t m)
342 {
343     ASSERT_EXPENSIVE (a < m && b < m);
344 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
345     /* Could do tr = a+m-b; t = a-b; if (!carry) tr = t; */
346     uint64_t tr, t = a;
347     __asm__ __VOLATILE (
348       "sub %2, %1\n\t"  /* t -= b ( = a - b) */
349       "lea (%1,%3,1), %0\n\t" /* tr = t + m ( = a - b + m) */
350       "cmovnc %1, %0\n\t" /* if (a >= b) tr = t */
351       : "=&r" (tr), "+&r" (t)
352       : "rme" (b), "r" (m)
353       : "cc"
354     );
355     r[0] = tr;
356 #elif 1
357   /* Seems to be faster than the one below */
358     uint64_t t = 0, tr;
359     if ((tr = a - b) > a)
360       t = m;
361     r[0] = tr + t;
362 #else
363     r[0] = (a < b) ? (a - b + m) : (a - b);
364 #endif
365 
366     ASSERT_EXPENSIVE (r[0] < m);
367 }
368 
369 
370 /* Multiply two uint64_t "a" and "b" and put the result as
371    r2:r1 (r2 being the high word) */
372 
373 static inline void
u64arith_mul_1_1_2(uint64_t * r1,uint64_t * r2,const uint64_t a,const uint64_t b)374 u64arith_mul_1_1_2 (uint64_t *r1, uint64_t *r2,
375 		    const uint64_t a, const uint64_t b)
376 {
377 #ifdef U64ARITH_VERBOSE_ASM
378   __asm__ ("# u64arith_mul_1_1_2 (%0, %1, %2, %3)\n" : :
379            "X" (*r1), "X" (*r2), "X" (a), "X" (b));
380 #endif
381 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
382   __asm__ __VOLATILE (
383     "mulq %3"
384     : "=a" (*r1), "=d" (*r2)
385     : "%0" (a), "rm" (b)
386     : "cc");
387 #elif defined(HAVE_INT128)
388     unsigned __int128 r = (unsigned __int128) a * b;
389     *r1 = r;
390     *r2 = r >> 64;
391 #else
392   const uint64_t mask = ((uint64_t)1 << 32) - 1;
393   const uint64_t ah = a >> 32, al = a & mask, bh = b >> 32, bl = b & mask;
394   uint64_t t1, t2, p1, p2;
395 
396   t1 = al * bl;
397   t2 = 0;
398   p1 = ah * bl;
399   p2 = p1 >> 32;
400   p1 = (p1 & mask) << 32;
401   u64arith_add_2_2 (&t1, &t2, p1, p2);
402   p1 = al * bh;
403   p2 = p1 >> 32;
404   p1 = (p1 & mask) << 32;
405   u64arith_add_2_2 (&t1, &t2, p1, p2);
406   t2 += ah * bh;
407   *r1 = t1;
408   *r2 = t2;
409 #endif
410 }
411 
412 
413 static inline void
u64arith_sqr_1_2(uint64_t * r1,uint64_t * r2,const uint64_t a)414 u64arith_sqr_1_2 (uint64_t *r1, uint64_t *r2,
415 		  const uint64_t a)
416 {
417 #ifdef U64ARITH_VERBOSE_ASM
418   __asm__ ("# u64arith_sqr_1_2 (%0, %1, %2)\n" : :
419            "X" (*r1), "X" (*r2), "X" (a));
420 #endif
421 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
422   __asm__ __VOLATILE (
423     "mulq %%rax"
424     : "=a" (*r1), "=d" (*r2)
425     : "0" (a)
426     : "cc");
427 #elif defined(HAVE_INT128)
428     unsigned __int128 r = (unsigned __int128) a * a;
429     *r1 = r;
430     *r2 = r >> 64;
431 #else
432   const uint64_t mask = ((uint64_t)1 << 32) - 1;
433   const uint64_t ah = a >> 32, al = a & mask;
434   uint64_t t1, t2, p1, p2;
435 
436   t1 = al * al;
437   t2 = 0;
438   p1 = ah * al;
439   p2 = p1 >> 32;
440   p1 = (p1 & mask) << 32;
441   u64arith_add_2_2 (&t1, &t2, p1, p2);
442   u64arith_add_2_2 (&t1, &t2, p1, p2);
443   t2 += ah * ah;
444   *r1 = t1;
445   *r2 = t2;
446 #endif
447 }
448 
449 
450 /* Set *r to lo shifted right by i bits, filling in the low bits from hi into the high
451    bits of *r. I.e., *r = (hi * 2^64 + lo) / 2^i. Assumes 0 <= i < 64. */
452 static inline void
u64arith_shrd(uint64_t * r,const uint64_t hi,const uint64_t lo,const unsigned char i)453 u64arith_shrd (uint64_t *r, const uint64_t hi, const uint64_t lo,
454               const unsigned char i)
455 {
456   ASSERT_EXPENSIVE (i < 64);
457 #ifdef U64ARITH_VERBOSE_ASM
458 /* Disable the "uninitialized" warning here, as *r is only written to and
459    does not need to be initialized, but we need to write (*r) here so the
460    "X" constraint can be resolved even when r does not have an address, e.g.,
461    when it is passed around in a register. It seems that "X" is assumed by
462    gcc as possibly referring to an input, and since "X" matches anything,
463    that's probably a neccessary assumtion to make. */
464 #if GNUC_VERSION_ATLEAST(4,4,0)
465 #if GNUC_VERSION_ATLEAST(4,6,0)
466 #pragma GCC diagnostic push
467 #endif
468 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
469 #endif
470   __asm__ ("# u64arith_shrd (*r=%0, hi=%1, lo=%2, i=%3)\n" : :
471            "X" (*r), "X" (hi), "X" (lo), "X" (i));
472 #if GNUC_VERSION_ATLEAST(4,6,0)
473 #pragma GCC diagnostic pop
474 #endif
475 #endif
476 
477 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
478   __asm__ __VOLATILE (
479     "shrdq %3, %1, %0\n"
480     : "=rm" (*r)
481     : "r" (hi), "0" (lo), "cJ" (i) /* i can be in %cl or a literal constant < 64 */
482     : "cc");
483 #else
484   if (i > 0) /* shl by 64 is no-op on x86! */
485     *r = (lo >> i) | (hi << (64 - i));
486   else
487     *r = lo;
488 #endif
489 }
490 
491 
492 /* Shift the 128-bit integer in lo:hi right by i bits, 0 <= i < 64 */
493 
494 static inline void
u64arith_shr_2(uint64_t * lo,uint64_t * hi,const unsigned char i)495 u64arith_shr_2 (uint64_t *lo, uint64_t *hi, const unsigned char i)
496 {
497   ASSERT_EXPENSIVE (i < 64);
498   u64arith_shrd(lo, *hi, *lo, i);
499   *hi >>= i;
500 }
501 
502 /* Set *r to hi shifted left by i bits, filling in the high bits from lo into the low
503    bits of *r. I.e., *r = (hi + lo*2^-64) * 2^i. Assumes 0 <= i < 64. */
504 static inline void
u64arith_shld(uint64_t * r,const uint64_t lo,const uint64_t hi,const unsigned char i)505 u64arith_shld (uint64_t *r, const uint64_t lo, const uint64_t hi,
506               const unsigned char i)
507 {
508   ASSERT_EXPENSIVE (i < 64);
509 #ifdef U64ARITH_VERBOSE_ASM
510 #if GNUC_VERSION_ATLEAST(4,4,0)
511 #if GNUC_VERSION_ATLEAST(4,6,0)
512 #pragma GCC diagnostic push
513 #endif
514 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
515 #endif
516   __asm__ ("# u64arith_shld (*r=%0, lo=%1, hi=%2, i=%3)\n" : :
517            "X" (*r), "X" (lo), "X" (hi), "X" (i));
518 #if GNUC_VERSION_ATLEAST(4,6,0)
519 #pragma GCC diagnostic pop
520 #endif
521 #endif
522 
523 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
524   __asm__ __VOLATILE (
525     "shldq %3, %1, %0\n"
526     : "=rm" (*r)
527     : "r" (lo), "0" (hi), "cJ" (i) /* i can be in %cl or a literal constant < 64 */
528     : "cc");
529 #else
530   if (i > 0) /* shr by 64 is no-op on x86! */
531     *r = (hi << i) | (lo >> (64 - i));
532   else
533     *r = hi;
534 #endif
535 }
536 
537 
538 /* Shift the 128-bit integer in lo:hi left by i bitss, 0 <= i < 64 */
539 
540 static inline void
u64arith_shl_2(uint64_t * lo,uint64_t * hi,const unsigned char i)541 u64arith_shl_2 (uint64_t *lo, uint64_t *hi, const unsigned char i)
542 {
543   ASSERT_EXPENSIVE (i < 64);
544   u64arith_shld(hi, *lo, *hi, i);
545   *lo <<= i;
546 }
547 
548 /* Returns number of trailing zeros in a. a must not be zero */
549 static inline int
u64arith_ctz(const uint64_t a)550 u64arith_ctz (const uint64_t a)
551 {
552 #if !defined (U64ARITH_NO_ASM) && GNUC_VERSION_ATLEAST(3, 4, 0)
553   ASSERT_EXPENSIVE (a != 0);
554   return __builtin_ctzll(a);
555 #else
556   static const unsigned char trailing_zeros[256] =
557     {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
558      5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
559      6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
560      5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
561      7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
562      5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
563      6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
564      5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
565   char lsh, t = 0;
566   uint64_t y = a;
567   ASSERT_EXPENSIVE (a != 0);
568   do {
569     lsh = trailing_zeros [(unsigned char) y];
570     y >>= lsh;
571     t += lsh;
572   } while (lsh == 8);
573   return (int) t;
574 #endif
575 }
576 
577 /* Returns number of leading zeros in a. a must not be zero */
578 static inline int
u64arith_clz(const uint64_t a)579 u64arith_clz (const uint64_t a)
580 {
581 #if !defined (U64ARITH_NO_ASM) && GNUC_VERSION_ATLEAST(3, 4, 0)
582   ASSERT_EXPENSIVE (a != 0);
583   STATIC_ASSERT(sizeof(uint64_t) == sizeof(unsigned long) || sizeof(uint64_t) == sizeof(unsigned long long), please_implement_me);
584   if (sizeof(uint64_t) == sizeof(unsigned long))
585     return __builtin_clzl(a);
586   else
587     return __builtin_clzll(a);
588 #else
589   uint64_t t = (uint64_t)1 << (64 - 1);
590   int i;
591   ASSERT_EXPENSIVE (a != 0);
592   for (i = 0; (a & t) == 0; i++)
593     t >>= 1;
594   return i;
595 #endif
596 }
597 
598 
599 /* Let A = a1 + a2*2^64. Returns q = floor(A / b), r = A mod b. Requires
600    a2 < b. Slow, simple binary grammar-school division. Used as reference
601    for testing faster code on machines without hardware division. */
602 
603 static inline void
u64arith_divqr_2_1_1_slow(uint64_t * q,uint64_t * r,const uint64_t a1,const uint64_t a2,const uint64_t b)604 u64arith_divqr_2_1_1_slow (uint64_t *q, uint64_t *r,
605 		      const uint64_t a1, const uint64_t a2,
606 		      const uint64_t b)
607 {
608   uint64_t R1 = a1, R2 = a2, D1 = 0, D2 = b, M = (uint64_t)1 << 63, Q = 0;
609 
610   ASSERT(a2 < b); /* Or there will be quotient overflow */
611 
612   for (int i = 0; i < 64; i++) {
613     u64arith_shr_2(&D2, &D1, 1);
614     /* if R >= D */
615     if (!u64arith_gt_2_2(D1, D2, R1, R2)) {
616       u64arith_sub_2_2(&R1, &R2, D1, D2); /* R := R - D */
617       Q += M;
618     }
619     M >>= 1;
620   }
621 
622 #ifdef WANT_ASSERT_EXPENSIVE
623   ASSERT_EXPENSIVE(R2 == 0 && R1 < b);
624   uint64_t P1, P2;
625   u64arith_mul_1_1_2(&P1, &P2, Q, b);
626   u64arith_add_2_2(&P1, &P2, R1, R2);
627   ASSERT_EXPENSIVE(P1 == a1 && P2 == a2);
628 #endif
629   *q = Q;
630   *r = R1;
631 }
632 
633 
634 /* Computes (2^128-1) / d - 1 for 2^63 <= d < 2^64. */
635 
636 static inline uint64_t
u64arith_reciprocal_for_div(const uint64_t d)637 u64arith_reciprocal_for_div(const uint64_t d)
638 {
639   /* Assumes 2^63 <= d <= 2^64-1 */
640   const uint64_t one = 1;
641   ASSERT(d >= (one << 63));
642   /* lut[i] = (2^19 - 3 * 256) / (i + 256) */
643   static const uint16_t lut[256] = {2045, 2037, 2029, 2021, 2013, 2005, 1998,
644     1990, 1983, 1975, 1968, 1960, 1953, 1946, 1938, 1931, 1924, 1917, 1910,
645     1903, 1896, 1889, 1883, 1876, 1869, 1863, 1856, 1849, 1843, 1836, 1830,
646     1824, 1817, 1811, 1805, 1799, 1792, 1786, 1780, 1774, 1768, 1762, 1756,
647     1750, 1745, 1739, 1733, 1727, 1722, 1716, 1710, 1705, 1699, 1694, 1688,
648     1683, 1677, 1672, 1667, 1661, 1656, 1651, 1646, 1641, 1636, 1630, 1625,
649     1620, 1615, 1610, 1605, 1600, 1596, 1591, 1586, 1581, 1576, 1572, 1567,
650     1562, 1558, 1553, 1548, 1544, 1539, 1535, 1530, 1526, 1521, 1517, 1513,
651     1508, 1504, 1500, 1495, 1491, 1487, 1483, 1478, 1474, 1470, 1466, 1462,
652     1458, 1454, 1450, 1446, 1442, 1438, 1434, 1430, 1426, 1422, 1418, 1414,
653     1411, 1407, 1403, 1399, 1396, 1392, 1388, 1384, 1381, 1377, 1374, 1370,
654     1366, 1363, 1359, 1356, 1352, 1349, 1345, 1342, 1338, 1335, 1332, 1328,
655     1325, 1322, 1318, 1315, 1312, 1308, 1305, 1302, 1299, 1295, 1292, 1289,
656     1286, 1283, 1280, 1276, 1273, 1270, 1267, 1264, 1261, 1258, 1255, 1252,
657     1249, 1246, 1243, 1240, 1237, 1234, 1231, 1228, 1226, 1223, 1220, 1217,
658     1214, 1211, 1209, 1206, 1203, 1200, 1197, 1195, 1192, 1189, 1187, 1184,
659     1181, 1179, 1176, 1173, 1171, 1168, 1165, 1163, 1160, 1158, 1155, 1153,
660     1150, 1148, 1145, 1143, 1140, 1138, 1135, 1133, 1130, 1128, 1125, 1123,
661     1121, 1118, 1116, 1113, 1111, 1109, 1106, 1104, 1102, 1099, 1097, 1095,
662     1092, 1090, 1088, 1086, 1083, 1081, 1079, 1077, 1074, 1072, 1070, 1068,
663     1066, 1064, 1061, 1059, 1057, 1055, 1053, 1051, 1049, 1047, 1044, 1042,
664     1040, 1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024};
665 
666   const uint64_t
667       d0 = d % 2,
668       d9 = d >> 55, /* 2^8 <= d9 < 2^9 */
669       d40 = (d >> 24) + 1, /* 2^39 + 1 <= d40 < 2^40+1 */
670       d63 = d / 2 + d0; /* ceil(d/2), 2^62 <= d63 <= 2^63 */
671 
672   uint64_t
673     v0 = lut[d9 - 256],
674     v1 = (1<<11)*v0 - (v0*v0*d40 >> 40) - 1,
675     v2 = (v1 << 13) + (v1 * ((one << 60) - v1 * d40) >> 47),
676     p1, p2;
677 
678   u64arith_mul_1_1_2(&p1, &p2, v2, ((v2 / 2) & (-d0)) - v2 * d63);
679   uint64_t v3 = (v2 << 31) + p2 / 2;
680 
681   u64arith_mul_1_1_2(&p1, &p2, v3, d);
682   u64arith_add_2_2(&p1, &p2, d, d);
683   uint64_t v4 = v3 - p2;
684 
685   return v4;
686 }
687 
688 static inline uint64_t
u64arith_reciprocal_for_div_3by2(const uint64_t d0,const uint64_t d1)689 u64arith_reciprocal_for_div_3by2(const uint64_t d0, const uint64_t d1)
690 {
691     uint64_t v = u64arith_reciprocal_for_div(d1);
692     uint64_t p = d1*v + d0;
693     if (p < d0) {
694         v--;
695         if (p >= d1) {
696             v--;
697             p -= d1;
698         }
699         p -= d1;
700     }
701     uint64_t t0, t1;
702     u64arith_mul_1_1_2(&t0, &t1, v, d0);
703     p += t1;
704     if (p < t1) {
705         v--;
706         if (u64arith_ge_2_2(t0, p, d0, d1)) {
707             v--;
708         }
709     }
710     return v;
711 }
712 
713 
714 static inline void
u64arith_divqr_2_1_1_recip_precomp(uint64_t * q,uint64_t * r,const uint64_t a1,const uint64_t a2,const uint64_t d,const uint64_t v,const int s)715 u64arith_divqr_2_1_1_recip_precomp (uint64_t *q, uint64_t *r,
716 		      const uint64_t a1, const uint64_t a2,
717 		      const uint64_t d, const uint64_t v,
718 		      const int s)
719 {
720   uint64_t u0 = a1, u1 = a2;
721   /* Adjust dividend to match divisor */
722   ASSERT_EXPENSIVE(0 <= s && s < 64);
723   ASSERT_EXPENSIVE((u1 & ~(UINT64_MAX >> s)) == 0);
724   u64arith_shl_2(&u0, &u1, s);
725 
726   uint64_t q0, q1;
727   u64arith_mul_1_1_2(&q0, &q1, v, u1);
728   u64arith_add_2_2(&q0, &q1, u0, u1);
729   q1++;
730   uint64_t r0 = u0 - q1*d;
731   if (r0 > q0) {
732     q1--;
733     r0 += d;
734   }
735   if (r0 >= d) {
736     q1++;
737     r0 -= d;
738   }
739   r0 >>= s;
740 
741   *q = q1;
742   *r = r0;
743 }
744 
745 /* Integer division of two uint64_t values a2:a1 by a uint64_t divisor b.
746    Returns quotient and remainder. Uses the algorithm described in
747    Niels Möller and Torbjörn Granlund, Improved Division by Invariant
748    Integers, IEEE Transactions on Computers (Volume: 60, Issue: 2,
749    Feb. 2011) */
750 
751 static inline void
u64arith_divqr_2_1_1_recip(uint64_t * q,uint64_t * r,const uint64_t a1,const uint64_t a2,const uint64_t b)752 u64arith_divqr_2_1_1_recip (uint64_t *q, uint64_t *r,
753 		      const uint64_t a1, const uint64_t a2,
754 		      const uint64_t b)
755 {
756   if (a2 == 0) {
757     *q = a1 / b;
758     *r = a1 % b;
759     return;
760   }
761 
762   const int s = u64arith_clz(b);
763   uint64_t d = b << s;
764   /* Left-adjust divisor */
765   const uint64_t v = u64arith_reciprocal_for_div(d);
766   u64arith_divqr_2_1_1_recip_precomp(q, r, a1, a2, d, v, s);
767 
768 #ifdef WANT_ASSERT_EXPENSIVE
769   uint64_t P1, P2;
770   u64arith_mul_1_1_2(&P1, &P2, *q, b);
771   unsigned char cy = u64arith_add_2_2_cy(&P1, &P2, *r, 0);
772   // printf("a1=%lu, a2=%lu, b=%lu, s=%d, q1=%lu, r0=%lu\n", a1, a2, b, s, q1, r0);
773   ASSERT_EXPENSIVE(P1 == a1 && P2 == a2 && cy == 0);
774 #endif
775 }
776 
777 
778 /* Integer division of two uint64_t values a2:a1 by a uint64_t divisor b.
779    Returns quotient and remainder. */
780 
781 static inline void
u64arith_divqr_2_1_1(uint64_t * q,uint64_t * r,const uint64_t a1,const uint64_t a2,const uint64_t b)782 u64arith_divqr_2_1_1 (uint64_t *q, uint64_t *r,
783 		      const uint64_t a1, const uint64_t a2,
784 		      const uint64_t b)
785 {
786   ASSERT(a2 < b); /* Or there will be quotient overflow */
787 #ifdef U64ARITH_VERBOSE_ASM
788   __asm__ ("# u64arith_divqr_2_1_1 (%0, %1, %2, %3, %4)\n" : :
789            "X" (*q), "X" (*r), "X" (a1), "X" (a2), "X" (b));
790 #endif
791 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
792   __asm__ __VOLATILE (
793     "divq %4"
794     : "=a" (*q), "=d" (*r)
795     : "0" (a1), "1" (a2), "rm" (b)
796     : "cc");
797 #else
798   u64arith_divqr_2_1_1_recip(q, r, a1, a2, b);
799 #endif
800 }
801 
802 
803 /* Integer division of two uint64_t values by a uint64_t divisor. Returns
804    only remainder. */
805 
806 static inline void
u64arith_divr_2_1_1(uint64_t * r,uint64_t a1,const uint64_t a2,const uint64_t b)807 u64arith_divr_2_1_1 (uint64_t *r, uint64_t a1, const uint64_t a2,
808                      const uint64_t b)
809 {
810   ASSERT(a2 < b); /* Or there will be quotient overflow */
811 #ifdef U64ARITH_VERBOSE_ASM
812   __asm__ ("# u64arith_divr_2_1_1 (%0, %1, %2, %3)\n" : :
813            "X" (*r), "X" (a1), "X" (a2), "X" (b));
814 #endif
815 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
816   __asm__ __VOLATILE (
817     "divq %3"
818     : "+a" (a1), "=d" (*r)
819     : "1" (a2), "rm" (b)
820     : "cc");
821 #else
822   uint64_t dummy;
823   u64arith_divqr_2_1_1(&dummy, r, a1, a2, b);
824 #endif
825 }
826 
827 static inline void
u64arith_divqr_3_2_1_recip_precomp(uint64_t * q,uint64_t * R0,uint64_t * R1,uint64_t u0,uint64_t u1,uint64_t u2,const uint64_t d0,const uint64_t d1,const uint64_t v,const int s)828 u64arith_divqr_3_2_1_recip_precomp (
829     uint64_t *q, uint64_t *R0, uint64_t *R1,
830     uint64_t u0, uint64_t u1, uint64_t u2,
831     const uint64_t d0, const uint64_t d1, const uint64_t v,
832     const int s)
833 {
834     uint64_t q0, q1, t0, t1, r0, r1;
835     ASSERT((d1 & (UINT64_C(1) << 63)) != 0);
836     ASSERT(u64arith_lt_2_2(u1, u2, d0, d1));
837     ASSERT(0 <= s && s < 64);
838     ASSERT((u2 & ~(UINT64_MAX >> s)) == 0);
839 
840     u64arith_shld(&u2, u1, u2, s);
841     u64arith_shld(&u1, u0, u1, s);
842     u0 <<= s;
843 
844     u64arith_mul_1_1_2(&q0, &q1, u2, v);
845     u64arith_add_2_2(&q0, &q1, u1, u2);
846     r1 = u1 - q1*d1;
847     u64arith_mul_1_1_2(&t0, &t1, d0, q1);
848     r0 = u0;
849     u64arith_sub_2_2(&r0, &r1, t0, t1);
850     u64arith_sub_2_2(&r0, &r1, d0, d1);
851     q1++;
852     if (r1 >= q0) {
853         q1--;
854         u64arith_add_2_2(&r0, &r1, d0, d1);
855     }
856     if (u64arith_ge_2_2(r0, r1, d0, d1)) {
857         q1++;
858         u64arith_sub_2_2(&r0, &r1, d0, d1);
859     }
860     u64arith_shr_2(&r0, &r1, s);
861     *R0 = r0;
862     *R1 = r1;
863     *q = q1;
864 }
865 
866 static inline void
u64arith_divqr_3_2_1(uint64_t * q,uint64_t * R0,uint64_t * R1,uint64_t u0,uint64_t u1,uint64_t u2,uint64_t d0,uint64_t d1)867 u64arith_divqr_3_2_1 (uint64_t *q, uint64_t *R0, uint64_t *R1,
868     uint64_t u0, uint64_t u1, uint64_t u2,
869     uint64_t d0, uint64_t d1)
870 {
871     ASSERT(d1 != 0);
872     ASSERT(u64arith_lt_2_2(u1, u2, d0, d1));
873     int s = u64arith_clz(d1);
874     u64arith_shl_2(&d0, &d1, s);
875     const uint64_t v = u64arith_reciprocal_for_div_3by2(d0, d1);
876     u64arith_divqr_3_2_1_recip_precomp(q, R0, R1, u0, u1, u2, d0, d1, v, s);
877 }
878 
879 /* Compute 1/n (mod 2^wordsize) */
880 static inline uint64_t
u64arith_invmod(const uint64_t n)881 u64arith_invmod (const uint64_t n)
882 {
883   uint64_t r;
884 
885   ASSERT (n % 2 != 0);
886 
887   /* Suggestion from PLM: initing the inverse to (3*n) XOR 2 gives the
888      correct inverse modulo 32, then 3 (for 32 bit) or 4 (for 64 bit)
889      Newton iterations are enough. */
890   r = (3 * n) ^ 2;
891   /* Newton iteration */
892   r = 2 * r - (uint32_t) r * (uint32_t) r * (uint32_t) n;
893   r = 2 * r - (uint32_t) r * (uint32_t) r * (uint32_t) n;
894   r = 2 * r - (uint32_t) r * (uint32_t) r * (uint32_t) n;
895 #if 0
896   r = 2 * r - r * r * n;
897 #else
898   /*
899     r*n = k*2^32 + 1
900 
901     r' = 2 * r - r * r * n
902     r' = 2 * r - r * (k*2^32 + 1)
903     r' = 2 * r - r * k*2^32 - r
904     r' = r - r * k*2^32
905     r' = r - ((r * k) % 2^32) * 2^32
906   */
907   uint32_t k = (uint32_t)(r * n >> 32);
908   k *= (uint32_t) r;
909   r = r - ((uint64_t)k << 32);
910 #endif
911 
912   ASSERT_EXPENSIVE(n * r == 1);
913   return r;
914 }
915 
916 /* Compute n/2 (mod m), where m must be odd. */
917 static inline uint64_t
u64arith_div2mod(const uint64_t n,const uint64_t m)918 u64arith_div2mod (const uint64_t n, const uint64_t m)
919 {
920 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
921     uint64_t N = n, M = m/2, t = 0;
922     ASSERT_EXPENSIVE (m % 2 != 0);
923     __asm__ __VOLATILE(
924         "shr $1, %1\n\t" /* N /= 2 */
925         "cmovc %0, %2\n\t" /* if (cy) {t = M;} */
926         "adc %2, %1\n\t" /* N += t + cy */
927         : "+&r" (M), "+&r" (N), "+&r" (t)
928         : : "cc"
929     );
930   return N;
931 #else
932   ASSERT_EXPENSIVE (m % 2 != 0);
933   if (n % 2 == 0)
934     return n / 2;
935   else
936     return n / 2 + m / 2 + 1;
937 #endif
938 }
939 
940 
941 /* Integer (truncated) square root of n */
942 static inline uint64_t
u64arith_sqrt(const uint64_t n)943 u64arith_sqrt (const uint64_t n)
944 {
945     /* TODO: use Hensel lifting instead */
946   unsigned int i;
947   uint64_t xs, c, d, s2;
948   const unsigned int l = 63 - (unsigned int)__builtin_clzl(n);
949 
950   d = n; /* d = n - x^2 */
951   xs = 0;
952   s2 = (uint64_t)1 << (l - l % 2);
953 
954   for (i = l / 2; i != 0; i--)
955     {
956       /* Here, s2 = 1 << (2*i) */
957       /* xs = x << (i + 1), the value of x shifted left i+1 bits */
958 
959       c = xs + s2; /* c = (x + 2^i) ^ 2 - x^2 = 2^(i+1) * x + 2^(2*i) */
960       xs >>= 1; /* Now xs is shifted only i positions */
961       if (d >= c)
962         {
963           d -= c;
964           xs |= s2; /* x |= 1 << i <=> xs |= 1 << (2*i) */
965         }
966       s2 >>= 2;
967     }
968 
969   c = xs + s2;
970   xs >>= 1;
971   if (d >= c)
972     xs |= s2;
973 
974   return xs;
975 }
976 
977 /* Given r = -rem/p (mod den), we want num/(den*2^k) (mod p) ==
978    (ratio + rem/den)/2^k (mod p).
979    Using (a variant of) Bezout's identity, we have, for some non-negative
980    integer t,
981    r * p - t * den = -rem, or
982    r * p + rem = t * den,
983    thus den | (r * p + rem), and thus
984    t = (r * p + rem) / den is an integer and satisfies
985    t = rem/den (mod p).
986 
987    We have 0 <= r <= den-1 and rem <= den-1, and thus
988    0 <= t = p * r/den + rem/den <=
989    p (1 - 1/den) + 1 - 1/den =
990    p + 1 - (p + 1)/den < p + 1.
991    Thus t is almost a properly reduced residue for rem/den (mod p).
992    As p fits in uint64_t, so does t, and we can compute t modulo
993    2^64; since den is odd, we can multiply by den^{-1} mod 2^64
994    to effect division by den.
995 
996    Finally we compute (t + ratio)/2^k mod p = num/(den*2^k) mod p.  */
997 
998 static inline uint64_t
u64arith_post_process_inverse(const uint64_t r,const uint64_t p,const uint64_t rem,const uint64_t den_inv,const uint64_t ratio,const uint64_t k)999 u64arith_post_process_inverse(const uint64_t r, const uint64_t p,
1000   const uint64_t rem, const uint64_t den_inv,
1001   const uint64_t ratio, const uint64_t k)
1002 {
1003   uint64_t t = (r * p + rem) * den_inv;
1004   const uint64_t ratio_p = (ratio >= p) ? ratio % p : ratio;
1005   ASSERT_ALWAYS(t <= p); /* Cheap and fairly strong test */
1006   /* u64arith_addmod_1_1() accepts third operand == p and still produces
1007      a properly reduced sum mod p. */
1008   u64arith_addmod_1_1 (&t, ratio_p, t, p);
1009 
1010   ASSERT_EXPENSIVE(t < p);
1011   ASSERT_EXPENSIVE(k == 0 || p % 2 == 1);
1012   for (uint64_t j = 0; j < k; j++) {
1013     t = u64arith_div2mod(t, p);
1014   }
1015   return t;
1016 }
1017 
1018 
1019 /* Computes r = ((phigh * 2^64 + plow) / 2^64) % m
1020    Requires phigh < m and invm = -1/m (mod 2^64). */
1021 
1022 static inline void
u64arith_redc(uint64_t * r,const uint64_t plow,const uint64_t phigh,const uint64_t m,const uint64_t invm)1023 u64arith_redc(uint64_t *r, const uint64_t plow,
1024              const uint64_t phigh, const uint64_t m,
1025              const uint64_t invm)
1026 {
1027   uint64_t t = phigh;
1028 
1029   ASSERT_EXPENSIVE (phigh < m);
1030 
1031 #if !defined (U64ARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
1032 
1033   /* TODO: are the register constraints watertight?
1034      %rax gets modified but putting tlow as an output constraint with "+"
1035      will keep r from getting allocated in %rax, which is a shame
1036      since we often want the result in %rax for the next multiply. */
1037 
1038   __asm__ __VOLATILE (
1039     "imulq %[invm], %%rax\n\t"
1040     "cmpq $1, %%rax \n\t"                /* if plow != 0, increase t */
1041     "sbbq $-1, %[t]\n\t"
1042     "mulq %[m]\n\t"
1043     "lea (%[t],%%rdx,1), %[r]\n\t"  /* compute (rdx + thigh) mod m */
1044     "subq %[m], %[t]\n\t"
1045     "addq %%rdx, %[t]\n\t"
1046     "cmovcq %[t], %[r]\n\t"
1047     : [t] "+&r" (t), [r] "=&r" (r[0])
1048     : [invm] "rm" (invm), [m] "rm" (m), "a" (plow)
1049     : "%rdx", "cc"
1050   );
1051 #else
1052   uint64_t tlow, thigh;
1053   tlow = plow * invm;
1054   u64arith_mul_1_1_2 (&tlow, &thigh, tlow, m);
1055   /* Let w = 2^wordsize. We know (phigh * w + plow) + (thigh * w + tlow)
1056      == 0 (mod w) so either plow == tlow == 0, or plow !=0 and tlow != 0.
1057      In the former case we want phigh + thigh + 1, in the latter
1058      phigh + thigh. Since t = phigh < m, and modredcul_add can handle the
1059      case where the second operand is equal to m, adding 1 is safe */
1060 
1061   t += (plow != 0) ? 1 : 0; /* Does not depend on the mul */
1062 
1063   u64arith_addmod_1_1(r, t, thigh, m);
1064 #endif
1065   ASSERT_EXPENSIVE (r[0] < m);
1066 }
1067 
1068 
1069 #ifdef __cplusplus
1070 }
1071 #endif
1072 
1073 
1074 #endif /* ifndef U64_ARITH_H__ */
1075