1 /* Some commonly used assembly helper macros for arithmetic on
2    unsigned long.
3    Defining ULARITH_VERBOSE_ASM puts in the asm output a line for each
4    function call that shows what registers/memory locations the operands
5    are in.
6    Defining ULARITH_NO_ASM avoids asm macros and uses the C fallback code
7    where available.
8 */
9 
10 #ifndef UL_ARITH_H__
11 
12 #define UL_ARITH_H__
13 
14 #include <limits.h>
15 #include <gmp.h>
16 #include "macros.h"
17 
18 /* <limits.h> defines LONG_BIT only with _XOPEN_SOURCE defined, but if
19    another header (such as <stdio.h>) already included <features.h> before
20    _XOPEN_SOURCE was set to 1, future includes of <features.h> are
21    short-circuited and _XOPEN_SOURCE is ignored. */
22 
23 #ifndef LONG_BIT
24 #ifdef LONG_MAX /* ISO C99 requires LONG_MAX in limits.h */
25 #if LONG_MAX == 2147483647L
26 #define LONG_BIT 32
27 #else
28 #define LONG_BIT 64
29 #endif /* if LONG_MAX == 2147483647L */
30 #elif defined __LONG_MAX__
31 #if __LONG_MAX__ == 2147483647L
32 #define LONG_BIT 32
33 #else
34 #define LONG_BIT 64
35 #endif /* if __LONG_MAX__ == 2147483647L */
36 #else /* elif defined __LONG_MAX__ */
37 #error Cannot guess LONG_BIT, please define
38 #endif /* elif defined __LONG_MAX__ else */
39 #endif /* ifndef LONG_BIT */
40 
41 /* On 32 bit x86, the general constraint for, e.g., the source operand
42    of add is "g". For x86_64, it is "rme", since immediate constants
43    must be 32 bit. */
44 #if defined(__i386__) && defined(__GNUC__)
45 #define ULARITH_CONSTRAINT_G "g"
46 #elif defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
47 #define ULARITH_CONSTRAINT_G "rme"
48 #endif
49 
50 #ifdef __cplusplus
51 extern "C" {
52 #endif
53 
54 #ifdef DEAD_CODE /* Unused and untested. Here be dragons. */
55 /* Increases r if a != 0 */
56 static inline void
ularith_inc_nz(unsigned long * r,const unsigned long a)57 ularith_inc_nz (unsigned long *r, const unsigned long a)
58 {
59 #ifdef ULARITH_VERBOSE_ASM
60   __asm__ ("# ularith_inc_nz (%0, %1)\n" : : "X" (*r), "X" (a));
61 #endif
62 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
63   __asm__ __VOLATILE (
64     "cmpq $1, %1\n\t"
65     "sbbq $-1, %0\n\t"
66     : "+r" (*r)
67     : "rm" (a)
68     : "cc");
69 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
70   __asm__ __VOLATILE (
71     "cmpl $1, %1\n\t"
72     "sbbl $-1, %0\n\t"
73     : "+r" (*r)
74     : "rm" (a)
75     : "cc");
76 #else
77   if (a != 0)
78     *r += 1;
79 #endif
80 }
81 #endif
82 
83 /* Let a = a1 + 2^k * a2, b = b1 + 2^k * b2, where k is number of bits
84    in an unsigned long. Return 1 if a > b, and 0 if a <= b. */
85 static inline int
86 ularith_gt_2ul_2ul(unsigned long, unsigned long,
87                    unsigned long, unsigned long) ATTRIBUTE((const));
88 static inline int
ularith_gt_2ul_2ul(const unsigned long a1,const unsigned long a2,const unsigned long b1,const unsigned long b2)89 ularith_gt_2ul_2ul(const unsigned long a1, const unsigned long a2,
90                    const unsigned long b1, const unsigned long b2)
91 {
92   return a2 > b2 || (a2 == b2 && a1 > b1);
93 }
94 
95 /* Add an unsigned long to two unsigned longs with carry propagation from
96    low word (r1) to high word (r2). Any carry out from high word is lost. */
97 
98 static inline void
ularith_add_ul_2ul(unsigned long * r1,unsigned long * r2,const unsigned long a)99 ularith_add_ul_2ul (unsigned long *r1, unsigned long *r2,
100                     const unsigned long a)
101 {
102 #ifdef ULARITH_VERBOSE_ASM
103   __asm__ ("# ularith_add_ul_2ul (%0, %1, %2)\n" : :
104            "X" (*r1), "X" (*r2), "X" (a));
105 #endif
106 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
107   __asm__ __VOLATILE (
108     "addq %2, %0\n\t"
109     "adcq $0, %1\n"
110     : "+&r" (*r1), "+r" (*r2)
111     : "rme" (a)
112     : "cc"); /* TODO: add commutativity and alternative for add to
113                 memory */
114 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
115   __asm__ __VOLATILE (
116     "addl %2, %0\n\t"
117     "adcl $0, %1\n"
118     : "+&r" (*r1), "+r" (*r2)
119     : "g" (a)
120     : "cc");
121 #else
122   *r1 += a;
123   if (*r1 < a)
124     (*r2)++;
125 #endif
126 }
127 
128 
129 /* Add two unsigned longs to two unsigned longs with carry propagation from
130    low word (r1) to high word (r2). Any carry out from high word is lost. */
131 
132 static inline void
ularith_add_2ul_2ul(unsigned long * r1,unsigned long * r2,const unsigned long a1,const unsigned long a2)133 ularith_add_2ul_2ul (unsigned long *r1, unsigned long *r2,
134 			 const unsigned long a1, const unsigned long a2)
135 {
136 #ifdef ULARITH_VERBOSE_ASM
137   __asm__ ("# ularith_add_2ul_2ul (%0, %1, %2, %3)\n" : :
138            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
139 #endif
140 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
141   __asm__ __VOLATILE (
142     "addq %2, %0\n\t"
143     "adcq %3, %1\n"
144     : "+&r" (*r1), "+r" (*r2)
145     : "rme" (a1), "rme" (a2)
146     : "cc");
147 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
148   __asm__ __VOLATILE (
149     "addl %2, %0\n\t"
150     "adcl %3, %1\n"
151     : "+&r" (*r1), "+r" (*r2)
152     : "g" (a1), "g" (a2)
153     : "cc");
154 #else
155   *r1 += a1;
156   *r2 += a2 + (*r1 < a1);
157 #endif
158 }
159 
160 /* Adds two unsigned longs from two unsigned longs with carry propagation
161    from low word (r1) to high word (r2). Returns 1 if there was a carry out
162    from high word, otherwise returns 0. */
163 
164 static inline char
ularith_add_2ul_2ul_cy(unsigned long * r1,unsigned long * r2,const unsigned long a1,const unsigned long a2)165 ularith_add_2ul_2ul_cy (unsigned long *r1, unsigned long *r2,
166 			const unsigned long a1, const unsigned long a2)
167 {
168   char cy;
169 #ifdef ULARITH_VERBOSE_ASM
170   __asm__ ("# ularith_add_2ul_2ul_cy (%0, %1, %2, %3)\n" : :
171            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
172 #endif
173 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
174   __asm__ __VOLATILE (
175     "addq %3, %0\n\t"
176     "adcq %4, %1\n\t"
177     "setc %2\n"
178     : "+&r" (*r1), "+r" (*r2), "=r" (cy)
179     : "rme" (a1), "rme" (a2)
180     : "cc");
181 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
182   __asm__ __VOLATILE (
183     "addl %3, %0\n\t"
184     "adcl %4, %1\n"
185     "setc %2\n"
186     : "+&r" (*r1), "+r" (*r2), "=r" (cy)
187     : "g" (a1), "g" (a2)
188     : "cc");
189 #else
190   unsigned long u1 = *r1 + a1, u2 = *r2 + a2;
191   if (u1 < *r1)
192     u2++;
193   /* Overflow occurred iff the sum is smaller than one of the summands */
194   cy = ularith_gt_2ul_2ul(a1, a2, u1, u2);
195   *r1 = u1;
196   *r2 = u2;
197 #endif
198   return cy;
199 }
200 
201 
202 /* Requires a < m and b <= m, then r == a+b (mod m) and r < m */
203 MAYBE_UNUSED
204 static inline void
ularith_addmod_ul_ul(unsigned long * r,const unsigned long a,const unsigned long b,const unsigned long m)205 ularith_addmod_ul_ul (unsigned long *r, const unsigned long a,
206                const unsigned long b, const unsigned long m)
207 {
208   ASSERT_EXPENSIVE (a < m && b <= m);
209 
210 #if (defined(__i386__) && defined(__GNUC__)) || defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
211   {
212     unsigned long t = a + b, tr = a - m;
213 
214     __asm__ __VOLATILE (
215       "add %2, %0\n\t"   /* tr += b */
216       "cmovnc %1, %0\n\t"  /* if (!cy) tr = t */
217       : "+&r" (tr)
218       : "rm" (t), ULARITH_CONSTRAINT_G (b)
219       : "cc"
220     );
221     ASSERT_EXPENSIVE (tr == ((a >= m - b) ? (a - (m - b)) : (a + b)));
222     r[0] = tr;
223   }
224 #else
225   r[0] = (b >= m - a) ? (b - (m - a)) : (a + b);
226 #endif
227 
228   ASSERT_EXPENSIVE (r[0] < m);
229 }
230 
231 
232 /* Subtract an unsigned long from two unsigned longs with borrow propagation
233    from low word (r1) to high word (r2). Any borrow out from high word is
234    lost. */
235 
236 static inline void
ularith_sub_ul_2ul(unsigned long * r1,unsigned long * r2,const unsigned long a)237 ularith_sub_ul_2ul (unsigned long *r1, unsigned long *r2,
238 			const unsigned long a)
239 {
240 #ifdef ULARITH_VERBOSE_ASM
241   __asm__ ("# ularith_sub_ul_2ul  (%0, %1, %2)\n" : :
242            "X" (*r1), "X" (*r2), "X" (a));
243 #endif
244 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
245   __asm__ __VOLATILE (
246     "subq %2, %0\n\t"
247     "sbbq $0, %1\n"
248     : "+&r" (*r1), "+r" (*r2)
249     : "rme" (a)
250     : "cc");
251 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
252   __asm__ __VOLATILE (
253     "subl %2, %0\n\t"
254     "sbbl $0, %1\n"
255     : "+&r" (*r1), "+r" (*r2)
256     : "g" (a)
257     : "cc");
258 #else
259   unsigned long u = *r1;
260   *r1 -= a;
261   if (*r1 > u)
262     (*r2)--;
263 #endif
264 }
265 
266 
267 /* Subtract two unsigned longs from two unsigned longs with borrow propagation
268    from low word (r1) to high word (r2). Any borrow out from high word is
269    lost. */
270 
271 static inline void
ularith_sub_2ul_2ul(unsigned long * r1,unsigned long * r2,const unsigned long a1,const unsigned long a2)272 ularith_sub_2ul_2ul (unsigned long *r1, unsigned long *r2,
273 			 const unsigned long a1, const unsigned long a2)
274 {
275 #ifdef ULARITH_VERBOSE_ASM
276   __asm__ ("# ularith_sub_2ul_2ul (%0, %1, %2, %3)\n" : :
277            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
278 #endif
279 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
280   __asm__ __VOLATILE (
281     "subq %2, %0\n\t"
282     "sbbq %3, %1\n"
283     : "+&r" (*r1), "+r" (*r2)
284     : "rme" (a1), "rme" (a2)
285     : "cc");
286 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
287   __asm__ __VOLATILE (
288     "subl %2, %0\n\t"
289     "sbbl %3, %1\n"
290     : "+&r" (*r1), "+r" (*r2)
291     : "g" (a1), "g" (a2)
292     : "cc");
293 #else
294   unsigned long u = *r1;
295   *r1 -= a1;
296   *r2 -= a2;
297   if (*r1 > u)
298     (*r2)--;
299 #endif
300 }
301 
302 /* Subtract two unsigned longs from two unsigned longs with borrow propagation
303    from low word (r1) to high word (r2). Returns 1 if there was a borrow out
304    from high word, otherwise returns 0. */
305 
306 static inline char
ularith_sub_2ul_2ul_cy(unsigned long * r1,unsigned long * r2,const unsigned long a1,const unsigned long a2)307 ularith_sub_2ul_2ul_cy (unsigned long *r1, unsigned long *r2,
308 			const unsigned long a1, const unsigned long a2)
309 {
310   char cy;
311 #ifdef ULARITH_VERBOSE_ASM
312   __asm__ ("# ularith_sub_2ul_2ul_cy (%0, %1, %2, %3)\n" : :
313            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
314 #endif
315 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
316   __asm__ __VOLATILE (
317     "subq %3, %0\n\t"
318     "sbbq %4, %1\n\t"
319     "setc %2\n"
320     : "+&r" (*r1), "+r" (*r2), "=r" (cy)
321     : "rme" (a1), "rme" (a2)
322     : "cc");
323 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
324   __asm__ __VOLATILE (
325     "subl %3, %0\n\t"
326     "sbbl %4, %1\n"
327     "setc %2\n"
328     : "+&r" (*r1), "+r" (*r2), "=q" (cy)
329     : "g" (a1), "g" (a2)
330     : "cc");
331 #else
332   unsigned long u1 = *r1 - a1, u2 = *r2 - a2;
333   if (a1 > *r1)
334     u2--;
335   cy = ularith_gt_2ul_2ul(a1, a2, *r1, *r2);
336   *r1 = u1;
337   *r2 = u2;
338 #endif
339   return cy;
340 }
341 
342 /* Subtract only if result is non-negative */
343 
344 static inline void
ularith_sub_ul_ul_ge(unsigned long * r,const unsigned long a)345 ularith_sub_ul_ul_ge (unsigned long *r, const unsigned long a)
346 {
347   unsigned long t = *r;
348 #ifdef ULARITH_VERBOSE_ASM
349   __asm__ ("# ularith_sub_2ul_2ul_ge (%0, %1, %2, %3)\n" : :
350            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
351 #endif
352 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
353   __asm__ __VOLATILE (
354     "subq %2, %0\n\t" /* r -= a */
355     "cmovc %1, %0\n\t" /* If there's a borrow, restore r from t */
356     : "+&r" (*r)
357     : "r" (t), "rme" (a)
358     : "cc");
359 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
360   __asm__ __VOLATILE (
361     "subl %2, %0\n\t"
362     "cmovc %1, %0\n\t"
363     : "+&r" (*r)
364     : "r" (t), "g" (a)
365     : "cc");
366 #else
367   t -= a;
368   if (*r >= a)
369     *r = t;
370 #endif
371 }
372 
373 
374 static inline void
ularith_sub_2ul_2ul_ge(unsigned long * r1,unsigned long * r2,const unsigned long a1,const unsigned long a2)375 ularith_sub_2ul_2ul_ge (unsigned long *r1, unsigned long *r2,
376 			const unsigned long a1, const unsigned long a2)
377 {
378   unsigned long t1 = *r1, t2 = *r2;
379 #ifdef ULARITH_VERBOSE_ASM
380   __asm__ ("# ularith_sub_2ul_2ul_ge (%0, %1, %2, %3)\n" : :
381            "X" (*r1), "X" (*r2), "X" (a1), "X" (a2));
382 #endif
383 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
384   __asm__ __VOLATILE (
385     "subq %4, %0\n\t" /* r1 -= a1 */
386     "sbbq %5, %1\n\t" /* r2 -= a2 + cy */
387     "cmovc %2, %0\n\t" /* If there's a borrow, restore r1 from t1 */
388     "cmovc %3, %1\n\t" /* and r2 from t2 */
389     : "+&r" (*r1), "+&r" (*r2)
390     : "r" (t1), "r" (t2), "rme" (a1), "rme" (a2)
391     : "cc");
392 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
393   __asm__ __VOLATILE ( "subl %4, %0\n\t"
394     "sbbl %5, %1\n\t"
395     "cmovc %2, %0\n\t"
396     "cmovc %3, %1\n\t"
397     : "+&r" (*r1), "+&r" (*r2)
398     : "r" (t1), "r" (t2), "g" (a1), "g" (a2)
399     : "cc");
400 #else
401   if (!ularith_gt_2ul_2ul(a1, a2, *r1, *r2))
402     {
403       *r1 = t1 - a1;
404       *r2 = t2 - a2 - (a1 > t1);
405     }
406 #endif
407 }
408 
409 
410 MAYBE_UNUSED
411 static inline void
ularith_submod_ul_ul(unsigned long * r,const unsigned long a,const unsigned long b,const unsigned long m)412 ularith_submod_ul_ul (unsigned long *r, const unsigned long a,
413                       const unsigned long b, const unsigned long m)
414 {
415   ASSERT_EXPENSIVE (a < m && b < m);
416 #if (defined(__i386__) && defined(__GNUC__)) || defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
417   {
418     unsigned long tr, t = a;
419     __asm__ __VOLATILE (
420       "sub %2, %1\n\t"  /* t -= b ( = a - b) */
421       "lea (%1,%3,1), %0\n\t" /* tr = t + m ( = a - b + m) */
422       "cmovnc %1, %0\n\t" /* if (a >= b) tr = t */
423       : "=&r" (tr), "+&r" (t)
424       : ULARITH_CONSTRAINT_G (b), "r" (m)
425       : "cc"
426     );
427     r[0] = tr;
428   }
429 #elif 1
430   /* Seems to be faster than the one below */
431   {
432     unsigned long t = 0UL, tr;
433     if ((tr = a - b) > a)
434       t = m;
435     r[0] = tr + t;
436   }
437 #else
438   r[0] = (a < b) ? (a - b + m) : (a - b);
439 #endif
440 
441   ASSERT_EXPENSIVE (r[0] < m);
442 }
443 
444 
445 /* Multiply two unsigned long "a" and "b" and put the result as
446    r2:r1 (r2 being the high word) */
447 
448 static inline void
ularith_mul_ul_ul_2ul(unsigned long * r1,unsigned long * r2,const unsigned long a,const unsigned long b)449 ularith_mul_ul_ul_2ul (unsigned long *r1, unsigned long *r2,
450 			   const unsigned long a, const unsigned long b)
451 {
452 #ifdef ULARITH_VERBOSE_ASM
453   __asm__ ("# ularith_mul_ul_ul_2ul (%0, %1, %2, %3)\n" : :
454            "X" (*r1), "X" (*r2), "X" (a), "X" (b));
455 #endif
456 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
457   __asm__ __VOLATILE (
458     "mulq %3"
459     : "=a" (*r1), "=d" (*r2)
460     : "%0" (a), "rm" (b)
461     : "cc");
462 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
463   __asm__ __VOLATILE (
464     "mull %3"
465     : "=a" (*r1), "=d" (*r2)
466     : "%0" (a), "rm" (b)
467     : "cc");
468 #elif !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_ARM_INLINE_ASM)
469   __asm__ __VOLATILE(
470    "umull   %[r1], %[r2], %[a], %[b]\n\t"
471   : [r1] "=&r" (*r1), [r2] "=&r" (*r2)
472   : [a] "r" (a), [b] "r" (b)
473   );
474 #elif LONG_BIT == 32
475     uint64_t r = (uint64_t) a * b;
476     *r1 = (unsigned long) r;
477     *r2 = (unsigned long) (r >> 32);
478 #elif LONG_BIT == 64 && defined(HAVE_INT128)
479     /* this code is useful for example on ARM processors (Raspberry Pi) */
480     unsigned __int128 r = (unsigned __int128) a * b;
481     *r1 = (unsigned long) r;
482     *r2 = (unsigned long) (r >> 64);
483 #else
484   const int half = LONG_BIT / 2;
485   const unsigned long mask = (1UL << half) - 1UL;
486   unsigned long t1, t2, p1, p2;
487 
488   t1 = (a & mask) * (b & mask);
489   t2 = 0UL;
490   p1 = (a >> half) * (b & mask);
491   p2 = p1 >> half;
492   p1 = (p1 & mask) << half;
493   ularith_add_2ul_2ul (&t1, &t2, p1, p2);
494   p1 = (a & mask) * (b >> half);
495   p2 = p1 >> half;
496   p1 = (p1 & mask) << half;
497   ularith_add_2ul_2ul (&t1, &t2, p1, p2);
498   t2 += (a >> half) * (b >> half);
499   *r1 = t1;
500   *r2 = t2;
501 #endif
502 }
503 
504 
505 static inline void
ularith_sqr_ul_2ul(unsigned long * r1,unsigned long * r2,const unsigned long a)506 ularith_sqr_ul_2ul (unsigned long *r1, unsigned long *r2,
507 		    const unsigned long a)
508 {
509 #ifdef ULARITH_VERBOSE_ASM
510   __asm__ ("# ularith_sqr_ul_2ul (%0, %1, %2)\n" : :
511            "X" (*r1), "X" (*r2), "X" (a));
512 #endif
513 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
514   __asm__ __VOLATILE (
515     "mulq %%rax"
516     : "=a" (*r1), "=d" (*r2)
517     : "0" (a)
518     : "cc");
519 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
520   __asm__ __VOLATILE (
521     "mull %%eax"
522     : "=a" (*r1), "=d" (*r2)
523     : "0" (a)
524     : "cc");
525 #elif !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_ARM_INLINE_ASM)
526   __asm__ __VOLATILE(
527    "umull   %[r1], %[r2], %[a], %[a]\n\t"
528   : [r1] "=&r" (*r1), [r2] "=&r" (*r2)
529   : [a] "r" (a)
530   );
531 #elif LONG_BIT == 32
532     uint64_t r = (uint64_t) a * a;
533     *r1 = r;
534     *r2 = r >> 32;
535 #elif LONG_BIT == 64 && defined(HAVE_INT128)
536   /* this code is useful for example on ARM processors (Raspberry Pi) */
537   /* Unfortunately, gcc does not seem to recognize that the two input
538    * operands to MUL are identical and can therefore go in %rax. This
539    * increases register pressure and leads to less efficient code than
540    * the explicit __asm__ statement above. */
541     unsigned __int128 r = (unsigned __int128) a * a;
542     *r1 = r;
543     *r2 = r >> 64;
544 #else
545   const int half = LONG_BIT / 2;
546   const unsigned long mask = (1UL << half) - 1UL;
547   unsigned long t1, t2, p1, p2;
548 
549   t1 = (a & mask) * (a & mask);
550   t2 = 0UL;
551   p1 = (a >> half) * (a & mask);
552   p2 = p1 >> half;
553   p1 = (p1 & mask) << half;
554   ularith_add_2ul_2ul (&t1, &t2, p1, p2);
555   ularith_add_2ul_2ul (&t1, &t2, p1, p2);
556   t2 += (a >> half) * (a >> half);
557   *r1 = t1;
558   *r2 = t2;
559 #endif
560 }
561 
562 
563 /* Integer division of a two ulong value a2:a1 by a ulong divisor. Returns
564    quotient and remainder. */
565 
566 static inline void
ularith_div_2ul_ul_ul(unsigned long * q,unsigned long * r,const unsigned long a1,const unsigned long a2,const unsigned long b)567 ularith_div_2ul_ul_ul (unsigned long *q, unsigned long *r,
568 			   const unsigned long a1, const unsigned long a2,
569 			   const unsigned long b)
570 {
571   ASSERT(a2 < b); /* Or there will be quotient overflow */
572 #ifdef ULARITH_VERBOSE_ASM
573   __asm__ ("# ularith_div_2ul_ul_ul (%0, %1, %2, %3, %4)\n" : :
574            "X" (*q), "X" (*r), "X" (a1), "X" (a2), "X" (b));
575 #endif
576 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
577   __asm__ __VOLATILE (
578     "divq %4"
579     : "=a" (*q), "=d" (*r)
580     : "0" (a1), "1" (a2), "rm" (b)
581     : "cc");
582 #elif defined(__i386__) && defined(__GNUC__)
583   __asm__ __VOLATILE (
584     "divl %4"
585     : "=a" (*q), "=d" (*r)
586     : "0" (a1), "1" (a2), "rm" (b)
587     : "cc");
588 #else
589   mp_limb_t A[2] = {a1, a2};
590   ASSERT(sizeof(unsigned long) == sizeof(mp_limb_t));
591   r[0] = mpn_divmod_1 (A, A, 2, b);
592   q[0] = A[0];
593 #endif
594 }
595 
596 
597 /* Integer division of a two longint value by a longint divisor. Returns
598    only remainder. */
599 
600 static inline void
ularith_div_2ul_ul_ul_r(unsigned long * r,unsigned long a1,const unsigned long a2,const unsigned long b)601 ularith_div_2ul_ul_ul_r (unsigned long *r, unsigned long a1,
602                  const unsigned long a2, const unsigned long b)
603 {
604   ASSERT(a2 < b); /* Or there will be quotient overflow */
605 #ifdef ULARITH_VERBOSE_ASM
606   __asm__ ("# ularith_div_2ul_ul_ul_r (%0, %1, %2, %3)\n" : :
607            "X" (*r), "X" (a1), "X" (a2), "X" (b));
608 #endif
609 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
610   __asm__ __VOLATILE (
611     "divq %3"
612     : "+a" (a1), "=d" (*r)
613     : "1" (a2), "rm" (b)
614     : "cc");
615 #elif defined(__i386__) && defined(__GNUC__)
616   __asm__ __VOLATILE (
617     "divl %3"
618     : "+a" (a1), "=d" (*r)
619     : "1" (a2), "rm" (b)
620     : "cc");
621 #else
622   mp_limb_t A[2] = {a1, a2};
623   ASSERT(sizeof(unsigned long) == sizeof(mp_limb_t));
624   r[0] = mpn_divmod_1 (A, A, 2, b);
625 #endif
626 }
627 
628 
629 /* Set *r to lo shifted right by i bits, filling in the low bits from hi into the high
630    bits of *r. I.e., *r = (hi * 2^LONG_BIT + lo) / 2^i. Assumes 0 <= i < LONG_BIT. */
631 MAYBE_UNUSED
632 static inline void
ularith_shrd(unsigned long * r,const unsigned long hi,const unsigned long lo,const unsigned char i)633 ularith_shrd (unsigned long *r, const unsigned long hi, const unsigned long lo,
634               const unsigned char i)
635 {
636   ASSERT_EXPENSIVE (i < LONG_BIT);
637 #ifdef ULARITH_VERBOSE_ASM
638 /* Disable the "uninitialized" warning here, as *r is only written to and
639    does not need to be initialized, but we need to write (*r) here so the
640    "X" constraint can be resolved even when r does not have an address, e.g.,
641    when it is passed around in a register. It seems that "X" is assumed by
642    gcc as possibly referring to an input, and since "X" matches anything,
643    that's probably a neccessary assumtion to make. */
644 #if GNUC_VERSION_ATLEAST(4,4,0)
645 #if GNUC_VERSION_ATLEAST(4,6,0)
646 #pragma GCC diagnostic push
647 #endif
648 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
649 #endif
650   __asm__ ("# ularith_shrd (*r=%0, hi=%1, lo=%2, i=%3)\n" : :
651            "X" (*r), "X" (hi), "X" (lo), "X" (i));
652 #if GNUC_VERSION_ATLEAST(4,6,0)
653 #pragma GCC diagnostic pop
654 #endif
655 #endif
656 
657 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
658   __asm__ __VOLATILE (
659     "shrdq %3, %1, %0\n"
660     : "=rm" (*r)
661     : "r" (hi), "0" (lo), "cJ" (i) /* i can be in %cl or a literal constant < 64 */
662     : "cc");
663 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
664   __asm__ __VOLATILE (
665     "shrdl %3, %1, %0\n"
666     : "=rm" (*r)
667     : "r" (hi), "0" (lo), "cI" (i) /* i can be in %cl or a literal constant < 32 */
668     : "cc");
669 #else
670   if (i > 0) /* shl by LONG_BIT is no-op on x86! */
671     *r = (lo >> i) | (hi << (LONG_BIT - i));
672   else
673     *r = lo;
674 #endif
675 }
676 
677 /* Set *r to hi shifted left by i bits, filling in the high bits from lo into the low
678    bits of *r. I.e., *r = (hi + lo*2^-LONG_BIT) * 2^i. Assumes 0 <= i < LONG_BIT. */
679 MAYBE_UNUSED
680 static inline void
ularith_shld(unsigned long * r,const unsigned long lo,const unsigned long hi,const unsigned char i)681 ularith_shld (unsigned long *r, const unsigned long lo, const unsigned long hi,
682               const unsigned char i)
683 {
684   ASSERT_EXPENSIVE (i < LONG_BIT);
685 #ifdef ULARITH_VERBOSE_ASM
686 #if GNUC_VERSION_ATLEAST(4,4,0)
687 #if GNUC_VERSION_ATLEAST(4,6,0)
688 #pragma GCC diagnostic push
689 #endif
690 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
691 #endif
692   __asm__ ("# ularith_shld (*r=%0, lo=%1, hi=%2, i=%3)\n" : :
693            "X" (*r), "X" (lo), "X" (hi), "X" (i));
694 #if GNUC_VERSION_ATLEAST(4,6,0)
695 #pragma GCC diagnostic pop
696 #endif
697 #endif
698 
699 #if !defined (ULARITH_NO_ASM) && defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
700   __asm__ __VOLATILE (
701     "shldq %3, %1, %0\n"
702     : "=rm" (*r)
703     : "r" (lo), "0" (hi), "cJ" (i) /* i can be in %cl or a literal constant < 64 */
704     : "cc");
705 #elif !defined (ULARITH_NO_ASM) && defined(__i386__) && defined(__GNUC__)
706   __asm__ __VOLATILE (
707     "shldl %3, %1, %0\n"
708     : "=rm" (*r)
709     : "r" (lo), "0" (hi), "cI" (i) /* i can be in %cl or a literal constant < 32 */
710     : "cc");
711 #else
712   if (i > 0) /* shr by LONG_BIT is no-op on x86! */
713     *r = (hi << i) | (lo >> (LONG_BIT - i));
714   else
715     *r = hi;
716 #endif
717 }
718 
719 /* Returns number of trailing zeros in a. a must not be zero */
720 MAYBE_UNUSED
721 static inline int
ularith_ctz(const unsigned long a)722 ularith_ctz (const unsigned long a)
723 {
724 #if !defined (ULARITH_NO_ASM) && defined(__GNUC__) && \
725     (__GNUC__ >= 4 || __GNUC__ >= 3 && __GNUC_MINOR__ >= 4)
726   ASSERT_EXPENSIVE (a != 0UL);
727   return __builtin_ctzl(a);
728 #else
729   static const unsigned char trailing_zeros[256] =
730     {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
731      5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
732      6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
733      5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
734      7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
735      5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
736      6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,
737      5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0};
738   char lsh, t = 0;
739   unsigned long y = a;
740   ASSERT_EXPENSIVE (a != 0UL);
741   do {
742     lsh = trailing_zeros [(unsigned char) y];
743     y >>= lsh;
744     t += lsh;
745   } while (lsh == 8);
746   return (int) t;
747 #endif
748 }
749 
750 /* Returns number of leading zeros in a. a must not be zero */
751 MAYBE_UNUSED
752 static inline int
ularith_clz(const unsigned long a)753 ularith_clz (const unsigned long a)
754 {
755 #if !defined (ULARITH_NO_ASM) && defined(__GNUC__) && \
756     (__GNUC__ >= 4 || __GNUC__ >= 3 && __GNUC_MINOR__ >= 4)
757   ASSERT_EXPENSIVE (a != 0UL);
758   return __builtin_clzl(a);
759 #else
760   unsigned long t = 1UL << (LONG_BIT - 1);
761   int i;
762   ASSERT_EXPENSIVE (a != 0UL);
763   for (i = 0; (a & t) == 0UL; i++)
764     t >>= 1;
765   return i;
766 #endif
767 }
768 
769 
770 /* Compute 1/n (mod 2^wordsize) */
771 MAYBE_UNUSED
772 static inline unsigned long
ularith_invmod(const unsigned long n)773 ularith_invmod (const unsigned long n)
774 {
775   /* T[i] = 1/(2i+1) mod 2^8 */
776   static unsigned char T[128] = {1, 171, 205, 183, 57, 163, 197, 239, 241, 27, 61, 167, 41, 19, 53, 223, 225, 139, 173, 151, 25, 131, 165, 207, 209, 251, 29, 135, 9, 243, 21, 191, 193, 107, 141, 119, 249, 99, 133, 175, 177, 219, 253, 103, 233, 211, 245, 159, 161, 75, 109, 87, 217, 67, 101, 143, 145, 187, 221, 71, 201, 179, 213, 127, 129, 43, 77, 55, 185, 35, 69, 111, 113, 155, 189, 39, 169, 147, 181, 95, 97, 11, 45, 23, 153, 3, 37, 79, 81, 123, 157, 7, 137, 115, 149, 63, 65, 235, 13, 247, 121, 227, 5, 47, 49, 91, 125, 231, 105, 83, 117, 31, 33, 203, 237, 215, 89, 195, 229, 15, 17, 59, 93, 199, 73, 51, 85, 255};
777   unsigned long r;
778 
779   ASSERT (n % 2UL != 0UL);
780 
781   r = T[(n & 255)>>1];
782   /* Perform 2 Newton iterations for LONG_BIT=32, 3 for LONG_BIT=64 */
783   r = 2UL * r - r * r * n;
784   r = 2UL * r - r * r * n;
785 #if LONG_BIT == 64
786   r = 2UL * r - r * r * n;
787 #endif
788 
789   return r;
790 }
791 
792 /* Compute n/2 (mod m), where m must be odd. */
793 static inline unsigned long
ularith_div2mod(const unsigned long n,const unsigned long m)794 ularith_div2mod (const unsigned long n, const unsigned long m)
795 {
796 #if (defined(__i386__) && defined(__GNUC__)) || defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
797     unsigned long N = n, M = m/2, t = 0;
798     ASSERT_EXPENSIVE (m % 2 != 0);
799     __asm__ __VOLATILE(
800         "shr $1, %1\n\t" /* N /= 2 */
801         "cmovc %0, %2\n\t" /* if (cy) {t = M;} */
802         "adc %2, %1\n\t" /* N += t + cy */
803         : "+&r" (M), "+&r" (N), "+&r" (t)
804         : : "cc"
805     );
806   return N;
807 #else
808   ASSERT_EXPENSIVE (m % 2UL != 0UL);
809   if (n % 2UL == 0UL)
810     return n / 2UL;
811   else
812     return n / 2UL + m / 2UL + 1UL;
813 #endif
814 }
815 
816 
817 /* Integer (truncated) square root of n */
818 static inline unsigned long
ularith_sqrt(const unsigned long n)819 ularith_sqrt (const unsigned long n)
820 {
821   unsigned int i;
822   unsigned long xs, c, d, s2;
823   const unsigned int l = (unsigned int)sizeof (unsigned long) * 8 - 1
824                        - (unsigned int)__builtin_clzl(n);
825 
826   d = n; /* d = n - x^2 */
827   xs = 0UL;
828   s2 = 1UL << (l - l % 2);
829 
830   for (i = l / 2; i != 0; i--)
831     {
832       /* Here, s2 = 1 << (2*i) */
833       /* xs = x << (i + 1), the value of x shifted left i+1 bits */
834 
835       c = xs + s2; /* c = (x + 2^i) ^ 2 - x^2 = 2^(i+1) * x + 2^(2*i) */
836       xs >>= 1; /* Now xs is shifted only i positions */
837       if (d >= c)
838         {
839           d -= c;
840           xs |= s2; /* x |= 1UL << i <=> xs |= 1UL << (2*i) */
841         }
842       s2 >>= 2;
843     }
844 
845   c = xs + s2;
846   xs >>= 1;
847   if (d >= c)
848     xs |= s2;
849 
850   return xs;
851 }
852 
853 /* Given r = -rem/p (mod den), we want num/(den*2^k) (mod p) ==
854    (ratio + rem/den)/2^k (mod p).
855    Using (a variant of) Bezout's identity, we have, for some non-negative
856    integer t,
857    r * p - t * den = -rem, or
858    r * p + rem = t * den,
859    thus den | (r * p + rem), and thus
860    t = (r * p + rem) / den is an integer and satisfies
861    t = rem/den (mod p).
862 
863    We have 0 <= r <= den-1 and rem <= den-1, and thus
864    0 <= t = p * r/den + rem/den <=
865    p (1 - 1/den) + 1 - 1/den =
866    p + 1 - (p + 1)/den < p + 1.
867    Thus t is almost a properly reduced residue for rem/den (mod p).
868    As p fits in unsigned long, so does t, and we can compute t modulo
869    2^LONGBITS; since den is odd, we can multiply by den^{-1} mod 2^LONGBITS
870    to effect division by den.
871 
872    Finally we compute (t + ratio)/2^k mod p = num/(den*2^k) mod p.  */
873 
874 static inline unsigned long
ularith_post_process_inverse(const unsigned long r,const unsigned long p,const unsigned long rem,const unsigned long den_inv,const unsigned long ratio,const unsigned long k)875 ularith_post_process_inverse(const unsigned long r, const unsigned long p,
876   const unsigned long rem, const unsigned long den_inv,
877   const unsigned long ratio, const unsigned long k)
878 {
879   unsigned long t = (r * p + rem) * den_inv;
880   const unsigned long ratio_p = (ratio >= p) ? ratio % p : ratio;
881   ASSERT_ALWAYS(t <= p); /* Cheap and fairly strong test */
882   /* ularith_addmod_ul_ul() accepts third operand == p and still produces
883      a properly reduced sum mod p. */
884   ularith_addmod_ul_ul (&t, ratio_p, t, p);
885 
886   ASSERT_EXPENSIVE(t < p);
887   ASSERT_EXPENSIVE(k == 0 || p % 2 == 1);
888   for (unsigned long j = 0; j < k; j++) {
889     t = ularith_div2mod(t, p);
890   }
891   return t;
892 }
893 
894 
895 /* Computes r = ((phigh * 2^LONG_BITS + plow) / 2^LONG_BITS) % m
896    Requires phigh < m and invm = -1/m (mod 2^LONG_BITS). */
897 
898 static inline void
ularith_redc(unsigned long * r,const unsigned long plow,const unsigned long phigh,const unsigned long m,const unsigned long invm)899 ularith_redc(unsigned long *r, const unsigned long plow,
900              const unsigned long phigh, const unsigned long m,
901              const unsigned long invm)
902 {
903   unsigned long t = phigh;
904 #ifndef HAVE_GCC_STYLE_AMD64_INLINE_ASM
905   unsigned long tlow, thigh;
906 #endif
907 
908   ASSERT_EXPENSIVE (phigh < m);
909 
910 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
911 
912   /* TODO: are the register constraints watertight?
913      %rax gets modified but putting tlow as an output constraint with "+"
914      will keep r from getting allocated in %rax, which is a shame
915      since we often want the result in %rax for the next multiply. */
916 
917   __asm__ __VOLATILE (
918     "imulq %[invm], %%rax\n\t"
919     "cmpq $1, %%rax \n\t"                /* if plow != 0, increase t */
920     "sbbq $-1, %[t]\n\t"
921     "mulq %[m]\n\t"
922     "lea (%[t],%%rdx,1), %[r]\n\t"  /* compute (rdx + thigh) mod m */
923     "subq %[m], %[t]\n\t"
924     "addq %%rdx, %[t]\n\t"
925     "cmovcq %[t], %[r]\n\t"
926     : [t] "+&r" (t), [r] "=&r" (r[0])
927     : [invm] "rm" (invm), [m] "rm" (m), "a" (plow)
928     : "%rdx", "cc"
929   );
930 #else
931   tlow = plow * invm;
932   ularith_mul_ul_ul_2ul (&tlow, &thigh, tlow, m);
933   /* Let w = 2^wordsize. We know (phigh * w + plow) + (thigh * w + tlow)
934      == 0 (mod w) so either plow == tlow == 0, or plow !=0 and tlow != 0.
935      In the former case we want phigh + thigh + 1, in the latter
936      phigh + thigh. Since t = phigh < m, and modredcul_add can handle the
937      case where the second operand is equal to m, adding 1 is safe */
938 
939   t += (plow != 0UL) ? 1UL : 0UL; /* Does not depend on the mul */
940 
941   ularith_addmod_ul_ul(r, t, thigh, m);
942 #endif
943   ASSERT_EXPENSIVE (r[0] < m);
944 }
945 
946 
947 #ifdef __cplusplus
948 }
949 #endif
950 
951 
952 #endif /* ifndef UL_ARITH_H__ */
953