1 /* Some functions for modular arithmetic with residues and modulus in
2    unsigned long variables. The modulus can be up to 2 unsigned longs
3    in size with the two most significant bits zero (meaning 62 bits if
4    unsigned long has 32 bits, or 126 bits if unsigned long has 64 bits).
5    Moduli must be odd and have the upper word non-zero. Residues are stored
6    in Montgomery form, reduction after multiplication is done with REDC.
7    Due to inlining, this file must be included in the caller's source code
8    with #include */
9 
10 /* Naming convention: all function start with modredc2ul2, for
11    MODulus REDC 2 Unsigned Longs minus 2 bits, followed by underscore,
12    functionality of function (add, mul, etc), and possibly underscore and
13    specification of what argument types the function takes (_ul, etc). */
14 
15 #ifndef MODREDC_2UL2_H
16 #define MODREDC_2UL2_H
17 
18 /**********************************************************************/
19 #include "cado_config.h"  // for HAVE_GCC_STYLE_AMD64_INLINE_ASM
20 #if defined(MODTRACE)
21 #include <stdio.h>
22 #endif
23 #include <limits.h>
24 #include <stdint.h>
25 #include <stdlib.h>       // for size_t, abort
26 #include "macros.h"
27 #include "ularith.h"
28 
29 // #define MODTRACE 1
30 
31 /*********************************************************************/
32 /* Helper macros */
33 
34 /* A macro for function renaming. All functions here start with
35    modredc2ul2_ */
36 #define MODREDC2UL2_RENAME(x) modredc2ul2_##x
37 
38 #define MODREDC2UL2_SIZE 2
39 #define MODREDC2UL2_MINBITS LONG_BIT
40 #define MODREDC2UL2_MAXBITS (2 * LONG_BIT - 2)
41 
42 typedef unsigned long residueredc2ul2_t[MODREDC2UL2_SIZE];
43 typedef unsigned long modintredc2ul2_t[MODREDC2UL2_SIZE];
44 struct __modulusredc2ul2_t {
45   modintredc2ul2_t m;
46   residueredc2ul2_t one;
47   unsigned long invm;
48 };
49 typedef struct __modulusredc2ul2_t modulusredc2ul2_t[1];
50 
51 
52 /* ==================== Functions used internally ==================== */
53 
54 static inline int
55 modredc2ul2_intlt (const modintredc2ul2_t a, const modintredc2ul2_t b);
56 
57 static inline void
58 modredc2ul2_add (residueredc2ul2_t r, const residueredc2ul2_t a,
59 		 const residueredc2ul2_t b, const modulusredc2ul2_t m);
60 static inline void
61 modredc2ul2_get_int (modintredc2ul2_t r, const residueredc2ul2_t s,
62 		     const modulusredc2ul2_t m MAYBE_UNUSED);
63 static inline void
64 modredc2ul2_intset (modintredc2ul2_t r, const modintredc2ul2_t s);
65 
66 MAYBE_UNUSED
67 static inline void
modredc2ul2_tomontgomery(residueredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m)68 modredc2ul2_tomontgomery (residueredc2ul2_t r, const residueredc2ul2_t s,
69 			  const modulusredc2ul2_t m)
70 {
71   int i;
72 
73   modredc2ul2_intset (r, s);
74   /* TODO FIXME: ridiculously slow */
75   for (i = 0; i < (MODREDC2UL2_SIZE * LONG_BIT); i++)
76     modredc2ul2_add (r, r, r, m);
77 }
78 
79 
80 /* Do a one-word REDC, i.e., r == s / w (mod m), w = 2^LONG_BIT.
81    If m > w, r < 2m. If s<m, then r<m */
82 MAYBE_UNUSED
83 static inline void
modredc2ul2_redc1(residueredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m)84 modredc2ul2_redc1 (residueredc2ul2_t r, const residueredc2ul2_t s,
85 		   const modulusredc2ul2_t m)
86 {
87   unsigned long t[4], k;
88 
89   k = s[0] * m[0].invm;
90   ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), k, m[0].m[0]);
91   if (s[0] != 0UL)
92     t[1]++;
93   t[2] = 0;
94   ularith_add_ul_2ul (&(t[1]), &(t[2]), s[1]); /* t[2] <= 1 */
95   ularith_mul_ul_ul_2ul (&(t[0]), &(t[3]), k, m[0].m[1]); /* t[3] < 2^w-1 */
96   ularith_add_2ul_2ul (&(t[1]), &(t[2]), t[0], t[3]);     /* t[2] < 2^w */
97 
98   /* r = (k*m + s) / w, k <= w-1. If s < m, then r < m */
99   r[0] = t[1];
100   r[1] = t[2];
101 }
102 
103 /* Converts s out of Montgomery form by dividing by 2^(2*LONG_BIT).
104    Requires s < m. */
105 MAYBE_UNUSED
106 static inline void
modredc2ul2_frommontgomery(residueredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m)107 modredc2ul2_frommontgomery (residueredc2ul2_t r, const residueredc2ul2_t s,
108 			    const modulusredc2ul2_t m)
109 {
110   /* Do two REDC steps */
111   modredc2ul2_redc1 (r, s, m);
112   modredc2ul2_redc1 (r, r, m);
113 }
114 
115 
116 #ifdef WANT_ASSERT_EXPENSIVE
117 #if defined(__x86_64__)
118 #define ABORT_IF_CY "jnc 1f\n\tlea _GLOBAL_OFFSET_TABLE_(%%rip), %%rbx\n\tcall abort@plt\n1:\n\t"
119 #elif defined(__i386__)
120 #define ABORT_IF_CY "jnc 1f\n\tcall abort\n1:\n\t"
121 #endif
122 #else
123 #define ABORT_IF_CY
124 #endif
125 
126 MAYBE_UNUSED
127 static inline void
_modredc2ul2_mul_ul(residueredc2ul2_t r,const residueredc2ul2_t a,const unsigned long b,const modulusredc2ul2_t m)128 _modredc2ul2_mul_ul (residueredc2ul2_t r, const residueredc2ul2_t a,
129                      const unsigned long b, const modulusredc2ul2_t m)
130 {
131   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
132 #if defined(MODTRACE)
133   printf ("((%lu * 2^%d + %lu) * %lu / 2^%d) %% (%lu * 2^%d + %lu)",
134           a[1], LONG_BIT, a[0], b, LONG_BIT, m[0].m[1], LONG_BIT, m[0].m[0]);
135 #endif
136 
137 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
138   unsigned long u = m[0].invm, a0 = a[0];
139   __asm__ __VOLATILE (
140     /* Product of low words */
141     "mulq %[b]\n\t"          /* rdx:rax = a0*b <= (2^64-1)^2 */
142     "movq %%rdx, %[t0]\n\t"  /* t0:rax = a0*b <= (2^64-1)^2 */
143     /* Compute u such that a0*b + u*m == 0 (mod 2^64), compute u*m, add to t0:rax */
144     "imulq %[u], %%rax\n\t"
145     "movq %%rax, %[u]\n\t"  /* u <= 2^64-1 */
146     "xorl %k[t1], %k[t1]\n\t"
147     "mulq %[m0]\n\t"         /* rdx:rax = u*m0 <= (2^64-1)^2 */
148     "negq %%rax\n\t"         /* if low word != 0, carry to high word */
149     "movq %[u], %%rax\n\t"  /* rax = u, independent, goes in pipe 0 */
150     "adcq %%rdx, %[t0]\n\t"
151     "setc %b[t1]\n\t"        /* t1:t0 = (a0*b + u*m0) / 2^64 <= 2*2^64 - 4 */
152     "mulq %[m1]\n\t"         /* rdx:rax = u*m1 */
153     "addq %%rax, %[t0]\n\t"
154     "movq %[a1], %%rax\n\t"  /* independent, goes in pipe 0 */
155     "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0*b+u*m)/2^64 <= 2^126 - 2^62 */
156     /* Free slot in pipe 2 here */
157     ABORT_IF_CY
158 
159     /* Product of low and high word */
160     "mulq %[b]\n\t"          /* rdx:rax = a1*b <= (2^63-2^32-1)*(2^64-1) */
161     "addq %%rax, %[t0]\n\t"
162     "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = ((a1*2^64 + a0)*b + u*m) / 2^64
163                                      <= ((2^126-1)*(2^64-1) + (2^64-1)*(2^126-1)) / 2^64
164                                       < 2^127 - 2^63 - 1, thus no carry */
165     ABORT_IF_CY
166                              /* t1:t0 = ((a1*2^64 + a0)*b + u*m) / 2^64
167                                      <= ((m-1)*(2^64-1) + (2^64-1)*m) / 2^64
168                                       = 2*m*(1-1/2^64) - 1*(1-1/2^64). May need to subtract m. */
169     "movq %[t0], %%rax\n\t" /* See if result > m */
170     "movq %[t1], %%rdx\n\t"
171     "subq %[m0], %[t0]\n\t" /* Try subtracting m, see if there's a carry */
172     "sbbq %[m1], %[t1]\n\t"
173     "cmovc %%rax, %[t0]\n\t" /* Carry -> restore old result */
174     "cmovc %%rdx, %[t1]\n\t"
175     : [u] "+&r" (u), [t0] "=&r" (r[0]), [t1] "=&r" (r[1]), [a0] "+&a" (a0)
176     : [a1] "g" (a[1]), [b] "rm" (b),
177       [m0] "rm" (m[0].m[0]), [m1] "rm" (m[0].m[1])
178     : "%rdx", "cc"
179   );
180 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
181   unsigned long pl, ph, t[2];
182 
183   /* m < 1/4 W^2,  a < m, b < W */
184 
185   /* Product of b and low word */
186   ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), a[0], b); /* t1:t0 = a0*b < W^2 */
187 
188   /* One REDC step */
189   modredc2ul2_redc1 (t, t, m); /* t1:t0 = (a0*b + k*m) / W < m + W < 1/4 W^2 + W */
190 
191   /* Product of b and high word  */
192   ularith_mul_ul_ul_2ul (&pl, &ph, a[1], b);   /* ph:pl < 1/4 W^2 */
193   ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 1/2 W^2 + W */
194 
195   ularith_sub_2ul_2ul_ge (&(t[0]), &(t[1]), m[0].m[0], m[0].m[1]);
196 
197   r[0] = t[0];
198   r[1] = t[1];
199 #endif
200 #if defined(MODTRACE)
201   printf (" == (%lu * 2^%d + %lu) /* PARI */ \n", r[1], LONG_BIT, r[0]);
202 #endif
203   ASSERT_EXPENSIVE (modredc2ul2_intlt (r, m[0].m));
204 }
205 
206 
207 /* ================= Functions that are part of the API ================= */
208 
209 /* Some functions for integers of the same width as the modulus */
210 
211 MAYBE_UNUSED
212 static inline void
modredc2ul2_intinit(modintredc2ul2_t r)213 modredc2ul2_intinit (modintredc2ul2_t r)
214 {
215   r[0] = 0;
216   r[1] = 0;
217 }
218 
219 
220 MAYBE_UNUSED
221 static inline void
modredc2ul2_intclear(modintredc2ul2_t r MAYBE_UNUSED)222 modredc2ul2_intclear (modintredc2ul2_t r MAYBE_UNUSED)
223 {
224   return;
225 }
226 
227 
228 MAYBE_UNUSED
229 static inline void
modredc2ul2_intset(modintredc2ul2_t r,const modintredc2ul2_t s)230 modredc2ul2_intset (modintredc2ul2_t r, const modintredc2ul2_t s)
231 {
232   r[0] = s[0];
233   r[1] = s[1];
234 }
235 
236 MAYBE_UNUSED
237 static inline void
modredc2ul2_intset_ul(modintredc2ul2_t r,const unsigned long s)238 modredc2ul2_intset_ul (modintredc2ul2_t r, const unsigned long s)
239 {
240   r[0] = s;
241   r[1] = 0UL;
242 }
243 
244 MAYBE_UNUSED
245 static inline void
modredc2ul2_intset_uls(modintredc2ul2_t r,const unsigned long * s,const size_t n)246 modredc2ul2_intset_uls (modintredc2ul2_t r, const unsigned long *s,
247                         const size_t n)
248 {
249   if (n == 0) {
250     r[0] = 0;
251     r[1] = 0;
252   } else if (n == 1) {
253     r[0] = s[0];
254     r[1] = 0;
255   } else if (n == 2) {
256     r[0] = s[0];
257     r[1] = s[1];
258   } else
259     abort();
260 }
261 
262 /* Get the least significant unsigned long of r */
263 MAYBE_UNUSED
264 static inline unsigned long
modredc2ul2_intget_ul(const modintredc2ul2_t r)265 modredc2ul2_intget_ul (const modintredc2ul2_t r)
266 {
267   return r[0];
268 }
269 
270 MAYBE_UNUSED
271 static inline size_t
modredc2ul2_intget_uls(unsigned long * r,const modintredc2ul2_t s)272 modredc2ul2_intget_uls (unsigned long *r, const modintredc2ul2_t s)
273 {
274   r[0] = s[0];
275   if (s[1] != 0) {
276     r[1] = s[1];
277     return 2;
278   }
279   return 1;
280 }
281 
282 MAYBE_UNUSED
283 static inline double
modredc2ul2_intget_double(const modintredc2ul2_t s)284 modredc2ul2_intget_double (const modintredc2ul2_t s)
285 {
286   double d = (double) s[1];
287 #if (LONG_BIT == 32)
288   d *= 4294967296.0;
289 #elif (LONG_BIT == 64)
290   d *= 18446744073709551616.0;
291 #else
292 #error "unsupported value of LONG_BIT"
293 #endif
294   return d + (double) s[0];
295 }
296 
297 MAYBE_UNUSED
298 static inline int
modredc2ul2_intequal(const modintredc2ul2_t a,const modintredc2ul2_t b)299 modredc2ul2_intequal (const modintredc2ul2_t a, const modintredc2ul2_t b)
300 {
301   return (a[0] == b[0] && a[1] == b[1]);
302 }
303 
304 MAYBE_UNUSED
305 static inline int
modredc2ul2_intequal_ul(const modintredc2ul2_t a,const unsigned long b)306 modredc2ul2_intequal_ul (const modintredc2ul2_t a, const unsigned long b)
307 {
308   return (a[0] == b && a[1] == 0UL);
309 }
310 
311 /* Returns 1 if a < b, 0 otherwise */
312 MAYBE_UNUSED
313 static inline int
modredc2ul2_intlt(const modintredc2ul2_t a,const modintredc2ul2_t b)314 modredc2ul2_intlt (const modintredc2ul2_t a, const modintredc2ul2_t b)
315 {
316     modintredc2ul2_t t;
317 
318     modredc2ul2_intset (t, a);
319     return ularith_sub_2ul_2ul_cy (&(t[0]), &(t[1]), b[0], b[1]);
320 }
321 
322 MAYBE_UNUSED
323 static inline int
modredc2ul2_intcmp(const modintredc2ul2_t a,const modintredc2ul2_t b)324 modredc2ul2_intcmp (const modintredc2ul2_t a, const modintredc2ul2_t b)
325 {
326   if (a[1] < b[1])
327     return -1;
328   if (a[1] > b[1])
329     return 1;
330   return (a[0] < b[0]) ? -1 : (a[0] == b[0]) ? 0 : 1;
331 }
332 
333 MAYBE_UNUSED
334 static inline int
modredc2ul2_intcmp_ul(const modintredc2ul2_t a,const unsigned long b)335 modredc2ul2_intcmp_ul (const modintredc2ul2_t a, const unsigned long b)
336 {
337   if (a[1] > 0UL)
338     return 1;
339   return (a[0] < b) ? -1 : (a[0] == b) ? 0 : 1;
340 }
341 
342 MAYBE_UNUSED
343 static inline int
modredc2ul2_intcmp_uint64(const modintredc2ul2_t a,const uint64_t b)344 modredc2ul2_intcmp_uint64 (const modintredc2ul2_t a, const uint64_t b)
345 {
346   ASSERT(ULONG_MAX == UINT32_MAX || ULONG_MAX == UINT64_MAX);
347   if (ULONG_MAX == UINT32_MAX) {
348     uint64_t t = ((uint64_t) a[1] << 32) + a[0];
349     return (t < b) ? -1 : (t == b) ? 0 : 1;
350   } else {
351     if (a[1] > 0UL)
352       return 1;
353     return (a[0] < b) ? -1 : (a[0] == b) ? 0 : 1;
354   }
355 }
356 
357 MAYBE_UNUSED
358 static inline int
modredc2ul2_intfits_ul(const modintredc2ul2_t a)359 modredc2ul2_intfits_ul (const modintredc2ul2_t a)
360 {
361   return (a[1] == 0UL);
362 }
363 
364 MAYBE_UNUSED
365 static inline void
modredc2ul2_intadd(modintredc2ul2_t r,const modintredc2ul2_t a,const modintredc2ul2_t b)366 modredc2ul2_intadd (modintredc2ul2_t r, const modintredc2ul2_t a,
367 		    const modintredc2ul2_t b)
368 {
369   modintredc2ul2_t t;
370   modredc2ul2_intset (t, a);
371   ularith_add_2ul_2ul (&(t[0]), &(t[1]), b[0], b[1]);
372   modredc2ul2_intset (r, t);
373 }
374 
375 MAYBE_UNUSED
376 static inline void
modredc2ul2_intsub(modintredc2ul2_t r,const modintredc2ul2_t a,const modintredc2ul2_t b)377 modredc2ul2_intsub (modintredc2ul2_t r, const modintredc2ul2_t a,
378 		    const modintredc2ul2_t b)
379 {
380   modintredc2ul2_t t;
381   modredc2ul2_intset (t, a);
382   ularith_sub_2ul_2ul (&(t[0]), &(t[1]), b[0], b[1]);
383   modredc2ul2_intset (r, t);
384 }
385 
386 /* Returns the number of bits in a, that is, floor(log_2(a))+1.
387    For a == 0 returns 0. */
388 MAYBE_UNUSED
389 static inline size_t
modredc2ul2_intbits(const modintredc2ul2_t a)390 modredc2ul2_intbits (const modintredc2ul2_t a)
391 {
392   if (a[1] > 0UL)
393     return 2*LONG_BIT - ularith_clz (a[1]);
394 
395   if (a[0] > 0UL)
396     return LONG_BIT - ularith_clz (a[0]);
397 
398   return 0;
399 }
400 
401 
402 /* r = trunc(s / 2^i) */
403 MAYBE_UNUSED
404 static inline void
modredc2ul2_intshr(modintredc2ul2_t r,const modintredc2ul2_t s,const int i)405 modredc2ul2_intshr (modintredc2ul2_t r, const modintredc2ul2_t s, const int i)
406 {
407   if (i >= 2 * LONG_BIT) {
408     r[1] = r[0] = 0UL;
409   } else if (i >= LONG_BIT) {
410     r[0] = s[1] >> (i - LONG_BIT);
411     r[1] = 0UL; /* May overwrite s[1] */
412   } else { /* i < LONG_BIT */
413     ularith_shrd (&(r[0]), s[1], s[0], i);
414     r[1] = s[1] >> i;
415   }
416 }
417 
418 
419 /* r = (s * 2^i) % (2^(2 * LONG_BIT)) */
420 MAYBE_UNUSED
421 static inline void
modredc2ul2_intshl(modintredc2ul2_t r,const modintredc2ul2_t s,const int i)422 modredc2ul2_intshl (modintredc2ul2_t r, const modintredc2ul2_t s, const int i)
423 {
424   if (i >= 2 * LONG_BIT) {
425     r[1] = r[0] = 0UL;
426   } else if (i >= LONG_BIT) {
427     r[1] = s[0] << (i - LONG_BIT);
428     r[0] = 0UL;
429   } else { /* i < LONG_BIT */
430     ularith_shld (&(r[1]), s[0], s[1], i);
431     r[0] = s[0] << i;
432   }
433 }
434 
435 
436 /* r = n/d. We require d|n */
437 MAYBE_UNUSED
438 static inline void
modredc2ul2_intdivexact(modintredc2ul2_t r,const modintredc2ul2_t n,const modintredc2ul2_t d)439 modredc2ul2_intdivexact (modintredc2ul2_t r, const modintredc2ul2_t n,
440                          const modintredc2ul2_t d)
441 {
442   modintredc2ul2_t n1, d1;
443   unsigned long invf, r0, k0, k1;
444   int i;
445 #ifdef WANT_ASSERT_EXPENSIVE
446   unsigned long s0 = n[0], s1 = n[1];
447 #endif
448 
449   modredc2ul2_intset (n1, n);
450   modredc2ul2_intset (d1, d);
451 
452   /* Make d odd */
453   if (d1[0] == 0UL)
454     {
455       ASSERT (n1[0] == 0UL);
456       d1[0] = d1[1];
457       d1[1] = 0UL;
458       n1[0] = n1[1];
459       n1[1] = 0UL;
460     }
461   ASSERT(d1[0] != 0UL);
462   i = ularith_ctz (d1[0]);
463   ularith_shrd (&(d1[0]), d1[1], d1[0], i);
464   d1[1] >>= i;
465   ASSERT((n1[0] & ((1UL << i) - 1UL)) == 0UL);
466   ularith_shrd (&(n1[0]), n1[1], n1[0], i);
467   n1[1] >>= i;
468 
469   invf = ularith_invmod (d1[0]);
470   r0 = invf * n1[0];
471   ularith_mul_ul_ul_2ul (&k0, &k1, r0, d1[0]);
472   ularith_sub_2ul_2ul (&(n1[0]), &(n1[1]), k0, k1);
473   n1[1] -= r0 * d1[1];
474   ASSERT (n1[0] == 0UL);
475   r[0] = r0;
476   r[1] = invf * n1[1];
477 #ifdef WANT_ASSERT_EXPENSIVE
478   ASSERT_EXPENSIVE (d[1] == 0UL || r[1] == 0UL);
479   ularith_mul_ul_ul_2ul (&k0, &k1, r[0], d[0]);
480   ularith_sub_2ul_2ul (&s0, &s1, k0, k1);
481   ASSERT_EXPENSIVE (s0 == 0UL);
482   ularith_mul_ul_ul_2ul (&k0, &k1, r[0], d[1]);
483   ASSERT_EXPENSIVE (k1 == 0UL);
484   s1 -= k0;
485   ularith_mul_ul_ul_2ul (&k0, &k1, r[1], d[0]);
486   ASSERT_EXPENSIVE (k1 == 0UL);
487   s1 -= k0;
488   ASSERT_EXPENSIVE (s1 == 0UL);
489 #endif
490 }
491 
492 MAYBE_UNUSED
493 static inline unsigned long
modredc2ul2_intmod_ul(const modintredc2ul2_t n,const unsigned long d)494 modredc2ul2_intmod_ul (const modintredc2ul2_t n, const unsigned long d)
495 {
496   unsigned long r;
497   if (n[1] < d)
498     {
499       ularith_div_2ul_ul_ul_r (&r, n[0], n[1], d);
500     }
501   else
502     {
503       unsigned long t;
504       ularith_div_2ul_ul_ul_r (&t, n[1], 0UL, d);
505       ularith_div_2ul_ul_ul_r (&r, n[0], t, d);
506     }
507   return r;
508 }
509 
510 /* r = n % d */
511 MAYBE_UNUSED
512 static inline void
modredc2ul2_intmod(modintredc2ul2_t r,const modintredc2ul2_t n,const modintredc2ul2_t d)513 modredc2ul2_intmod (modintredc2ul2_t r, const modintredc2ul2_t n,
514                     const modintredc2ul2_t d)
515 {
516   if (d[1] == 0UL)
517     {
518       r[0] = modredc2ul2_intmod_ul (n, d[0]);
519       r[1] = 0;
520     }
521   else
522     {
523       modintredc2ul2_t t1, t2;
524       unsigned long q, dummy;
525       int i;
526 
527       /* Divide both by 2^k s.t. 2^(LONG_BIT-1) <= d/2^k < 2^LONG_BIT */
528       modredc2ul2_intshr (t1, n, 1);
529       modredc2ul2_intshr (t2, d, 1);
530       if (t2[1] != 0UL)
531         {
532           i = LONG_BIT - ularith_clz (t2[1]);
533           modredc2ul2_intshr (t1, t1, i);
534           modredc2ul2_intshr (t2, t2, i);
535         }
536       ASSERT(t2[1] == 0);
537       ASSERT((t2[0] & (1UL << (LONG_BIT - 1))) != 0UL);
538       ASSERT (t1[1] < t2[0]);
539 
540       ularith_div_2ul_ul_ul (&q, &dummy, t1[0], t1[1], t2[0]);
541       ularith_mul_ul_ul_2ul (&(t1[0]), &(t1[1]), q, d[0]);
542       t1[1] += q * d[1];
543       /* printf ("n=%lu*2^%d + %lu; d=%lu*2^%d + %lu; q=%lu; "
544               "t1=%lu*2^%d + %lu;\n",
545               n[1], LONG_BIT, n[0], d[1], LONG_BIT, d[0], q,
546               t1[1], LONG_BIT, t1[0]); */
547 
548       if (modredc2ul2_intlt (n, t1))
549         modredc2ul2_intsub (t1, t1, d);
550       ASSERT(!modredc2ul2_intlt (n, t1));
551       modredc2ul2_intsub(r, n, t1);
552     }
553 }
554 
555 
556 /* Functions for the modulus */
557 
558 /* Init the modulus from modintredc2ul2_t. */
559 MAYBE_UNUSED
560 static inline void
modredc2ul2_initmod_int(modulusredc2ul2_t m,const modintredc2ul2_t s)561 modredc2ul2_initmod_int (modulusredc2ul2_t m, const modintredc2ul2_t s)
562 {
563   ASSERT (s[1] > 0UL);
564   ASSERT (s[1] < (1UL << (LONG_BIT - 2)));
565   modredc2ul2_intset (m[0].m, s);
566   m[0].invm = -ularith_invmod (s[0]);
567 
568   modredc2ul2_intset_ul (m[0].one, 0UL);
569   modredc2ul2_intsub (m[0].one, m[0].one, m[0].m); /* 2^128 - m */
570   modredc2ul2_intmod (m[0].one, m[0].one, m[0].m);
571 
572 #ifdef WANT_ASSERT_EXPENSIVE
573   {
574     modintredc2ul2_t t;
575     modredc2ul2_get_int (t, m[0].one, m);
576     ASSERT_EXPENSIVE (modredc2ul2_intequal_ul (t, 1UL));
577   }
578 #endif
579 }
580 
581 
582 /* Returns the modulus as a modintredc2ul2_t. */
583 MAYBE_UNUSED
584 static inline void
modredc2ul2_getmod_int(modintredc2ul2_t r,const modulusredc2ul2_t m)585 modredc2ul2_getmod_int (modintredc2ul2_t r, const modulusredc2ul2_t m)
586 {
587   modredc2ul2_intset (r, m[0].m);
588 }
589 
590 
591 MAYBE_UNUSED
592 static inline void
modredc2ul2_clearmod(modulusredc2ul2_t m MAYBE_UNUSED)593 modredc2ul2_clearmod (modulusredc2ul2_t m MAYBE_UNUSED)
594 {
595   return;
596 }
597 
598 
599 /* Functions for residues */
600 
601 /* Initialises a residueredc2ul2_t and sets it to zero */
602 MAYBE_UNUSED
603 static inline void
modredc2ul2_init(residueredc2ul2_t r,const modulusredc2ul2_t m MAYBE_UNUSED)604 modredc2ul2_init (residueredc2ul2_t r, const modulusredc2ul2_t m MAYBE_UNUSED)
605 {
606   modredc2ul2_intset_ul (r, 0UL);
607 }
608 
609 
610 /* Initialises a residueredc2ul2_t, but does not set it to zero. For fixed
611    length residueredc2ul2_t, that leaves nothing to do at all. */
612 MAYBE_UNUSED
613 static inline void
modredc2ul2_init_noset0(residueredc2ul2_t r MAYBE_UNUSED,const modulusredc2ul2_t m MAYBE_UNUSED)614 modredc2ul2_init_noset0 (residueredc2ul2_t r MAYBE_UNUSED,
615 			 const modulusredc2ul2_t m MAYBE_UNUSED)
616 {
617   return;
618 }
619 
620 
621 MAYBE_UNUSED
622 static inline void
modredc2ul2_clear(residueredc2ul2_t r MAYBE_UNUSED,const modulusredc2ul2_t m MAYBE_UNUSED)623 modredc2ul2_clear (residueredc2ul2_t r MAYBE_UNUSED,
624 		   const modulusredc2ul2_t m MAYBE_UNUSED)
625 {
626   return;
627 }
628 
629 
630 MAYBE_UNUSED
631 static inline void
modredc2ul2_set(residueredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m MAYBE_UNUSED)632 modredc2ul2_set (residueredc2ul2_t r, const residueredc2ul2_t s,
633 		 const modulusredc2ul2_t m MAYBE_UNUSED)
634 {
635   ASSERT_EXPENSIVE (modredc2ul2_intlt (s, m[0].m));
636   modredc2ul2_intset (r, s);
637 }
638 
639 
640 MAYBE_UNUSED
641 static inline void
modredc2ul2_set_ul(residueredc2ul2_t r,const unsigned long s,const modulusredc2ul2_t m)642 modredc2ul2_set_ul (residueredc2ul2_t r, const unsigned long s,
643 		    const modulusredc2ul2_t m)
644 {
645   modredc2ul2_intset_ul (r, s);
646   modredc2ul2_tomontgomery (r, r, m);
647 #ifdef WANT_ASSERT_EXPENSIVE
648   {
649     modintredc2ul2_t t;
650     modredc2ul2_get_int (t, r, m);
651     ASSERT_EXPENSIVE (t[0] == s && t[1] == 0UL);
652   }
653 #endif
654 }
655 
656 
657 /* Sets the residueredc2ul2_t to the class represented by the integer s.
658    Assumes that s is reduced (mod m), i.e. 0 <= s < m */
659 
660 MAYBE_UNUSED
661 static inline void
modredc2ul2_set_ul_reduced(residueredc2ul2_t r,const unsigned long s,const modulusredc2ul2_t m MAYBE_UNUSED)662 modredc2ul2_set_ul_reduced (residueredc2ul2_t r, const unsigned long s,
663 			    const modulusredc2ul2_t m MAYBE_UNUSED)
664 {
665   modredc2ul2_set_ul (r, s, m);
666 }
667 
668 
669 MAYBE_UNUSED
670 static inline void
modredc2ul2_set_int(residueredc2ul2_t r,const modintredc2ul2_t s,const modulusredc2ul2_t m)671 modredc2ul2_set_int (residueredc2ul2_t r, const modintredc2ul2_t s,
672 		     const modulusredc2ul2_t m)
673 {
674   if (!modredc2ul2_intlt (s, m[0].m))
675     modredc2ul2_intmod (r, s, m[0].m);
676   else
677     modredc2ul2_intset (r, s);
678 
679   modredc2ul2_tomontgomery (r, r, m);
680 }
681 
682 
683 MAYBE_UNUSED
684 static inline void
modredc2ul2_set_int_reduced(residueredc2ul2_t r,const modintredc2ul2_t s,const modulusredc2ul2_t m)685 modredc2ul2_set_int_reduced (residueredc2ul2_t r, const modintredc2ul2_t s,
686 			     const modulusredc2ul2_t m)
687 {
688   ASSERT (modredc2ul2_intlt (s, m[0].m));
689   modredc2ul2_intset (r, s);
690   modredc2ul2_tomontgomery (r, r, m);
691 }
692 
693 
694 MAYBE_UNUSED
695 static inline void
modredc2ul2_set0(residueredc2ul2_t r,const modulusredc2ul2_t m MAYBE_UNUSED)696 modredc2ul2_set0 (residueredc2ul2_t r, const modulusredc2ul2_t m MAYBE_UNUSED)
697 {
698   modredc2ul2_intset_ul (r, 0UL);
699 }
700 
701 
702 MAYBE_UNUSED
703 static inline void
modredc2ul2_set1(residueredc2ul2_t r,const modulusredc2ul2_t m)704 modredc2ul2_set1 (residueredc2ul2_t r, const modulusredc2ul2_t m)
705 {
706   modredc2ul2_intset (r, m[0].one);
707 }
708 
709 
710 /* Exchanges the values of the two arguments */
711 
712 MAYBE_UNUSED
713 static inline void
modredc2ul2_swap(residueredc2ul2_t a,residueredc2ul2_t b,const modulusredc2ul2_t m MAYBE_UNUSED)714 modredc2ul2_swap (residueredc2ul2_t a, residueredc2ul2_t b,
715 		  const modulusredc2ul2_t m MAYBE_UNUSED)
716 {
717   modintredc2ul2_t t;
718   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
719   ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
720   modredc2ul2_intset (t, a);
721   modredc2ul2_intset (a, b);
722   modredc2ul2_intset (b, t);
723 }
724 
725 
726 /* Returns the least significant unsigned long of the residue. How to signal
727    if the residue does not fit in one unsigned long? */
728 
729 MAYBE_UNUSED
730 static inline unsigned long
modredc2ul2_get_ul(const residueredc2ul2_t s,const modulusredc2ul2_t m MAYBE_UNUSED)731 modredc2ul2_get_ul (const residueredc2ul2_t s,
732 		    const modulusredc2ul2_t m MAYBE_UNUSED)
733 {
734   residueredc2ul2_t t;
735   ASSERT_EXPENSIVE (modredc2ul2_intlt (s, m[0].m));
736   modredc2ul2_frommontgomery (t, s, m);
737   ASSERT (t[1] == 0UL);
738   return t[0];
739 }
740 
741 
742 /* Returns the residue as a modintredc2ul2_t */
743 
744 MAYBE_UNUSED
745 static inline void
modredc2ul2_get_int(modintredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m MAYBE_UNUSED)746 modredc2ul2_get_int (modintredc2ul2_t r, const residueredc2ul2_t s,
747 		     const modulusredc2ul2_t m MAYBE_UNUSED)
748 {
749   ASSERT_EXPENSIVE (modredc2ul2_intlt (s, m[0].m));
750   modredc2ul2_frommontgomery (r, s, m);
751 }
752 
753 
754 MAYBE_UNUSED
755 static inline int
modredc2ul2_equal(const residueredc2ul2_t a,const residueredc2ul2_t b,const modulusredc2ul2_t m MAYBE_UNUSED)756 modredc2ul2_equal (const residueredc2ul2_t a, const residueredc2ul2_t b,
757 		   const modulusredc2ul2_t m MAYBE_UNUSED)
758 {
759   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
760   ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
761   return (modredc2ul2_intequal(a, b));
762 }
763 
764 
765 MAYBE_UNUSED
766 static inline int
modredc2ul2_is0(const residueredc2ul2_t a,const modulusredc2ul2_t m MAYBE_UNUSED)767 modredc2ul2_is0 (const residueredc2ul2_t a, const modulusredc2ul2_t m MAYBE_UNUSED)
768 {
769   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
770   return (modredc2ul2_intequal_ul(a, 0UL));
771 }
772 
773 
774 MAYBE_UNUSED
775 static inline int
modredc2ul2_is1(const residueredc2ul2_t a,const modulusredc2ul2_t m MAYBE_UNUSED)776 modredc2ul2_is1 (const residueredc2ul2_t a, const modulusredc2ul2_t m MAYBE_UNUSED)
777 {
778   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
779   return (modredc2ul2_intequal(a, m[0].one));
780 }
781 
782 
783 MAYBE_UNUSED
784 static inline void
modredc2ul2_add(residueredc2ul2_t r,const residueredc2ul2_t a,const residueredc2ul2_t b,const modulusredc2ul2_t m)785 modredc2ul2_add (residueredc2ul2_t r, const residueredc2ul2_t a,
786 		 const residueredc2ul2_t b, const modulusredc2ul2_t m)
787 {
788   const unsigned long t0 = b[0], t1 = b[1]; /* r, a, and/or b may overlap */
789   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
790   ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
791 
792   modredc2ul2_intset (r, a);
793   ularith_add_2ul_2ul (&(r[0]), &(r[1]), t0, t1);
794   ularith_sub_2ul_2ul_ge (&(r[0]), &(r[1]), m[0].m[0], m[0].m[1]);
795   ASSERT_EXPENSIVE (modredc2ul2_intlt (r, m[0].m));
796 }
797 
798 
799 MAYBE_UNUSED
800 static inline void
modredc2ul2_sub(residueredc2ul2_t r,const residueredc2ul2_t a,const residueredc2ul2_t b,const modulusredc2ul2_t m)801 modredc2ul2_sub (residueredc2ul2_t r, const residueredc2ul2_t a,
802 		 const residueredc2ul2_t b, const modulusredc2ul2_t m)
803 {
804   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
805   ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
806 
807 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
808   {
809     unsigned long s1 = m[0].m[0], s2 = m[0].m[1], t1 = a[0], t2 = a[1];
810 
811     __asm__ __VOLATILE (
812 	     "subq %4, %0\n\t"
813 	     "sbbq %5, %1\n\t"    /* t -= b */
814 	     "cmovncq %6, %2\n\t" /* If !carry, s = 0 */
815 	     "cmovncq %6, %3\n"
816 	     : "+&r" (t1), "+&r" (t2), "+&r" (s1), "+r" (s2)
817 	     : "g" (b[0]), "g" (b[1]), "rm" (0UL)
818 	     : "cc"
819 	     );
820     ularith_add_2ul_2ul (&t1, &t2, s1, s2);
821     r[0] = t1;
822     r[1] = t2;
823   }
824 #else
825   {
826     unsigned long t1 = a[0], t2 = a[1];
827     ularith_sub_2ul_2ul (&t1, &t2, b[0], b[1]);
828 
829     if (t2 > a[1] || (t2 == a[1] && t1 > a[0]))
830       ularith_add_2ul_2ul (&t1, &t2, m[0].m[0], m[0].m[1]);
831 
832     r[0] = t1;
833     r[1] = t2;
834   }
835 #endif
836 }
837 
838 
839 MAYBE_UNUSED
840 static inline void
modredc2ul2_add1(residueredc2ul2_t r,const residueredc2ul2_t a,const modulusredc2ul2_t m)841 modredc2ul2_add1 (residueredc2ul2_t r, const residueredc2ul2_t a,
842 		  const modulusredc2ul2_t m)
843 {
844   modredc2ul2_add(r, a, m[0].one, m);
845 }
846 
847 
848 MAYBE_UNUSED
849 static inline void
modredc2ul2_add_ul(residueredc2ul2_t r,const residueredc2ul2_t a,const unsigned long b,const modulusredc2ul2_t m)850 modredc2ul2_add_ul (residueredc2ul2_t r, const residueredc2ul2_t a,
851 		    const unsigned long b, const modulusredc2ul2_t m)
852 {
853   residueredc2ul2_t t;
854 
855   /* TODO: speed up */
856   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
857   modredc2ul2_init_noset0 (t, m);
858   modredc2ul2_set_ul (t, b, m);
859   modredc2ul2_add (r, a, t, m);
860   modredc2ul2_clear (t, m);
861 }
862 
863 
864 MAYBE_UNUSED
865 static inline void
modredc2ul2_sub_ul(residueredc2ul2_t r,const residueredc2ul2_t a,const unsigned long b,const modulusredc2ul2_t m)866 modredc2ul2_sub_ul (residueredc2ul2_t r, const residueredc2ul2_t a,
867 		    const unsigned long b, const modulusredc2ul2_t m)
868 {
869   residueredc2ul2_t t;
870 
871   /* TODO: speed up */
872   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
873   modredc2ul2_init_noset0 (t, m);
874   modredc2ul2_set_ul (t, b, m);
875   modredc2ul2_sub (r, a, t, m);
876   modredc2ul2_clear (t, m);
877 }
878 
879 
880 MAYBE_UNUSED
881 static inline void
modredc2ul2_neg(residueredc2ul2_t r,const residueredc2ul2_t a,const modulusredc2ul2_t m)882 modredc2ul2_neg (residueredc2ul2_t r, const residueredc2ul2_t a,
883 		 const modulusredc2ul2_t m)
884 {
885   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
886   if (modredc2ul2_is0 (a, m))
887     modredc2ul2_set (r, a, m);
888   else
889     modredc2ul2_intsub (r, m[0].m, a);
890 }
891 
892 
893 MAYBE_UNUSED
894 static inline void
modredc2ul2_div2(residueredc2ul2_t r,const residueredc2ul2_t a,const modulusredc2ul2_t m)895 modredc2ul2_div2 (residueredc2ul2_t r, const residueredc2ul2_t a,
896 		  const modulusredc2ul2_t m)
897 {
898   ASSERT_EXPENSIVE (m[0].m[0] % 2UL != 0UL);
899   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
900   modredc2ul2_intset (r, a);
901   if (r[0] % 2UL == 1UL)
902     modredc2ul2_intadd (r, r, m[0].m);
903   modredc2ul2_intshr (r, r, 1);
904 }
905 
906 
907 MAYBE_UNUSED
908 static inline void
modredc2ul2_mul(residueredc2ul2_t r,const residueredc2ul2_t a,const residueredc2ul2_t b,const modulusredc2ul2_t m)909 modredc2ul2_mul (residueredc2ul2_t r, const residueredc2ul2_t a,
910                const residueredc2ul2_t b, const modulusredc2ul2_t m)
911 {
912 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
913 
914   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
915   ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
916 #if defined(MODTRACE)
917   printf ("((%lu * 2^%d + %lu) * (%lu * 2^%d + %lu) / 2^%d) %% "
918 	  "(%lu * 2^%d + %lu)",
919           a[1], LONG_BIT, a[0], b[1], LONG_BIT, b[0], 2 * LONG_BIT,
920 	  m[0].m[1], LONG_BIT, m[0].m[0]);
921 #endif
922 
923   unsigned long dummy;
924   __asm__ __VOLATILE (
925     /* Product of low words */
926     "movq %[a0], %%rax\n\t"
927     "mulq %[b0]\n\t"         /* rdx:rax = a0*b0 <= (2^64-1)^2 */
928     "movq %%rdx, %[t0]\n\t"
929     /* Compute u0*m, add to t0:rax */
930     "imulq %[invm], %%rax\n\t"
931     "movq %%rax, %[t2]\n\t"  /* t2 = u0 */
932     "xorl %k[t1], %k[t1]\n\t"
933     "mulq %[m0]\n\t"         /* rdx:rax = u0*m0 <= (2^64-1)^2 */
934     "negq %%rax\n\t"         /* if low word != 0, carry to high word */
935     "movq %[t2], %%rax\n\t"  /* independent, goes in pipe 0 */
936     "adcq %%rdx, %[t0]\n\t"
937     "setc %b[t1]\n\t"        /* t1:t0 = (a0*b0 + u0*m0) / 2^64 <= 2*2^64 - 4 */
938     "mulq %[m1]\n\t"
939     "addq %%rax, %[t0]\n\t"
940     "movq %[a0], %%rax\n\t"  /* independent, goes in pipe 0 */
941     "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0*b0+u0*m)/2^64 */
942     ABORT_IF_CY              /* <= 2^126 - 2^62 */
943 
944 
945     /* 2 products of low and high word */
946     "xorl %k[t2], %k[t2]\n\t"
947     "mulq %[b1]\n\t"         /* rdx:rax = a0*b1 <= (2^64-1)*(2^63-2^32-1) */
948     "addq %%rax, %[t0]\n\t"
949     "movq %[a1], %%rax\n\t"  /* independent, goes in pipe 0 */
950     "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0*b0+u0*m)/2^64 + a0*b1 */
951     ABORT_IF_CY              /* <= 2^126 - 2^62 + (2^64-1)*(2^63-2^32-1)
952     			        = 2^127 + 2^126 - 2^96 ... */
953 
954     /* Free slot here */
955     "mulq %[b0]\n\t"         /* rdx:rax = a1*b0 <= (2^63-2^32-1)*(2^64-1) */
956     "addq %%rax, %[t0]\n\t"
957     "movq %[a1], %%rax\n\t"  /* independent, goes in pipe 0 */
958     "adcq %%rdx, %[t1]\n\t"
959     "setc %b[t2]\n\t"        /* t2:t1:t0 = (a0*b0+u0*m)/2^64 + a0*b1 + a1*b0 */
960     			     /* <= 2^126 - 2^62 + 2*(2^64-1)*(2^63-2^32-1)
961                                 = 2^128 + 2^126 - 2*2^96 ... */
962     /* Product of high words */
963     "mulq %[b1]\n\t"         /* rdx:rax = a1*b1 <= (2^63-2^32-1)^2 */
964     "addq %%rax, %[t1]\n\t"
965     "movq %[t0], %%rax\n\t"
966     "adcq %%rdx, %[t2]\n\t"  /* t2:t1:t0 = (a*b+u0*m)/2^64 */
967     ABORT_IF_CY              /* <= ((2^127-2^96-1)^2+(2^64-1)*(2^126-2^64+1))/2^64
968                                 = 2^190 - 2^160 ... */
969     /* Free slot here */
970     /* Compute u1*m, add to t2:t1:t0 */
971     "imulq %[invm], %%rax\n\t"
972     "movq %%rax, %[t0]\n\t" /* t0 = u1 */
973     /* Free slot here */
974     "mulq %[m0]\n\t"        /* rdx:rax = m0*u1 <= (2^64-1)^2 */
975     "negq %%rax\n\t"        /* if low word != 0, carry to high word */
976     "movq %[t0], %%rax\n\t"
977     "adcq %%rdx, %[t1]\n\t"
978     "adcq $0,%[t2]\n\t"     /* t2:t1:0 = (a*b+u0*m)/2^64 + u1*m0 */
979     ABORT_IF_CY             /* <= 2^190 - 2^160 + 2*2^128 + 2^126 ... */
980 
981     "mulq %[m1]\n\t"        /* rdx:rax = u1*m1 */
982     "addq %%rax, %[t1]\n\t"
983     "adcq %%rdx, %[t2]\n\t" /* t2:t1 = ((a*b+u0*m)/2^64 + u1*m)/2^64 */
984     ABORT_IF_CY             /* <= 2^127 - 2^96 - 1 */
985 
986     "movq %[t1], %%rax\n\t" /* See if result > m */
987     "movq %[t2], %%rdx\n\t"
988     "subq %[m0], %[t1]\n\t"
989     "sbbq %[m1], %[t2]\n\t"
990     "cmovc %%rax, %[t1]\n\t" /* Carry -> restore old result */
991     "cmovc %%rdx, %[t2]\n\t"
992     : [t0] "=&r" (dummy), [t1] "=&r" (r[0]), [t2] "=&r" (r[1])
993     : [a0] "g" (a[0]), [a1] "g" (a[1]), [b0] "rm" (b[0]), [b1] "rm" (b[1]),
994       [m0] "rm" (m[0].m[0]), [m1] "rm" (m[0].m[1]), [invm] "rm" (m[0].invm)
995     : "%rax", "%rdx", "cc"
996   );
997 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
998 
999   unsigned long pl, ph, t[4], k;
1000 
1001   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
1002   ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
1003 #if defined(MODTRACE)
1004   printf ("((%lu * 2^%d + %lu) * (%lu * 2^%d + %lu) / 2^%d) %% "
1005 	  "(%lu * 2^%d + %lu)",
1006           a[1], LONG_BIT, a[0], b[1], LONG_BIT, b[0], 2 * LONG_BIT,
1007 	  m[0].m[1], LONG_BIT, m[0].m[0]);
1008 #endif
1009 
1010   /* m < 1/4 W^2,  a,b < m */
1011 
1012   /* Product of the two low words */
1013   ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), a[0], b[0]);
1014 
1015   /* One REDC step */
1016   modredc2ul2_redc1 (t, t, m); /* t < 2m < 1/2 W^2 */
1017 
1018   /* Products of one low and one high word  */
1019   ularith_mul_ul_ul_2ul (&pl, &ph, a[1], b[0]);   /* ph:pl < 1/4 W^2 */
1020   ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 3/4 W^2 */
1021   ularith_mul_ul_ul_2ul (&pl, &ph, a[0], b[1]);   /* ph:pl < 1/4 W^2 */
1022   ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < W^2 */
1023 
1024   /* Product of the two high words */
1025   ularith_mul_ul_ul_2ul (&pl, &(t[2]), a[1], b[1]); /* t2:pl < 1/16 W^2 */
1026   ularith_add_ul_2ul (&(t[1]), &(t[2]), pl); /* t2:t1:t0 < 1/16 W^3 + W^2 */
1027 
1028   /* Compute t2:t1:t0 := t2:t1:t0 + km, km < Wm < 1/4 W^3 */
1029   k = t[0] * m[0].invm;
1030   ularith_mul_ul_ul_2ul (&pl, &ph, k, m[0].m[0]);
1031   if (t[0] != 0UL)
1032     ph++; /* t[0] = 0 */
1033   ularith_add_ul_2ul (&(t[1]), &(t[2]), ph);
1034   ularith_mul_ul_ul_2ul (&pl, &ph, k, m[0].m[1]); /* ph:pl < 1/4 W^2 */
1035   ularith_add_2ul_2ul (&(t[1]), &(t[2]), pl, ph);
1036   /* t2:t1:0 < 1/16 W^3 + W^2 + 1/4 W^3 < 5/16 W^3 + W^2 */
1037 
1038   /* Result may be larger than m, but is < 2*m */
1039 
1040   ularith_sub_2ul_2ul_ge (&(t[1]), &(t[2]), m[0].m[0], m[0].m[1]);
1041 
1042   r[0] = t[1];
1043   r[1] = t[2];
1044 #endif
1045 #if defined(MODTRACE)
1046   printf (" == (%lu * 2^%d + %lu) /* PARI */ \n", r[1], LONG_BIT, r[0]);
1047 #endif
1048   ASSERT_EXPENSIVE (modredc2ul2_intlt (r, m[0].m));
1049 }
1050 
1051 MAYBE_UNUSED
1052 static inline void
modredc2ul2_sqr(residueredc2ul2_t r,const residueredc2ul2_t a,const modulusredc2ul2_t m)1053 modredc2ul2_sqr (residueredc2ul2_t r, const residueredc2ul2_t a,
1054                  const modulusredc2ul2_t m)
1055 {
1056 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
1057 
1058   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
1059 #if defined(MODTRACE)
1060   printf ("((%lu * 2^%d + %lu) * (%lu * 2^%d + %lu) / 2^%d) %% "
1061 	  "(%lu * 2^%d + %lu)",
1062           a[1], LONG_BIT, a[0], a[1], LONG_BIT, a[0], 2 * LONG_BIT,
1063 	  m[0].m[1], LONG_BIT, m[0].m[0]);
1064 #endif
1065 
1066 /* m <= 2^126-1
1067    Since m1>0, m*u is maximal for m0=1 and u=2^64-1, so
1068    u*m is bounded by (2^126 - 2^64 + 1)*(2^64 - 1) =
1069    2^190 - 2^128 - 2^126 + 2*2^64 - 1.
1070    If a,b <= 2^127-2^96-1, then
1071    ((a*b+u0*m)/2^64 + u1*m)/2^64 <=  2^127-2^96-1
1072    If we allow non-canonical residues up to 2^127-2^96-1, we can skip
1073    the final conditional subtraction. These residues are still < 2^127,
1074    so an addition does not overflow */
1075 
1076   unsigned long dummy;
1077   __asm__ __VOLATILE (
1078     /* Product of low words */
1079     "movq %[a0], %%rax\n\t"
1080     "mulq %[a0]\n\t"         /* rdx:rax = a0^2 <= (2^64-1)^2 */
1081     "movq %%rdx, %[t0]\n\t"
1082     /* Compute u0*m, add to t0:rax */
1083     "imulq %[invm], %%rax\n\t"
1084     "movq %%rax, %[t2]\n\t"  /* t2 = u0 */
1085     "xorl %k[t1], %k[t1]\n\t"
1086     "mulq %[m0]\n\t"         /* rdx:rax = u0*m0 <= (2^64-1)^2 */
1087     "negq %%rax\n\t"         /* if low word != 0, carry to high word */
1088     "movq %[t2], %%rax\n\t"  /* independent, goes in pipe 0 */
1089     "adcq %%rdx, %[t0]\n\t"
1090     "setc %b[t1]\n\t"        /* t1:t0 = (a0^2 + u0*m0) / 2^64 <= 2*2^64 - 4 */
1091     "mulq %[m1]\n\t"
1092     "addq %%rax, %[t0]\n\t"
1093     "movq %[a0], %%rax\n\t"  /* independent, goes in pipe 0 */
1094     "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0^2+u0*m)/2^64 */
1095     ABORT_IF_CY              /* <= 2^126 - 2^62 */
1096 
1097     /* 2 products of low and high word */
1098     "xorl %k[t2], %k[t2]\n\t"
1099     "mulq %[a1]\n\t"         /* rdx:rax = a0*a1 <= (2^64-1)*(2^63-2^32-1) */
1100     "addq %%rax, %[t0]\n\t"
1101     "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0^2+u0*m)/2^64 + a0*a1 */
1102     ABORT_IF_CY              /* <= 2^126 - 2^62 + (2^64-1)*(2^63-2^32-1)
1103     			        = 2^127 + 2^126 - 2^96 ... */
1104     "addq %%rax, %[t0]\n\t"
1105     "adcq %%rdx, %[t1]\n\t"
1106     "movq %[a1], %%rax\n\t"  /* independent, goes in pipe 0 */
1107     "setc %b[t2]\n\t"        /* t2:t1:t0 = (a0*a0+u0*m)/2^64 + 2*a0*a1 */
1108     			     /* <= 2^126 - 2^62 + 2*(2^64-1)*(2^63-2^32-1)
1109                                 = 2^128 + 2^126 - 2*2^96 ... */
1110 
1111     /* Product of high words */
1112     "mulq %[a1]\n\t"         /* rdx:rax = a1^2 <= (2^63-2^32-1)^2 */
1113     "addq %%rax, %[t1]\n\t"
1114     "movq %[t0], %%rax\n\t"
1115     "adcq %%rdx, %[t2]\n\t"  /* t2:t1:t0 = (a^2+u0*m)/2^64 */
1116     ABORT_IF_CY              /* <= ((2^127-2^96-1)^2+(2^64-1)*(2^126-2^64+1))/2^64
1117                                 = 2^190 - 2^160 ... */
1118     /* Free slot here */
1119     /* Compute u1*m, add to t2:t1:t0 */
1120     "imulq %[invm], %%rax\n\t"
1121     "movq %%rax, %[t0]\n\t" /* t0 = u1 */
1122     /* Free slot here */
1123     "mulq %[m0]\n\t"        /* rdx:rax = m0*u1 <= (2^64-1)^2 */
1124     "negq %%rax\n\t"        /* if low word != 0, carry to high word */
1125     "movq %[t0], %%rax\n\t"
1126     "adcq %%rdx, %[t1]\n\t"
1127     "adcq $0,%[t2]\n\t"     /* t2:t1:0 = (a*a+u0*m)/2^64 + u1*m0 */
1128     ABORT_IF_CY             /* <= 2^190 - 2^160 + 2*2^128 + 2^126 ... */
1129 
1130     "mulq %[m1]\n\t"        /* rdx:rax = u1*m1 */
1131     "addq %%rax, %[t1]\n\t"
1132     "adcq %%rdx, %[t2]\n\t" /* t2:t1 = ((a*a+u0*m)/2^64 + u1*m)/2^64 */
1133     ABORT_IF_CY             /* <= 2^127 - 2^96 - 1 */
1134 
1135     "movq %[t1], %%rax\n\t" /* See if result > m */
1136     "movq %[t2], %%rdx\n\t"
1137     "subq %[m0], %[t1]\n\t"
1138     "sbbq %[m1], %[t2]\n\t"
1139     "cmovc %%rax, %[t1]\n\t" /* No carry -> copy new result */
1140     "cmovc %%rdx, %[t2]\n\t"
1141     : [t0] "=&r" (dummy), [t1] "=&r" (r[0]), [t2] "=&r" (r[1])
1142     : [a0] "g" (a[0]), [a1] "g" (a[1]),
1143       [m0] "rm" (m[0].m[0]), [m1] "rm" (m[0].m[1]), [invm] "rm" (m[0].invm)
1144     : "%rax", "%rdx", "cc"
1145   );
1146 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
1147 
1148   unsigned long pl, ph, t[4], k;
1149 
1150   ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
1151 #if defined(MODTRACE)
1152   printf ("((%lu * 2^%d + %lu)^2 %% (%lu * 2^%d + %lu)",
1153           a[1], LONG_BIT, a[0], 2 * LONG_BIT, m[0].m[1], LONG_BIT, m[0].m[0]);
1154 #endif
1155 
1156   /* m < 1/4 W^2,  a < m */
1157 
1158   /* Square of the low word */
1159   ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), a[0], a[0]);
1160 
1161   /* One REDC step */
1162   modredc2ul2_redc1 (t, t, m); /* t < 2m < 1/2 W^2 */
1163 
1164   /* Products of low and high word  */
1165   ularith_mul_ul_ul_2ul (&pl, &ph, a[1], a[0]);   /* ph:pl < 1/4 W^2 */
1166   ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 3/4 W^2 */
1167   ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < W^2 */
1168 
1169   /* Square of high word */
1170   ularith_mul_ul_ul_2ul (&pl, &(t[2]), a[1], a[1]); /* t2:pl < 1/16 W^2 */
1171   ularith_add_ul_2ul (&(t[1]), &(t[2]), pl); /* t2:t1:t0 < 1/16 W^3 + W^2 */
1172 
1173   /* Compute t2:t1:t0 := t2:t1:t0 + km, km < Wm < 1/4 W^3 */
1174   k = t[0] * m[0].invm;
1175   ularith_mul_ul_ul_2ul (&pl, &ph, k, m[0].m[0]);
1176   if (t[0] != 0UL)
1177     ph++; /* t[0] = 0 */
1178   ularith_add_ul_2ul (&(t[1]), &(t[2]), ph);
1179   ularith_mul_ul_ul_2ul (&pl, &ph, k, m[0].m[1]); /* ph:pl < 1/4 W^2 */
1180   ularith_add_2ul_2ul (&(t[1]), &(t[2]), pl, ph);
1181   /* t2:t1:0 < 1/16 W^3 + W^2 + 1/4 W^3 < 5/16 W^3 + W^2 */
1182 
1183   /* Result may be larger than m, but is < 2*m */
1184 
1185   ularith_sub_2ul_2ul_ge (&(t[1]), &(t[2]), m[0].m[0], m[0].m[1]);
1186 
1187   r[0] = t[1];
1188   r[1] = t[2];
1189 #endif
1190 #if defined(MODTRACE)
1191   printf (" == (%lu * 2^%d + %lu) /* PARI */ \n", r[1], LONG_BIT, r[0]);
1192 #endif
1193   ASSERT_EXPENSIVE (modredc2ul2_intlt (r, m[0].m));
1194 }
1195 
1196 
1197 MAYBE_UNUSED
1198 static inline int
modredc2ul2_next(residueredc2ul2_t r,const modulusredc2ul2_t m)1199 modredc2ul2_next (residueredc2ul2_t r, const modulusredc2ul2_t m)
1200 {
1201   ularith_add_ul_2ul (&(r[0]), &(r[1]), 1UL);
1202   return (modredc2ul2_intequal (r, m[0].m));
1203 }
1204 
1205 
1206 MAYBE_UNUSED
1207 static inline int
modredc2ul2_finished(const residueredc2ul2_t r,const modulusredc2ul2_t m)1208 modredc2ul2_finished (const residueredc2ul2_t r, const modulusredc2ul2_t m)
1209 {
1210   return (modredc2ul2_intequal (r, m[0].m));
1211 }
1212 
1213 /* Division by small integer n, where (n-1)*m may overflow the most
1214    significant word. Returns 1 if n is invertible modulo m, 0 if not.
1215 
1216    w_mod_n is word base (e.g., 2^32 or  2^64) mod n
1217    inv_n contains -1/i (mod n) if i is coprime to n, or 0 if i is not coprime
1218    to n, for 0 <= i < n
1219    c = n^(-1) (mod word base)
1220 */
1221 
1222 MAYBE_UNUSED
1223 static inline int
modredc2ul2_divn(residueredc2ul2_t r,const residueredc2ul2_t a,const unsigned long n,const unsigned long w_mod_n,const unsigned long * inv_n,const unsigned long c,const modulusredc2ul2_t m)1224 modredc2ul2_divn (residueredc2ul2_t r, const residueredc2ul2_t a,
1225 		  const unsigned long n, const unsigned long w_mod_n,
1226 		  const unsigned long *inv_n, const unsigned long c,
1227 		  const modulusredc2ul2_t m)
1228 {
1229   const unsigned long an = ((a[1] % n) * w_mod_n + a[0] % n) % n;
1230   const unsigned long mn = ((m[0].m[1] % n) * w_mod_n + m[0].m[0] % n) % n;
1231 
1232   residueredc2ul2_t t, t2;
1233   unsigned long k;
1234 #ifdef WANT_ASSERT_EXPENSIVE
1235   residueredc2ul2_t a_backup;
1236 #endif
1237 
1238   if (inv_n[mn] == 0)
1239     return 0;
1240 
1241 #ifdef WANT_ASSERT_EXPENSIVE
1242   modredc2ul2_init_noset0 (a_backup, m);
1243   modredc2ul2_set (a_backup, a, m);
1244 #endif
1245   modredc2ul2_init_noset0 (t, m);
1246   modredc2ul2_init_noset0 (t2, m);
1247   modredc2ul2_set (t, a, m);
1248 
1249   /* Make t[1]:t[0] == a+km (mod w^2) with a+km divisible by n */
1250   /* We want a+km == 0 (mod n), so k = -a*m^{-1} (mod n) */
1251   k = (inv_n[mn] * an) % n;
1252   ASSERT_EXPENSIVE ((an + k*mn) % n == 0);
1253   ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), m[0].m[0], k);
1254   t[1] += m[0].m[1] * k;
1255   ularith_add_2ul_2ul (&(t[0]), &(t[1]), a[0], a[1]);
1256 
1257   /* We want r = (a+km)/n. */
1258 
1259   /* May overwrite a */
1260   r[0] = t[0] * c;
1261 
1262   /* r0 == (a+km)/n (mod w)
1263      (r1*w + r0) * n = (a+km)
1264      (r1*w + r0) * n == t (mod w^2)
1265      r1*w*n == t - n*r0 (mod w^2)
1266                             t - n*r0 == 0 (mod w), thus
1267      r1*n == (t - n*r0)/w (mod w) */
1268 
1269   ularith_mul_ul_ul_2ul (&(t2[0]), &(t2[1]), r[0], n);
1270   ularith_sub_2ul_2ul (&(t[0]), &(t[1]), t2[0], t2[1]);
1271   ASSERT_EXPENSIVE (t[0] == 0UL);
1272   r[1] = t[1] * c;
1273 
1274 #ifdef WANT_ASSERT_EXPENSIVE
1275   {
1276     unsigned long i;
1277     modredc2ul2_set (t, r, m);
1278     for (i = 1; i < n; i++)
1279       modredc2ul2_add (t, t, r, m);
1280     ASSERT_EXPENSIVE (modredc2ul2_equal (t, a_backup, m));
1281     modredc2ul2_clear (a_backup, m);
1282   }
1283 #endif
1284 
1285   modredc2ul2_clear (t, m);
1286   modredc2ul2_clear (t2, m);
1287 
1288   return 1;
1289 }
1290 
1291 struct modredc2ul2_batch_Q_to_Fp_context_s {
1292   modintredc2ul2_t c;
1293   unsigned long rem_ul, ratio_ul, den_inv;
1294   modulusredc2ul2_t m;
1295 };
1296 typedef struct modredc2ul2_batch_Q_to_Fp_context_s modredc2ul2_batch_Q_to_Fp_context_t;
1297 
1298 /* prototypes of non-inline functions */
1299 #ifdef __cplusplus
1300 extern "C" {
1301 #endif
1302 int modredc2ul2_div3 (residueredc2ul2_t, const residueredc2ul2_t,
1303 		      const modulusredc2ul2_t);
1304 int modredc2ul2_div5 (residueredc2ul2_t, const residueredc2ul2_t,
1305 		       const modulusredc2ul2_t);
1306 int modredc2ul2_div7 (residueredc2ul2_t, const residueredc2ul2_t,
1307 		      const modulusredc2ul2_t);
1308 int modredc2ul2_div11 (residueredc2ul2_t, const residueredc2ul2_t,
1309 		       const modulusredc2ul2_t);
1310 int modredc2ul2_div13 (residueredc2ul2_t, const residueredc2ul2_t,
1311 		       const modulusredc2ul2_t);
1312 void modredc2ul2_gcd (modintredc2ul2_t, const residueredc2ul2_t,
1313 		      const modulusredc2ul2_t);
1314 void modredc2ul2_pow_ul (residueredc2ul2_t, const residueredc2ul2_t,
1315 			 const unsigned long, const modulusredc2ul2_t);
1316 void modredc2ul2_2pow_ul (residueredc2ul2_t, const unsigned long,
1317                           const modulusredc2ul2_t);
1318 void modredc2ul2_pow_mp (residueredc2ul2_t, const residueredc2ul2_t,
1319 			 const unsigned long *, const int,
1320 			 const modulusredc2ul2_t);
1321 void modredc2ul2_2pow_mp (residueredc2ul2_t, const unsigned long *, const int,
1322 			  const modulusredc2ul2_t);
1323 void modredc2ul2_V_ul (residueredc2ul2_t, const residueredc2ul2_t,
1324 		       const unsigned long, const modulusredc2ul2_t);
1325 void modredc2ul2_V_mp (residueredc2ul2_t, const residueredc2ul2_t,
1326 		       const unsigned long *, const int,
1327 		       const modulusredc2ul2_t);
1328 int modredc2ul2_sprp (const residueredc2ul2_t, const modulusredc2ul2_t);
1329 int modredc2ul2_sprp2 (const modulusredc2ul2_t);
1330 int modredc2ul2_isprime (const modulusredc2ul2_t);
1331 int modredc2ul2_inv (residueredc2ul2_t, const residueredc2ul2_t,
1332 		     const modulusredc2ul2_t);
1333 int modredc2ul2_batchinv (residueredc2ul2_t *, const residueredc2ul2_t *,
1334                           size_t n, const residueredc2ul2_t,
1335                           const modulusredc2ul2_t);
1336 modredc2ul2_batch_Q_to_Fp_context_t *
1337 modredc2ul2_batch_Q_to_Fp_init (const modintredc2ul2_t, const modintredc2ul2_t);
1338 void modredc2ul2_batch_Q_to_Fp_clear (modredc2ul2_batch_Q_to_Fp_context_t *);
1339 
1340 int modredc2ul2_batch_Q_to_Fp (unsigned long *,
1341                                const modredc2ul2_batch_Q_to_Fp_context_t *,
1342                                unsigned long, int, const unsigned long *, size_t);
1343 int modredc2ul2_jacobi (const residueredc2ul2_t, const modulusredc2ul2_t);
1344 #ifdef __cplusplus
1345 }
1346 #endif
1347 
1348 #endif  /* MODREDC_2UL2_H */
1349