1 /* Some functions for modular arithmetic with residues and modulus in
2 unsigned long variables. The modulus can be up to 2 unsigned longs
3 in size with the two most significant bits zero (meaning 62 bits if
4 unsigned long has 32 bits, or 126 bits if unsigned long has 64 bits).
5 Moduli must be odd and have the upper word non-zero. Residues are stored
6 in Montgomery form, reduction after multiplication is done with REDC.
7 Due to inlining, this file must be included in the caller's source code
8 with #include */
9
10 /* Naming convention: all function start with modredc2ul2, for
11 MODulus REDC 2 Unsigned Longs minus 2 bits, followed by underscore,
12 functionality of function (add, mul, etc), and possibly underscore and
13 specification of what argument types the function takes (_ul, etc). */
14
15 #ifndef MODREDC_2UL2_H
16 #define MODREDC_2UL2_H
17
18 /**********************************************************************/
19 #include "cado_config.h" // for HAVE_GCC_STYLE_AMD64_INLINE_ASM
20 #if defined(MODTRACE)
21 #include <stdio.h>
22 #endif
23 #include <limits.h>
24 #include <stdint.h>
25 #include <stdlib.h> // for size_t, abort
26 #include "macros.h"
27 #include "ularith.h"
28
29 // #define MODTRACE 1
30
31 /*********************************************************************/
32 /* Helper macros */
33
34 /* A macro for function renaming. All functions here start with
35 modredc2ul2_ */
36 #define MODREDC2UL2_RENAME(x) modredc2ul2_##x
37
38 #define MODREDC2UL2_SIZE 2
39 #define MODREDC2UL2_MINBITS LONG_BIT
40 #define MODREDC2UL2_MAXBITS (2 * LONG_BIT - 2)
41
42 typedef unsigned long residueredc2ul2_t[MODREDC2UL2_SIZE];
43 typedef unsigned long modintredc2ul2_t[MODREDC2UL2_SIZE];
44 struct __modulusredc2ul2_t {
45 modintredc2ul2_t m;
46 residueredc2ul2_t one;
47 unsigned long invm;
48 };
49 typedef struct __modulusredc2ul2_t modulusredc2ul2_t[1];
50
51
52 /* ==================== Functions used internally ==================== */
53
54 static inline int
55 modredc2ul2_intlt (const modintredc2ul2_t a, const modintredc2ul2_t b);
56
57 static inline void
58 modredc2ul2_add (residueredc2ul2_t r, const residueredc2ul2_t a,
59 const residueredc2ul2_t b, const modulusredc2ul2_t m);
60 static inline void
61 modredc2ul2_get_int (modintredc2ul2_t r, const residueredc2ul2_t s,
62 const modulusredc2ul2_t m MAYBE_UNUSED);
63 static inline void
64 modredc2ul2_intset (modintredc2ul2_t r, const modintredc2ul2_t s);
65
66 MAYBE_UNUSED
67 static inline void
modredc2ul2_tomontgomery(residueredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m)68 modredc2ul2_tomontgomery (residueredc2ul2_t r, const residueredc2ul2_t s,
69 const modulusredc2ul2_t m)
70 {
71 int i;
72
73 modredc2ul2_intset (r, s);
74 /* TODO FIXME: ridiculously slow */
75 for (i = 0; i < (MODREDC2UL2_SIZE * LONG_BIT); i++)
76 modredc2ul2_add (r, r, r, m);
77 }
78
79
80 /* Do a one-word REDC, i.e., r == s / w (mod m), w = 2^LONG_BIT.
81 If m > w, r < 2m. If s<m, then r<m */
82 MAYBE_UNUSED
83 static inline void
modredc2ul2_redc1(residueredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m)84 modredc2ul2_redc1 (residueredc2ul2_t r, const residueredc2ul2_t s,
85 const modulusredc2ul2_t m)
86 {
87 unsigned long t[4], k;
88
89 k = s[0] * m[0].invm;
90 ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), k, m[0].m[0]);
91 if (s[0] != 0UL)
92 t[1]++;
93 t[2] = 0;
94 ularith_add_ul_2ul (&(t[1]), &(t[2]), s[1]); /* t[2] <= 1 */
95 ularith_mul_ul_ul_2ul (&(t[0]), &(t[3]), k, m[0].m[1]); /* t[3] < 2^w-1 */
96 ularith_add_2ul_2ul (&(t[1]), &(t[2]), t[0], t[3]); /* t[2] < 2^w */
97
98 /* r = (k*m + s) / w, k <= w-1. If s < m, then r < m */
99 r[0] = t[1];
100 r[1] = t[2];
101 }
102
103 /* Converts s out of Montgomery form by dividing by 2^(2*LONG_BIT).
104 Requires s < m. */
105 MAYBE_UNUSED
106 static inline void
modredc2ul2_frommontgomery(residueredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m)107 modredc2ul2_frommontgomery (residueredc2ul2_t r, const residueredc2ul2_t s,
108 const modulusredc2ul2_t m)
109 {
110 /* Do two REDC steps */
111 modredc2ul2_redc1 (r, s, m);
112 modredc2ul2_redc1 (r, r, m);
113 }
114
115
116 #ifdef WANT_ASSERT_EXPENSIVE
117 #if defined(__x86_64__)
118 #define ABORT_IF_CY "jnc 1f\n\tlea _GLOBAL_OFFSET_TABLE_(%%rip), %%rbx\n\tcall abort@plt\n1:\n\t"
119 #elif defined(__i386__)
120 #define ABORT_IF_CY "jnc 1f\n\tcall abort\n1:\n\t"
121 #endif
122 #else
123 #define ABORT_IF_CY
124 #endif
125
126 MAYBE_UNUSED
127 static inline void
_modredc2ul2_mul_ul(residueredc2ul2_t r,const residueredc2ul2_t a,const unsigned long b,const modulusredc2ul2_t m)128 _modredc2ul2_mul_ul (residueredc2ul2_t r, const residueredc2ul2_t a,
129 const unsigned long b, const modulusredc2ul2_t m)
130 {
131 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
132 #if defined(MODTRACE)
133 printf ("((%lu * 2^%d + %lu) * %lu / 2^%d) %% (%lu * 2^%d + %lu)",
134 a[1], LONG_BIT, a[0], b, LONG_BIT, m[0].m[1], LONG_BIT, m[0].m[0]);
135 #endif
136
137 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
138 unsigned long u = m[0].invm, a0 = a[0];
139 __asm__ __VOLATILE (
140 /* Product of low words */
141 "mulq %[b]\n\t" /* rdx:rax = a0*b <= (2^64-1)^2 */
142 "movq %%rdx, %[t0]\n\t" /* t0:rax = a0*b <= (2^64-1)^2 */
143 /* Compute u such that a0*b + u*m == 0 (mod 2^64), compute u*m, add to t0:rax */
144 "imulq %[u], %%rax\n\t"
145 "movq %%rax, %[u]\n\t" /* u <= 2^64-1 */
146 "xorl %k[t1], %k[t1]\n\t"
147 "mulq %[m0]\n\t" /* rdx:rax = u*m0 <= (2^64-1)^2 */
148 "negq %%rax\n\t" /* if low word != 0, carry to high word */
149 "movq %[u], %%rax\n\t" /* rax = u, independent, goes in pipe 0 */
150 "adcq %%rdx, %[t0]\n\t"
151 "setc %b[t1]\n\t" /* t1:t0 = (a0*b + u*m0) / 2^64 <= 2*2^64 - 4 */
152 "mulq %[m1]\n\t" /* rdx:rax = u*m1 */
153 "addq %%rax, %[t0]\n\t"
154 "movq %[a1], %%rax\n\t" /* independent, goes in pipe 0 */
155 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0*b+u*m)/2^64 <= 2^126 - 2^62 */
156 /* Free slot in pipe 2 here */
157 ABORT_IF_CY
158
159 /* Product of low and high word */
160 "mulq %[b]\n\t" /* rdx:rax = a1*b <= (2^63-2^32-1)*(2^64-1) */
161 "addq %%rax, %[t0]\n\t"
162 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = ((a1*2^64 + a0)*b + u*m) / 2^64
163 <= ((2^126-1)*(2^64-1) + (2^64-1)*(2^126-1)) / 2^64
164 < 2^127 - 2^63 - 1, thus no carry */
165 ABORT_IF_CY
166 /* t1:t0 = ((a1*2^64 + a0)*b + u*m) / 2^64
167 <= ((m-1)*(2^64-1) + (2^64-1)*m) / 2^64
168 = 2*m*(1-1/2^64) - 1*(1-1/2^64). May need to subtract m. */
169 "movq %[t0], %%rax\n\t" /* See if result > m */
170 "movq %[t1], %%rdx\n\t"
171 "subq %[m0], %[t0]\n\t" /* Try subtracting m, see if there's a carry */
172 "sbbq %[m1], %[t1]\n\t"
173 "cmovc %%rax, %[t0]\n\t" /* Carry -> restore old result */
174 "cmovc %%rdx, %[t1]\n\t"
175 : [u] "+&r" (u), [t0] "=&r" (r[0]), [t1] "=&r" (r[1]), [a0] "+&a" (a0)
176 : [a1] "g" (a[1]), [b] "rm" (b),
177 [m0] "rm" (m[0].m[0]), [m1] "rm" (m[0].m[1])
178 : "%rdx", "cc"
179 );
180 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
181 unsigned long pl, ph, t[2];
182
183 /* m < 1/4 W^2, a < m, b < W */
184
185 /* Product of b and low word */
186 ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), a[0], b); /* t1:t0 = a0*b < W^2 */
187
188 /* One REDC step */
189 modredc2ul2_redc1 (t, t, m); /* t1:t0 = (a0*b + k*m) / W < m + W < 1/4 W^2 + W */
190
191 /* Product of b and high word */
192 ularith_mul_ul_ul_2ul (&pl, &ph, a[1], b); /* ph:pl < 1/4 W^2 */
193 ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 1/2 W^2 + W */
194
195 ularith_sub_2ul_2ul_ge (&(t[0]), &(t[1]), m[0].m[0], m[0].m[1]);
196
197 r[0] = t[0];
198 r[1] = t[1];
199 #endif
200 #if defined(MODTRACE)
201 printf (" == (%lu * 2^%d + %lu) /* PARI */ \n", r[1], LONG_BIT, r[0]);
202 #endif
203 ASSERT_EXPENSIVE (modredc2ul2_intlt (r, m[0].m));
204 }
205
206
207 /* ================= Functions that are part of the API ================= */
208
209 /* Some functions for integers of the same width as the modulus */
210
211 MAYBE_UNUSED
212 static inline void
modredc2ul2_intinit(modintredc2ul2_t r)213 modredc2ul2_intinit (modintredc2ul2_t r)
214 {
215 r[0] = 0;
216 r[1] = 0;
217 }
218
219
220 MAYBE_UNUSED
221 static inline void
modredc2ul2_intclear(modintredc2ul2_t r MAYBE_UNUSED)222 modredc2ul2_intclear (modintredc2ul2_t r MAYBE_UNUSED)
223 {
224 return;
225 }
226
227
228 MAYBE_UNUSED
229 static inline void
modredc2ul2_intset(modintredc2ul2_t r,const modintredc2ul2_t s)230 modredc2ul2_intset (modintredc2ul2_t r, const modintredc2ul2_t s)
231 {
232 r[0] = s[0];
233 r[1] = s[1];
234 }
235
236 MAYBE_UNUSED
237 static inline void
modredc2ul2_intset_ul(modintredc2ul2_t r,const unsigned long s)238 modredc2ul2_intset_ul (modintredc2ul2_t r, const unsigned long s)
239 {
240 r[0] = s;
241 r[1] = 0UL;
242 }
243
244 MAYBE_UNUSED
245 static inline void
modredc2ul2_intset_uls(modintredc2ul2_t r,const unsigned long * s,const size_t n)246 modredc2ul2_intset_uls (modintredc2ul2_t r, const unsigned long *s,
247 const size_t n)
248 {
249 if (n == 0) {
250 r[0] = 0;
251 r[1] = 0;
252 } else if (n == 1) {
253 r[0] = s[0];
254 r[1] = 0;
255 } else if (n == 2) {
256 r[0] = s[0];
257 r[1] = s[1];
258 } else
259 abort();
260 }
261
262 /* Get the least significant unsigned long of r */
263 MAYBE_UNUSED
264 static inline unsigned long
modredc2ul2_intget_ul(const modintredc2ul2_t r)265 modredc2ul2_intget_ul (const modintredc2ul2_t r)
266 {
267 return r[0];
268 }
269
270 MAYBE_UNUSED
271 static inline size_t
modredc2ul2_intget_uls(unsigned long * r,const modintredc2ul2_t s)272 modredc2ul2_intget_uls (unsigned long *r, const modintredc2ul2_t s)
273 {
274 r[0] = s[0];
275 if (s[1] != 0) {
276 r[1] = s[1];
277 return 2;
278 }
279 return 1;
280 }
281
282 MAYBE_UNUSED
283 static inline double
modredc2ul2_intget_double(const modintredc2ul2_t s)284 modredc2ul2_intget_double (const modintredc2ul2_t s)
285 {
286 double d = (double) s[1];
287 #if (LONG_BIT == 32)
288 d *= 4294967296.0;
289 #elif (LONG_BIT == 64)
290 d *= 18446744073709551616.0;
291 #else
292 #error "unsupported value of LONG_BIT"
293 #endif
294 return d + (double) s[0];
295 }
296
297 MAYBE_UNUSED
298 static inline int
modredc2ul2_intequal(const modintredc2ul2_t a,const modintredc2ul2_t b)299 modredc2ul2_intequal (const modintredc2ul2_t a, const modintredc2ul2_t b)
300 {
301 return (a[0] == b[0] && a[1] == b[1]);
302 }
303
304 MAYBE_UNUSED
305 static inline int
modredc2ul2_intequal_ul(const modintredc2ul2_t a,const unsigned long b)306 modredc2ul2_intequal_ul (const modintredc2ul2_t a, const unsigned long b)
307 {
308 return (a[0] == b && a[1] == 0UL);
309 }
310
311 /* Returns 1 if a < b, 0 otherwise */
312 MAYBE_UNUSED
313 static inline int
modredc2ul2_intlt(const modintredc2ul2_t a,const modintredc2ul2_t b)314 modredc2ul2_intlt (const modintredc2ul2_t a, const modintredc2ul2_t b)
315 {
316 modintredc2ul2_t t;
317
318 modredc2ul2_intset (t, a);
319 return ularith_sub_2ul_2ul_cy (&(t[0]), &(t[1]), b[0], b[1]);
320 }
321
322 MAYBE_UNUSED
323 static inline int
modredc2ul2_intcmp(const modintredc2ul2_t a,const modintredc2ul2_t b)324 modredc2ul2_intcmp (const modintredc2ul2_t a, const modintredc2ul2_t b)
325 {
326 if (a[1] < b[1])
327 return -1;
328 if (a[1] > b[1])
329 return 1;
330 return (a[0] < b[0]) ? -1 : (a[0] == b[0]) ? 0 : 1;
331 }
332
333 MAYBE_UNUSED
334 static inline int
modredc2ul2_intcmp_ul(const modintredc2ul2_t a,const unsigned long b)335 modredc2ul2_intcmp_ul (const modintredc2ul2_t a, const unsigned long b)
336 {
337 if (a[1] > 0UL)
338 return 1;
339 return (a[0] < b) ? -1 : (a[0] == b) ? 0 : 1;
340 }
341
342 MAYBE_UNUSED
343 static inline int
modredc2ul2_intcmp_uint64(const modintredc2ul2_t a,const uint64_t b)344 modredc2ul2_intcmp_uint64 (const modintredc2ul2_t a, const uint64_t b)
345 {
346 ASSERT(ULONG_MAX == UINT32_MAX || ULONG_MAX == UINT64_MAX);
347 if (ULONG_MAX == UINT32_MAX) {
348 uint64_t t = ((uint64_t) a[1] << 32) + a[0];
349 return (t < b) ? -1 : (t == b) ? 0 : 1;
350 } else {
351 if (a[1] > 0UL)
352 return 1;
353 return (a[0] < b) ? -1 : (a[0] == b) ? 0 : 1;
354 }
355 }
356
357 MAYBE_UNUSED
358 static inline int
modredc2ul2_intfits_ul(const modintredc2ul2_t a)359 modredc2ul2_intfits_ul (const modintredc2ul2_t a)
360 {
361 return (a[1] == 0UL);
362 }
363
364 MAYBE_UNUSED
365 static inline void
modredc2ul2_intadd(modintredc2ul2_t r,const modintredc2ul2_t a,const modintredc2ul2_t b)366 modredc2ul2_intadd (modintredc2ul2_t r, const modintredc2ul2_t a,
367 const modintredc2ul2_t b)
368 {
369 modintredc2ul2_t t;
370 modredc2ul2_intset (t, a);
371 ularith_add_2ul_2ul (&(t[0]), &(t[1]), b[0], b[1]);
372 modredc2ul2_intset (r, t);
373 }
374
375 MAYBE_UNUSED
376 static inline void
modredc2ul2_intsub(modintredc2ul2_t r,const modintredc2ul2_t a,const modintredc2ul2_t b)377 modredc2ul2_intsub (modintredc2ul2_t r, const modintredc2ul2_t a,
378 const modintredc2ul2_t b)
379 {
380 modintredc2ul2_t t;
381 modredc2ul2_intset (t, a);
382 ularith_sub_2ul_2ul (&(t[0]), &(t[1]), b[0], b[1]);
383 modredc2ul2_intset (r, t);
384 }
385
386 /* Returns the number of bits in a, that is, floor(log_2(a))+1.
387 For a == 0 returns 0. */
388 MAYBE_UNUSED
389 static inline size_t
modredc2ul2_intbits(const modintredc2ul2_t a)390 modredc2ul2_intbits (const modintredc2ul2_t a)
391 {
392 if (a[1] > 0UL)
393 return 2*LONG_BIT - ularith_clz (a[1]);
394
395 if (a[0] > 0UL)
396 return LONG_BIT - ularith_clz (a[0]);
397
398 return 0;
399 }
400
401
402 /* r = trunc(s / 2^i) */
403 MAYBE_UNUSED
404 static inline void
modredc2ul2_intshr(modintredc2ul2_t r,const modintredc2ul2_t s,const int i)405 modredc2ul2_intshr (modintredc2ul2_t r, const modintredc2ul2_t s, const int i)
406 {
407 if (i >= 2 * LONG_BIT) {
408 r[1] = r[0] = 0UL;
409 } else if (i >= LONG_BIT) {
410 r[0] = s[1] >> (i - LONG_BIT);
411 r[1] = 0UL; /* May overwrite s[1] */
412 } else { /* i < LONG_BIT */
413 ularith_shrd (&(r[0]), s[1], s[0], i);
414 r[1] = s[1] >> i;
415 }
416 }
417
418
419 /* r = (s * 2^i) % (2^(2 * LONG_BIT)) */
420 MAYBE_UNUSED
421 static inline void
modredc2ul2_intshl(modintredc2ul2_t r,const modintredc2ul2_t s,const int i)422 modredc2ul2_intshl (modintredc2ul2_t r, const modintredc2ul2_t s, const int i)
423 {
424 if (i >= 2 * LONG_BIT) {
425 r[1] = r[0] = 0UL;
426 } else if (i >= LONG_BIT) {
427 r[1] = s[0] << (i - LONG_BIT);
428 r[0] = 0UL;
429 } else { /* i < LONG_BIT */
430 ularith_shld (&(r[1]), s[0], s[1], i);
431 r[0] = s[0] << i;
432 }
433 }
434
435
436 /* r = n/d. We require d|n */
437 MAYBE_UNUSED
438 static inline void
modredc2ul2_intdivexact(modintredc2ul2_t r,const modintredc2ul2_t n,const modintredc2ul2_t d)439 modredc2ul2_intdivexact (modintredc2ul2_t r, const modintredc2ul2_t n,
440 const modintredc2ul2_t d)
441 {
442 modintredc2ul2_t n1, d1;
443 unsigned long invf, r0, k0, k1;
444 int i;
445 #ifdef WANT_ASSERT_EXPENSIVE
446 unsigned long s0 = n[0], s1 = n[1];
447 #endif
448
449 modredc2ul2_intset (n1, n);
450 modredc2ul2_intset (d1, d);
451
452 /* Make d odd */
453 if (d1[0] == 0UL)
454 {
455 ASSERT (n1[0] == 0UL);
456 d1[0] = d1[1];
457 d1[1] = 0UL;
458 n1[0] = n1[1];
459 n1[1] = 0UL;
460 }
461 ASSERT(d1[0] != 0UL);
462 i = ularith_ctz (d1[0]);
463 ularith_shrd (&(d1[0]), d1[1], d1[0], i);
464 d1[1] >>= i;
465 ASSERT((n1[0] & ((1UL << i) - 1UL)) == 0UL);
466 ularith_shrd (&(n1[0]), n1[1], n1[0], i);
467 n1[1] >>= i;
468
469 invf = ularith_invmod (d1[0]);
470 r0 = invf * n1[0];
471 ularith_mul_ul_ul_2ul (&k0, &k1, r0, d1[0]);
472 ularith_sub_2ul_2ul (&(n1[0]), &(n1[1]), k0, k1);
473 n1[1] -= r0 * d1[1];
474 ASSERT (n1[0] == 0UL);
475 r[0] = r0;
476 r[1] = invf * n1[1];
477 #ifdef WANT_ASSERT_EXPENSIVE
478 ASSERT_EXPENSIVE (d[1] == 0UL || r[1] == 0UL);
479 ularith_mul_ul_ul_2ul (&k0, &k1, r[0], d[0]);
480 ularith_sub_2ul_2ul (&s0, &s1, k0, k1);
481 ASSERT_EXPENSIVE (s0 == 0UL);
482 ularith_mul_ul_ul_2ul (&k0, &k1, r[0], d[1]);
483 ASSERT_EXPENSIVE (k1 == 0UL);
484 s1 -= k0;
485 ularith_mul_ul_ul_2ul (&k0, &k1, r[1], d[0]);
486 ASSERT_EXPENSIVE (k1 == 0UL);
487 s1 -= k0;
488 ASSERT_EXPENSIVE (s1 == 0UL);
489 #endif
490 }
491
492 MAYBE_UNUSED
493 static inline unsigned long
modredc2ul2_intmod_ul(const modintredc2ul2_t n,const unsigned long d)494 modredc2ul2_intmod_ul (const modintredc2ul2_t n, const unsigned long d)
495 {
496 unsigned long r;
497 if (n[1] < d)
498 {
499 ularith_div_2ul_ul_ul_r (&r, n[0], n[1], d);
500 }
501 else
502 {
503 unsigned long t;
504 ularith_div_2ul_ul_ul_r (&t, n[1], 0UL, d);
505 ularith_div_2ul_ul_ul_r (&r, n[0], t, d);
506 }
507 return r;
508 }
509
510 /* r = n % d */
511 MAYBE_UNUSED
512 static inline void
modredc2ul2_intmod(modintredc2ul2_t r,const modintredc2ul2_t n,const modintredc2ul2_t d)513 modredc2ul2_intmod (modintredc2ul2_t r, const modintredc2ul2_t n,
514 const modintredc2ul2_t d)
515 {
516 if (d[1] == 0UL)
517 {
518 r[0] = modredc2ul2_intmod_ul (n, d[0]);
519 r[1] = 0;
520 }
521 else
522 {
523 modintredc2ul2_t t1, t2;
524 unsigned long q, dummy;
525 int i;
526
527 /* Divide both by 2^k s.t. 2^(LONG_BIT-1) <= d/2^k < 2^LONG_BIT */
528 modredc2ul2_intshr (t1, n, 1);
529 modredc2ul2_intshr (t2, d, 1);
530 if (t2[1] != 0UL)
531 {
532 i = LONG_BIT - ularith_clz (t2[1]);
533 modredc2ul2_intshr (t1, t1, i);
534 modredc2ul2_intshr (t2, t2, i);
535 }
536 ASSERT(t2[1] == 0);
537 ASSERT((t2[0] & (1UL << (LONG_BIT - 1))) != 0UL);
538 ASSERT (t1[1] < t2[0]);
539
540 ularith_div_2ul_ul_ul (&q, &dummy, t1[0], t1[1], t2[0]);
541 ularith_mul_ul_ul_2ul (&(t1[0]), &(t1[1]), q, d[0]);
542 t1[1] += q * d[1];
543 /* printf ("n=%lu*2^%d + %lu; d=%lu*2^%d + %lu; q=%lu; "
544 "t1=%lu*2^%d + %lu;\n",
545 n[1], LONG_BIT, n[0], d[1], LONG_BIT, d[0], q,
546 t1[1], LONG_BIT, t1[0]); */
547
548 if (modredc2ul2_intlt (n, t1))
549 modredc2ul2_intsub (t1, t1, d);
550 ASSERT(!modredc2ul2_intlt (n, t1));
551 modredc2ul2_intsub(r, n, t1);
552 }
553 }
554
555
556 /* Functions for the modulus */
557
558 /* Init the modulus from modintredc2ul2_t. */
559 MAYBE_UNUSED
560 static inline void
modredc2ul2_initmod_int(modulusredc2ul2_t m,const modintredc2ul2_t s)561 modredc2ul2_initmod_int (modulusredc2ul2_t m, const modintredc2ul2_t s)
562 {
563 ASSERT (s[1] > 0UL);
564 ASSERT (s[1] < (1UL << (LONG_BIT - 2)));
565 modredc2ul2_intset (m[0].m, s);
566 m[0].invm = -ularith_invmod (s[0]);
567
568 modredc2ul2_intset_ul (m[0].one, 0UL);
569 modredc2ul2_intsub (m[0].one, m[0].one, m[0].m); /* 2^128 - m */
570 modredc2ul2_intmod (m[0].one, m[0].one, m[0].m);
571
572 #ifdef WANT_ASSERT_EXPENSIVE
573 {
574 modintredc2ul2_t t;
575 modredc2ul2_get_int (t, m[0].one, m);
576 ASSERT_EXPENSIVE (modredc2ul2_intequal_ul (t, 1UL));
577 }
578 #endif
579 }
580
581
582 /* Returns the modulus as a modintredc2ul2_t. */
583 MAYBE_UNUSED
584 static inline void
modredc2ul2_getmod_int(modintredc2ul2_t r,const modulusredc2ul2_t m)585 modredc2ul2_getmod_int (modintredc2ul2_t r, const modulusredc2ul2_t m)
586 {
587 modredc2ul2_intset (r, m[0].m);
588 }
589
590
591 MAYBE_UNUSED
592 static inline void
modredc2ul2_clearmod(modulusredc2ul2_t m MAYBE_UNUSED)593 modredc2ul2_clearmod (modulusredc2ul2_t m MAYBE_UNUSED)
594 {
595 return;
596 }
597
598
599 /* Functions for residues */
600
601 /* Initialises a residueredc2ul2_t and sets it to zero */
602 MAYBE_UNUSED
603 static inline void
modredc2ul2_init(residueredc2ul2_t r,const modulusredc2ul2_t m MAYBE_UNUSED)604 modredc2ul2_init (residueredc2ul2_t r, const modulusredc2ul2_t m MAYBE_UNUSED)
605 {
606 modredc2ul2_intset_ul (r, 0UL);
607 }
608
609
610 /* Initialises a residueredc2ul2_t, but does not set it to zero. For fixed
611 length residueredc2ul2_t, that leaves nothing to do at all. */
612 MAYBE_UNUSED
613 static inline void
modredc2ul2_init_noset0(residueredc2ul2_t r MAYBE_UNUSED,const modulusredc2ul2_t m MAYBE_UNUSED)614 modredc2ul2_init_noset0 (residueredc2ul2_t r MAYBE_UNUSED,
615 const modulusredc2ul2_t m MAYBE_UNUSED)
616 {
617 return;
618 }
619
620
621 MAYBE_UNUSED
622 static inline void
modredc2ul2_clear(residueredc2ul2_t r MAYBE_UNUSED,const modulusredc2ul2_t m MAYBE_UNUSED)623 modredc2ul2_clear (residueredc2ul2_t r MAYBE_UNUSED,
624 const modulusredc2ul2_t m MAYBE_UNUSED)
625 {
626 return;
627 }
628
629
630 MAYBE_UNUSED
631 static inline void
modredc2ul2_set(residueredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m MAYBE_UNUSED)632 modredc2ul2_set (residueredc2ul2_t r, const residueredc2ul2_t s,
633 const modulusredc2ul2_t m MAYBE_UNUSED)
634 {
635 ASSERT_EXPENSIVE (modredc2ul2_intlt (s, m[0].m));
636 modredc2ul2_intset (r, s);
637 }
638
639
640 MAYBE_UNUSED
641 static inline void
modredc2ul2_set_ul(residueredc2ul2_t r,const unsigned long s,const modulusredc2ul2_t m)642 modredc2ul2_set_ul (residueredc2ul2_t r, const unsigned long s,
643 const modulusredc2ul2_t m)
644 {
645 modredc2ul2_intset_ul (r, s);
646 modredc2ul2_tomontgomery (r, r, m);
647 #ifdef WANT_ASSERT_EXPENSIVE
648 {
649 modintredc2ul2_t t;
650 modredc2ul2_get_int (t, r, m);
651 ASSERT_EXPENSIVE (t[0] == s && t[1] == 0UL);
652 }
653 #endif
654 }
655
656
657 /* Sets the residueredc2ul2_t to the class represented by the integer s.
658 Assumes that s is reduced (mod m), i.e. 0 <= s < m */
659
660 MAYBE_UNUSED
661 static inline void
modredc2ul2_set_ul_reduced(residueredc2ul2_t r,const unsigned long s,const modulusredc2ul2_t m MAYBE_UNUSED)662 modredc2ul2_set_ul_reduced (residueredc2ul2_t r, const unsigned long s,
663 const modulusredc2ul2_t m MAYBE_UNUSED)
664 {
665 modredc2ul2_set_ul (r, s, m);
666 }
667
668
669 MAYBE_UNUSED
670 static inline void
modredc2ul2_set_int(residueredc2ul2_t r,const modintredc2ul2_t s,const modulusredc2ul2_t m)671 modredc2ul2_set_int (residueredc2ul2_t r, const modintredc2ul2_t s,
672 const modulusredc2ul2_t m)
673 {
674 if (!modredc2ul2_intlt (s, m[0].m))
675 modredc2ul2_intmod (r, s, m[0].m);
676 else
677 modredc2ul2_intset (r, s);
678
679 modredc2ul2_tomontgomery (r, r, m);
680 }
681
682
683 MAYBE_UNUSED
684 static inline void
modredc2ul2_set_int_reduced(residueredc2ul2_t r,const modintredc2ul2_t s,const modulusredc2ul2_t m)685 modredc2ul2_set_int_reduced (residueredc2ul2_t r, const modintredc2ul2_t s,
686 const modulusredc2ul2_t m)
687 {
688 ASSERT (modredc2ul2_intlt (s, m[0].m));
689 modredc2ul2_intset (r, s);
690 modredc2ul2_tomontgomery (r, r, m);
691 }
692
693
694 MAYBE_UNUSED
695 static inline void
modredc2ul2_set0(residueredc2ul2_t r,const modulusredc2ul2_t m MAYBE_UNUSED)696 modredc2ul2_set0 (residueredc2ul2_t r, const modulusredc2ul2_t m MAYBE_UNUSED)
697 {
698 modredc2ul2_intset_ul (r, 0UL);
699 }
700
701
702 MAYBE_UNUSED
703 static inline void
modredc2ul2_set1(residueredc2ul2_t r,const modulusredc2ul2_t m)704 modredc2ul2_set1 (residueredc2ul2_t r, const modulusredc2ul2_t m)
705 {
706 modredc2ul2_intset (r, m[0].one);
707 }
708
709
710 /* Exchanges the values of the two arguments */
711
712 MAYBE_UNUSED
713 static inline void
modredc2ul2_swap(residueredc2ul2_t a,residueredc2ul2_t b,const modulusredc2ul2_t m MAYBE_UNUSED)714 modredc2ul2_swap (residueredc2ul2_t a, residueredc2ul2_t b,
715 const modulusredc2ul2_t m MAYBE_UNUSED)
716 {
717 modintredc2ul2_t t;
718 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
719 ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
720 modredc2ul2_intset (t, a);
721 modredc2ul2_intset (a, b);
722 modredc2ul2_intset (b, t);
723 }
724
725
726 /* Returns the least significant unsigned long of the residue. How to signal
727 if the residue does not fit in one unsigned long? */
728
729 MAYBE_UNUSED
730 static inline unsigned long
modredc2ul2_get_ul(const residueredc2ul2_t s,const modulusredc2ul2_t m MAYBE_UNUSED)731 modredc2ul2_get_ul (const residueredc2ul2_t s,
732 const modulusredc2ul2_t m MAYBE_UNUSED)
733 {
734 residueredc2ul2_t t;
735 ASSERT_EXPENSIVE (modredc2ul2_intlt (s, m[0].m));
736 modredc2ul2_frommontgomery (t, s, m);
737 ASSERT (t[1] == 0UL);
738 return t[0];
739 }
740
741
742 /* Returns the residue as a modintredc2ul2_t */
743
744 MAYBE_UNUSED
745 static inline void
modredc2ul2_get_int(modintredc2ul2_t r,const residueredc2ul2_t s,const modulusredc2ul2_t m MAYBE_UNUSED)746 modredc2ul2_get_int (modintredc2ul2_t r, const residueredc2ul2_t s,
747 const modulusredc2ul2_t m MAYBE_UNUSED)
748 {
749 ASSERT_EXPENSIVE (modredc2ul2_intlt (s, m[0].m));
750 modredc2ul2_frommontgomery (r, s, m);
751 }
752
753
754 MAYBE_UNUSED
755 static inline int
modredc2ul2_equal(const residueredc2ul2_t a,const residueredc2ul2_t b,const modulusredc2ul2_t m MAYBE_UNUSED)756 modredc2ul2_equal (const residueredc2ul2_t a, const residueredc2ul2_t b,
757 const modulusredc2ul2_t m MAYBE_UNUSED)
758 {
759 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
760 ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
761 return (modredc2ul2_intequal(a, b));
762 }
763
764
765 MAYBE_UNUSED
766 static inline int
modredc2ul2_is0(const residueredc2ul2_t a,const modulusredc2ul2_t m MAYBE_UNUSED)767 modredc2ul2_is0 (const residueredc2ul2_t a, const modulusredc2ul2_t m MAYBE_UNUSED)
768 {
769 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
770 return (modredc2ul2_intequal_ul(a, 0UL));
771 }
772
773
774 MAYBE_UNUSED
775 static inline int
modredc2ul2_is1(const residueredc2ul2_t a,const modulusredc2ul2_t m MAYBE_UNUSED)776 modredc2ul2_is1 (const residueredc2ul2_t a, const modulusredc2ul2_t m MAYBE_UNUSED)
777 {
778 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
779 return (modredc2ul2_intequal(a, m[0].one));
780 }
781
782
783 MAYBE_UNUSED
784 static inline void
modredc2ul2_add(residueredc2ul2_t r,const residueredc2ul2_t a,const residueredc2ul2_t b,const modulusredc2ul2_t m)785 modredc2ul2_add (residueredc2ul2_t r, const residueredc2ul2_t a,
786 const residueredc2ul2_t b, const modulusredc2ul2_t m)
787 {
788 const unsigned long t0 = b[0], t1 = b[1]; /* r, a, and/or b may overlap */
789 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
790 ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
791
792 modredc2ul2_intset (r, a);
793 ularith_add_2ul_2ul (&(r[0]), &(r[1]), t0, t1);
794 ularith_sub_2ul_2ul_ge (&(r[0]), &(r[1]), m[0].m[0], m[0].m[1]);
795 ASSERT_EXPENSIVE (modredc2ul2_intlt (r, m[0].m));
796 }
797
798
799 MAYBE_UNUSED
800 static inline void
modredc2ul2_sub(residueredc2ul2_t r,const residueredc2ul2_t a,const residueredc2ul2_t b,const modulusredc2ul2_t m)801 modredc2ul2_sub (residueredc2ul2_t r, const residueredc2ul2_t a,
802 const residueredc2ul2_t b, const modulusredc2ul2_t m)
803 {
804 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
805 ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
806
807 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
808 {
809 unsigned long s1 = m[0].m[0], s2 = m[0].m[1], t1 = a[0], t2 = a[1];
810
811 __asm__ __VOLATILE (
812 "subq %4, %0\n\t"
813 "sbbq %5, %1\n\t" /* t -= b */
814 "cmovncq %6, %2\n\t" /* If !carry, s = 0 */
815 "cmovncq %6, %3\n"
816 : "+&r" (t1), "+&r" (t2), "+&r" (s1), "+r" (s2)
817 : "g" (b[0]), "g" (b[1]), "rm" (0UL)
818 : "cc"
819 );
820 ularith_add_2ul_2ul (&t1, &t2, s1, s2);
821 r[0] = t1;
822 r[1] = t2;
823 }
824 #else
825 {
826 unsigned long t1 = a[0], t2 = a[1];
827 ularith_sub_2ul_2ul (&t1, &t2, b[0], b[1]);
828
829 if (t2 > a[1] || (t2 == a[1] && t1 > a[0]))
830 ularith_add_2ul_2ul (&t1, &t2, m[0].m[0], m[0].m[1]);
831
832 r[0] = t1;
833 r[1] = t2;
834 }
835 #endif
836 }
837
838
839 MAYBE_UNUSED
840 static inline void
modredc2ul2_add1(residueredc2ul2_t r,const residueredc2ul2_t a,const modulusredc2ul2_t m)841 modredc2ul2_add1 (residueredc2ul2_t r, const residueredc2ul2_t a,
842 const modulusredc2ul2_t m)
843 {
844 modredc2ul2_add(r, a, m[0].one, m);
845 }
846
847
848 MAYBE_UNUSED
849 static inline void
modredc2ul2_add_ul(residueredc2ul2_t r,const residueredc2ul2_t a,const unsigned long b,const modulusredc2ul2_t m)850 modredc2ul2_add_ul (residueredc2ul2_t r, const residueredc2ul2_t a,
851 const unsigned long b, const modulusredc2ul2_t m)
852 {
853 residueredc2ul2_t t;
854
855 /* TODO: speed up */
856 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
857 modredc2ul2_init_noset0 (t, m);
858 modredc2ul2_set_ul (t, b, m);
859 modredc2ul2_add (r, a, t, m);
860 modredc2ul2_clear (t, m);
861 }
862
863
864 MAYBE_UNUSED
865 static inline void
modredc2ul2_sub_ul(residueredc2ul2_t r,const residueredc2ul2_t a,const unsigned long b,const modulusredc2ul2_t m)866 modredc2ul2_sub_ul (residueredc2ul2_t r, const residueredc2ul2_t a,
867 const unsigned long b, const modulusredc2ul2_t m)
868 {
869 residueredc2ul2_t t;
870
871 /* TODO: speed up */
872 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
873 modredc2ul2_init_noset0 (t, m);
874 modredc2ul2_set_ul (t, b, m);
875 modredc2ul2_sub (r, a, t, m);
876 modredc2ul2_clear (t, m);
877 }
878
879
880 MAYBE_UNUSED
881 static inline void
modredc2ul2_neg(residueredc2ul2_t r,const residueredc2ul2_t a,const modulusredc2ul2_t m)882 modredc2ul2_neg (residueredc2ul2_t r, const residueredc2ul2_t a,
883 const modulusredc2ul2_t m)
884 {
885 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
886 if (modredc2ul2_is0 (a, m))
887 modredc2ul2_set (r, a, m);
888 else
889 modredc2ul2_intsub (r, m[0].m, a);
890 }
891
892
893 MAYBE_UNUSED
894 static inline void
modredc2ul2_div2(residueredc2ul2_t r,const residueredc2ul2_t a,const modulusredc2ul2_t m)895 modredc2ul2_div2 (residueredc2ul2_t r, const residueredc2ul2_t a,
896 const modulusredc2ul2_t m)
897 {
898 ASSERT_EXPENSIVE (m[0].m[0] % 2UL != 0UL);
899 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
900 modredc2ul2_intset (r, a);
901 if (r[0] % 2UL == 1UL)
902 modredc2ul2_intadd (r, r, m[0].m);
903 modredc2ul2_intshr (r, r, 1);
904 }
905
906
907 MAYBE_UNUSED
908 static inline void
modredc2ul2_mul(residueredc2ul2_t r,const residueredc2ul2_t a,const residueredc2ul2_t b,const modulusredc2ul2_t m)909 modredc2ul2_mul (residueredc2ul2_t r, const residueredc2ul2_t a,
910 const residueredc2ul2_t b, const modulusredc2ul2_t m)
911 {
912 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
913
914 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
915 ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
916 #if defined(MODTRACE)
917 printf ("((%lu * 2^%d + %lu) * (%lu * 2^%d + %lu) / 2^%d) %% "
918 "(%lu * 2^%d + %lu)",
919 a[1], LONG_BIT, a[0], b[1], LONG_BIT, b[0], 2 * LONG_BIT,
920 m[0].m[1], LONG_BIT, m[0].m[0]);
921 #endif
922
923 unsigned long dummy;
924 __asm__ __VOLATILE (
925 /* Product of low words */
926 "movq %[a0], %%rax\n\t"
927 "mulq %[b0]\n\t" /* rdx:rax = a0*b0 <= (2^64-1)^2 */
928 "movq %%rdx, %[t0]\n\t"
929 /* Compute u0*m, add to t0:rax */
930 "imulq %[invm], %%rax\n\t"
931 "movq %%rax, %[t2]\n\t" /* t2 = u0 */
932 "xorl %k[t1], %k[t1]\n\t"
933 "mulq %[m0]\n\t" /* rdx:rax = u0*m0 <= (2^64-1)^2 */
934 "negq %%rax\n\t" /* if low word != 0, carry to high word */
935 "movq %[t2], %%rax\n\t" /* independent, goes in pipe 0 */
936 "adcq %%rdx, %[t0]\n\t"
937 "setc %b[t1]\n\t" /* t1:t0 = (a0*b0 + u0*m0) / 2^64 <= 2*2^64 - 4 */
938 "mulq %[m1]\n\t"
939 "addq %%rax, %[t0]\n\t"
940 "movq %[a0], %%rax\n\t" /* independent, goes in pipe 0 */
941 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0*b0+u0*m)/2^64 */
942 ABORT_IF_CY /* <= 2^126 - 2^62 */
943
944
945 /* 2 products of low and high word */
946 "xorl %k[t2], %k[t2]\n\t"
947 "mulq %[b1]\n\t" /* rdx:rax = a0*b1 <= (2^64-1)*(2^63-2^32-1) */
948 "addq %%rax, %[t0]\n\t"
949 "movq %[a1], %%rax\n\t" /* independent, goes in pipe 0 */
950 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0*b0+u0*m)/2^64 + a0*b1 */
951 ABORT_IF_CY /* <= 2^126 - 2^62 + (2^64-1)*(2^63-2^32-1)
952 = 2^127 + 2^126 - 2^96 ... */
953
954 /* Free slot here */
955 "mulq %[b0]\n\t" /* rdx:rax = a1*b0 <= (2^63-2^32-1)*(2^64-1) */
956 "addq %%rax, %[t0]\n\t"
957 "movq %[a1], %%rax\n\t" /* independent, goes in pipe 0 */
958 "adcq %%rdx, %[t1]\n\t"
959 "setc %b[t2]\n\t" /* t2:t1:t0 = (a0*b0+u0*m)/2^64 + a0*b1 + a1*b0 */
960 /* <= 2^126 - 2^62 + 2*(2^64-1)*(2^63-2^32-1)
961 = 2^128 + 2^126 - 2*2^96 ... */
962 /* Product of high words */
963 "mulq %[b1]\n\t" /* rdx:rax = a1*b1 <= (2^63-2^32-1)^2 */
964 "addq %%rax, %[t1]\n\t"
965 "movq %[t0], %%rax\n\t"
966 "adcq %%rdx, %[t2]\n\t" /* t2:t1:t0 = (a*b+u0*m)/2^64 */
967 ABORT_IF_CY /* <= ((2^127-2^96-1)^2+(2^64-1)*(2^126-2^64+1))/2^64
968 = 2^190 - 2^160 ... */
969 /* Free slot here */
970 /* Compute u1*m, add to t2:t1:t0 */
971 "imulq %[invm], %%rax\n\t"
972 "movq %%rax, %[t0]\n\t" /* t0 = u1 */
973 /* Free slot here */
974 "mulq %[m0]\n\t" /* rdx:rax = m0*u1 <= (2^64-1)^2 */
975 "negq %%rax\n\t" /* if low word != 0, carry to high word */
976 "movq %[t0], %%rax\n\t"
977 "adcq %%rdx, %[t1]\n\t"
978 "adcq $0,%[t2]\n\t" /* t2:t1:0 = (a*b+u0*m)/2^64 + u1*m0 */
979 ABORT_IF_CY /* <= 2^190 - 2^160 + 2*2^128 + 2^126 ... */
980
981 "mulq %[m1]\n\t" /* rdx:rax = u1*m1 */
982 "addq %%rax, %[t1]\n\t"
983 "adcq %%rdx, %[t2]\n\t" /* t2:t1 = ((a*b+u0*m)/2^64 + u1*m)/2^64 */
984 ABORT_IF_CY /* <= 2^127 - 2^96 - 1 */
985
986 "movq %[t1], %%rax\n\t" /* See if result > m */
987 "movq %[t2], %%rdx\n\t"
988 "subq %[m0], %[t1]\n\t"
989 "sbbq %[m1], %[t2]\n\t"
990 "cmovc %%rax, %[t1]\n\t" /* Carry -> restore old result */
991 "cmovc %%rdx, %[t2]\n\t"
992 : [t0] "=&r" (dummy), [t1] "=&r" (r[0]), [t2] "=&r" (r[1])
993 : [a0] "g" (a[0]), [a1] "g" (a[1]), [b0] "rm" (b[0]), [b1] "rm" (b[1]),
994 [m0] "rm" (m[0].m[0]), [m1] "rm" (m[0].m[1]), [invm] "rm" (m[0].invm)
995 : "%rax", "%rdx", "cc"
996 );
997 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
998
999 unsigned long pl, ph, t[4], k;
1000
1001 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
1002 ASSERT_EXPENSIVE (modredc2ul2_intlt (b, m[0].m));
1003 #if defined(MODTRACE)
1004 printf ("((%lu * 2^%d + %lu) * (%lu * 2^%d + %lu) / 2^%d) %% "
1005 "(%lu * 2^%d + %lu)",
1006 a[1], LONG_BIT, a[0], b[1], LONG_BIT, b[0], 2 * LONG_BIT,
1007 m[0].m[1], LONG_BIT, m[0].m[0]);
1008 #endif
1009
1010 /* m < 1/4 W^2, a,b < m */
1011
1012 /* Product of the two low words */
1013 ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), a[0], b[0]);
1014
1015 /* One REDC step */
1016 modredc2ul2_redc1 (t, t, m); /* t < 2m < 1/2 W^2 */
1017
1018 /* Products of one low and one high word */
1019 ularith_mul_ul_ul_2ul (&pl, &ph, a[1], b[0]); /* ph:pl < 1/4 W^2 */
1020 ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 3/4 W^2 */
1021 ularith_mul_ul_ul_2ul (&pl, &ph, a[0], b[1]); /* ph:pl < 1/4 W^2 */
1022 ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < W^2 */
1023
1024 /* Product of the two high words */
1025 ularith_mul_ul_ul_2ul (&pl, &(t[2]), a[1], b[1]); /* t2:pl < 1/16 W^2 */
1026 ularith_add_ul_2ul (&(t[1]), &(t[2]), pl); /* t2:t1:t0 < 1/16 W^3 + W^2 */
1027
1028 /* Compute t2:t1:t0 := t2:t1:t0 + km, km < Wm < 1/4 W^3 */
1029 k = t[0] * m[0].invm;
1030 ularith_mul_ul_ul_2ul (&pl, &ph, k, m[0].m[0]);
1031 if (t[0] != 0UL)
1032 ph++; /* t[0] = 0 */
1033 ularith_add_ul_2ul (&(t[1]), &(t[2]), ph);
1034 ularith_mul_ul_ul_2ul (&pl, &ph, k, m[0].m[1]); /* ph:pl < 1/4 W^2 */
1035 ularith_add_2ul_2ul (&(t[1]), &(t[2]), pl, ph);
1036 /* t2:t1:0 < 1/16 W^3 + W^2 + 1/4 W^3 < 5/16 W^3 + W^2 */
1037
1038 /* Result may be larger than m, but is < 2*m */
1039
1040 ularith_sub_2ul_2ul_ge (&(t[1]), &(t[2]), m[0].m[0], m[0].m[1]);
1041
1042 r[0] = t[1];
1043 r[1] = t[2];
1044 #endif
1045 #if defined(MODTRACE)
1046 printf (" == (%lu * 2^%d + %lu) /* PARI */ \n", r[1], LONG_BIT, r[0]);
1047 #endif
1048 ASSERT_EXPENSIVE (modredc2ul2_intlt (r, m[0].m));
1049 }
1050
1051 MAYBE_UNUSED
1052 static inline void
modredc2ul2_sqr(residueredc2ul2_t r,const residueredc2ul2_t a,const modulusredc2ul2_t m)1053 modredc2ul2_sqr (residueredc2ul2_t r, const residueredc2ul2_t a,
1054 const modulusredc2ul2_t m)
1055 {
1056 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
1057
1058 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
1059 #if defined(MODTRACE)
1060 printf ("((%lu * 2^%d + %lu) * (%lu * 2^%d + %lu) / 2^%d) %% "
1061 "(%lu * 2^%d + %lu)",
1062 a[1], LONG_BIT, a[0], a[1], LONG_BIT, a[0], 2 * LONG_BIT,
1063 m[0].m[1], LONG_BIT, m[0].m[0]);
1064 #endif
1065
1066 /* m <= 2^126-1
1067 Since m1>0, m*u is maximal for m0=1 and u=2^64-1, so
1068 u*m is bounded by (2^126 - 2^64 + 1)*(2^64 - 1) =
1069 2^190 - 2^128 - 2^126 + 2*2^64 - 1.
1070 If a,b <= 2^127-2^96-1, then
1071 ((a*b+u0*m)/2^64 + u1*m)/2^64 <= 2^127-2^96-1
1072 If we allow non-canonical residues up to 2^127-2^96-1, we can skip
1073 the final conditional subtraction. These residues are still < 2^127,
1074 so an addition does not overflow */
1075
1076 unsigned long dummy;
1077 __asm__ __VOLATILE (
1078 /* Product of low words */
1079 "movq %[a0], %%rax\n\t"
1080 "mulq %[a0]\n\t" /* rdx:rax = a0^2 <= (2^64-1)^2 */
1081 "movq %%rdx, %[t0]\n\t"
1082 /* Compute u0*m, add to t0:rax */
1083 "imulq %[invm], %%rax\n\t"
1084 "movq %%rax, %[t2]\n\t" /* t2 = u0 */
1085 "xorl %k[t1], %k[t1]\n\t"
1086 "mulq %[m0]\n\t" /* rdx:rax = u0*m0 <= (2^64-1)^2 */
1087 "negq %%rax\n\t" /* if low word != 0, carry to high word */
1088 "movq %[t2], %%rax\n\t" /* independent, goes in pipe 0 */
1089 "adcq %%rdx, %[t0]\n\t"
1090 "setc %b[t1]\n\t" /* t1:t0 = (a0^2 + u0*m0) / 2^64 <= 2*2^64 - 4 */
1091 "mulq %[m1]\n\t"
1092 "addq %%rax, %[t0]\n\t"
1093 "movq %[a0], %%rax\n\t" /* independent, goes in pipe 0 */
1094 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0^2+u0*m)/2^64 */
1095 ABORT_IF_CY /* <= 2^126 - 2^62 */
1096
1097 /* 2 products of low and high word */
1098 "xorl %k[t2], %k[t2]\n\t"
1099 "mulq %[a1]\n\t" /* rdx:rax = a0*a1 <= (2^64-1)*(2^63-2^32-1) */
1100 "addq %%rax, %[t0]\n\t"
1101 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0^2+u0*m)/2^64 + a0*a1 */
1102 ABORT_IF_CY /* <= 2^126 - 2^62 + (2^64-1)*(2^63-2^32-1)
1103 = 2^127 + 2^126 - 2^96 ... */
1104 "addq %%rax, %[t0]\n\t"
1105 "adcq %%rdx, %[t1]\n\t"
1106 "movq %[a1], %%rax\n\t" /* independent, goes in pipe 0 */
1107 "setc %b[t2]\n\t" /* t2:t1:t0 = (a0*a0+u0*m)/2^64 + 2*a0*a1 */
1108 /* <= 2^126 - 2^62 + 2*(2^64-1)*(2^63-2^32-1)
1109 = 2^128 + 2^126 - 2*2^96 ... */
1110
1111 /* Product of high words */
1112 "mulq %[a1]\n\t" /* rdx:rax = a1^2 <= (2^63-2^32-1)^2 */
1113 "addq %%rax, %[t1]\n\t"
1114 "movq %[t0], %%rax\n\t"
1115 "adcq %%rdx, %[t2]\n\t" /* t2:t1:t0 = (a^2+u0*m)/2^64 */
1116 ABORT_IF_CY /* <= ((2^127-2^96-1)^2+(2^64-1)*(2^126-2^64+1))/2^64
1117 = 2^190 - 2^160 ... */
1118 /* Free slot here */
1119 /* Compute u1*m, add to t2:t1:t0 */
1120 "imulq %[invm], %%rax\n\t"
1121 "movq %%rax, %[t0]\n\t" /* t0 = u1 */
1122 /* Free slot here */
1123 "mulq %[m0]\n\t" /* rdx:rax = m0*u1 <= (2^64-1)^2 */
1124 "negq %%rax\n\t" /* if low word != 0, carry to high word */
1125 "movq %[t0], %%rax\n\t"
1126 "adcq %%rdx, %[t1]\n\t"
1127 "adcq $0,%[t2]\n\t" /* t2:t1:0 = (a*a+u0*m)/2^64 + u1*m0 */
1128 ABORT_IF_CY /* <= 2^190 - 2^160 + 2*2^128 + 2^126 ... */
1129
1130 "mulq %[m1]\n\t" /* rdx:rax = u1*m1 */
1131 "addq %%rax, %[t1]\n\t"
1132 "adcq %%rdx, %[t2]\n\t" /* t2:t1 = ((a*a+u0*m)/2^64 + u1*m)/2^64 */
1133 ABORT_IF_CY /* <= 2^127 - 2^96 - 1 */
1134
1135 "movq %[t1], %%rax\n\t" /* See if result > m */
1136 "movq %[t2], %%rdx\n\t"
1137 "subq %[m0], %[t1]\n\t"
1138 "sbbq %[m1], %[t2]\n\t"
1139 "cmovc %%rax, %[t1]\n\t" /* No carry -> copy new result */
1140 "cmovc %%rdx, %[t2]\n\t"
1141 : [t0] "=&r" (dummy), [t1] "=&r" (r[0]), [t2] "=&r" (r[1])
1142 : [a0] "g" (a[0]), [a1] "g" (a[1]),
1143 [m0] "rm" (m[0].m[0]), [m1] "rm" (m[0].m[1]), [invm] "rm" (m[0].invm)
1144 : "%rax", "%rdx", "cc"
1145 );
1146 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
1147
1148 unsigned long pl, ph, t[4], k;
1149
1150 ASSERT_EXPENSIVE (modredc2ul2_intlt (a, m[0].m));
1151 #if defined(MODTRACE)
1152 printf ("((%lu * 2^%d + %lu)^2 %% (%lu * 2^%d + %lu)",
1153 a[1], LONG_BIT, a[0], 2 * LONG_BIT, m[0].m[1], LONG_BIT, m[0].m[0]);
1154 #endif
1155
1156 /* m < 1/4 W^2, a < m */
1157
1158 /* Square of the low word */
1159 ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), a[0], a[0]);
1160
1161 /* One REDC step */
1162 modredc2ul2_redc1 (t, t, m); /* t < 2m < 1/2 W^2 */
1163
1164 /* Products of low and high word */
1165 ularith_mul_ul_ul_2ul (&pl, &ph, a[1], a[0]); /* ph:pl < 1/4 W^2 */
1166 ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 3/4 W^2 */
1167 ularith_add_2ul_2ul (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < W^2 */
1168
1169 /* Square of high word */
1170 ularith_mul_ul_ul_2ul (&pl, &(t[2]), a[1], a[1]); /* t2:pl < 1/16 W^2 */
1171 ularith_add_ul_2ul (&(t[1]), &(t[2]), pl); /* t2:t1:t0 < 1/16 W^3 + W^2 */
1172
1173 /* Compute t2:t1:t0 := t2:t1:t0 + km, km < Wm < 1/4 W^3 */
1174 k = t[0] * m[0].invm;
1175 ularith_mul_ul_ul_2ul (&pl, &ph, k, m[0].m[0]);
1176 if (t[0] != 0UL)
1177 ph++; /* t[0] = 0 */
1178 ularith_add_ul_2ul (&(t[1]), &(t[2]), ph);
1179 ularith_mul_ul_ul_2ul (&pl, &ph, k, m[0].m[1]); /* ph:pl < 1/4 W^2 */
1180 ularith_add_2ul_2ul (&(t[1]), &(t[2]), pl, ph);
1181 /* t2:t1:0 < 1/16 W^3 + W^2 + 1/4 W^3 < 5/16 W^3 + W^2 */
1182
1183 /* Result may be larger than m, but is < 2*m */
1184
1185 ularith_sub_2ul_2ul_ge (&(t[1]), &(t[2]), m[0].m[0], m[0].m[1]);
1186
1187 r[0] = t[1];
1188 r[1] = t[2];
1189 #endif
1190 #if defined(MODTRACE)
1191 printf (" == (%lu * 2^%d + %lu) /* PARI */ \n", r[1], LONG_BIT, r[0]);
1192 #endif
1193 ASSERT_EXPENSIVE (modredc2ul2_intlt (r, m[0].m));
1194 }
1195
1196
1197 MAYBE_UNUSED
1198 static inline int
modredc2ul2_next(residueredc2ul2_t r,const modulusredc2ul2_t m)1199 modredc2ul2_next (residueredc2ul2_t r, const modulusredc2ul2_t m)
1200 {
1201 ularith_add_ul_2ul (&(r[0]), &(r[1]), 1UL);
1202 return (modredc2ul2_intequal (r, m[0].m));
1203 }
1204
1205
1206 MAYBE_UNUSED
1207 static inline int
modredc2ul2_finished(const residueredc2ul2_t r,const modulusredc2ul2_t m)1208 modredc2ul2_finished (const residueredc2ul2_t r, const modulusredc2ul2_t m)
1209 {
1210 return (modredc2ul2_intequal (r, m[0].m));
1211 }
1212
1213 /* Division by small integer n, where (n-1)*m may overflow the most
1214 significant word. Returns 1 if n is invertible modulo m, 0 if not.
1215
1216 w_mod_n is word base (e.g., 2^32 or 2^64) mod n
1217 inv_n contains -1/i (mod n) if i is coprime to n, or 0 if i is not coprime
1218 to n, for 0 <= i < n
1219 c = n^(-1) (mod word base)
1220 */
1221
1222 MAYBE_UNUSED
1223 static inline int
modredc2ul2_divn(residueredc2ul2_t r,const residueredc2ul2_t a,const unsigned long n,const unsigned long w_mod_n,const unsigned long * inv_n,const unsigned long c,const modulusredc2ul2_t m)1224 modredc2ul2_divn (residueredc2ul2_t r, const residueredc2ul2_t a,
1225 const unsigned long n, const unsigned long w_mod_n,
1226 const unsigned long *inv_n, const unsigned long c,
1227 const modulusredc2ul2_t m)
1228 {
1229 const unsigned long an = ((a[1] % n) * w_mod_n + a[0] % n) % n;
1230 const unsigned long mn = ((m[0].m[1] % n) * w_mod_n + m[0].m[0] % n) % n;
1231
1232 residueredc2ul2_t t, t2;
1233 unsigned long k;
1234 #ifdef WANT_ASSERT_EXPENSIVE
1235 residueredc2ul2_t a_backup;
1236 #endif
1237
1238 if (inv_n[mn] == 0)
1239 return 0;
1240
1241 #ifdef WANT_ASSERT_EXPENSIVE
1242 modredc2ul2_init_noset0 (a_backup, m);
1243 modredc2ul2_set (a_backup, a, m);
1244 #endif
1245 modredc2ul2_init_noset0 (t, m);
1246 modredc2ul2_init_noset0 (t2, m);
1247 modredc2ul2_set (t, a, m);
1248
1249 /* Make t[1]:t[0] == a+km (mod w^2) with a+km divisible by n */
1250 /* We want a+km == 0 (mod n), so k = -a*m^{-1} (mod n) */
1251 k = (inv_n[mn] * an) % n;
1252 ASSERT_EXPENSIVE ((an + k*mn) % n == 0);
1253 ularith_mul_ul_ul_2ul (&(t[0]), &(t[1]), m[0].m[0], k);
1254 t[1] += m[0].m[1] * k;
1255 ularith_add_2ul_2ul (&(t[0]), &(t[1]), a[0], a[1]);
1256
1257 /* We want r = (a+km)/n. */
1258
1259 /* May overwrite a */
1260 r[0] = t[0] * c;
1261
1262 /* r0 == (a+km)/n (mod w)
1263 (r1*w + r0) * n = (a+km)
1264 (r1*w + r0) * n == t (mod w^2)
1265 r1*w*n == t - n*r0 (mod w^2)
1266 t - n*r0 == 0 (mod w), thus
1267 r1*n == (t - n*r0)/w (mod w) */
1268
1269 ularith_mul_ul_ul_2ul (&(t2[0]), &(t2[1]), r[0], n);
1270 ularith_sub_2ul_2ul (&(t[0]), &(t[1]), t2[0], t2[1]);
1271 ASSERT_EXPENSIVE (t[0] == 0UL);
1272 r[1] = t[1] * c;
1273
1274 #ifdef WANT_ASSERT_EXPENSIVE
1275 {
1276 unsigned long i;
1277 modredc2ul2_set (t, r, m);
1278 for (i = 1; i < n; i++)
1279 modredc2ul2_add (t, t, r, m);
1280 ASSERT_EXPENSIVE (modredc2ul2_equal (t, a_backup, m));
1281 modredc2ul2_clear (a_backup, m);
1282 }
1283 #endif
1284
1285 modredc2ul2_clear (t, m);
1286 modredc2ul2_clear (t2, m);
1287
1288 return 1;
1289 }
1290
1291 struct modredc2ul2_batch_Q_to_Fp_context_s {
1292 modintredc2ul2_t c;
1293 unsigned long rem_ul, ratio_ul, den_inv;
1294 modulusredc2ul2_t m;
1295 };
1296 typedef struct modredc2ul2_batch_Q_to_Fp_context_s modredc2ul2_batch_Q_to_Fp_context_t;
1297
1298 /* prototypes of non-inline functions */
1299 #ifdef __cplusplus
1300 extern "C" {
1301 #endif
1302 int modredc2ul2_div3 (residueredc2ul2_t, const residueredc2ul2_t,
1303 const modulusredc2ul2_t);
1304 int modredc2ul2_div5 (residueredc2ul2_t, const residueredc2ul2_t,
1305 const modulusredc2ul2_t);
1306 int modredc2ul2_div7 (residueredc2ul2_t, const residueredc2ul2_t,
1307 const modulusredc2ul2_t);
1308 int modredc2ul2_div11 (residueredc2ul2_t, const residueredc2ul2_t,
1309 const modulusredc2ul2_t);
1310 int modredc2ul2_div13 (residueredc2ul2_t, const residueredc2ul2_t,
1311 const modulusredc2ul2_t);
1312 void modredc2ul2_gcd (modintredc2ul2_t, const residueredc2ul2_t,
1313 const modulusredc2ul2_t);
1314 void modredc2ul2_pow_ul (residueredc2ul2_t, const residueredc2ul2_t,
1315 const unsigned long, const modulusredc2ul2_t);
1316 void modredc2ul2_2pow_ul (residueredc2ul2_t, const unsigned long,
1317 const modulusredc2ul2_t);
1318 void modredc2ul2_pow_mp (residueredc2ul2_t, const residueredc2ul2_t,
1319 const unsigned long *, const int,
1320 const modulusredc2ul2_t);
1321 void modredc2ul2_2pow_mp (residueredc2ul2_t, const unsigned long *, const int,
1322 const modulusredc2ul2_t);
1323 void modredc2ul2_V_ul (residueredc2ul2_t, const residueredc2ul2_t,
1324 const unsigned long, const modulusredc2ul2_t);
1325 void modredc2ul2_V_mp (residueredc2ul2_t, const residueredc2ul2_t,
1326 const unsigned long *, const int,
1327 const modulusredc2ul2_t);
1328 int modredc2ul2_sprp (const residueredc2ul2_t, const modulusredc2ul2_t);
1329 int modredc2ul2_sprp2 (const modulusredc2ul2_t);
1330 int modredc2ul2_isprime (const modulusredc2ul2_t);
1331 int modredc2ul2_inv (residueredc2ul2_t, const residueredc2ul2_t,
1332 const modulusredc2ul2_t);
1333 int modredc2ul2_batchinv (residueredc2ul2_t *, const residueredc2ul2_t *,
1334 size_t n, const residueredc2ul2_t,
1335 const modulusredc2ul2_t);
1336 modredc2ul2_batch_Q_to_Fp_context_t *
1337 modredc2ul2_batch_Q_to_Fp_init (const modintredc2ul2_t, const modintredc2ul2_t);
1338 void modredc2ul2_batch_Q_to_Fp_clear (modredc2ul2_batch_Q_to_Fp_context_t *);
1339
1340 int modredc2ul2_batch_Q_to_Fp (unsigned long *,
1341 const modredc2ul2_batch_Q_to_Fp_context_t *,
1342 unsigned long, int, const unsigned long *, size_t);
1343 int modredc2ul2_jacobi (const residueredc2ul2_t, const modulusredc2ul2_t);
1344 #ifdef __cplusplus
1345 }
1346 #endif
1347
1348 #endif /* MODREDC_2UL2_H */
1349