1 #ifndef MODREDC126_HPP
2 #define MODREDC126_HPP
3 
4 /* Some functions for modular arithmetic with modulus in [2^64+1, 2^126-1].
5    Moduli must be odd. Residues are stored in Montgomery form, reduction
6    after multiplication is done with REDC. */
7 
8 /**********************************************************************/
9 #include "cado_config.h"  // for HAVE_GCC_STYLE_AMD64_INLINE_ASM
10 #include <cstdint>
11 #include <cstddef>       // for size_t, NULL
12 #include <new>            // for operator new
13 #include "macros.h"
14 #include "u64arith.h"
15 #include "modint.hpp"
16 #include "mod_stdop.hpp"
17 
18 class ModulusREDC126 {
19     /* Type definitions */
20 public:
21     typedef Integer128 Integer;
22     class Residue {
23         friend class ModulusREDC126;
24     protected:
25         uint64_t r[2];
26     public:
27         typedef ModulusREDC126 Modulus;
28         typedef Modulus::Integer Integer;
29         typedef bool IsResidueType;
30         Residue() = delete;
Residue(const Modulus & m MAYBE_UNUSED)31         Residue(const Modulus &m MAYBE_UNUSED) : r{0,0} {}
Residue(const Modulus & m MAYBE_UNUSED,const Residue & s)32         Residue(const Modulus &m MAYBE_UNUSED, const Residue &s) : r{s.r[0], s.r[1]} {}
Residue(const Residue && s)33         Residue(const Residue &&s) : r{s.r[0], s.r[1]} {}
34     protected:
operator =(const Residue & s)35         Residue &operator=(const Residue &s) {r[0] = s.r[0]; r[1] = s.r[1]; return *this;}
operator =(const Integer & s)36         Residue &operator=(const Integer &s) {r[1] = 0; s.get(r, 2); return *this;}
operator =(const uint64_t s)37         Residue &operator=(const uint64_t s) {r[0] = s; r[1] = 0; return *this;}
38     };
39 
40     typedef ResidueStdOp<Residue> ResidueOp;
41 
42 protected:
43     /* Data members */
44     uint64_t m[2];
45     uint64_t invm;
46     uint64_t mrecip;
47     Residue one;
48 
49 public:
getminmod()50     static Integer getminmod() {return Integer(1,1);} /* 2^64+1 */
getmaxmod()51     static Integer getmaxmod() {return Integer(UINT64_MAX, UINT64_MAX >> 2);} /* 2^126-1 */
valid(const Integer & m)52     static bool valid(const Integer &m) {
53         return getminmod() <= m && m <= getmaxmod() && m % 2 == 1;
54     }
ModulusREDC126(const ModulusREDC126 & s)55     ModulusREDC126(const ModulusREDC126 &s) : m{s.m[0], s.m[1]}, invm(s.invm), mrecip(s.mrecip), one(s) {one = s.one;}
ModulusREDC126(const Integer s)56     ModulusREDC126 (const Integer s) : one(*this) {
57         ASSERT (valid(s));
58         s.get (m, 2);
59         invm = -u64arith_invmod (m[0]);
60 
61         int shift = u64arith_clz(m[1]);
62         uint64_t dummy, ml[2] = {m[0], m[1]};
63         u64arith_shl_2(&ml[0], &ml[1], shift);
64         mrecip = u64arith_reciprocal_for_div_3by2(ml[0], ml[1]);
65         u64arith_divqr_3_2_1_recip_precomp(&dummy, &one.r[0], &one.r[1],
66             0, 0, 1, ml[0], ml[1], mrecip, shift);
67     }
68 
69     /* Returns the modulus as an Integer. */
getmod(Integer & r) const70     void getmod(Integer &r) const {
71         r.set(m, 2);
72     }
73 
74 protected:
assertValid(const Residue & r MAYBE_UNUSED) const75     void assertValid(const Residue &r MAYBE_UNUSED) const {ASSERT(u64arith_lt_2_2(r.r[0], r.r[1], m[0], m[1]));}
76 
77     /* r = s * 2^64 mod m */
78     void
tomontgomery1(Residue & r,const Residue & s) const79     tomontgomery1 (Residue &r, const Residue &s) const
80     {
81         int shift = u64arith_clz(m[1]);
82         uint64_t dummy, ml[2] = {m[0], m[1]};
83         u64arith_shl_2(&ml[0], &ml[1], shift);
84         u64arith_divqr_3_2_1_recip_precomp(&dummy, &r.r[0], &r.r[1],
85             0, s.r[0], s.r[1], ml[0], ml[1], mrecip, shift);
86     }
87 
88     /* r = s * 2^128 mod m */
89     void
tomontgomery(Residue & r,const Residue & s) const90     tomontgomery (Residue &r, const Residue &s) const
91     {
92         tomontgomery1(r, s);
93         tomontgomery1(r, r);
94     }
95 
redc1(uint64_t * r,const uint64_t * s) const96     void redc1 (uint64_t *r, const uint64_t *s) const
97     {
98         uint64_t t[4], k;
99 
100         k = s[0] * invm;
101         u64arith_mul_1_1_2 (&(t[0]), &(t[1]), k, m[0]);
102         if (s[0] != 0)
103             t[1]++;
104         t[2] = 0;
105         u64arith_add_1_2 (&(t[1]), &(t[2]), s[1]);     /* t[2] <= 1 */
106         u64arith_mul_1_1_2 (&(t[0]), &(t[3]), k, m[1]);  /* t[3] < 2^w-1 */
107         u64arith_add_2_2 (&(t[1]), &(t[2]), t[0], t[3]); /* t[2] < 2^w */
108 
109         /* r = (k*m + s) / w, k <= w-1. If s < m, then r < m */
110         r[0] = t[1];
111         r[1] = t[2];
112     }
113 
114     /* Do a one-word REDC, i.e., r == s / w (mod m), w = 2^64.
115        If m > w, r < 2m. If s < m, then r < m */
redc1(Residue & r,const Residue & s) const116     void redc1 (Residue &r, const Residue &s) const
117     {
118         redc1(r.r, s.r);
119     }
redc1(Integer & r,const Integer & s) const120     void redc1 (Integer &r, const Integer &s) const
121     {
122         uint64_t t[2];
123         s.get(t, 2);
124         redc1(t, t);
125         r = Integer(t[0], t[1]);
126     }
127 
128     /* Converts s out of Montgomery form by dividing by 2^(2*LONG_BIT).
129        Requires s < m. */
frommontgomery(Residue & r,const Residue & s) const130     void frommontgomery (Residue &r, const Residue &s) const
131     {
132         /* Do two REDC steps */
133         redc1 (r, s);
134         redc1 (r, r);
135     }
136 
137 public:
138     /** Allocate an array of len residues.
139      *
140      * Must be freed with deleteArray(), not with delete[].
141      */
newArray(const size_t len) const142     Residue *newArray(const size_t len) const {
143         void *t = operator new[](len * sizeof(Residue));
144         if (t == NULL)
145             return NULL;
146         Residue *ptr = static_cast<Residue *>(t);
147         for(size_t i = 0; i < len; i++) {
148             new(&ptr[i]) Residue(*this);
149         }
150         return ptr;
151     }
152 
deleteArray(Residue * ptr,const size_t len) const153     void deleteArray(Residue *ptr, const size_t len) const {
154         for(size_t i = len; i > 0; i++) {
155             ptr[i - 1].~Residue();
156         }
157         operator delete[](ptr);
158     }
159 
set(Residue & r,const Residue & s) const160     void set(Residue &r, const Residue &s) const {r = s;}
set(Residue & r,const uint64_t s) const161     void set (Residue &r, const uint64_t s) const {
162         r.r[0] = s;
163         r.r[1] = 0;
164         tomontgomery (r, r);
165     }
set(Residue & r,const Integer & s) const166     void set (Residue &r, const Integer &s) const {
167         if (s < Integer(m[0], m[1])) {
168             s.get(r.r, 2);
169         } else {
170             Integer t = s;
171             t = t % Integer(m[0], m[1]);
172             t.get(r.r, 2);
173         }
174         tomontgomery (r, r);
175     }
176     /* Sets the residueredc2ul2_t to the class represented by the integer s.
177        Assumes that s is reduced (mod m), i.e. 0 <= s < m */
set_reduced(Residue & r,const uint64_t s) const178     void set_reduced (Residue &r, const uint64_t s) const {set (r, s);}
set_reduced(Residue & r,Integer s) const179     void set_reduced (Residue &r, Integer s) const {
180         ASSERT (s < Integer(m[0], m[1]));
181         s.get(r.r, 2);
182         tomontgomery (r, r);
183     }
set0(Residue & r) const184     void set0(Residue &r) const {r.r[0] = 0; r.r[1] = 0;}
set1(Residue & r) const185     void set1 (Residue &r) const {set (r, one);}
186     /* Exchanges the values of the two arguments */
187 
swap(Residue & a,Residue & b) const188     void swap (Residue &a, Residue &b) const {
189         Residue t(*this);
190         set (t, a);
191         set (a, b);
192         set (b, t);
193     }
194 
195     /* Returns the residue as a modintredc2ul2_t */
get(Integer & r,const Residue & s) const196     void get (Integer &r, const Residue &s) const {
197         Residue t(*this);
198         frommontgomery (t, s);
199         r.set(t.r, 2);
200     }
201 
equal(const Residue & a,const Residue & b) const202     bool equal (const Residue &a, const Residue &b) const {
203         return (a.r[0] == b.r[0] && a.r[1] == b.r[1]);
204     }
205 
is0(const Residue & a) const206     bool is0 (const Residue &a) const {
207         return (a.r[0] == 0 && a.r[1] == 0);
208     }
209 
is1(const Residue & a) const210     bool is1 (const Residue &a) const {
211         return (equal(a, one));
212     }
213 
add(Residue & r,const Residue & a,const Residue & b) const214     void add (Residue &r, const Residue &a, const Residue &b) const {
215         const uint64_t t0 = b.r[0], t1 = b.r[1]; /* r, a, and/or b may overlap */
216         set (r, a);
217         u64arith_add_2_2 (&(r.r[0]), &(r.r[1]), t0, t1);
218         u64arith_sub_2_2_ge (&(r.r[0]), &(r.r[1]), m[0], m[1]);
219     }
220 
sub(Residue & r,const Residue & a,const Residue & b) const221     void sub (Residue &r, const Residue &a, const Residue &b) const {
222 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
223         {
224             uint64_t s1 = m[0], s2 = m[1], t1 = a.r[0], t2 = a.r[1];
225 
226             __asm__ __VOLATILE (
227                 "subq %4, %0\n\t"
228                 "sbbq %5, %1\n\t"    /* t -= b */
229                 "cmovncq %6, %2\n\t" /* If !carry, s = 0 */
230                 "cmovncq %6, %3\n"
231                 : "+&r" (t1), "+&r" (t2), "+&r" (s1), "+r" (s2)
232                 : "g" (b.r[0]), "g" (b.r[1]), "rm" (0UL)
233                 : "cc"
234             );
235             u64arith_add_2_2 (&t1, &t2, s1, s2);
236             r.r[0] = t1;
237             r.r[1] = t2;
238         }
239 #else
240         {
241             uint64_t t1 = a.r[0], t2 = a.r[1];
242             u64arith_sub_2_2 (&t1, &t2, b.r[0], b.r[1]);
243 
244             if (t2 > a.r[1] || (t2 == a.r[1] && t1 > a.r[0]))
245                 u64arith_add_2_2 (&t1, &t2, m[0], m[1]);
246 
247             r.r[0] = t1;
248             r.r[1] = t2;
249         }
250 #endif
251     }
252 
add1(Residue & r,const Residue & a) const253     void add1 (Residue &r, const Residue &a) const {
254         add(r, a, one);
255     }
256 
sub1(Residue & r,const Residue & a) const257     void sub1 (Residue &r, const Residue &a) const {
258         sub(r, a, one);
259     }
260 
add(Residue & r,const Residue & a,const uint64_t b) const261     void add (Residue &r, const Residue &a, const uint64_t b) const
262     {
263         Residue t(*this);
264 
265         set (t, b);
266         add (r, a, t);
267     }
268 
sub(Residue & r,const Residue & a,const uint64_t b) const269     void sub (Residue &r, const Residue &a, const uint64_t b) const {
270         Residue t(*this);
271 
272         set (t, b);
273         sub (r, a, t);
274     }
275 
neg(Residue & r,const Residue & a) const276     void neg (Residue &r, const Residue &a) const {
277         if (is0 (a)) {
278             set0 (r);
279         } else {
280             uint64_t t0 = m[0], t1 = m[1];
281             u64arith_sub_2_2(&t0, &t1, a.r[0], a.r[1]);
282             r.r[0] = t0;
283             r.r[1] = t1;
284         }
285     }
286 
div2(Residue & r,const Residue & a) const287     bool div2 (Residue &r, const Residue &a) const {
288         set (r, a);
289         if (r.r[0] % 2 == 1)
290             u64arith_add_2_2 (&r.r[0], &r.r[1], m[0], m[1]);
291         u64arith_shr_2(&r.r[0], &r.r[1], 1);
292         return true;
293     }
294 
295 #ifdef WANT_ASSERT_EXPENSIVE
296 #if defined(__x86_64__)
297 #define ABORT_IF_CY "jnc 1f\n\tlea _GLOBAL_OFFSET_TABLE_(%%rip), %%rbx\n\tcall abort@plt\n1:\n\t"
298 #elif defined(__i386__)
299 #define ABORT_IF_CY "jnc 1f\n\tcall abort\n1:\n\t"
300 #endif
301 #else
302 #define ABORT_IF_CY
303 #endif
304 
mul(Residue & r,const Residue & a,const Residue & b) const305     void mul (Residue &r, const Residue &a, const Residue &b) const {
306 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
307         uint64_t dummy;
308         __asm__ __VOLATILE (
309             /* Product of low words */
310             "movq %[a0], %%rax\n\t"
311             "mulq %[b0]\n\t"         /* rdx:rax = a0*b0 <= (2^64-1)^2 */
312             "movq %%rdx, %[t0]\n\t"
313             /* Compute u0*m, add to t0:rax */
314             "imulq %[invm], %%rax\n\t"
315             "movq %%rax, %[t2]\n\t"  /* t2 = u0 */
316             "xorl %k[t1], %k[t1]\n\t"
317             "mulq %[m0]\n\t"         /* rdx:rax = u0*m0 <= (2^64-1)^2 */
318             "negq %%rax\n\t"         /* if low word != 0, carry to high word */
319             "movq %[t2], %%rax\n\t"  /* independent, goes in pipe 0 */
320             "adcq %%rdx, %[t0]\n\t"
321             "setc %b[t1]\n\t"        /* t1:t0 = (a0*b0 + u0*m0) / 2^64 <= 2*2^64 - 4 */
322             "mulq %[m1]\n\t"
323             "addq %%rax, %[t0]\n\t"
324             "movq %[a0], %%rax\n\t"  /* independent, goes in pipe 0 */
325             "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0*b0+u0*m)/2^64 */
326             ABORT_IF_CY              /* <= 2^126 - 2^62 */
327 
328 
329             /* 2 products of low and high word */
330             "xorl %k[t2], %k[t2]\n\t"
331             "mulq %[b1]\n\t"         /* rdx:rax = a0*b1 <= (2^64-1)*(2^63-2^32-1) */
332             "addq %%rax, %[t0]\n\t"
333             "movq %[a1], %%rax\n\t"  /* independent, goes in pipe 0 */
334             "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0*b0+u0*m)/2^64 + a0*b1 */
335             ABORT_IF_CY              /* <= 2^126 - 2^62 + (2^64-1)*(2^63-2^32-1)
336     			        = 2^127 + 2^126 - 2^96 ... */
337 
338             /* Free slot here */
339             "mulq %[b0]\n\t"         /* rdx:rax = a1*b0 <= (2^63-2^32-1)*(2^64-1) */
340             "addq %%rax, %[t0]\n\t"
341             "movq %[a1], %%rax\n\t"  /* independent, goes in pipe 0 */
342             "adcq %%rdx, %[t1]\n\t"
343             "setc %b[t2]\n\t"        /* t2:t1:t0 = (a0*b0+u0*m)/2^64 + a0*b1 + a1*b0 */
344             /* <= 2^126 - 2^62 + 2*(2^64-1)*(2^63-2^32-1)
345                        = 2^128 + 2^126 - 2*2^96 ... */
346             /* Product of high words */
347             "mulq %[b1]\n\t"         /* rdx:rax = a1*b1 <= (2^63-2^32-1)^2 */
348             "addq %%rax, %[t1]\n\t"
349             "movq %[t0], %%rax\n\t"
350             "adcq %%rdx, %[t2]\n\t"  /* t2:t1:t0 = (a*b+u0*m)/2^64 */
351             ABORT_IF_CY              /* <= ((2^127-2^96-1)^2+(2^64-1)*(2^126-2^64+1))/2^64
352                                 = 2^190 - 2^160 ... */
353             /* Free slot here */
354             /* Compute u1*m, add to t2:t1:t0 */
355             "imulq %[invm], %%rax\n\t"
356             "movq %%rax, %[t0]\n\t" /* t0 = u1 */
357             /* Free slot here */
358             "mulq %[m0]\n\t"        /* rdx:rax = m0*u1 <= (2^64-1)^2 */
359             "negq %%rax\n\t"        /* if low word != 0, carry to high word */
360             "movq %[t0], %%rax\n\t"
361             "adcq %%rdx, %[t1]\n\t"
362             "adcq $0,%[t2]\n\t"     /* t2:t1:0 = (a*b+u0*m)/2^64 + u1*m0 */
363             ABORT_IF_CY             /* <= 2^190 - 2^160 + 2*2^128 + 2^126 ... */
364 
365             "mulq %[m1]\n\t"        /* rdx:rax = u1*m1 */
366             "addq %%rax, %[t1]\n\t"
367             "adcq %%rdx, %[t2]\n\t" /* t2:t1 = ((a*b+u0*m)/2^64 + u1*m)/2^64 */
368             ABORT_IF_CY             /* <= 2^127 - 2^96 - 1 */
369 
370             "movq %[t1], %%rax\n\t" /* See if result > m */
371             "movq %[t2], %%rdx\n\t"
372             "subq %[m0], %[t1]\n\t"
373             "sbbq %[m1], %[t2]\n\t"
374             "cmovc %%rax, %[t1]\n\t" /* Carry -> restore old result */
375             "cmovc %%rdx, %[t2]\n\t"
376             : [t0] "=&r" (dummy), [t1] "=&r" (r.r[0]), [t2] "=&r" (r.r[1])
377             : [a0] "g" (a.r[0]), [a1] "g" (a.r[1]), [b0] "rm" (b.r[0]), [b1] "rm" (b.r[1]),
378             [m0] "rm" (m[0]), [m1] "rm" (m[1]), [invm] "rm" (invm)
379             : "%rax", "%rdx", "cc"
380         );
381 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
382         uint64_t pl, ph, t[4], k;
383 
384         /* m < 1/4 W^2,  a,b < m */
385 
386         /* Product of the two low words */
387         u64arith_mul_1_1_2 (&(t[0]), &(t[1]), a.r[0], b.r[0]);
388 
389         /* One REDC step */
390         redc1 (t, t); /* t < 2m < 1/2 W^2 */
391 
392         /* Products of one low and one high word  */
393         u64arith_mul_1_1_2 (&pl, &ph, a.r[1], b.r[0]);   /* ph:pl < 1/4 W^2 */
394         u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 3/4 W^2 */
395         u64arith_mul_1_1_2 (&pl, &ph, a.r[0], b.r[1]);   /* ph:pl < 1/4 W^2 */
396         u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < W^2 */
397 
398         /* Product of the two high words */
399         u64arith_mul_1_1_2 (&pl, &(t[2]), a.r[1], b.r[1]); /* t2:pl < 1/16 W^2 */
400         u64arith_add_1_2 (&(t[1]), &(t[2]), pl); /* t2:t1:t0 < 1/16 W^3 + W^2 */
401 
402         /* Compute t2:t1:t0 := t2:t1:t0 + km, km < Wm < 1/4 W^3 */
403         k = t[0] * invm;
404         u64arith_mul_1_1_2 (&pl, &ph, k, m[0]);
405         if (t[0] != 0UL)
406             ph++; /* t[0] = 0 */
407         u64arith_add_1_2 (&(t[1]), &(t[2]), ph);
408         u64arith_mul_1_1_2 (&pl, &ph, k, m[1]); /* ph:pl < 1/4 W^2 */
409         u64arith_add_2_2 (&(t[1]), &(t[2]), pl, ph);
410         /* t2:t1:0 < 1/16 W^3 + W^2 + 1/4 W^3 < 5/16 W^3 + W^2 */
411 
412         /* Result may be larger than m, but is < 2*m */
413 
414         u64arith_sub_2_2_ge (&(t[1]), &(t[2]), m[0], m[1]);
415 
416         r.r[0] = t[1];
417         r.r[1] = t[2];
418 #endif
419     }
420 
sqr(Residue & r,const Residue & a) const421     void sqr (Residue &r, const Residue &a) const {
422 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
423 
424         /* m <= 2^126-1
425            Since m1>0, m*u is maximal for m0=1 and u=2^64-1, so
426            u*m is bounded by (2^126 - 2^64 + 1)*(2^64 - 1) =
427            2^190 - 2^128 - 2^126 + 2*2^64 - 1.
428            If a,b <= 2^127-2^96-1, then
429            ((a*b+u0*m)/2^64 + u1*m)/2^64 <=  2^127-2^96-1
430            If we allow non-canonical residues up to 2^127-2^96-1, we can skip
431            the final conditional subtraction. These residues are still < 2^127,
432            so an addition does not overflow */
433 
434         uint64_t dummy;
435         __asm__ __VOLATILE (
436             /* Product of low words */
437             "movq %[a0], %%rax\n\t"
438             "mulq %[a0]\n\t"         /* rdx:rax = a0^2 <= (2^64-1)^2 */
439             "movq %%rdx, %[t0]\n\t"
440             /* Compute u0*m, add to t0:rax */
441             "imulq %[invm], %%rax\n\t"
442             "movq %%rax, %[t2]\n\t"  /* t2 = u0 */
443             "xorl %k[t1], %k[t1]\n\t"
444             "mulq %[m0]\n\t"         /* rdx:rax = u0*m0 <= (2^64-1)^2 */
445             "negq %%rax\n\t"         /* if low word != 0, carry to high word */
446             "movq %[t2], %%rax\n\t"  /* independent, goes in pipe 0 */
447             "adcq %%rdx, %[t0]\n\t"
448             "setc %b[t1]\n\t"        /* t1:t0 = (a0^2 + u0*m0) / 2^64 <= 2*2^64 - 4 */
449             "mulq %[m1]\n\t"
450             "addq %%rax, %[t0]\n\t"
451             "movq %[a0], %%rax\n\t"  /* independent, goes in pipe 0 */
452             "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0^2+u0*m)/2^64 */
453             ABORT_IF_CY              /* <= 2^126 - 2^62 */
454 
455             /* 2 products of low and high word */
456             "xorl %k[t2], %k[t2]\n\t"
457             "mulq %[a1]\n\t"         /* rdx:rax = a0*a1 <= (2^64-1)*(2^63-2^32-1) */
458             "addq %%rax, %[t0]\n\t"
459             "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0^2+u0*m)/2^64 + a0*a1 */
460             ABORT_IF_CY              /* <= 2^126 - 2^62 + (2^64-1)*(2^63-2^32-1)
461     			        = 2^127 + 2^126 - 2^96 ... */
462             "addq %%rax, %[t0]\n\t"
463             "adcq %%rdx, %[t1]\n\t"
464             "movq %[a1], %%rax\n\t"  /* independent, goes in pipe 0 */
465             "setc %b[t2]\n\t"        /* t2:t1:t0 = (a0*a0+u0*m)/2^64 + 2*a0*a1 */
466             /* <= 2^126 - 2^62 + 2*(2^64-1)*(2^63-2^32-1)
467                        = 2^128 + 2^126 - 2*2^96 ... */
468 
469             /* Product of high words */
470             "mulq %[a1]\n\t"         /* rdx:rax = a1^2 <= (2^63-2^32-1)^2 */
471             "addq %%rax, %[t1]\n\t"
472             "movq %[t0], %%rax\n\t"
473             "adcq %%rdx, %[t2]\n\t"  /* t2:t1:t0 = (a^2+u0*m)/2^64 */
474             ABORT_IF_CY              /* <= ((2^127-2^96-1)^2+(2^64-1)*(2^126-2^64+1))/2^64
475                                 = 2^190 - 2^160 ... */
476             /* Free slot here */
477             /* Compute u1*m, add to t2:t1:t0 */
478             "imulq %[invm], %%rax\n\t"
479             "movq %%rax, %[t0]\n\t" /* t0 = u1 */
480             /* Free slot here */
481             "mulq %[m0]\n\t"        /* rdx:rax = m0*u1 <= (2^64-1)^2 */
482             "negq %%rax\n\t"        /* if low word != 0, carry to high word */
483             "movq %[t0], %%rax\n\t"
484             "adcq %%rdx, %[t1]\n\t"
485             "adcq $0,%[t2]\n\t"     /* t2:t1:0 = (a*a+u0*m)/2^64 + u1*m0 */
486             ABORT_IF_CY             /* <= 2^190 - 2^160 + 2*2^128 + 2^126 ... */
487 
488             "mulq %[m1]\n\t"        /* rdx:rax = u1*m1 */
489             "addq %%rax, %[t1]\n\t"
490             "adcq %%rdx, %[t2]\n\t" /* t2:t1 = ((a*a+u0*m)/2^64 + u1*m)/2^64 */
491             ABORT_IF_CY             /* <= 2^127 - 2^96 - 1 */
492 
493             "movq %[t1], %%rax\n\t" /* See if result > m */
494             "movq %[t2], %%rdx\n\t"
495             "subq %[m0], %[t1]\n\t"
496             "sbbq %[m1], %[t2]\n\t"
497             "cmovc %%rax, %[t1]\n\t" /* No carry -> copy new result */
498             "cmovc %%rdx, %[t2]\n\t"
499             : [t0] "=&r" (dummy), [t1] "=&r" (r.r[0]), [t2] "=&r" (r.r[1])
500             : [a0] "g" (a.r[0]), [a1] "g" (a.r[1]),
501             [m0] "rm" (m[0]), [m1] "rm" (m[1]), [invm] "rm" (invm)
502             : "%rax", "%rdx", "cc"
503         );
504 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
505 
506         uint64_t pl, ph, t[4], k;
507 
508         /* m < 1/4 W^2,  a < m */
509 
510         /* Square of the low word */
511         u64arith_mul_1_1_2 (&(t[0]), &(t[1]), a.r[0], a.r[0]);
512 
513         /* One REDC step */
514         redc1 (t, t); /* t < 2m < 1/2 W^2 */
515 
516         /* Products of low and high word  */
517         u64arith_mul_1_1_2 (&pl, &ph, a.r[1], a.r[0]);   /* ph:pl < 1/4 W^2 */
518         u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 3/4 W^2 */
519         u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < W^2 */
520 
521         /* Square of high word */
522         u64arith_mul_1_1_2 (&pl, &(t[2]), a.r[1], a.r[1]); /* t2:pl < 1/16 W^2 */
523         u64arith_add_1_2 (&(t[1]), &(t[2]), pl); /* t2:t1:t0 < 1/16 W^3 + W^2 */
524 
525         /* Compute t2:t1:t0 := t2:t1:t0 + km, km < Wm < 1/4 W^3 */
526         k = t[0] * invm;
527         u64arith_mul_1_1_2 (&pl, &ph, k, m[0]);
528         if (t[0] != 0UL)
529             ph++; /* t[0] = 0 */
530         u64arith_add_1_2 (&(t[1]), &(t[2]), ph);
531         u64arith_mul_1_1_2 (&pl, &ph, k, m[1]); /* ph:pl < 1/4 W^2 */
532         u64arith_add_2_2 (&(t[1]), &(t[2]), pl, ph);
533         /* t2:t1:0 < 1/16 W^3 + W^2 + 1/4 W^3 < 5/16 W^3 + W^2 */
534 
535         /* Result may be larger than m, but is < 2*m */
536 
537         u64arith_sub_2_2_ge (&(t[1]), &(t[2]), m[0], m[1]);
538 
539         r.r[0] = t[1];
540         r.r[1] = t[2];
541 #endif
542     }
543 
next(Residue & r) const544     bool next (Residue &r) const {
545         u64arith_add_1_2 (&(r.r[0]), &(r.r[1]), 1);
546         return (finished (r));
547     }
548 
finished(const Residue & r) const549     bool finished (const Residue &r) const {
550         return (r.r[0] == m[0] && r.r[1] == m[1]);
551     }
552 
553     /* Division by small integer n, where (n-1)*m may overflow the most
554        significant word. Returns 1 if n is invertible modulo m, 0 if not.
555 
556        w_mod_n is word base (e.g., 2^32 or  2^64) mod n
557        inv_n contains -1/i (mod n) if i is coprime to n, or 0 if i is not coprime
558        to n, for 0 <= i < n
559        c = n^(-1) (mod word base)
560     */
561 
divn(Residue & r,const Residue & a,const uint64_t n,const uint64_t w_mod_n,const uint64_t * inv_n,const uint64_t c) const562     bool divn (Residue &r, const Residue &a,
563                       const uint64_t n, const uint64_t w_mod_n,
564                       const uint64_t *inv_n, const uint64_t c) const {
565         const uint64_t an = ((a.r[1] % n) * w_mod_n + a.r[0] % n) % n;
566         const uint64_t mn = ((m[1] % n) * w_mod_n + m[0] % n) % n;
567 
568         Residue t(*this), t2(*this);
569         uint64_t k;
570 #ifdef WANT_ASSERT_EXPENSIVE
571         Residue a_backup(*this);
572 #endif
573 
574         if (inv_n[mn] == 0)
575             return false;
576 
577 #ifdef WANT_ASSERT_EXPENSIVE
578         set (a_backup, a);
579 #endif
580         set (t, a);
581 
582         /* Make t[1]:t[0] == a+km (mod w^2) with a+km divisible by n */
583         /* We want a+km == 0 (mod n), so k = -a*m^{-1} (mod n) */
584         k = (inv_n[mn] * an) % n;
585         ASSERT_EXPENSIVE ((an + k*mn) % n == 0);
586         u64arith_mul_1_1_2 (&(t.r[0]), &(t.r[1]), m[0], k);
587         t.r[1] += m[1] * k;
588         u64arith_add_2_2 (&(t.r[0]), &(t.r[1]), a.r[0], a.r[1]);
589 
590         /* We want r = (a+km)/n. */
591 
592         /* May overwrite a */
593         r.r[0] = t.r[0] * c;
594 
595         /* r0 == (a+km)/n (mod w)
596            (r1*w + r0) * n = (a+km)
597            (r1*w + r0) * n == t (mod w^2)
598            r1*w*n == t - n*r0 (mod w^2)
599                                   t - n*r0 == 0 (mod w), thus
600            r1*n == (t - n*r0)/w (mod w) */
601 
602         u64arith_mul_1_1_2 (&(t2.r[0]), &(t2.r[1]), r.r[0], n);
603         u64arith_sub_2_2 (&(t.r[0]), &(t.r[1]), t2.r[0], t2.r[1]);
604         ASSERT_EXPENSIVE (t.r[0] == 0);
605         r.r[1] = t.r[1] * c;
606 
607 #ifdef WANT_ASSERT_EXPENSIVE
608         {
609             uint64_t i;
610             set (t, r);
611             for (i = 1; i < n; i++)
612                 add (t, t, r);
613             ASSERT_EXPENSIVE (equal (t, a_backup));
614         }
615 #endif
616 
617         return true;
618     }
619 
620     /* Given a = V_n (x), b = V_m (x) and d = V_{n-m} (x), compute V_{m+n} (x).
621      * r can be the same variable as a or b but must not be the same variable as d.
622      */
V_dadd(Residue & r,const Residue & a,const Residue & b,const Residue & d) const623     void V_dadd (Residue &r, const Residue &a, const Residue &b,
624                      const Residue &d) const {
625         ASSERT (&r != &d);
626         mul (r, a, b);
627         sub (r, r, d);
628     }
629 
630     /* Given a = V_n (x) and two = 2, compute V_{2n} (x).
631      * r can be the same variable as a but must not be the same variable as two.
632      */
V_dbl(Residue & r,const Residue & a,const Residue & two) const633     void V_dbl (Residue &r, const Residue &a, const Residue &two) const {
634         ASSERT (&r != &two);
635         sqr (r, a);
636         sub (r, r, two);
637     }
638 
639 
640     /* prototypes of non-inline functions */
641     bool div3 (Residue &, const Residue &) const;
642     bool div5 (Residue &, const Residue &) const;
643     bool div7 (Residue &, const Residue &) const;
644     bool div11 (Residue &, const Residue &) const;
645     bool div13 (Residue &, const Residue &) const;
646     void gcd (Integer &, const Residue &) const;
647     void pow (Residue &, const Residue &, uint64_t) const;
648     void pow (Residue &, const Residue &, const uint64_t *, size_t) const;
649     void pow (Residue &r, const Residue &b, const Integer &e) const;
650     void pow2 (Residue &, uint64_t) const;
651     void pow2 (Residue &, const uint64_t *, size_t) const;
652     void pow2 (Residue &r, const Integer &e) const;
653     void V (Residue &, const Residue &, const uint64_t) const;
654     void V (Residue &, const Residue &, const uint64_t *, size_t) const;
655     void V (Residue &r, const Residue &b, const Integer &e) const;
656     void V (Residue &r, Residue *rp1, const Residue &b,
657             const uint64_t k) const;
658     bool sprp (const Residue &) const;
659     bool sprp2 () const;
660     bool isprime () const;
661     bool inv (Residue &, const Residue &) const;
662     bool batchinv (Residue *, const Residue *, size_t, const Residue *) const;
663     bool batchinv (Residue *, const uint64_t *, size_t, const Integer *) const;
664     struct batch_Q_to_Fp_context_s {
665         Integer c;
666         uint64_t rem_ul, ratio_ul, den_inv;
667         ModulusREDC126 &m;
668     };
669     typedef struct batch_Q_to_Fp_context_s batch_Q_to_Fp_context_t;
670 
671     batch_Q_to_Fp_context_t *
672     batch_Q_to_Fp_init (const Integer &, const Integer &) const;
673     void batch_Q_to_Fp_clear (batch_Q_to_Fp_context_t *) const;
674 
675     bool batch_Q_to_Fp (uint64_t *, const batch_Q_to_Fp_context_t *,
676                         uint64_t, int, const uint64_t *, size_t) const;
677     int jacobi (const Residue &) const;
678 protected:
679     bool find_minus1 (Residue &, const Residue &, const int) const;
680     /* Computes r = (a * b * 2^-64) mod m, where a is in REDC
681      * representation */
682     void
mul_ul(Residue & r,const Residue & a,const uint64_t b) const683     mul_ul (Residue &r, const Residue &a, const uint64_t b) const
684     {
685         assertValid(a);
686 
687 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
688         uint64_t u = invm, a0 = a.r[0];
689         __asm__ __VOLATILE (
690             /* Product of low words */
691             "mulq %[b]\n\t"          /* rdx:rax = a0*b <= (2^64-1)^2 */
692             "movq %%rdx, %[t0]\n\t"  /* t0:rax = a0*b <= (2^64-1)^2 */
693             /* Compute u such that a0*b + u*m == 0 (mod 2^64), compute u*m, add to t0:rax */
694             "imulq %[u], %%rax\n\t"
695             "movq %%rax, %[u]\n\t"  /* u <= 2^64-1 */
696             "xorl %k[t1], %k[t1]\n\t"
697             "mulq %[m0]\n\t"         /* rdx:rax = u*m0 <= (2^64-1)^2 */
698             "negq %%rax\n\t"         /* if low word != 0, carry to high word */
699             "movq %[u], %%rax\n\t"  /* rax = u, independent, goes in pipe 0 */
700             "adcq %%rdx, %[t0]\n\t"
701             "setc %b[t1]\n\t"        /* t1:t0 = (a0*b + u*m0) / 2^64 <= 2*2^64 - 4 */
702             "mulq %[m1]\n\t"         /* rdx:rax = u*m1 */
703             "addq %%rax, %[t0]\n\t"
704             "movq %[a1], %%rax\n\t"  /* independent, goes in pipe 0 */
705             "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = (a0*b+u*m)/2^64 <= 2^126 - 2^62 */
706             /* Free slot in pipe 2 here */
707             ABORT_IF_CY
708 
709             /* Product of low and high word */
710             "mulq %[b]\n\t"          /* rdx:rax = a1*b <= (2^63-2^32-1)*(2^64-1) */
711             "addq %%rax, %[t0]\n\t"
712             "adcq %%rdx, %[t1]\n\t"  /* t1:t0 = ((a1*2^64 + a0)*b + u*m) / 2^64
713                                      <= ((2^126-1)*(2^64-1) + (2^64-1)*(2^126-1)) / 2^64
714                                       < 2^127 - 2^63 - 1, thus no carry */
715             ABORT_IF_CY
716             /* t1:t0 = ((a1*2^64 + a0)*b + u*m) / 2^64
717                     <= ((m-1)*(2^64-1) + (2^64-1)*m) / 2^64
718                      = 2*m*(1-1/2^64) - 1*(1-1/2^64). May need to subtract m. */
719             "movq %[t0], %%rax\n\t" /* See if result > m */
720             "movq %[t1], %%rdx\n\t"
721             "subq %[m0], %[t0]\n\t" /* Try subtracting m, see if there's a carry */
722             "sbbq %[m1], %[t1]\n\t"
723             "cmovc %%rax, %[t0]\n\t" /* Carry -> restore old result */
724             "cmovc %%rdx, %[t1]\n\t"
725             : [u] "+&r" (u), [t0] "=&r" (r.r[0]), [t1] "=&r" (r.r[1]), [a0] "+&a" (a0)
726             : [a1] "g" (a.r[1]), [b] "rm" (b),
727             [m0] "rm" (m[0]), [m1] "rm" (m[1])
728             : "%rdx", "cc"
729         );
730 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */
731         uint64_t pl, ph, t[2];
732 
733         /* m < 1/4 W^2,  a < m, b < W */
734 
735         /* Product of b and low word */
736         u64arith_mul_1_1_2 (&(t[0]), &(t[1]), a.r[0], b); /* t1:t0 = a0*b < W^2 */
737 
738         /* One REDC step */
739         redc1 (t, t); /* t1:t0 = (a0*b + k*m) / W < m + W < 1/4 W^2 + W */
740 
741         /* Product of b and high word  */
742         u64arith_mul_1_1_2 (&pl, &ph, a.r[1], b);   /* ph:pl < 1/4 W^2 */
743         u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 1/2 W^2 + W */
744 
745         u64arith_sub_2_2 (&(t[0]), &(t[1]), m[0], m[1]);
746 
747         r.r[0] = t[0];
748         r.r[1] = t[1];
749 #endif
750         assertValid(r);
751     }
752 
753 };
754 
755 
756 #endif  /* MODREDC126_HPP */
757