1 #ifndef MODREDC126_HPP 2 #define MODREDC126_HPP 3 4 /* Some functions for modular arithmetic with modulus in [2^64+1, 2^126-1]. 5 Moduli must be odd. Residues are stored in Montgomery form, reduction 6 after multiplication is done with REDC. */ 7 8 /**********************************************************************/ 9 #include "cado_config.h" // for HAVE_GCC_STYLE_AMD64_INLINE_ASM 10 #include <cstdint> 11 #include <cstddef> // for size_t, NULL 12 #include <new> // for operator new 13 #include "macros.h" 14 #include "u64arith.h" 15 #include "modint.hpp" 16 #include "mod_stdop.hpp" 17 18 class ModulusREDC126 { 19 /* Type definitions */ 20 public: 21 typedef Integer128 Integer; 22 class Residue { 23 friend class ModulusREDC126; 24 protected: 25 uint64_t r[2]; 26 public: 27 typedef ModulusREDC126 Modulus; 28 typedef Modulus::Integer Integer; 29 typedef bool IsResidueType; 30 Residue() = delete; Residue(const Modulus & m MAYBE_UNUSED)31 Residue(const Modulus &m MAYBE_UNUSED) : r{0,0} {} Residue(const Modulus & m MAYBE_UNUSED,const Residue & s)32 Residue(const Modulus &m MAYBE_UNUSED, const Residue &s) : r{s.r[0], s.r[1]} {} Residue(const Residue && s)33 Residue(const Residue &&s) : r{s.r[0], s.r[1]} {} 34 protected: operator =(const Residue & s)35 Residue &operator=(const Residue &s) {r[0] = s.r[0]; r[1] = s.r[1]; return *this;} operator =(const Integer & s)36 Residue &operator=(const Integer &s) {r[1] = 0; s.get(r, 2); return *this;} operator =(const uint64_t s)37 Residue &operator=(const uint64_t s) {r[0] = s; r[1] = 0; return *this;} 38 }; 39 40 typedef ResidueStdOp<Residue> ResidueOp; 41 42 protected: 43 /* Data members */ 44 uint64_t m[2]; 45 uint64_t invm; 46 uint64_t mrecip; 47 Residue one; 48 49 public: getminmod()50 static Integer getminmod() {return Integer(1,1);} /* 2^64+1 */ getmaxmod()51 static Integer getmaxmod() {return Integer(UINT64_MAX, UINT64_MAX >> 2);} /* 2^126-1 */ valid(const Integer & m)52 static bool valid(const Integer &m) { 53 return getminmod() <= m && m <= getmaxmod() && m % 2 == 1; 54 } ModulusREDC126(const ModulusREDC126 & s)55 ModulusREDC126(const ModulusREDC126 &s) : m{s.m[0], s.m[1]}, invm(s.invm), mrecip(s.mrecip), one(s) {one = s.one;} ModulusREDC126(const Integer s)56 ModulusREDC126 (const Integer s) : one(*this) { 57 ASSERT (valid(s)); 58 s.get (m, 2); 59 invm = -u64arith_invmod (m[0]); 60 61 int shift = u64arith_clz(m[1]); 62 uint64_t dummy, ml[2] = {m[0], m[1]}; 63 u64arith_shl_2(&ml[0], &ml[1], shift); 64 mrecip = u64arith_reciprocal_for_div_3by2(ml[0], ml[1]); 65 u64arith_divqr_3_2_1_recip_precomp(&dummy, &one.r[0], &one.r[1], 66 0, 0, 1, ml[0], ml[1], mrecip, shift); 67 } 68 69 /* Returns the modulus as an Integer. */ getmod(Integer & r) const70 void getmod(Integer &r) const { 71 r.set(m, 2); 72 } 73 74 protected: assertValid(const Residue & r MAYBE_UNUSED) const75 void assertValid(const Residue &r MAYBE_UNUSED) const {ASSERT(u64arith_lt_2_2(r.r[0], r.r[1], m[0], m[1]));} 76 77 /* r = s * 2^64 mod m */ 78 void tomontgomery1(Residue & r,const Residue & s) const79 tomontgomery1 (Residue &r, const Residue &s) const 80 { 81 int shift = u64arith_clz(m[1]); 82 uint64_t dummy, ml[2] = {m[0], m[1]}; 83 u64arith_shl_2(&ml[0], &ml[1], shift); 84 u64arith_divqr_3_2_1_recip_precomp(&dummy, &r.r[0], &r.r[1], 85 0, s.r[0], s.r[1], ml[0], ml[1], mrecip, shift); 86 } 87 88 /* r = s * 2^128 mod m */ 89 void tomontgomery(Residue & r,const Residue & s) const90 tomontgomery (Residue &r, const Residue &s) const 91 { 92 tomontgomery1(r, s); 93 tomontgomery1(r, r); 94 } 95 redc1(uint64_t * r,const uint64_t * s) const96 void redc1 (uint64_t *r, const uint64_t *s) const 97 { 98 uint64_t t[4], k; 99 100 k = s[0] * invm; 101 u64arith_mul_1_1_2 (&(t[0]), &(t[1]), k, m[0]); 102 if (s[0] != 0) 103 t[1]++; 104 t[2] = 0; 105 u64arith_add_1_2 (&(t[1]), &(t[2]), s[1]); /* t[2] <= 1 */ 106 u64arith_mul_1_1_2 (&(t[0]), &(t[3]), k, m[1]); /* t[3] < 2^w-1 */ 107 u64arith_add_2_2 (&(t[1]), &(t[2]), t[0], t[3]); /* t[2] < 2^w */ 108 109 /* r = (k*m + s) / w, k <= w-1. If s < m, then r < m */ 110 r[0] = t[1]; 111 r[1] = t[2]; 112 } 113 114 /* Do a one-word REDC, i.e., r == s / w (mod m), w = 2^64. 115 If m > w, r < 2m. If s < m, then r < m */ redc1(Residue & r,const Residue & s) const116 void redc1 (Residue &r, const Residue &s) const 117 { 118 redc1(r.r, s.r); 119 } redc1(Integer & r,const Integer & s) const120 void redc1 (Integer &r, const Integer &s) const 121 { 122 uint64_t t[2]; 123 s.get(t, 2); 124 redc1(t, t); 125 r = Integer(t[0], t[1]); 126 } 127 128 /* Converts s out of Montgomery form by dividing by 2^(2*LONG_BIT). 129 Requires s < m. */ frommontgomery(Residue & r,const Residue & s) const130 void frommontgomery (Residue &r, const Residue &s) const 131 { 132 /* Do two REDC steps */ 133 redc1 (r, s); 134 redc1 (r, r); 135 } 136 137 public: 138 /** Allocate an array of len residues. 139 * 140 * Must be freed with deleteArray(), not with delete[]. 141 */ newArray(const size_t len) const142 Residue *newArray(const size_t len) const { 143 void *t = operator new[](len * sizeof(Residue)); 144 if (t == NULL) 145 return NULL; 146 Residue *ptr = static_cast<Residue *>(t); 147 for(size_t i = 0; i < len; i++) { 148 new(&ptr[i]) Residue(*this); 149 } 150 return ptr; 151 } 152 deleteArray(Residue * ptr,const size_t len) const153 void deleteArray(Residue *ptr, const size_t len) const { 154 for(size_t i = len; i > 0; i++) { 155 ptr[i - 1].~Residue(); 156 } 157 operator delete[](ptr); 158 } 159 set(Residue & r,const Residue & s) const160 void set(Residue &r, const Residue &s) const {r = s;} set(Residue & r,const uint64_t s) const161 void set (Residue &r, const uint64_t s) const { 162 r.r[0] = s; 163 r.r[1] = 0; 164 tomontgomery (r, r); 165 } set(Residue & r,const Integer & s) const166 void set (Residue &r, const Integer &s) const { 167 if (s < Integer(m[0], m[1])) { 168 s.get(r.r, 2); 169 } else { 170 Integer t = s; 171 t = t % Integer(m[0], m[1]); 172 t.get(r.r, 2); 173 } 174 tomontgomery (r, r); 175 } 176 /* Sets the residueredc2ul2_t to the class represented by the integer s. 177 Assumes that s is reduced (mod m), i.e. 0 <= s < m */ set_reduced(Residue & r,const uint64_t s) const178 void set_reduced (Residue &r, const uint64_t s) const {set (r, s);} set_reduced(Residue & r,Integer s) const179 void set_reduced (Residue &r, Integer s) const { 180 ASSERT (s < Integer(m[0], m[1])); 181 s.get(r.r, 2); 182 tomontgomery (r, r); 183 } set0(Residue & r) const184 void set0(Residue &r) const {r.r[0] = 0; r.r[1] = 0;} set1(Residue & r) const185 void set1 (Residue &r) const {set (r, one);} 186 /* Exchanges the values of the two arguments */ 187 swap(Residue & a,Residue & b) const188 void swap (Residue &a, Residue &b) const { 189 Residue t(*this); 190 set (t, a); 191 set (a, b); 192 set (b, t); 193 } 194 195 /* Returns the residue as a modintredc2ul2_t */ get(Integer & r,const Residue & s) const196 void get (Integer &r, const Residue &s) const { 197 Residue t(*this); 198 frommontgomery (t, s); 199 r.set(t.r, 2); 200 } 201 equal(const Residue & a,const Residue & b) const202 bool equal (const Residue &a, const Residue &b) const { 203 return (a.r[0] == b.r[0] && a.r[1] == b.r[1]); 204 } 205 is0(const Residue & a) const206 bool is0 (const Residue &a) const { 207 return (a.r[0] == 0 && a.r[1] == 0); 208 } 209 is1(const Residue & a) const210 bool is1 (const Residue &a) const { 211 return (equal(a, one)); 212 } 213 add(Residue & r,const Residue & a,const Residue & b) const214 void add (Residue &r, const Residue &a, const Residue &b) const { 215 const uint64_t t0 = b.r[0], t1 = b.r[1]; /* r, a, and/or b may overlap */ 216 set (r, a); 217 u64arith_add_2_2 (&(r.r[0]), &(r.r[1]), t0, t1); 218 u64arith_sub_2_2_ge (&(r.r[0]), &(r.r[1]), m[0], m[1]); 219 } 220 sub(Residue & r,const Residue & a,const Residue & b) const221 void sub (Residue &r, const Residue &a, const Residue &b) const { 222 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) 223 { 224 uint64_t s1 = m[0], s2 = m[1], t1 = a.r[0], t2 = a.r[1]; 225 226 __asm__ __VOLATILE ( 227 "subq %4, %0\n\t" 228 "sbbq %5, %1\n\t" /* t -= b */ 229 "cmovncq %6, %2\n\t" /* If !carry, s = 0 */ 230 "cmovncq %6, %3\n" 231 : "+&r" (t1), "+&r" (t2), "+&r" (s1), "+r" (s2) 232 : "g" (b.r[0]), "g" (b.r[1]), "rm" (0UL) 233 : "cc" 234 ); 235 u64arith_add_2_2 (&t1, &t2, s1, s2); 236 r.r[0] = t1; 237 r.r[1] = t2; 238 } 239 #else 240 { 241 uint64_t t1 = a.r[0], t2 = a.r[1]; 242 u64arith_sub_2_2 (&t1, &t2, b.r[0], b.r[1]); 243 244 if (t2 > a.r[1] || (t2 == a.r[1] && t1 > a.r[0])) 245 u64arith_add_2_2 (&t1, &t2, m[0], m[1]); 246 247 r.r[0] = t1; 248 r.r[1] = t2; 249 } 250 #endif 251 } 252 add1(Residue & r,const Residue & a) const253 void add1 (Residue &r, const Residue &a) const { 254 add(r, a, one); 255 } 256 sub1(Residue & r,const Residue & a) const257 void sub1 (Residue &r, const Residue &a) const { 258 sub(r, a, one); 259 } 260 add(Residue & r,const Residue & a,const uint64_t b) const261 void add (Residue &r, const Residue &a, const uint64_t b) const 262 { 263 Residue t(*this); 264 265 set (t, b); 266 add (r, a, t); 267 } 268 sub(Residue & r,const Residue & a,const uint64_t b) const269 void sub (Residue &r, const Residue &a, const uint64_t b) const { 270 Residue t(*this); 271 272 set (t, b); 273 sub (r, a, t); 274 } 275 neg(Residue & r,const Residue & a) const276 void neg (Residue &r, const Residue &a) const { 277 if (is0 (a)) { 278 set0 (r); 279 } else { 280 uint64_t t0 = m[0], t1 = m[1]; 281 u64arith_sub_2_2(&t0, &t1, a.r[0], a.r[1]); 282 r.r[0] = t0; 283 r.r[1] = t1; 284 } 285 } 286 div2(Residue & r,const Residue & a) const287 bool div2 (Residue &r, const Residue &a) const { 288 set (r, a); 289 if (r.r[0] % 2 == 1) 290 u64arith_add_2_2 (&r.r[0], &r.r[1], m[0], m[1]); 291 u64arith_shr_2(&r.r[0], &r.r[1], 1); 292 return true; 293 } 294 295 #ifdef WANT_ASSERT_EXPENSIVE 296 #if defined(__x86_64__) 297 #define ABORT_IF_CY "jnc 1f\n\tlea _GLOBAL_OFFSET_TABLE_(%%rip), %%rbx\n\tcall abort@plt\n1:\n\t" 298 #elif defined(__i386__) 299 #define ABORT_IF_CY "jnc 1f\n\tcall abort\n1:\n\t" 300 #endif 301 #else 302 #define ABORT_IF_CY 303 #endif 304 mul(Residue & r,const Residue & a,const Residue & b) const305 void mul (Residue &r, const Residue &a, const Residue &b) const { 306 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM 307 uint64_t dummy; 308 __asm__ __VOLATILE ( 309 /* Product of low words */ 310 "movq %[a0], %%rax\n\t" 311 "mulq %[b0]\n\t" /* rdx:rax = a0*b0 <= (2^64-1)^2 */ 312 "movq %%rdx, %[t0]\n\t" 313 /* Compute u0*m, add to t0:rax */ 314 "imulq %[invm], %%rax\n\t" 315 "movq %%rax, %[t2]\n\t" /* t2 = u0 */ 316 "xorl %k[t1], %k[t1]\n\t" 317 "mulq %[m0]\n\t" /* rdx:rax = u0*m0 <= (2^64-1)^2 */ 318 "negq %%rax\n\t" /* if low word != 0, carry to high word */ 319 "movq %[t2], %%rax\n\t" /* independent, goes in pipe 0 */ 320 "adcq %%rdx, %[t0]\n\t" 321 "setc %b[t1]\n\t" /* t1:t0 = (a0*b0 + u0*m0) / 2^64 <= 2*2^64 - 4 */ 322 "mulq %[m1]\n\t" 323 "addq %%rax, %[t0]\n\t" 324 "movq %[a0], %%rax\n\t" /* independent, goes in pipe 0 */ 325 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0*b0+u0*m)/2^64 */ 326 ABORT_IF_CY /* <= 2^126 - 2^62 */ 327 328 329 /* 2 products of low and high word */ 330 "xorl %k[t2], %k[t2]\n\t" 331 "mulq %[b1]\n\t" /* rdx:rax = a0*b1 <= (2^64-1)*(2^63-2^32-1) */ 332 "addq %%rax, %[t0]\n\t" 333 "movq %[a1], %%rax\n\t" /* independent, goes in pipe 0 */ 334 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0*b0+u0*m)/2^64 + a0*b1 */ 335 ABORT_IF_CY /* <= 2^126 - 2^62 + (2^64-1)*(2^63-2^32-1) 336 = 2^127 + 2^126 - 2^96 ... */ 337 338 /* Free slot here */ 339 "mulq %[b0]\n\t" /* rdx:rax = a1*b0 <= (2^63-2^32-1)*(2^64-1) */ 340 "addq %%rax, %[t0]\n\t" 341 "movq %[a1], %%rax\n\t" /* independent, goes in pipe 0 */ 342 "adcq %%rdx, %[t1]\n\t" 343 "setc %b[t2]\n\t" /* t2:t1:t0 = (a0*b0+u0*m)/2^64 + a0*b1 + a1*b0 */ 344 /* <= 2^126 - 2^62 + 2*(2^64-1)*(2^63-2^32-1) 345 = 2^128 + 2^126 - 2*2^96 ... */ 346 /* Product of high words */ 347 "mulq %[b1]\n\t" /* rdx:rax = a1*b1 <= (2^63-2^32-1)^2 */ 348 "addq %%rax, %[t1]\n\t" 349 "movq %[t0], %%rax\n\t" 350 "adcq %%rdx, %[t2]\n\t" /* t2:t1:t0 = (a*b+u0*m)/2^64 */ 351 ABORT_IF_CY /* <= ((2^127-2^96-1)^2+(2^64-1)*(2^126-2^64+1))/2^64 352 = 2^190 - 2^160 ... */ 353 /* Free slot here */ 354 /* Compute u1*m, add to t2:t1:t0 */ 355 "imulq %[invm], %%rax\n\t" 356 "movq %%rax, %[t0]\n\t" /* t0 = u1 */ 357 /* Free slot here */ 358 "mulq %[m0]\n\t" /* rdx:rax = m0*u1 <= (2^64-1)^2 */ 359 "negq %%rax\n\t" /* if low word != 0, carry to high word */ 360 "movq %[t0], %%rax\n\t" 361 "adcq %%rdx, %[t1]\n\t" 362 "adcq $0,%[t2]\n\t" /* t2:t1:0 = (a*b+u0*m)/2^64 + u1*m0 */ 363 ABORT_IF_CY /* <= 2^190 - 2^160 + 2*2^128 + 2^126 ... */ 364 365 "mulq %[m1]\n\t" /* rdx:rax = u1*m1 */ 366 "addq %%rax, %[t1]\n\t" 367 "adcq %%rdx, %[t2]\n\t" /* t2:t1 = ((a*b+u0*m)/2^64 + u1*m)/2^64 */ 368 ABORT_IF_CY /* <= 2^127 - 2^96 - 1 */ 369 370 "movq %[t1], %%rax\n\t" /* See if result > m */ 371 "movq %[t2], %%rdx\n\t" 372 "subq %[m0], %[t1]\n\t" 373 "sbbq %[m1], %[t2]\n\t" 374 "cmovc %%rax, %[t1]\n\t" /* Carry -> restore old result */ 375 "cmovc %%rdx, %[t2]\n\t" 376 : [t0] "=&r" (dummy), [t1] "=&r" (r.r[0]), [t2] "=&r" (r.r[1]) 377 : [a0] "g" (a.r[0]), [a1] "g" (a.r[1]), [b0] "rm" (b.r[0]), [b1] "rm" (b.r[1]), 378 [m0] "rm" (m[0]), [m1] "rm" (m[1]), [invm] "rm" (invm) 379 : "%rax", "%rdx", "cc" 380 ); 381 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */ 382 uint64_t pl, ph, t[4], k; 383 384 /* m < 1/4 W^2, a,b < m */ 385 386 /* Product of the two low words */ 387 u64arith_mul_1_1_2 (&(t[0]), &(t[1]), a.r[0], b.r[0]); 388 389 /* One REDC step */ 390 redc1 (t, t); /* t < 2m < 1/2 W^2 */ 391 392 /* Products of one low and one high word */ 393 u64arith_mul_1_1_2 (&pl, &ph, a.r[1], b.r[0]); /* ph:pl < 1/4 W^2 */ 394 u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 3/4 W^2 */ 395 u64arith_mul_1_1_2 (&pl, &ph, a.r[0], b.r[1]); /* ph:pl < 1/4 W^2 */ 396 u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < W^2 */ 397 398 /* Product of the two high words */ 399 u64arith_mul_1_1_2 (&pl, &(t[2]), a.r[1], b.r[1]); /* t2:pl < 1/16 W^2 */ 400 u64arith_add_1_2 (&(t[1]), &(t[2]), pl); /* t2:t1:t0 < 1/16 W^3 + W^2 */ 401 402 /* Compute t2:t1:t0 := t2:t1:t0 + km, km < Wm < 1/4 W^3 */ 403 k = t[0] * invm; 404 u64arith_mul_1_1_2 (&pl, &ph, k, m[0]); 405 if (t[0] != 0UL) 406 ph++; /* t[0] = 0 */ 407 u64arith_add_1_2 (&(t[1]), &(t[2]), ph); 408 u64arith_mul_1_1_2 (&pl, &ph, k, m[1]); /* ph:pl < 1/4 W^2 */ 409 u64arith_add_2_2 (&(t[1]), &(t[2]), pl, ph); 410 /* t2:t1:0 < 1/16 W^3 + W^2 + 1/4 W^3 < 5/16 W^3 + W^2 */ 411 412 /* Result may be larger than m, but is < 2*m */ 413 414 u64arith_sub_2_2_ge (&(t[1]), &(t[2]), m[0], m[1]); 415 416 r.r[0] = t[1]; 417 r.r[1] = t[2]; 418 #endif 419 } 420 sqr(Residue & r,const Residue & a) const421 void sqr (Residue &r, const Residue &a) const { 422 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM 423 424 /* m <= 2^126-1 425 Since m1>0, m*u is maximal for m0=1 and u=2^64-1, so 426 u*m is bounded by (2^126 - 2^64 + 1)*(2^64 - 1) = 427 2^190 - 2^128 - 2^126 + 2*2^64 - 1. 428 If a,b <= 2^127-2^96-1, then 429 ((a*b+u0*m)/2^64 + u1*m)/2^64 <= 2^127-2^96-1 430 If we allow non-canonical residues up to 2^127-2^96-1, we can skip 431 the final conditional subtraction. These residues are still < 2^127, 432 so an addition does not overflow */ 433 434 uint64_t dummy; 435 __asm__ __VOLATILE ( 436 /* Product of low words */ 437 "movq %[a0], %%rax\n\t" 438 "mulq %[a0]\n\t" /* rdx:rax = a0^2 <= (2^64-1)^2 */ 439 "movq %%rdx, %[t0]\n\t" 440 /* Compute u0*m, add to t0:rax */ 441 "imulq %[invm], %%rax\n\t" 442 "movq %%rax, %[t2]\n\t" /* t2 = u0 */ 443 "xorl %k[t1], %k[t1]\n\t" 444 "mulq %[m0]\n\t" /* rdx:rax = u0*m0 <= (2^64-1)^2 */ 445 "negq %%rax\n\t" /* if low word != 0, carry to high word */ 446 "movq %[t2], %%rax\n\t" /* independent, goes in pipe 0 */ 447 "adcq %%rdx, %[t0]\n\t" 448 "setc %b[t1]\n\t" /* t1:t0 = (a0^2 + u0*m0) / 2^64 <= 2*2^64 - 4 */ 449 "mulq %[m1]\n\t" 450 "addq %%rax, %[t0]\n\t" 451 "movq %[a0], %%rax\n\t" /* independent, goes in pipe 0 */ 452 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0^2+u0*m)/2^64 */ 453 ABORT_IF_CY /* <= 2^126 - 2^62 */ 454 455 /* 2 products of low and high word */ 456 "xorl %k[t2], %k[t2]\n\t" 457 "mulq %[a1]\n\t" /* rdx:rax = a0*a1 <= (2^64-1)*(2^63-2^32-1) */ 458 "addq %%rax, %[t0]\n\t" 459 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0^2+u0*m)/2^64 + a0*a1 */ 460 ABORT_IF_CY /* <= 2^126 - 2^62 + (2^64-1)*(2^63-2^32-1) 461 = 2^127 + 2^126 - 2^96 ... */ 462 "addq %%rax, %[t0]\n\t" 463 "adcq %%rdx, %[t1]\n\t" 464 "movq %[a1], %%rax\n\t" /* independent, goes in pipe 0 */ 465 "setc %b[t2]\n\t" /* t2:t1:t0 = (a0*a0+u0*m)/2^64 + 2*a0*a1 */ 466 /* <= 2^126 - 2^62 + 2*(2^64-1)*(2^63-2^32-1) 467 = 2^128 + 2^126 - 2*2^96 ... */ 468 469 /* Product of high words */ 470 "mulq %[a1]\n\t" /* rdx:rax = a1^2 <= (2^63-2^32-1)^2 */ 471 "addq %%rax, %[t1]\n\t" 472 "movq %[t0], %%rax\n\t" 473 "adcq %%rdx, %[t2]\n\t" /* t2:t1:t0 = (a^2+u0*m)/2^64 */ 474 ABORT_IF_CY /* <= ((2^127-2^96-1)^2+(2^64-1)*(2^126-2^64+1))/2^64 475 = 2^190 - 2^160 ... */ 476 /* Free slot here */ 477 /* Compute u1*m, add to t2:t1:t0 */ 478 "imulq %[invm], %%rax\n\t" 479 "movq %%rax, %[t0]\n\t" /* t0 = u1 */ 480 /* Free slot here */ 481 "mulq %[m0]\n\t" /* rdx:rax = m0*u1 <= (2^64-1)^2 */ 482 "negq %%rax\n\t" /* if low word != 0, carry to high word */ 483 "movq %[t0], %%rax\n\t" 484 "adcq %%rdx, %[t1]\n\t" 485 "adcq $0,%[t2]\n\t" /* t2:t1:0 = (a*a+u0*m)/2^64 + u1*m0 */ 486 ABORT_IF_CY /* <= 2^190 - 2^160 + 2*2^128 + 2^126 ... */ 487 488 "mulq %[m1]\n\t" /* rdx:rax = u1*m1 */ 489 "addq %%rax, %[t1]\n\t" 490 "adcq %%rdx, %[t2]\n\t" /* t2:t1 = ((a*a+u0*m)/2^64 + u1*m)/2^64 */ 491 ABORT_IF_CY /* <= 2^127 - 2^96 - 1 */ 492 493 "movq %[t1], %%rax\n\t" /* See if result > m */ 494 "movq %[t2], %%rdx\n\t" 495 "subq %[m0], %[t1]\n\t" 496 "sbbq %[m1], %[t2]\n\t" 497 "cmovc %%rax, %[t1]\n\t" /* No carry -> copy new result */ 498 "cmovc %%rdx, %[t2]\n\t" 499 : [t0] "=&r" (dummy), [t1] "=&r" (r.r[0]), [t2] "=&r" (r.r[1]) 500 : [a0] "g" (a.r[0]), [a1] "g" (a.r[1]), 501 [m0] "rm" (m[0]), [m1] "rm" (m[1]), [invm] "rm" (invm) 502 : "%rax", "%rdx", "cc" 503 ); 504 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */ 505 506 uint64_t pl, ph, t[4], k; 507 508 /* m < 1/4 W^2, a < m */ 509 510 /* Square of the low word */ 511 u64arith_mul_1_1_2 (&(t[0]), &(t[1]), a.r[0], a.r[0]); 512 513 /* One REDC step */ 514 redc1 (t, t); /* t < 2m < 1/2 W^2 */ 515 516 /* Products of low and high word */ 517 u64arith_mul_1_1_2 (&pl, &ph, a.r[1], a.r[0]); /* ph:pl < 1/4 W^2 */ 518 u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 3/4 W^2 */ 519 u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < W^2 */ 520 521 /* Square of high word */ 522 u64arith_mul_1_1_2 (&pl, &(t[2]), a.r[1], a.r[1]); /* t2:pl < 1/16 W^2 */ 523 u64arith_add_1_2 (&(t[1]), &(t[2]), pl); /* t2:t1:t0 < 1/16 W^3 + W^2 */ 524 525 /* Compute t2:t1:t0 := t2:t1:t0 + km, km < Wm < 1/4 W^3 */ 526 k = t[0] * invm; 527 u64arith_mul_1_1_2 (&pl, &ph, k, m[0]); 528 if (t[0] != 0UL) 529 ph++; /* t[0] = 0 */ 530 u64arith_add_1_2 (&(t[1]), &(t[2]), ph); 531 u64arith_mul_1_1_2 (&pl, &ph, k, m[1]); /* ph:pl < 1/4 W^2 */ 532 u64arith_add_2_2 (&(t[1]), &(t[2]), pl, ph); 533 /* t2:t1:0 < 1/16 W^3 + W^2 + 1/4 W^3 < 5/16 W^3 + W^2 */ 534 535 /* Result may be larger than m, but is < 2*m */ 536 537 u64arith_sub_2_2_ge (&(t[1]), &(t[2]), m[0], m[1]); 538 539 r.r[0] = t[1]; 540 r.r[1] = t[2]; 541 #endif 542 } 543 next(Residue & r) const544 bool next (Residue &r) const { 545 u64arith_add_1_2 (&(r.r[0]), &(r.r[1]), 1); 546 return (finished (r)); 547 } 548 finished(const Residue & r) const549 bool finished (const Residue &r) const { 550 return (r.r[0] == m[0] && r.r[1] == m[1]); 551 } 552 553 /* Division by small integer n, where (n-1)*m may overflow the most 554 significant word. Returns 1 if n is invertible modulo m, 0 if not. 555 556 w_mod_n is word base (e.g., 2^32 or 2^64) mod n 557 inv_n contains -1/i (mod n) if i is coprime to n, or 0 if i is not coprime 558 to n, for 0 <= i < n 559 c = n^(-1) (mod word base) 560 */ 561 divn(Residue & r,const Residue & a,const uint64_t n,const uint64_t w_mod_n,const uint64_t * inv_n,const uint64_t c) const562 bool divn (Residue &r, const Residue &a, 563 const uint64_t n, const uint64_t w_mod_n, 564 const uint64_t *inv_n, const uint64_t c) const { 565 const uint64_t an = ((a.r[1] % n) * w_mod_n + a.r[0] % n) % n; 566 const uint64_t mn = ((m[1] % n) * w_mod_n + m[0] % n) % n; 567 568 Residue t(*this), t2(*this); 569 uint64_t k; 570 #ifdef WANT_ASSERT_EXPENSIVE 571 Residue a_backup(*this); 572 #endif 573 574 if (inv_n[mn] == 0) 575 return false; 576 577 #ifdef WANT_ASSERT_EXPENSIVE 578 set (a_backup, a); 579 #endif 580 set (t, a); 581 582 /* Make t[1]:t[0] == a+km (mod w^2) with a+km divisible by n */ 583 /* We want a+km == 0 (mod n), so k = -a*m^{-1} (mod n) */ 584 k = (inv_n[mn] * an) % n; 585 ASSERT_EXPENSIVE ((an + k*mn) % n == 0); 586 u64arith_mul_1_1_2 (&(t.r[0]), &(t.r[1]), m[0], k); 587 t.r[1] += m[1] * k; 588 u64arith_add_2_2 (&(t.r[0]), &(t.r[1]), a.r[0], a.r[1]); 589 590 /* We want r = (a+km)/n. */ 591 592 /* May overwrite a */ 593 r.r[0] = t.r[0] * c; 594 595 /* r0 == (a+km)/n (mod w) 596 (r1*w + r0) * n = (a+km) 597 (r1*w + r0) * n == t (mod w^2) 598 r1*w*n == t - n*r0 (mod w^2) 599 t - n*r0 == 0 (mod w), thus 600 r1*n == (t - n*r0)/w (mod w) */ 601 602 u64arith_mul_1_1_2 (&(t2.r[0]), &(t2.r[1]), r.r[0], n); 603 u64arith_sub_2_2 (&(t.r[0]), &(t.r[1]), t2.r[0], t2.r[1]); 604 ASSERT_EXPENSIVE (t.r[0] == 0); 605 r.r[1] = t.r[1] * c; 606 607 #ifdef WANT_ASSERT_EXPENSIVE 608 { 609 uint64_t i; 610 set (t, r); 611 for (i = 1; i < n; i++) 612 add (t, t, r); 613 ASSERT_EXPENSIVE (equal (t, a_backup)); 614 } 615 #endif 616 617 return true; 618 } 619 620 /* Given a = V_n (x), b = V_m (x) and d = V_{n-m} (x), compute V_{m+n} (x). 621 * r can be the same variable as a or b but must not be the same variable as d. 622 */ V_dadd(Residue & r,const Residue & a,const Residue & b,const Residue & d) const623 void V_dadd (Residue &r, const Residue &a, const Residue &b, 624 const Residue &d) const { 625 ASSERT (&r != &d); 626 mul (r, a, b); 627 sub (r, r, d); 628 } 629 630 /* Given a = V_n (x) and two = 2, compute V_{2n} (x). 631 * r can be the same variable as a but must not be the same variable as two. 632 */ V_dbl(Residue & r,const Residue & a,const Residue & two) const633 void V_dbl (Residue &r, const Residue &a, const Residue &two) const { 634 ASSERT (&r != &two); 635 sqr (r, a); 636 sub (r, r, two); 637 } 638 639 640 /* prototypes of non-inline functions */ 641 bool div3 (Residue &, const Residue &) const; 642 bool div5 (Residue &, const Residue &) const; 643 bool div7 (Residue &, const Residue &) const; 644 bool div11 (Residue &, const Residue &) const; 645 bool div13 (Residue &, const Residue &) const; 646 void gcd (Integer &, const Residue &) const; 647 void pow (Residue &, const Residue &, uint64_t) const; 648 void pow (Residue &, const Residue &, const uint64_t *, size_t) const; 649 void pow (Residue &r, const Residue &b, const Integer &e) const; 650 void pow2 (Residue &, uint64_t) const; 651 void pow2 (Residue &, const uint64_t *, size_t) const; 652 void pow2 (Residue &r, const Integer &e) const; 653 void V (Residue &, const Residue &, const uint64_t) const; 654 void V (Residue &, const Residue &, const uint64_t *, size_t) const; 655 void V (Residue &r, const Residue &b, const Integer &e) const; 656 void V (Residue &r, Residue *rp1, const Residue &b, 657 const uint64_t k) const; 658 bool sprp (const Residue &) const; 659 bool sprp2 () const; 660 bool isprime () const; 661 bool inv (Residue &, const Residue &) const; 662 bool batchinv (Residue *, const Residue *, size_t, const Residue *) const; 663 bool batchinv (Residue *, const uint64_t *, size_t, const Integer *) const; 664 struct batch_Q_to_Fp_context_s { 665 Integer c; 666 uint64_t rem_ul, ratio_ul, den_inv; 667 ModulusREDC126 &m; 668 }; 669 typedef struct batch_Q_to_Fp_context_s batch_Q_to_Fp_context_t; 670 671 batch_Q_to_Fp_context_t * 672 batch_Q_to_Fp_init (const Integer &, const Integer &) const; 673 void batch_Q_to_Fp_clear (batch_Q_to_Fp_context_t *) const; 674 675 bool batch_Q_to_Fp (uint64_t *, const batch_Q_to_Fp_context_t *, 676 uint64_t, int, const uint64_t *, size_t) const; 677 int jacobi (const Residue &) const; 678 protected: 679 bool find_minus1 (Residue &, const Residue &, const int) const; 680 /* Computes r = (a * b * 2^-64) mod m, where a is in REDC 681 * representation */ 682 void mul_ul(Residue & r,const Residue & a,const uint64_t b) const683 mul_ul (Residue &r, const Residue &a, const uint64_t b) const 684 { 685 assertValid(a); 686 687 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) 688 uint64_t u = invm, a0 = a.r[0]; 689 __asm__ __VOLATILE ( 690 /* Product of low words */ 691 "mulq %[b]\n\t" /* rdx:rax = a0*b <= (2^64-1)^2 */ 692 "movq %%rdx, %[t0]\n\t" /* t0:rax = a0*b <= (2^64-1)^2 */ 693 /* Compute u such that a0*b + u*m == 0 (mod 2^64), compute u*m, add to t0:rax */ 694 "imulq %[u], %%rax\n\t" 695 "movq %%rax, %[u]\n\t" /* u <= 2^64-1 */ 696 "xorl %k[t1], %k[t1]\n\t" 697 "mulq %[m0]\n\t" /* rdx:rax = u*m0 <= (2^64-1)^2 */ 698 "negq %%rax\n\t" /* if low word != 0, carry to high word */ 699 "movq %[u], %%rax\n\t" /* rax = u, independent, goes in pipe 0 */ 700 "adcq %%rdx, %[t0]\n\t" 701 "setc %b[t1]\n\t" /* t1:t0 = (a0*b + u*m0) / 2^64 <= 2*2^64 - 4 */ 702 "mulq %[m1]\n\t" /* rdx:rax = u*m1 */ 703 "addq %%rax, %[t0]\n\t" 704 "movq %[a1], %%rax\n\t" /* independent, goes in pipe 0 */ 705 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = (a0*b+u*m)/2^64 <= 2^126 - 2^62 */ 706 /* Free slot in pipe 2 here */ 707 ABORT_IF_CY 708 709 /* Product of low and high word */ 710 "mulq %[b]\n\t" /* rdx:rax = a1*b <= (2^63-2^32-1)*(2^64-1) */ 711 "addq %%rax, %[t0]\n\t" 712 "adcq %%rdx, %[t1]\n\t" /* t1:t0 = ((a1*2^64 + a0)*b + u*m) / 2^64 713 <= ((2^126-1)*(2^64-1) + (2^64-1)*(2^126-1)) / 2^64 714 < 2^127 - 2^63 - 1, thus no carry */ 715 ABORT_IF_CY 716 /* t1:t0 = ((a1*2^64 + a0)*b + u*m) / 2^64 717 <= ((m-1)*(2^64-1) + (2^64-1)*m) / 2^64 718 = 2*m*(1-1/2^64) - 1*(1-1/2^64). May need to subtract m. */ 719 "movq %[t0], %%rax\n\t" /* See if result > m */ 720 "movq %[t1], %%rdx\n\t" 721 "subq %[m0], %[t0]\n\t" /* Try subtracting m, see if there's a carry */ 722 "sbbq %[m1], %[t1]\n\t" 723 "cmovc %%rax, %[t0]\n\t" /* Carry -> restore old result */ 724 "cmovc %%rdx, %[t1]\n\t" 725 : [u] "+&r" (u), [t0] "=&r" (r.r[0]), [t1] "=&r" (r.r[1]), [a0] "+&a" (a0) 726 : [a1] "g" (a.r[1]), [b] "rm" (b), 727 [m0] "rm" (m[0]), [m1] "rm" (m[1]) 728 : "%rdx", "cc" 729 ); 730 #else /* HAVE_GCC_STYLE_AMD64_INLINE_ASM */ 731 uint64_t pl, ph, t[2]; 732 733 /* m < 1/4 W^2, a < m, b < W */ 734 735 /* Product of b and low word */ 736 u64arith_mul_1_1_2 (&(t[0]), &(t[1]), a.r[0], b); /* t1:t0 = a0*b < W^2 */ 737 738 /* One REDC step */ 739 redc1 (t, t); /* t1:t0 = (a0*b + k*m) / W < m + W < 1/4 W^2 + W */ 740 741 /* Product of b and high word */ 742 u64arith_mul_1_1_2 (&pl, &ph, a.r[1], b); /* ph:pl < 1/4 W^2 */ 743 u64arith_add_2_2 (&(t[0]), &(t[1]), pl, ph); /* t1:t0 < 1/2 W^2 + W */ 744 745 u64arith_sub_2_2 (&(t[0]), &(t[1]), m[0], m[1]); 746 747 r.r[0] = t[0]; 748 r.r[1] = t[1]; 749 #endif 750 assertValid(r); 751 } 752 753 }; 754 755 756 #endif /* MODREDC126_HPP */ 757