1 #ifndef ARITH_MODP_HPP_ 2 #define ARITH_MODP_HPP_ 3 4 #include <gmp.h> 5 6 #include "gmp-hacks.h" 7 #include "gmp_aux.h" 8 #include "macros.h" 9 #include "memory.h" 10 11 #if defined(HAVE_AVX2) || defined(HAVE_SSSE3) 12 #include <x86intrin.h> 13 #endif 14 15 16 #define xxxDEBUG_INFINITE_LOOPS 17 18 namespace arith_modp { 19 namespace details { 20 /* some of this is provided by <type_traits>, but that is post C++98 21 */ 22 template<bool x> struct is_true {}; 23 template<> struct is_true<true> { typedef int type; }; 24 template<typename T> struct make_signed {}; 25 template<> struct make_signed<unsigned long> { typedef long type; }; 26 template<> struct make_signed<unsigned long long> { typedef long long type; }; 27 typedef make_signed<mp_limb_t>::type signed_mp_limb_t; 28 29 template<int n> struct mpn { 30 static const int alignment = sizeof(mp_limb_t); 31 typedef mpn<n> self; 32 mp_limb_t x[n]; mpnarith_modp::details::mpn33 mpn() { memset(x, 0, n * sizeof(mp_limb_t)); } mpnarith_modp::details::mpn34 mpn(mpn const& a) { memcpy(x, a.x, n * sizeof(mp_limb_t)); } operator =arith_modp::details::mpn35 mpn& operator=(mpn const& a) { 36 memcpy(x, a.x, n * sizeof(mp_limb_t)); 37 return *this; 38 } mpnarith_modp::details::mpn39 mpn(mpz_srcptr a) { MPN_SET_MPZ(x, n, a); } operator =arith_modp::details::mpn40 self& operator=(mpz_srcptr a) { MPN_SET_MPZ(x, n, a); return *this; } zeroarith_modp::details::mpn41 void zero() { memset(x, 0, n * sizeof(mp_limb_t)); } operator mp_limb_t*arith_modp::details::mpn42 operator mp_limb_t * () { return x; } operator const mp_limb_t*arith_modp::details::mpn43 operator const mp_limb_t * () const { return x; } zeroarith_modp::details::mpn44 static void zero(self * x, int N) { 45 // see bug 21663 46 memset(x->x, 0, n * N * sizeof(mp_limb_t)); 47 } copyarith_modp::details::mpn48 static void copy(self * y, const self * x, int N) { 49 // see bug 21663 50 memcpy(y->x, x->x, n * N * sizeof(mp_limb_t)); 51 } operator ==arith_modp::details::mpn52 bool operator==(self const& a) { 53 return memcmp(x, a.x, n * sizeof(mp_limb_t)) == 0; 54 } is_zeroarith_modp::details::mpn55 bool is_zero() const { 56 for(int i = 0 ; i < n ; i++) if (x[i]) return false; 57 return true; 58 } 59 60 /* 61 bool operator<(self const& a) { 62 return memcmp(x, a.x, n * sizeof(mp_limb_t)) < 0; 63 } 64 bool operator>(self const& a) { 65 return memcmp(x, a.x, n * sizeof(mp_limb_t)) > 0; 66 } 67 */ 68 }; 69 70 template<int n_, int extra_, typename T> 71 struct gfp_base { 72 /* This is only example code. The version used is the one which is 73 * specialized for extra == 1. 74 */ 75 static const int n = n_; 76 static const typename is_true<n_>= extra_>::type extra = extra_; 77 78 typedef mpn<n> elt; 79 struct elt_ur: public mpn<n + extra> { operator =arith_modp::details::gfp_base::elt_ur80 elt_ur& operator=(elt const& a) { 81 mpn_copyi(mpn<n + extra>::x, a.x, n); 82 memset(mpn<n + extra>::x + n, 0, extra * sizeof(mp_limb_t)); 83 return *this; 84 } 85 }; 86 struct preinv : public mpn<extra> { int shift; }; 87 propagate_carryarith_modp::details::gfp_base88 static void propagate_carry(mp_limb_t * dst, mp_limb_t cy) { 89 mpn_add_1(dst, dst, extra, cy); 90 } propagate_borrowarith_modp::details::gfp_base91 static void propagate_borrow(mp_limb_t * dst, mp_limb_t cy) { 92 mpn_sub_1(dst, dst, extra, cy); 93 } upperlimbs_are_zeroarith_modp::details::gfp_base94 static bool upperlimbs_are_zero(mp_limb_t * dst) { 95 for(int i = extra ; i-- ; ) 96 if (dst[i]) return false; 97 return true; 98 } 99 is_zeroarith_modp::details::gfp_base100 static inline bool is_zero(elt const & x) { return x.is_zero(); } is_zeroarith_modp::details::gfp_base101 static inline bool is_zero(elt_ur const & x) { return x.is_zero(); } 102 103 stream_storearith_modp::details::gfp_base104 static inline void stream_store(elt * dst, elt const& src) { *dst = src; } addarith_modp::details::gfp_base105 static inline void add(elt_ur & dst, elt const & src) 106 { 107 mp_limb_t cy = mpn_add_n(dst, dst, src, n); 108 T::propagate_carry(dst + n, cy); 109 } 110 subarith_modp::details::gfp_base111 static inline void sub(elt_ur & dst, elt const & src) 112 { 113 mp_limb_t cy = mpn_sub_n(dst, dst, src, n); 114 T::propagate_borrow(dst + n, cy); 115 } 116 add_urarith_modp::details::gfp_base117 static inline void add_ur(elt_ur & dst, elt_ur const & src) 118 { 119 mpn_add_n(dst, dst, src, n + extra); 120 } 121 sub_urarith_modp::details::gfp_base122 static inline void sub_ur(elt_ur & dst, elt_ur const & src) 123 { 124 mpn_sub_n(dst, dst, src, n + extra); 125 } 126 addmul_uiarith_modp::details::gfp_base127 static inline void addmul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) 128 { 129 mp_limb_t cy = mpn_addmul_1(dst, src, n, x); 130 T::propagate_carry(dst + n, cy); 131 } submul_uiarith_modp::details::gfp_base132 static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) 133 { 134 mp_limb_t cy = mpn_submul_1(dst, src, n, x); 135 T::propagate_borrow(dst + n, cy); 136 } 137 138 /* Preinverse for Barrett reduction. See also the code for reduction, 139 * which is further below. 140 * 141 * We want to apply this to reduce a mod p, with the following 142 * constraints. 143 * 144 * 2^(m-1) < p < 2^m 145 * -2^(ell-1)*p <= a < 2^(ell-1)*p 146 * 147 * Let I=floor(2^(m+ell)/p). Because of the bound on p, we have 2^ell 148 * < I < 2^(ell+1), so that 0<J=I-2^ell<2^ell (which actually fits 149 * within ell bits). The preinverse we compute is this J. 150 */ 151 compute_preinvarith_modp::details::gfp_base152 static void compute_preinv(preinv & j, elt const & p) 153 { 154 mpz_t big; 155 mpz_t pz; 156 mpz_init_set_ui(big,1); 157 mpz_init(pz); 158 MPZ_SET_MPN(pz, p, n); 159 size_t m = mpz_sizeinbase(pz, 2); 160 ASSERT_ALWAYS(m <= (size_t) n * mp_bits_per_limb); 161 size_t ell = extra * mp_bits_per_limb; 162 mpz_mul_2exp(big, big, m+ell); 163 mpz_fdiv_q(big, big, pz); 164 ASSERT_ALWAYS(mpz_sizeinbase(big, 2) == (ell + 1)); 165 mpz_fdiv_r_2exp(big, big, ell); 166 MPN_SET_MPZ(j, extra, big); 167 j.shift = (mp_bits_per_limb - m) % mp_bits_per_limb; 168 mpz_clear(big); 169 mpz_clear(pz); 170 } 171 172 /* Signed Barrett reduction (extended from Brent-Zimmermann 2010, 173 * theorem 2.4) 174 */ 175 176 /* input: a = a0 + a1 * 2^m, with 2^(m-1) < p < 2^m 177 * -2^(ell-1)*p <= a < 2^(ell-1)*p 178 * 0 <= a0 < 2^m 179 * which imply in particular: 180 * -2^(ell-1) <= a1 < 2^(ell-1) 181 * 182 * Case a1 >= 0. 183 * 184 * Let q0 = floor(a1*I/2^ell) = floor(a1*J/2^ell) + a1. 185 * We have 0 <= q0 < 2^ell. 186 * 187 * Moreover: q0 <= a1*I/2^ell <= a1*2^m/p <= a/p, so that r0=a-p*q0>=0. 188 * use p*I >= 2^(m+ell)-p and 2^ell*q0 >= a1*I-2^ell 189 * 190 * compute 2^ell*p*q0 >= 2^(m+ell)*a1-a1*p-2^ell*p 191 * >= 2^ell*(a-a0)-p*(a1+2^ell) 192 * > 2^ell*a - 4*2^ell*p 193 * a-p*q0 < 4p 194 * where the third line used a1 < 2^(ell-1) and a0 <= 2^m <= 2*p. 195 * 196 * Case a1 < 0. 197 * 198 * We let b1 = a1 + 2^ell, which is the unsigned limb used to 199 * represent a1. 200 * 201 * Let q0 = floor(a1*I/2^ell) = floor(b1*J/2^ell) + b1 - 2^ell - J. 202 * 203 * Since a1 < 0, we have q0 < 0. With a1 >= -2^(ell-1) and 204 * I<2^(ell+1), we obtaib q0 > -2^ell. Therefore q0 is well 205 * represented by the machine word 206 * q'0 = q0+2^ell = floor(b1*J/2^ell) + b1 - J 207 * 208 * We have p*q0 <= p*a1*I/2^ell < p*a1/2^ell*(2^(m+ell)/p-1) 209 * < a1*2^m - a1/2^ell * p 210 * < p*q+r-a0-a1/2^ell * p 211 * q-q0 > (a0-r)/p + a1/2^ell 212 * > -1.5 since a0>0, r<p, and a1>-2^(ell-1). 213 * >= -1 since q and q0 are integers. 214 * So that q-(q0-1) >= 0. 215 * 216 * Note that because we have -2^ell < q0 < 0, then q0-1 is properly 217 * represented by the unsigned machine word 2^ell-1+q0. 218 * 219 * Also, we have p*q0 >= p*a1*I/2^ell-p 220 * >= a1*2^m-p 221 * >= a-a0-p 222 * a-p*(q0-1) <= a0 + 2p < 4p 223 * 224 * To compute a-p*(q0-1), we actually compute 225 * a-p*(q0+2^ell-1)+2^ell*p, which is a submul_ui followed by one 226 * addition. 227 */ 228 229 /* this reduces a in place, and copies the result to r */ reducearith_modp::details::gfp_base230 static void reduce(elt & r, elt_ur & a, elt const & p, preinv const & j) 231 { 232 mp_limb_t tmp[extra + 1]; 233 if (j.shift) { 234 mpn_lshift(tmp, a + n - 1, extra + 1, j.shift); 235 } else { 236 mpn_copyi(tmp + 1, a + n, extra); 237 } 238 mp_limb_t a1I[2*extra]; 239 mpn_mul_n(a1I, tmp + 1, j, extra); 240 mpn_add_n(a1I + extra, a1I + extra, tmp + 1, extra); 241 mp_limb_t * q0 = a1I + extra; 242 typename make_signed<mp_limb_t>::type sa1 = (tmp+1)[extra-1]; 243 if (sa1 < 0) { 244 mpn_sub_n(q0, q0, j, extra); 245 mpn_sub_1(q0, q0, extra, 1); 246 mpn_add_n(a + extra, a + extra, p, n); 247 } 248 /* emulate a submul_n ; need to do mul first, then sub... */ 249 mp_limb_t scratch[n + extra]; 250 mpn_mul(scratch, p, n, q0, extra); 251 mpn_sub_n(a, a, scratch, n + extra); 252 #if !defined(NDEBUG) && !defined(DEBUG_INFINITE_LOOPS) 253 int spin=0; 254 #endif 255 while (!upperlimbs_are_zero(a + n) || mpn_cmp(a, p, n) >= 0) { 256 T::sub(a, p); 257 /* 258 { 259 mp_limb_t cy = mpn_sub_n(a, a, p, n); 260 propagate_borrow(a + n, cy); 261 } 262 */ 263 #if !defined(NDEBUG) && !defined(DEBUG_INFINITE_LOOPS) 264 spin++; 265 ASSERT_ALWAYS(spin < 4); 266 #endif 267 } 268 mpn_copyi(r, a, n); 269 } 270 }; 271 272 template<int n_, int extra_, typename T> 273 struct gfp_middle: public gfp_base<n_, extra_, T> {}; 274 275 template<int n_, typename T> 276 class gfp_middle<n_, 1, T>: public gfp_base<n_, 1, T> 277 { 278 typedef gfp_base<n_, 1, T> super; 279 public: propagate_carry(mp_limb_t * dst,mp_limb_t cy)280 static inline void propagate_carry(mp_limb_t * dst, mp_limb_t cy) { 281 *dst += cy; 282 } propagate_borrow(mp_limb_t * dst,mp_limb_t cy)283 static inline void propagate_borrow(mp_limb_t * dst, mp_limb_t cy) { 284 *dst -= cy; 285 } upperlimbs_are_zero(mp_limb_t * dst)286 static inline bool upperlimbs_are_zero(mp_limb_t * dst) { 287 return !dst[0]; 288 } 289 290 using super::n; 291 /* We would prefer write "using typename super::elt" below, 292 * however this is only ok with recent c++ */ 293 typedef typename super::elt elt; 294 typedef typename super::elt_ur elt_ur; 295 typedef typename super::preinv preinv; 296 297 /* this reduces a in place, and copies the result to r */ reduce(elt & r,elt_ur & a,elt const & p,preinv const & j)298 static void reduce(elt & r, elt_ur & a, elt const & p, preinv const & j) { 299 mp_limb_t a1 = a[n] << j.shift; 300 if (j.shift) { 301 a1 |= a[n-1] >> (mp_bits_per_limb - j.shift); 302 } 303 signed_mp_limb_t sa1 = a1; 304 mp_limb_t tmp[2]; 305 #ifdef umul_ppmm 306 umul_ppmm(tmp[1], tmp[0], a1, j[0]); 307 #elif defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) 308 __asm__ ("mulq %3" : "=a"(tmp[0]), "=d" (tmp[1]) : "0" (a1), "rm" (j[0])); 309 #else 310 mpn_mul_n(tmp, &a1, j, 1); 311 #endif 312 mp_limb_t q0 = tmp[1] + a1; 313 if (sa1 < 0) { 314 /* see above for the specificities of the negative case */ 315 q0 -= j[0] + 1; 316 mpn_add_n(a + 1, a + 1, p, n); 317 } 318 T::submul_ui(a, p, q0, p, j); 319 /* 320 { 321 mp_limb_t cy = mpn_submul_1(a, p, n, q0); 322 super::propagate_borrow(a + n, cy); 323 } 324 */ 325 #if !defined(NDEBUG) && !defined(DEBUG_INFINITE_LOOPS) 326 int spin=0; 327 #endif 328 while (a[n] || mpn_cmp(a, p, n) >= 0) { 329 T::sub(a, p); 330 /* 331 { 332 mp_limb_t cy = mpn_sub_n(a, a, p, n); 333 super::propagate_borrow(a + n, cy); 334 } 335 */ 336 #if !defined(NDEBUG) && !defined(DEBUG_INFINITE_LOOPS) 337 spin++; 338 ASSERT_ALWAYS(spin < 4); 339 #endif 340 } 341 mpn_copyi(r, a, n); 342 } 343 }; 344 345 template<int n, int extra=1> 346 struct gfp : public gfp_middle<n, extra, gfp<n, extra> > 347 { 348 }; 349 350 351 /* Now for some sizes, we see a clear interest in using auxiliary 352 * vector types. We call these "fast" types. The general compromise 353 * is that we accept types which may be a little wider, but generally 354 * allow for better performance. The specs go typically as follows. 355 * 356 * - conversion to the "fast" types must be done for both operands 357 * (say, source vector as well as destination vector). We don't 358 * intend to go with the same kind of arithmetic that what we do 359 * with elt and elt_ur above, where a "mixed" add/sub function 360 * exists. 361 * 362 * - "fast" types are amenable to vector instructions 363 * 364 * - up to some number of additions or subtractions may be performed 365 * on the fast type before reduction. 366 * 367 * - type may be ambiguous (so comparison entails conversion). 368 * 369 * We have two natural choices: 370 * 371 * - RNS representation 372 * - carry-save (aka nails). 373 * 374 * The specializations below work with nails. The idea is as follows. 375 * For a p spanning three 64-bit words, we spread data into four 376 * 48-bit words in an avx register. Then we can accumulate up to 2^16 377 * of these at little cost. 378 */ 379 380 /* the default version is just making no difference, so that we'll 381 * use the simple elt / elt_ur mechanism */ 382 template<typename T> struct fast_type : public T { }; 383 384 385 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM 386 /* Now some specializations */ 387 388 /* {{{ gfp<1,1> */ 389 template<> struct gfp<1, 1> : public gfp_middle<1,1,gfp<1,1> > { addarith_modp::details::gfp390 static inline void add(elt_ur & dst, elt const & src) 391 { 392 asm("# gfp<1, 1>::add\n" 393 "addq %q2, %q0\n" 394 "adcq $0x0, %q1\n" 395 : "+r"(dst[0]), "+r"(dst[1]) 396 : "rm"(src[0]) 397 ); 398 } 399 subarith_modp::details::gfp400 static inline void sub(elt_ur & dst, elt const & src) 401 { 402 asm("# gfp<1, 1>::sub\n" 403 "subq %q2, %q0\n" 404 "sbbq $0x0, %q1\n" 405 : "+r"(dst[0]), "+r"(dst[1]) 406 : "rm"(src[0]) 407 ); 408 } 409 addmul_uiarith_modp::details::gfp410 static inline void addmul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) 411 { 412 mp_limb_t foo, bar; 413 asm("# gfp<1, 1>::addmul_ui\n" 414 "mulq %[mult]\n" 415 "addq %%rax, %[z0]\n" 416 "adcq $0, %%rdx\n" 417 "addq %%rdx, %[z1]\n" 418 : "=a"(foo), "=&d"(bar), [z0]"+rm"(dst[0]), [z1]"+rm"(dst[1]) 419 : "0"(src[0]), [mult]"r1m"(x) 420 ); 421 } submul_uiarith_modp::details::gfp422 static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) 423 { 424 mp_limb_t foo, bar; 425 asm("# gfp<1, 1>::submul_ui\n" 426 "mulq %[mult]\n" 427 "subq %%rax, %[z0]\n" 428 "adcq $0, %%rdx\n" 429 "subq %%rdx, %[z1]\n" 430 : "=a"(foo), "=&d"(bar), [z0]"+rm"(dst[0]), [z1]"+rm"(dst[1]) 431 : "0"(src[0]), [mult]"r1m"(x) 432 ); 433 } 434 }; 435 /* }}} */ 436 437 /* {{{ gfp<2,1> */ 438 template<> struct gfp<2, 1> : public gfp_middle<2,1,gfp<2,1> > { addarith_modp::details::gfp439 static inline void add(elt_ur & dst, elt const & src) { 440 asm("# gfp<2, 1>::add\n" 441 "addq %q[s0], %q[d0]\n" 442 "adcq %q[s1], %q[d1]\n" 443 "adcq $0x0, %q[d2]\n" 444 : [d0]"+rm"(dst[0]), [d1]"+rm"(dst[1]), [d2]"+rm"(dst[2]) 445 : [s0]"r"(src[0]), [s1]"r"(src[1]) 446 ); 447 } 448 subarith_modp::details::gfp449 static inline void sub(elt_ur & dst, elt const & src) { 450 asm("# gfp<2, 1>::sub\n" 451 "subq %q[s0], %q[d0]\n" 452 "sbbq %q[s1], %q[d1]\n" 453 "sbbq $0x0, %q[d2]\n" 454 : [d0]"+rm"(dst[0]), [d1]"+rm"(dst[1]), [d2]"+rm"(dst[2]) 455 : [s0]"r"(src[0]), [s1]"r"(src[1]) 456 ); 457 } 458 addmul_uiarith_modp::details::gfp459 static inline void addmul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) 460 { 461 mp_limb_t foo, bar; 462 asm("# gfp<2, 1>::addmul_ui\n" 463 "mulq %[mult]\n" 464 "addq %%rax, %[z0]\n" 465 "adcq $0, %%rdx\n" 466 "movq %%rdx, %%rcx\n" 467 "movq %[s1], %%rax\n" 468 "mulq %[mult]\n" 469 "addq %%rcx, %%rax\n" 470 "adcq $0, %%rdx\n" 471 "addq %%rax, %[z1]\n" 472 "adcq $0, %%rdx\n" 473 "addq %%rdx, %[z2]\n" 474 : "=&a"(foo), "=&d"(bar), 475 [z0]"+rm"(dst[0]), 476 [z1]"+rm"(dst[1]), 477 [z2]"+rm"(dst[2]) 478 : [s0]"0"(src[0]), [s1]"rm"(src[1]), [mult]"rm"(x) 479 : "rcx" 480 ); 481 } 482 submul_uiarith_modp::details::gfp483 static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) { 484 mp_limb_t foo, bar; 485 asm("# gfp<2, 1>::submul_ui\n" 486 "mulq %[mult]\n" 487 "subq %%rax, %[z0]\n" 488 "adcq $0, %%rdx\n" 489 "movq %%rdx, %%rcx\n" 490 "movq %[s1], %%rax\n" 491 "mulq %[mult]\n" 492 "addq %%rcx, %%rax\n" 493 "adcq $0, %%rdx\n" 494 "subq %%rax, %[z1]\n" 495 "adcq $0, %%rdx\n" 496 "subq %%rdx, %[z2]\n" 497 : "=&a"(foo), "=&d"(bar), 498 [z0]"+rm"(dst[0]), 499 [z1]"+rm"(dst[1]), 500 [z2]"+rm"(dst[2]) 501 : [s0]"0"(src[0]), [s1]"rm"(src[1]), [mult]"rm"(x) 502 : "rcx" 503 ); 504 } 505 }; 506 /* }}} */ 507 508 /* {{{ macros for assembly for further specializations */ 509 #define FEED_IN_WITH_S0_IN_RAX(in1, r0, r1) \ 510 /* status: s0 in rax */ \ 511 "mulq %[mult]\n" /* rdx:rax = s0 * v */ \ 512 "movq %%rax, %%" #r0 "\n" /* lo contrib to d1 */ \ 513 "movq " in1 ", %%rax\n" /* load s1 */ \ 514 "movq %%rdx, %%" #r1 "\n" /* hi contrib to d1 */ 515 #define FEED_IN(in0, in1, r0, r1) \ 516 "movq " in0 ", %%rax\n" \ 517 FEED_IN_WITH_S0_IN_RAX(in1, r0, r1) 518 #define INNER_MUL(op, out, in, r0, r1, r2) \ 519 /* status: r1:r0 to be added to d_{i+1}:d_i, rax = s_{i+1} */ \ 520 "xorq %%" #r2 ", %%" #r2 "\n" \ 521 "mulq %[mult]\n" /* rdx:rax = s_{i+1} * v */ \ 522 "" #op "q %%" #r0 ", " out "\n" /* store d_i */ \ 523 "adcq %%rax, %%" #r1 "\n" /* lo contrib to d_{i+1} */ \ 524 "adcq %%rdx, %%" #r2 "\n" /* hi contrib to d_{i+2} */ \ 525 "movq " in ", %%rax\n" /* load s_{i+2} */ 526 #define FINISH(op, opc, out0, out1, out2, r0, r1) \ 527 /* r1:r0 to be added to d_{i+1}:d_i ; rax = s_{i+2} */ \ 528 "mulq %[mult]\n" \ 529 "" #op "q %%" #r0 ", " out0 "\n" \ 530 "adcq %%rax, %%" #r1 "\n" \ 531 "adcq $0x0, %%rdx\n" \ 532 "" #op "q %%" #r1 ", " out1 "\n" \ 533 "" #opc "q %%rdx, " out2 "\n" 534 /* }}} */ 535 /* {{{ this macro actually exposes the specialization in itself */ 536 #define EXPOSE_SPECIALIZATION(n) \ 537 template<> struct gfp<n, 1> : public gfp_middle<n,1,gfp<n,1> > { \ 538 using gfp_middle<n,1,gfp<n,1> >::add_ur; \ 539 using gfp_middle<n,1,gfp<n,1> >::sub_ur; \ 540 static inline void add(elt_ur & dst, elt const & src) { \ 541 asm("# gfp<" #n ", 1>::add\n" \ 542 ADDSUB_CODE ## n(add, adc) \ 543 ); \ 544 } \ 545 static inline void sub(elt_ur & dst, elt const & src) { \ 546 asm("# gfp<" #n ", 1>::sub\n" \ 547 ADDSUB_CODE ## n(sub, sbb) \ 548 ); \ 549 } \ 550 static inline void addmul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) {\ 551 mp_limb_t foo MAYBE_UNUSED; \ 552 asm ("# gfp<" #n ", 1>::addmul_ui\n" \ 553 ADDSUBMUL_CODE ## n(add, adc) \ 554 ); \ 555 } \ 556 static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) {\ 557 mp_limb_t foo MAYBE_UNUSED; \ 558 asm("# gfp<" #n ", 1>::submul_ui\n" \ 559 ADDSUBMUL_CODE ## n(sub, sbb) \ 560 ); \ 561 } \ 562 } 563 /* }}} */ 564 565 /* {{{ code for gfp<3, 1> */ 566 #define ADDSUBMUL_CODE3(op, opc) \ 567 FEED_IN_WITH_S0_IN_RAX("%[s1]", r8, r9) \ 568 INNER_MUL(op, "%[z0]", "%[s2]", r8, r9, r10) \ 569 FINISH(op, opc, "%[z1]", "%[z2]", "%[z3]", r9, r10) \ 570 : "=&a"(foo), \ 571 [z0]"+rm"(dst[0]), \ 572 [z1]"+rm"(dst[1]), \ 573 [z2]"+rm"(dst[2]), \ 574 [z3]"+rm"(dst[3]) \ 575 : \ 576 [s0]"0"(src[0]), \ 577 [s1]"rm"(src[1]), \ 578 [s2]"rm"(src[2]), \ 579 [mult]"rm"(x) \ 580 : "r8", "r9", "r10", "rdx" 581 582 #define ADDSUB_CODE3(op, opc) \ 583 "" #op "q %q[s0], %q[d0]\n" \ 584 "" #opc "q %q[s1], %q[d1]\n" \ 585 "" #opc "q %q[s2], %q[d2]\n" \ 586 "" #opc "q $0x0, %q[d3]\n" \ 587 : \ 588 [d0]"+rm"(dst[0]), \ 589 [d1]"+rm"(dst[1]), \ 590 [d2]"+rm"(dst[2]), \ 591 [d3]"+rm"(dst[3]) \ 592 : \ 593 [s0]"r"(src[0]), \ 594 [s1]"r"(src[1]), \ 595 [s2]"r"(src[2]) 596 597 EXPOSE_SPECIALIZATION(3); 598 /* }}} */ 599 600 /* {{{ code for gfp<4, 1> */ 601 /* 602 #define ADDSUBMUL_CODE4(op, opc) \ 603 FEED_IN_WITH_S0_IN_RAX("%[s1]", r8, r9) \ 604 INNER_MUL(op, "%[z0]", "%[s2]", r8, r9, r10) \ 605 INNER_MUL(op, "%[z1]", "%[s3]", r9, r10, r11) \ 606 FINISH(op, opc, "%[z2]", "%[z3]", "%[z4]", r10, r11) \ 607 : "=&a"(foo), \ 608 [z0]"+rm"(dst[0]), \ 609 [z1]"+rm"(dst[1]), \ 610 [z2]"+rm"(dst[2]), \ 611 [z3]"+rm"(dst[3]), \ 612 [z4]"+rm"(dst[4]) \ 613 : \ 614 [s0]"0"(src[0]), \ 615 [s1]"rm"(src[1]), \ 616 [s2]"rm"(src[2]), \ 617 [s3]"rm"(src[3]), \ 618 [mult]"rm"(x) \ 619 : "r8", "r9", "r10", "r11", "rdx" 620 621 #define xADDSUB_CODE4(op, opc) \ 622 "" #op "q %q[s0], (%[z])\n" \ 623 "" #opc "q %q[s1], 0x8(%[z])\n" \ 624 "" #opc "q %q[s2], 0x10(%[z])\n" \ 625 "" #opc "q %q[s3], 0x18(%[z])\n" \ 626 "" #opc "q $0x0, 0x20(%[z])\n" \ 627 : \ 628 : \ 629 [z]"r"(&dst[0]), \ 630 [s0]"r"(src[0]), \ 631 [s1]"r"(src[1]), \ 632 [s2]"r"(src[2]), \ 633 [s3]"r"(src[3]) \ 634 : "memory" 635 636 637 */ 638 #define ADDSUB_CODE4(op, opc) \ 639 "" #op "q %q[s0], %q[d0]\n" \ 640 "" #opc "q %q[s1], %q[d1]\n" \ 641 "" #opc "q %q[s2], %q[d2]\n" \ 642 "" #opc "q %q[s3], %q[d3]\n" \ 643 "" #opc "q $0x0, %q[d4]\n" \ 644 : \ 645 [d0]"+rm"(dst[0]), \ 646 [d1]"+rm"(dst[1]), \ 647 [d2]"+rm"(dst[2]), \ 648 [d3]"+rm"(dst[3]), \ 649 [d4]"+rm"(dst[4]) \ 650 : \ 651 [s0]"r"(src[0]), \ 652 [s1]"r"(src[1]), \ 653 [s2]"r"(src[2]), \ 654 [s3]"r"(src[3]) 655 656 657 #define ADDSUBMUL_CODE4(op, opc) \ 658 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9) \ 659 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10) \ 660 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11) \ 661 FINISH(op, opc, \ 662 "0x10(%[z])", "0x18(%[z])", "0x20(%[z])", \ 663 r10, r11) \ 664 : \ 665 : \ 666 [z]"D"(&dst[0]), \ 667 [s]"S"(&src[0]), \ 668 [mult]"rm"(x) \ 669 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory" 670 671 EXPOSE_SPECIALIZATION(4); 672 /* }}} */ 673 674 /* {{{ code for gfp<5, 1> */ 675 676 #define ADDSUBMUL_CODE5(op, opc) \ 677 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9) \ 678 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10) \ 679 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11) \ 680 INNER_MUL(op, "0x10(%[z])", "0x20(%[s])", r10, r11, r8) \ 681 FINISH(op, opc, \ 682 "0x18(%[z])", "0x20(%[z])", "0x28(%[z])", \ 683 r11, r8) \ 684 : \ 685 : \ 686 [z]"D"(&dst[0]), \ 687 [s]"S"(&src[0]), \ 688 [mult]"rm"(x) \ 689 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory" 690 691 #define ADDSUB_CODE5(op, opc) \ 692 "" #op "q %q[s0], (%[z])\n" \ 693 "" #opc "q %q[s1], 0x8(%[z])\n" \ 694 "" #opc "q %q[s2], 0x10(%[z])\n" \ 695 "" #opc "q %q[s3], 0x18(%[z])\n" \ 696 "" #opc "q %q[s4], 0x20(%[z])\n" \ 697 "" #opc "q $0x0, 0x28(%[z])\n" \ 698 : \ 699 : \ 700 [z]"r"(&dst[0]), \ 701 [s0]"r"(src[0]), \ 702 [s1]"r"(src[1]), \ 703 [s2]"r"(src[2]), \ 704 [s3]"r"(src[3]), \ 705 [s4]"r"(src[4]) \ 706 : "memory" 707 708 EXPOSE_SPECIALIZATION(5); 709 /* }}} */ 710 711 /* {{{ code for gfp<6, 1> */ 712 713 #define ADDSUBMUL_CODE6(op, opc) \ 714 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9) \ 715 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10) \ 716 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11) \ 717 INNER_MUL(op, "0x10(%[z])", "0x20(%[s])", r10, r11, r8) \ 718 INNER_MUL(op, "0x18(%[z])", "0x28(%[s])", r11, r8, r9) \ 719 FINISH(op, opc, \ 720 "0x20(%[z])", "0x28(%[z])", "0x30(%[z])", \ 721 r8, r9) \ 722 : \ 723 : \ 724 [z]"D"(&dst[0]), \ 725 [s]"S"(&src[0]), \ 726 [mult]"rm"(x) \ 727 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory" 728 729 #define ADDSUB_CODE6(op, opc) \ 730 "" #op "q %q[s0], (%[z])\n" \ 731 "" #opc "q %q[s1], 0x8(%[z])\n" \ 732 "" #opc "q %q[s2], 0x10(%[z])\n" \ 733 "" #opc "q %q[s3], 0x18(%[z])\n" \ 734 "" #opc "q %q[s4], 0x20(%[z])\n" \ 735 "" #opc "q %q[s5], 0x28(%[z])\n" \ 736 "" #opc "q $0x0, 0x30(%[z])\n" \ 737 : \ 738 : \ 739 [z]"r"(&dst[0]), \ 740 [s0]"r"(src[0]), \ 741 [s1]"r"(src[1]), \ 742 [s2]"r"(src[2]), \ 743 [s3]"r"(src[3]), \ 744 [s4]"r"(src[4]), \ 745 [s5]"r"(src[5]) \ 746 : "memory" 747 748 EXPOSE_SPECIALIZATION(6); 749 /* }}} */ 750 751 /* {{{ code for gfp<7, 1> */ 752 #define ADDSUBMUL_CODE7(op, opc) \ 753 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9) \ 754 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10) \ 755 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11) \ 756 INNER_MUL(op, "0x10(%[z])", "0x20(%[s])", r10, r11, r8) \ 757 INNER_MUL(op, "0x18(%[z])", "0x28(%[s])", r11, r8, r9) \ 758 INNER_MUL(op, "0x20(%[z])", "0x30(%[s])", r8, r9, r10) \ 759 FINISH(op, opc, \ 760 "0x28(%[z])", "0x30(%[z])", "0x38(%[z])", \ 761 r9, r10) \ 762 : \ 763 : \ 764 [z]"D"(&dst[0]), \ 765 [s]"S"(&src[0]), \ 766 [mult]"rm"(x) \ 767 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory" 768 769 #define ADDSUB_CODE7(op, opc) \ 770 "" #op "q %q[s0], (%[z])\n" \ 771 "" #opc "q %q[s1], 0x8(%[z])\n" \ 772 "" #opc "q %q[s2], 0x10(%[z])\n" \ 773 "" #opc "q %q[s3], 0x18(%[z])\n" \ 774 "" #opc "q %q[s4], 0x20(%[z])\n" \ 775 "" #opc "q %q[s5], 0x28(%[z])\n" \ 776 "" #opc "q %q[s6], 0x30(%[z])\n" \ 777 "" #opc "q $0x0, 0x38(%[z])\n" \ 778 : \ 779 : \ 780 [z]"r"(&dst[0]), \ 781 [s0]"r"(src[0]), \ 782 [s1]"r"(src[1]), \ 783 [s2]"r"(src[2]), \ 784 [s3]"r"(src[3]), \ 785 [s4]"r"(src[4]), \ 786 [s5]"r"(src[5]), \ 787 [s6]"r"(src[6]) \ 788 : "memory" 789 790 EXPOSE_SPECIALIZATION(7); 791 /* }}} */ 792 793 /* {{{ code for gfp<8, 1> */ 794 #define ADDSUBMUL_CODE8(op, opc) \ 795 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9) \ 796 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10) \ 797 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11) \ 798 INNER_MUL(op, "0x10(%[z])", "0x20(%[s])", r10, r11, r8) \ 799 INNER_MUL(op, "0x18(%[z])", "0x28(%[s])", r11, r8, r9) \ 800 INNER_MUL(op, "0x20(%[z])", "0x30(%[s])", r8, r9, r10) \ 801 INNER_MUL(op, "0x28(%[z])", "0x38(%[s])", r9, r10, r11) \ 802 FINISH(op, opc, \ 803 "0x30(%[z])", "0x38(%[z])", "0x40(%[z])", \ 804 r10, r11) \ 805 : \ 806 : \ 807 [z]"D"(&dst[0]), \ 808 [s]"S"(&src[0]), \ 809 [mult]"rm"(x) \ 810 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory" 811 812 #define ADDSUB_CODE8(op, opc) \ 813 "" #op "q %q[s0], (%[z])\n" \ 814 "" #opc "q %q[s1], 0x8(%[z])\n" \ 815 "" #opc "q %q[s2], 0x10(%[z])\n" \ 816 "" #opc "q %q[s3], 0x18(%[z])\n" \ 817 "" #opc "q %q[s4], 0x20(%[z])\n" \ 818 "" #opc "q %q[s5], 0x28(%[z])\n" \ 819 "" #opc "q %q[s6], 0x30(%[z])\n" \ 820 "" #opc "q %q[s7], 0x38(%[z])\n" \ 821 "" #opc "q $0x0, 0x40(%[z])\n" \ 822 : \ 823 : \ 824 [z]"r"(&dst[0]), \ 825 [s0]"r"(src[0]), \ 826 [s1]"r"(src[1]), \ 827 [s2]"r"(src[2]), \ 828 [s3]"r"(src[3]), \ 829 [s4]"r"(src[4]), \ 830 [s5]"r"(src[5]), \ 831 [s6]"r"(src[6]), \ 832 [s7]"r"(src[7]) \ 833 : "memory" 834 835 EXPOSE_SPECIALIZATION(8); 836 /* }}} */ 837 838 /* further specialization only seem to bring very marginal 839 * improvements. */ 840 841 /* AVX/SSE code is a choice which is mostly unrelated to the C++ 842 * dialect we use, except for a fine point below. We use a union to 843 * protect an off-bounds write. And pre-C++11 does not really like 844 * unions with contructors for members. Given that anyway we have 845 * little use for this code at the moment, since it is suboptimal, 846 * let's protect it with HAVE_CXX11 847 */ 848 #ifdef HAVE_CXX11 849 #if defined(HAVE_AVX2) || defined(HAVE_SSSE3) 850 template<> struct fast_type<gfp<3, 1> > { 851 typedef gfp<3, 1> super; 852 struct elt; 853 typedef elt elt_ur; 854 struct elt { 855 #ifdef HAVE_AVX2 856 static const int alignment = 32; 857 #else 858 static const int alignment = 16; 859 #endif 860 typedef elt self; 861 #ifdef HAVE_AVX2 862 __m256i data[1]; 863 #else 864 __m128i data[2]; 865 #endif eltarith_modp::details::fast_type::elt866 elt() { zero(); } 867 elt(elt const& a) = default; 868 elt& operator=(elt const& a) = default; 869 870 /* we do not construct (nor affect) from mpz, because we're not 871 * positional */ zeroarith_modp::details::fast_type::elt872 void zero() { 873 #ifdef HAVE_AVX2 874 data[0] = _mm256_setzero_si256(); 875 #else 876 data[0] = _mm_setzero_si128(); 877 data[1] = _mm_setzero_si128(); 878 #endif 879 } zeroarith_modp::details::fast_type::elt880 static void zero(elt * x, int N) { 881 // see bug 21663 882 memset(x->data, 0, N * sizeof(data)); 883 } copyarith_modp::details::fast_type::elt884 static void copy(elt * y, const elt * x, int N) { 885 // see bug 21663 886 memcpy(y->data, x->data, N * sizeof(data)); 887 } operator ==arith_modp::details::fast_type::elt888 bool operator==(elt const& a) { 889 return memcmp(data, a.data, sizeof(data)) == 0; 890 } eltarith_modp::details::fast_type::elt891 elt(super::elt const& a) { 892 convert(*this, a); 893 } 894 operator super::elt_urarith_modp::details::fast_type::elt895 operator super::elt_ur() const { 896 super::elt_ur carries(conv_backend_get_carries(*this)); 897 super::add(carries, conv_backend_get_main(*this)); 898 return carries; 899 } 900 901 /* same, but we assume carry is zero */ operator super::eltarith_modp::details::fast_type::elt902 operator super::elt() const { 903 ASSERT(conv_backend_get_carries(*this).is_zero()); 904 return conv_backend_get_main(*this); 905 } 906 }; 907 stream_storearith_modp::details::fast_type908 static inline void stream_store(elt * dst, elt const& src) { 909 /* Do we want to stream that or not ? In fact it's slower 910 * when streaming... */ 911 #if 0 912 #ifdef HAVE_AVX2 913 _mm256_stream_si256(dst->data+0, src.data[0]); 914 #else 915 _mm_stream_si128(dst->data+0, src.data[0]); 916 _mm_stream_si128(dst->data+1, src.data[1]); 917 #endif 918 #else 919 #ifdef HAVE_AVX2 920 _mm256_storeu_si256(dst->data+0, src.data[0]); 921 #else 922 _mm_storeu_si128(dst->data+0, src.data[0]); 923 _mm_storeu_si128(dst->data+1, src.data[1]); 924 #endif 925 #endif 926 } addarith_modp::details::fast_type927 static inline void add(elt & dst, elt const & src) 928 { 929 #ifdef HAVE_AVX2 930 dst.data[0] = _mm256_add_epi64 (dst.data[0], src.data[0]); 931 #else 932 dst.data[0] = _mm_add_epi64 (dst.data[0], src.data[0]); 933 dst.data[1] = _mm_add_epi64 (dst.data[1], src.data[1]); 934 #endif 935 } 936 subarith_modp::details::fast_type937 static inline void sub(elt & dst, elt const & src) 938 { 939 #ifdef HAVE_AVX2 940 dst.data[0] = _mm256_sub_epi64 (dst.data[0], src.data[0]); 941 #else 942 dst.data[0] = _mm_sub_epi64 (dst.data[0], src.data[0]); 943 dst.data[1] = _mm_sub_epi64 (dst.data[1], src.data[1]); 944 #endif 945 } 946 sub_urarith_modp::details::fast_type947 static inline void sub_ur(elt_ur & dst, elt_ur const & src) 948 { 949 #ifdef HAVE_AVX2 950 dst.data[0] = _mm256_sub_epi64 (dst.data[0], src.data[0]); 951 #else 952 dst.data[0] = _mm_sub_epi64 (dst.data[0], src.data[0]); 953 dst.data[1] = _mm_sub_epi64 (dst.data[1], src.data[1]); 954 #endif 955 } 956 957 /* conversions are done as a combination of blend & shuffle */ 958 959 #ifdef HAVE_AVX2 960 /* We grok only values for w_i which are integer immediates 961 * within {-1} \cup {0..15} 962 */ 963 #define shuffle_16bit_words(out, in, \ 964 w0, w1, w2, w3, \ 965 w4, w5, w6, w7, \ 966 w8, w9, wa, wb, \ 967 wc, wd, we, wf) \ 968 do { \ 969 *out = _mm256_xor_si256( \ 970 _mm256_shuffle_epi8( \ 971 in, \ 972 _mm256_setr_epi8( \ 973 (w0 < 0) ? -1 : ((w0 < 8) ? 2*(w0&7) + 0 : -1), \ 974 (w0 < 0) ? -1 : ((w0 < 8) ? 2*(w0&7) + 1 : -1), \ 975 (w1 < 0) ? -1 : ((w1 < 8) ? 2*(w1&7) + 0 : -1), \ 976 (w1 < 0) ? -1 : ((w1 < 8) ? 2*(w1&7) + 1 : -1), \ 977 (w2 < 0) ? -1 : ((w2 < 8) ? 2*(w2&7) + 0 : -1), \ 978 (w2 < 0) ? -1 : ((w2 < 8) ? 2*(w2&7) + 1 : -1), \ 979 (w3 < 0) ? -1 : ((w3 < 8) ? 2*(w3&7) + 0 : -1), \ 980 (w3 < 0) ? -1 : ((w3 < 8) ? 2*(w3&7) + 1 : -1), \ 981 (w4 < 0) ? -1 : ((w4 < 8) ? 2*(w4&7) + 0 : -1), \ 982 (w4 < 0) ? -1 : ((w4 < 8) ? 2*(w4&7) + 1 : -1), \ 983 (w5 < 0) ? -1 : ((w5 < 8) ? 2*(w5&7) + 0 : -1), \ 984 (w5 < 0) ? -1 : ((w5 < 8) ? 2*(w5&7) + 1 : -1), \ 985 (w6 < 0) ? -1 : ((w6 < 8) ? 2*(w6&7) + 0 : -1), \ 986 (w6 < 0) ? -1 : ((w6 < 8) ? 2*(w6&7) + 1 : -1), \ 987 (w7 < 0) ? -1 : ((w7 < 8) ? 2*(w7&7) + 0 : -1), \ 988 (w7 < 0) ? -1 : ((w7 < 8) ? 2*(w7&7) + 1 : -1), \ 989 (w8 < 0) ? -1 : ((w8 >= 8) ? 2*(w8&7) + 0 : -1), \ 990 (w8 < 0) ? -1 : ((w8 >= 8) ? 2*(w8&7) + 1 : -1), \ 991 (w9 < 0) ? -1 : ((w9 >= 8) ? 2*(w9&7) + 0 : -1), \ 992 (w9 < 0) ? -1 : ((w9 >= 8) ? 2*(w9&7) + 1 : -1), \ 993 (wa < 0) ? -1 : ((wa >= 8) ? 2*(wa&7) + 0 : -1), \ 994 (wa < 0) ? -1 : ((wa >= 8) ? 2*(wa&7) + 1 : -1), \ 995 (wb < 0) ? -1 : ((wb >= 8) ? 2*(wb&7) + 0 : -1), \ 996 (wb < 0) ? -1 : ((wb >= 8) ? 2*(wb&7) + 1 : -1), \ 997 (wc < 0) ? -1 : ((wc >= 8) ? 2*(wc&7) + 0 : -1), \ 998 (wc < 0) ? -1 : ((wc >= 8) ? 2*(wc&7) + 1 : -1), \ 999 (wd < 0) ? -1 : ((wd >= 8) ? 2*(wd&7) + 0 : -1), \ 1000 (wd < 0) ? -1 : ((wd >= 8) ? 2*(wd&7) + 1 : -1), \ 1001 (we < 0) ? -1 : ((we >= 8) ? 2*(we&7) + 0 : -1), \ 1002 (we < 0) ? -1 : ((we >= 8) ? 2*(we&7) + 1 : -1), \ 1003 (wf < 0) ? -1 : ((wf >= 8) ? 2*(wf&7) + 0 : -1), \ 1004 (wf < 0) ? -1 : ((wf >= 8) ? 2*(wf&7) + 1 : -1))), \ 1005 _mm256_shuffle_epi8( \ 1006 /* 0x4E is 0b01001110 aka (1,0,3,2) */ \ 1007 _mm256_permute4x64_epi64 (in, _MM_SHUFFLE(1,0,3,2)), \ 1008 _mm256_setr_epi8( \ 1009 (w0 < 0) ? -1 : ((w0 >= 8) ? 2*(w0&7) + 0 : -1), \ 1010 (w0 < 0) ? -1 : ((w0 >= 8) ? 2*(w0&7) + 1 : -1), \ 1011 (w1 < 0) ? -1 : ((w1 >= 8) ? 2*(w1&7) + 0 : -1), \ 1012 (w1 < 0) ? -1 : ((w1 >= 8) ? 2*(w1&7) + 1 : -1), \ 1013 (w2 < 0) ? -1 : ((w2 >= 8) ? 2*(w2&7) + 0 : -1), \ 1014 (w2 < 0) ? -1 : ((w2 >= 8) ? 2*(w2&7) + 1 : -1), \ 1015 (w3 < 0) ? -1 : ((w3 >= 8) ? 2*(w3&7) + 0 : -1), \ 1016 (w3 < 0) ? -1 : ((w3 >= 8) ? 2*(w3&7) + 1 : -1), \ 1017 (w4 < 0) ? -1 : ((w4 >= 8) ? 2*(w4&7) + 0 : -1), \ 1018 (w4 < 0) ? -1 : ((w4 >= 8) ? 2*(w4&7) + 1 : -1), \ 1019 (w5 < 0) ? -1 : ((w5 >= 8) ? 2*(w5&7) + 0 : -1), \ 1020 (w5 < 0) ? -1 : ((w5 >= 8) ? 2*(w5&7) + 1 : -1), \ 1021 (w6 < 0) ? -1 : ((w6 >= 8) ? 2*(w6&7) + 0 : -1), \ 1022 (w6 < 0) ? -1 : ((w6 >= 8) ? 2*(w6&7) + 1 : -1), \ 1023 (w7 < 0) ? -1 : ((w7 >= 8) ? 2*(w7&7) + 0 : -1), \ 1024 (w7 < 0) ? -1 : ((w7 >= 8) ? 2*(w7&7) + 1 : -1), \ 1025 (w8 < 0) ? -1 : ((w8 < 8) ? 2*(w8&7) + 0 : -1), \ 1026 (w8 < 0) ? -1 : ((w8 < 8) ? 2*(w8&7) + 1 : -1), \ 1027 (w9 < 0) ? -1 : ((w9 < 8) ? 2*(w9&7) + 0 : -1), \ 1028 (w9 < 0) ? -1 : ((w9 < 8) ? 2*(w9&7) + 1 : -1), \ 1029 (wa < 0) ? -1 : ((wa < 8) ? 2*(wa&7) + 0 : -1), \ 1030 (wa < 0) ? -1 : ((wa < 8) ? 2*(wa&7) + 1 : -1), \ 1031 (wb < 0) ? -1 : ((wb < 8) ? 2*(wb&7) + 0 : -1), \ 1032 (wb < 0) ? -1 : ((wb < 8) ? 2*(wb&7) + 1 : -1), \ 1033 (wc < 0) ? -1 : ((wc < 8) ? 2*(wc&7) + 0 : -1), \ 1034 (wc < 0) ? -1 : ((wc < 8) ? 2*(wc&7) + 1 : -1), \ 1035 (wd < 0) ? -1 : ((wd < 8) ? 2*(wd&7) + 0 : -1), \ 1036 (wd < 0) ? -1 : ((wd < 8) ? 2*(wd&7) + 1 : -1), \ 1037 (we < 0) ? -1 : ((we < 8) ? 2*(we&7) + 0 : -1), \ 1038 (we < 0) ? -1 : ((we < 8) ? 2*(we&7) + 1 : -1), \ 1039 (wf < 0) ? -1 : ((wf < 8) ? 2*(wf&7) + 0 : -1), \ 1040 (wf < 0) ? -1 : ((wf < 8) ? 2*(wf&7) + 1 : -1))) \ 1041 ); \ 1042 } while (0) 1043 #else 1044 #define shuffle_16bit_words(out, lo, hi, \ 1045 w0, w1, w2, w3, \ 1046 w4, w5, w6, w7, \ 1047 w8, w9, wa, wb, \ 1048 wc, wd, we, wf) \ 1049 do { \ 1050 out[0] = _mm_xor_si128( \ 1051 _mm_shuffle_epi8(lo, _mm_setr_epi8( \ 1052 (w0 < 0) ? -1 : ((w0 < 8) ? 2*(w0&7) + 0 : -1), \ 1053 (w0 < 0) ? -1 : ((w0 < 8) ? 2*(w0&7) + 1 : -1), \ 1054 (w1 < 0) ? -1 : ((w1 < 8) ? 2*(w1&7) + 0 : -1), \ 1055 (w1 < 0) ? -1 : ((w1 < 8) ? 2*(w1&7) + 1 : -1), \ 1056 (w2 < 0) ? -1 : ((w2 < 8) ? 2*(w2&7) + 0 : -1), \ 1057 (w2 < 0) ? -1 : ((w2 < 8) ? 2*(w2&7) + 1 : -1), \ 1058 (w3 < 0) ? -1 : ((w3 < 8) ? 2*(w3&7) + 0 : -1), \ 1059 (w3 < 0) ? -1 : ((w3 < 8) ? 2*(w3&7) + 1 : -1), \ 1060 (w4 < 0) ? -1 : ((w4 < 8) ? 2*(w4&7) + 0 : -1), \ 1061 (w4 < 0) ? -1 : ((w4 < 8) ? 2*(w4&7) + 1 : -1), \ 1062 (w5 < 0) ? -1 : ((w5 < 8) ? 2*(w5&7) + 0 : -1), \ 1063 (w5 < 0) ? -1 : ((w5 < 8) ? 2*(w5&7) + 1 : -1), \ 1064 (w6 < 0) ? -1 : ((w6 < 8) ? 2*(w6&7) + 0 : -1), \ 1065 (w6 < 0) ? -1 : ((w6 < 8) ? 2*(w6&7) + 1 : -1), \ 1066 (w7 < 0) ? -1 : ((w7 < 8) ? 2*(w7&7) + 0 : -1), \ 1067 (w7 < 0) ? -1 : ((w7 < 8) ? 2*(w7&7) + 1 : -1))), \ 1068 _mm_shuffle_epi8(hi, _mm_setr_epi8( \ 1069 (w0 < 0) ? -1 : ((w0 >= 8) ? 2*(w0&7) + 0 : -1), \ 1070 (w0 < 0) ? -1 : ((w0 >= 8) ? 2*(w0&7) + 1 : -1), \ 1071 (w1 < 0) ? -1 : ((w1 >= 8) ? 2*(w1&7) + 0 : -1), \ 1072 (w1 < 0) ? -1 : ((w1 >= 8) ? 2*(w1&7) + 1 : -1), \ 1073 (w2 < 0) ? -1 : ((w2 >= 8) ? 2*(w2&7) + 0 : -1), \ 1074 (w2 < 0) ? -1 : ((w2 >= 8) ? 2*(w2&7) + 1 : -1), \ 1075 (w3 < 0) ? -1 : ((w3 >= 8) ? 2*(w3&7) + 0 : -1), \ 1076 (w3 < 0) ? -1 : ((w3 >= 8) ? 2*(w3&7) + 1 : -1), \ 1077 (w4 < 0) ? -1 : ((w4 >= 8) ? 2*(w4&7) + 0 : -1), \ 1078 (w4 < 0) ? -1 : ((w4 >= 8) ? 2*(w4&7) + 1 : -1), \ 1079 (w5 < 0) ? -1 : ((w5 >= 8) ? 2*(w5&7) + 0 : -1), \ 1080 (w5 < 0) ? -1 : ((w5 >= 8) ? 2*(w5&7) + 1 : -1), \ 1081 (w6 < 0) ? -1 : ((w6 >= 8) ? 2*(w6&7) + 0 : -1), \ 1082 (w6 < 0) ? -1 : ((w6 >= 8) ? 2*(w6&7) + 1 : -1), \ 1083 (w7 < 0) ? -1 : ((w7 >= 8) ? 2*(w7&7) + 0 : -1), \ 1084 (w7 < 0) ? -1 : ((w7 >= 8) ? 2*(w7&7) + 1 : -1)))); \ 1085 out[1] = _mm_xor_si128( \ 1086 _mm_shuffle_epi8(lo, _mm_setr_epi8( \ 1087 (w8 < 0) ? -1 : ((w8 < 8) ? 2*(w8&7) + 0 : -1), \ 1088 (w8 < 0) ? -1 : ((w8 < 8) ? 2*(w8&7) + 1 : -1), \ 1089 (w9 < 0) ? -1 : ((w9 < 8) ? 2*(w9&7) + 0 : -1), \ 1090 (w9 < 0) ? -1 : ((w9 < 8) ? 2*(w9&7) + 1 : -1), \ 1091 (wa < 0) ? -1 : ((wa < 8) ? 2*(wa&7) + 0 : -1), \ 1092 (wa < 0) ? -1 : ((wa < 8) ? 2*(wa&7) + 1 : -1), \ 1093 (wb < 0) ? -1 : ((wb < 8) ? 2*(wb&7) + 0 : -1), \ 1094 (wb < 0) ? -1 : ((wb < 8) ? 2*(wb&7) + 1 : -1), \ 1095 (wc < 0) ? -1 : ((wc < 8) ? 2*(wc&7) + 0 : -1), \ 1096 (wc < 0) ? -1 : ((wc < 8) ? 2*(wc&7) + 1 : -1), \ 1097 (wd < 0) ? -1 : ((wd < 8) ? 2*(wd&7) + 0 : -1), \ 1098 (wd < 0) ? -1 : ((wd < 8) ? 2*(wd&7) + 1 : -1), \ 1099 (we < 0) ? -1 : ((we < 8) ? 2*(we&7) + 0 : -1), \ 1100 (we < 0) ? -1 : ((we < 8) ? 2*(we&7) + 1 : -1), \ 1101 (wf < 0) ? -1 : ((wf < 8) ? 2*(wf&7) + 0 : -1), \ 1102 (wf < 0) ? -1 : ((wf < 8) ? 2*(wf&7) + 1 : -1))), \ 1103 _mm_shuffle_epi8(hi, _mm_setr_epi8( \ 1104 (w8 < 0) ? -1 : ((w8 >= 8) ? 2*(w8&7) + 0 : -1), \ 1105 (w8 < 0) ? -1 : ((w8 >= 8) ? 2*(w8&7) + 1 : -1), \ 1106 (w9 < 0) ? -1 : ((w9 >= 8) ? 2*(w9&7) + 0 : -1), \ 1107 (w9 < 0) ? -1 : ((w9 >= 8) ? 2*(w9&7) + 1 : -1), \ 1108 (wa < 0) ? -1 : ((wa >= 8) ? 2*(wa&7) + 0 : -1), \ 1109 (wa < 0) ? -1 : ((wa >= 8) ? 2*(wa&7) + 1 : -1), \ 1110 (wb < 0) ? -1 : ((wb >= 8) ? 2*(wb&7) + 0 : -1), \ 1111 (wb < 0) ? -1 : ((wb >= 8) ? 2*(wb&7) + 1 : -1), \ 1112 (wc < 0) ? -1 : ((wc >= 8) ? 2*(wc&7) + 0 : -1), \ 1113 (wc < 0) ? -1 : ((wc >= 8) ? 2*(wc&7) + 1 : -1), \ 1114 (wd < 0) ? -1 : ((wd >= 8) ? 2*(wd&7) + 0 : -1), \ 1115 (wd < 0) ? -1 : ((wd >= 8) ? 2*(wd&7) + 1 : -1), \ 1116 (we < 0) ? -1 : ((we >= 8) ? 2*(we&7) + 0 : -1), \ 1117 (we < 0) ? -1 : ((we >= 8) ? 2*(we&7) + 1 : -1), \ 1118 (wf < 0) ? -1 : ((wf >= 8) ? 2*(wf&7) + 0 : -1), \ 1119 (wf < 0) ? -1 : ((wf >= 8) ? 2*(wf&7) + 1 : -1)))); \ 1120 } while (0) 1121 #endif 1122 1123 /* case of 192 bits within 256 bits. Three 64-bit words 1124 * split into four 48-bit words. 1125 */ convertarith_modp::details::fast_type1126 static void convert(elt& dst, const super::elt& a) { 1127 /* index of 16-bit word in destination, fetched from 1128 * which index of 16-bit word in the gfp::elt. This is 1129 * given for the 256-bit registers 1130 * 1131 * 0 0 1132 * 1 1 1133 * 2 2 1134 * 3 <empty> 1135 * 4 3 1136 * 5 4 1137 * 6 5 1138 * 7 <empty> 1139 * 8 6 1140 * 9 7 1141 * 10 8 1142 * 11 <empty> 1143 * 12 9 1144 * 13 10 1145 * 14 11 1146 * 15 <empty> 1147 */ 1148 #ifdef HAVE_AVX2 1149 /* I'm really upset here. _mm256_shuffle_epi8, aka VPSHUFB, 1150 * reads only 4-byte immediates (and discards the rest). As a 1151 * consequnence, the following does not work: the indices 1152 * 12,13,14,15 read off bounds, while the 16,17, etc actually 1153 * do what they want, but based on the fact that they're 1154 * reduced mod 16 + implicitly considered wrt the high part 1155 * of the operand... 1156 dst.data[0] = _mm256_shuffle_epi8( 1157 _mm256_loadu_si256((__m256i*) a.x), 1158 _mm256_setr_epi8( 1159 0,1,2,3,4,5,-1,-1, 1160 6,7,8,9,10,11,-1,-1, 1161 12,13,14,15,16,17,-1,-1, 1162 18,19,20,21,22,23,-1,-1)); 1163 */ 1164 1165 #if 0 1166 __m256i in = _mm256_loadu_si256((__m256i*) a.x); 1167 dst.data[0] = 1168 _mm256_xor_si256( 1169 _mm256_shuffle_epi8( 1170 in, 1171 _mm256_setr_epi8( 1172 0,1,2,3,4,5,-1,-1, 1173 6,7,8,9,10,11,-1,-1, 1174 -1,-1,-1,-1,0,1,-1,-1, 1175 2,3,4,5,6,7,-1,-1)), 1176 _mm256_shuffle_epi8( 1177 /* 0x4E is 0b01001110 aka (1,0,3,2) */ 1178 _mm256_permute4x64_epi64 (in, _MM_SHUFFLE(1,0,3,2)), 1179 _mm256_setr_epi8( 1180 -1,-1,-1,-1,-1,-1,-1,-1, 1181 -1,-1,-1,-1,-1,-1,-1,-1, 1182 12,13,14,15,-1,-1,-1,-1, 1183 -1,-1,-1,-1,-1,-1,-1,-1))); 1184 #endif 1185 __m256i in = _mm256_loadu_si256((__m256i*) a.x); 1186 shuffle_16bit_words(dst.data, in, 1187 0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1); 1188 #else /* SSSE3 !! */ 1189 __m128i lo = _mm_loadu_si128((__m128i*) a.x); 1190 __m128i hi = _mm_loadu_si128((__m128i*) (a.x + 2)); 1191 shuffle_16bit_words(dst.data, lo, hi, 1192 0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1); 1193 /* note that 16bit-wide shuffles use an 8-bit immediate, 1194 * but do not offer the option to selectively insert 1195 * zeroes. So we're probably better off shuffling bytes. 1196 */ 1197 #endif 1198 } 1199 conv_backend_get_mainarith_modp::details::fast_type1200 static super::elt conv_backend_get_main(elt const& src) { 1201 /* This is needed because we knowingly write off bounds */ 1202 union station { 1203 super::elt e; 1204 #ifdef HAVE_AVX2 1205 __m256i v[1]; 1206 #else 1207 __m128i v[2]; 1208 #endif 1209 station() {}; 1210 } main; 1211 #ifdef HAVE_AVX2 1212 shuffle_16bit_words(main.v, src.data[0], 1213 0,1,2, 4,5,6, 8,9,10, 12,13,14, -1,-1,-1,-1); 1214 #else 1215 shuffle_16bit_words(main.v, src.data[0], src.data[1], 1216 0,1,2, 4,5,6, 8,9,10, 12,13,14, -1,-1,-1,-1); 1217 #endif 1218 return main.e; 1219 } conv_backend_get_carriesarith_modp::details::fast_type1220 static super::elt_ur conv_backend_get_carries(elt const& src) { 1221 union station { 1222 super::elt_ur e; 1223 #ifdef HAVE_AVX2 1224 __m256i v[1]; 1225 #else 1226 __m128i v[2]; 1227 #endif 1228 station() {}; 1229 } carries, ncarries; 1230 1231 /* It's slightly more complicated than it seems. The carry 1232 * words may be negative. So we must sign-extend them to the 1233 * full unreduced element size. 1234 */ 1235 #ifdef HAVE_AVX2 1236 shuffle_16bit_words(carries.v, src.data[0], 1237 -1,-1,-1,3, 1238 -1,-1,7,-1, 1239 -1,11,-1,-1, 1240 15,-1,-1,-1); 1241 __m256i zero = _mm256_setzero_si256(); 1242 shuffle_16bit_words(ncarries.v, 1243 _mm256_sub_epi16(zero, 1244 _mm256_cmpgt_epi16(zero, carries.v[0])), 1245 -1,-1,-1,-1, 1246 3,-1,-1,6, 1247 -1,-1,9,-1, 1248 -1,12,-1,-1); 1249 #else 1250 shuffle_16bit_words(carries.v, src.data[0], src.data[1], 1251 -1,-1,-1,3, 1252 -1,-1,7,-1, 1253 -1,11,-1,-1, 1254 15,-1,-1,-1); 1255 __m128i zero = _mm_setzero_si128(); 1256 shuffle_16bit_words(ncarries.v, 1257 _mm_sub_epi16(zero, _mm_cmpgt_epi16(zero, carries.v[0])), 1258 _mm_sub_epi16(zero, _mm_cmpgt_epi16(zero, carries.v[1])), 1259 -1,-1,-1,-1, 1260 3,-1,-1,6, 1261 -1,-1,9,-1, 1262 -1,12,-1,-1); 1263 #endif 1264 super::sub_ur(carries.e, ncarries.e); 1265 return carries.e; 1266 } 1267 1268 1269 1270 /* (add|sub)mul_ui go through convert, do naively and convert 1271 * back. Yes, it's slightly painful. Here we assume that src 1272 * has undergone little to no accumulated additions, so that 1273 * it can basically be converted lossless to a gfp::elt 1274 */ addmul_uiarith_modp::details::fast_type1275 static inline void addmul_ui(elt & dst, elt const & src, mp_limb_t x, super::elt const & p, super::preinv const & j) 1276 { 1277 super::elt zr; 1278 super::elt_ur z(dst); 1279 super::addmul_ui(z, (super::elt) src, x, p, j); 1280 super::reduce(zr, z, p, j); 1281 dst = zr; 1282 } submul_uiarith_modp::details::fast_type1283 static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, super::elt const & p, super::preinv const & j) 1284 { 1285 super::elt zr; 1286 super::elt_ur z(dst); 1287 super::submul_ui(z, (super::elt) src, x, p, j); 1288 super::reduce(zr, z, p, j); 1289 dst = zr; 1290 } 1291 1292 /* we have *TWO* reduction functions here. One which assigns to a 1293 * standard gfp::elt, and one which assigns to fast_type::elt */ reducearith_modp::details::fast_type1294 static void reduce(super::elt & r, elt const & a, super::elt const & p, super::preinv const & j) 1295 { 1296 super::elt_ur z(a); 1297 super::reduce(r, z, p, j); 1298 } reducearith_modp::details::fast_type1299 static void reduce(elt & r, elt const & a, super::elt const & p, super::preinv const & j) 1300 { 1301 super::elt zr; 1302 reduce(zr, a, p, j); 1303 r = zr; 1304 } 1305 }; 1306 #endif /* defined(HAVE_AVX2) || defined(HAVE_SSSE3) */ 1307 #endif 1308 #endif 1309 } 1310 1311 /* expose only what we have in our public interface */ 1312 using details::gfp; 1313 using details::fast_type; 1314 } 1315 1316 1317 1318 #endif 1319