1 #ifndef ARITH_MODP_HPP_
2 #define ARITH_MODP_HPP_
3 
4 #include <gmp.h>
5 
6 #include "gmp-hacks.h"
7 #include "gmp_aux.h"
8 #include "macros.h"
9 #include "memory.h"
10 
11 #if defined(HAVE_AVX2) || defined(HAVE_SSSE3)
12 #include <x86intrin.h>
13 #endif
14 
15 
16 #define  xxxDEBUG_INFINITE_LOOPS
17 
18 namespace arith_modp {
19 namespace details {
20     /* some of this is provided by <type_traits>, but that is post C++98
21      */
22     template<bool x> struct is_true {};
23     template<> struct is_true<true> { typedef int type; };
24     template<typename T> struct make_signed {};
25     template<> struct make_signed<unsigned long> { typedef long type; };
26     template<> struct make_signed<unsigned long long> { typedef long long type; };
27     typedef make_signed<mp_limb_t>::type signed_mp_limb_t;
28 
29     template<int n> struct mpn {
30         static const int alignment = sizeof(mp_limb_t);
31         typedef mpn<n> self;
32         mp_limb_t x[n];
mpnarith_modp::details::mpn33         mpn() { memset(x, 0, n * sizeof(mp_limb_t)); }
mpnarith_modp::details::mpn34         mpn(mpn const& a) { memcpy(x, a.x, n * sizeof(mp_limb_t)); }
operator =arith_modp::details::mpn35         mpn& operator=(mpn const& a) {
36                 memcpy(x, a.x, n * sizeof(mp_limb_t));
37                 return *this;
38         }
mpnarith_modp::details::mpn39         mpn(mpz_srcptr a) { MPN_SET_MPZ(x, n, a); }
operator =arith_modp::details::mpn40         self& operator=(mpz_srcptr a) { MPN_SET_MPZ(x, n, a); return *this; }
zeroarith_modp::details::mpn41         void zero() { memset(x, 0, n * sizeof(mp_limb_t)); }
operator mp_limb_t*arith_modp::details::mpn42         operator mp_limb_t * () { return x; }
operator const mp_limb_t*arith_modp::details::mpn43         operator const mp_limb_t * () const { return x; }
zeroarith_modp::details::mpn44         static void zero(self * x, int N) {
45             // see bug 21663
46             memset(x->x, 0, n * N * sizeof(mp_limb_t));
47         }
copyarith_modp::details::mpn48         static void copy(self * y, const self * x, int N) {
49             // see bug 21663
50             memcpy(y->x, x->x, n * N * sizeof(mp_limb_t));
51         }
operator ==arith_modp::details::mpn52         bool operator==(self const& a) {
53             return memcmp(x, a.x, n * sizeof(mp_limb_t)) == 0;
54         }
is_zeroarith_modp::details::mpn55         bool is_zero() const {
56             for(int i = 0 ; i < n ; i++) if (x[i]) return false;
57             return true;
58         }
59 
60         /*
61         bool operator<(self const& a) {
62             return memcmp(x, a.x, n * sizeof(mp_limb_t)) < 0;
63         }
64         bool operator>(self const& a) {
65             return memcmp(x, a.x, n * sizeof(mp_limb_t)) > 0;
66         }
67         */
68     };
69 
70     template<int n_, int extra_, typename T>
71         struct gfp_base {
72             /* This is only example code. The version used is the one which is
73              * specialized for extra == 1.
74              */
75             static const int n = n_;
76             static const typename is_true<n_>= extra_>::type extra = extra_;
77 
78             typedef mpn<n> elt;
79             struct elt_ur: public mpn<n + extra> {
operator =arith_modp::details::gfp_base::elt_ur80                 elt_ur& operator=(elt const& a) {
81                     mpn_copyi(mpn<n + extra>::x, a.x, n);
82                     memset(mpn<n + extra>::x + n, 0, extra * sizeof(mp_limb_t));
83                     return *this;
84                 }
85             };
86             struct preinv : public mpn<extra> { int shift; };
87 
propagate_carryarith_modp::details::gfp_base88             static void propagate_carry(mp_limb_t * dst, mp_limb_t cy) {
89                 mpn_add_1(dst, dst, extra, cy);
90             }
propagate_borrowarith_modp::details::gfp_base91             static void propagate_borrow(mp_limb_t * dst, mp_limb_t cy) {
92                 mpn_sub_1(dst, dst, extra, cy);
93             }
upperlimbs_are_zeroarith_modp::details::gfp_base94             static bool upperlimbs_are_zero(mp_limb_t * dst) {
95                 for(int i = extra ; i-- ; )
96                     if (dst[i]) return false;
97                 return true;
98             }
99 
is_zeroarith_modp::details::gfp_base100             static inline bool is_zero(elt const & x) { return x.is_zero(); }
is_zeroarith_modp::details::gfp_base101             static inline bool is_zero(elt_ur const & x) { return x.is_zero(); }
102 
103 
stream_storearith_modp::details::gfp_base104             static inline void stream_store(elt * dst, elt const& src) { *dst = src; }
addarith_modp::details::gfp_base105             static inline void add(elt_ur & dst, elt const & src)
106             {
107                 mp_limb_t cy = mpn_add_n(dst, dst, src, n);
108                 T::propagate_carry(dst + n, cy);
109             }
110 
subarith_modp::details::gfp_base111             static inline void sub(elt_ur & dst, elt const & src)
112             {
113                 mp_limb_t cy = mpn_sub_n(dst, dst, src, n);
114                 T::propagate_borrow(dst + n, cy);
115             }
116 
add_urarith_modp::details::gfp_base117             static inline void add_ur(elt_ur & dst, elt_ur const & src)
118             {
119                 mpn_add_n(dst, dst, src, n + extra);
120             }
121 
sub_urarith_modp::details::gfp_base122             static inline void sub_ur(elt_ur & dst, elt_ur const & src)
123             {
124                 mpn_sub_n(dst, dst, src, n + extra);
125             }
126 
addmul_uiarith_modp::details::gfp_base127             static inline void addmul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&)
128             {
129                 mp_limb_t cy = mpn_addmul_1(dst, src, n, x);
130                 T::propagate_carry(dst + n, cy);
131             }
submul_uiarith_modp::details::gfp_base132             static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&)
133             {
134                 mp_limb_t cy = mpn_submul_1(dst, src, n, x);
135                 T::propagate_borrow(dst + n, cy);
136             }
137 
138             /* Preinverse for Barrett reduction. See also the code for reduction,
139              * which is further below.
140              *
141              * We want to apply this to reduce a mod p, with the following
142              * constraints.
143              *
144              *             2^(m-1) <  p  < 2^m
145              *        -2^(ell-1)*p <= a  < 2^(ell-1)*p
146              *
147              * Let I=floor(2^(m+ell)/p). Because of the bound on p, we have 2^ell
148              * < I < 2^(ell+1), so that 0<J=I-2^ell<2^ell (which actually fits
149              * within ell bits). The preinverse we compute is this J.
150              */
151 
compute_preinvarith_modp::details::gfp_base152             static void compute_preinv(preinv & j, elt const & p)
153             {
154                 mpz_t big;
155                 mpz_t pz;
156                 mpz_init_set_ui(big,1);
157                 mpz_init(pz);
158                 MPZ_SET_MPN(pz, p, n);
159                 size_t m = mpz_sizeinbase(pz, 2);
160                 ASSERT_ALWAYS(m <= (size_t) n * mp_bits_per_limb);
161                 size_t ell = extra * mp_bits_per_limb;
162                 mpz_mul_2exp(big, big, m+ell);
163                 mpz_fdiv_q(big, big, pz);
164                 ASSERT_ALWAYS(mpz_sizeinbase(big, 2) == (ell + 1));
165                 mpz_fdiv_r_2exp(big, big, ell);
166                 MPN_SET_MPZ(j, extra, big);
167                 j.shift = (mp_bits_per_limb - m) % mp_bits_per_limb;
168                 mpz_clear(big);
169                 mpz_clear(pz);
170             }
171 
172             /* Signed Barrett reduction (extended from Brent-Zimmermann 2010,
173              * theorem 2.4)
174              */
175 
176             /* input: a = a0 + a1 * 2^m, with         2^(m-1) <  p  < 2^m
177              *                                   -2^(ell-1)*p <= a  < 2^(ell-1)*p
178              *                                              0 <= a0 < 2^m
179              * which imply in particular:
180              *                                     -2^(ell-1) <= a1 < 2^(ell-1)
181              *
182              * Case a1 >= 0.
183              *
184              * Let q0 = floor(a1*I/2^ell) = floor(a1*J/2^ell) + a1.
185              * We have 0 <= q0 < 2^ell.
186              *
187              * Moreover: q0 <= a1*I/2^ell <= a1*2^m/p <= a/p, so that r0=a-p*q0>=0.
188              * use p*I >= 2^(m+ell)-p and 2^ell*q0 >= a1*I-2^ell
189              *
190              * compute 2^ell*p*q0 >= 2^(m+ell)*a1-a1*p-2^ell*p
191              *                    >= 2^ell*(a-a0)-p*(a1+2^ell)
192              *                    >  2^ell*a - 4*2^ell*p
193              *             a-p*q0 <  4p
194              * where the third line used a1 < 2^(ell-1) and a0 <= 2^m <= 2*p.
195              *
196              * Case a1 < 0.
197              *
198              * We let b1 = a1 + 2^ell, which is the unsigned limb used to
199              * represent a1.
200              *
201              * Let q0 = floor(a1*I/2^ell) = floor(b1*J/2^ell) + b1 - 2^ell - J.
202              *
203              * Since a1 < 0, we have q0 < 0. With a1 >= -2^(ell-1) and
204              * I<2^(ell+1), we obtaib q0 > -2^ell. Therefore q0 is well
205              * represented by the machine word
206              *  q'0 = q0+2^ell = floor(b1*J/2^ell) + b1 - J
207              *
208              * We have p*q0 <= p*a1*I/2^ell < p*a1/2^ell*(2^(m+ell)/p-1)
209              *              <  a1*2^m - a1/2^ell * p
210              *              <  p*q+r-a0-a1/2^ell * p
211              *         q-q0 >  (a0-r)/p + a1/2^ell
212              *              >  -1.5   since a0>0, r<p, and a1>-2^(ell-1).
213              *              >= -1     since q and q0 are integers.
214              * So that q-(q0-1) >= 0.
215              *
216              * Note that because we have -2^ell < q0 < 0, then q0-1 is properly
217              * represented by the unsigned machine word 2^ell-1+q0.
218              *
219              * Also, we have p*q0 >= p*a1*I/2^ell-p
220              *                    >= a1*2^m-p
221              *                    >= a-a0-p
222              *         a-p*(q0-1) <= a0 + 2p < 4p
223              *
224              * To compute a-p*(q0-1), we actually compute
225              * a-p*(q0+2^ell-1)+2^ell*p, which is a submul_ui followed by one
226              * addition.
227              */
228 
229             /* this reduces a in place, and copies the result to r */
reducearith_modp::details::gfp_base230             static void reduce(elt & r, elt_ur & a, elt const & p, preinv const & j)
231             {
232                 mp_limb_t tmp[extra + 1];
233                 if (j.shift) {
234                     mpn_lshift(tmp, a + n - 1, extra + 1, j.shift);
235                 } else {
236                     mpn_copyi(tmp + 1, a + n, extra);
237                 }
238                 mp_limb_t a1I[2*extra];
239                 mpn_mul_n(a1I, tmp + 1, j, extra);
240                 mpn_add_n(a1I + extra, a1I + extra, tmp + 1, extra);
241                 mp_limb_t * q0 = a1I + extra;
242                 typename make_signed<mp_limb_t>::type sa1 = (tmp+1)[extra-1];
243                 if (sa1 < 0) {
244                     mpn_sub_n(q0, q0, j, extra);
245                     mpn_sub_1(q0, q0, extra, 1);
246                     mpn_add_n(a + extra, a + extra, p, n);
247                 }
248                 /* emulate a submul_n ; need to do mul first, then sub... */
249                 mp_limb_t scratch[n + extra];
250                 mpn_mul(scratch, p, n, q0, extra);
251                 mpn_sub_n(a, a, scratch, n + extra);
252 #if !defined(NDEBUG) && !defined(DEBUG_INFINITE_LOOPS)
253                 int spin=0;
254 #endif
255                 while (!upperlimbs_are_zero(a + n) || mpn_cmp(a, p, n) >= 0) {
256                     T::sub(a, p);
257                     /*
258                        {
259                        mp_limb_t cy = mpn_sub_n(a, a, p, n);
260                        propagate_borrow(a + n, cy);
261                        }
262                        */
263 #if !defined(NDEBUG) && !defined(DEBUG_INFINITE_LOOPS)
264                     spin++;
265                     ASSERT_ALWAYS(spin < 4);
266 #endif
267                 }
268                 mpn_copyi(r, a, n);
269             }
270         };
271 
272     template<int n_, int extra_, typename T>
273         struct gfp_middle: public gfp_base<n_, extra_, T> {};
274 
275     template<int n_, typename T>
276         class gfp_middle<n_, 1, T>: public gfp_base<n_, 1, T>
277         {
278             typedef gfp_base<n_, 1, T> super;
279             public:
propagate_carry(mp_limb_t * dst,mp_limb_t cy)280             static inline void propagate_carry(mp_limb_t * dst, mp_limb_t cy) {
281                 *dst += cy;
282             }
propagate_borrow(mp_limb_t * dst,mp_limb_t cy)283             static inline void propagate_borrow(mp_limb_t * dst, mp_limb_t cy) {
284                 *dst -= cy;
285             }
upperlimbs_are_zero(mp_limb_t * dst)286             static inline bool upperlimbs_are_zero(mp_limb_t * dst) {
287                 return !dst[0];
288             }
289 
290             using super::n;
291             /* We would prefer write "using typename super::elt" below,
292              * however this is only ok with recent c++ */
293             typedef typename super::elt elt;
294             typedef typename super::elt_ur elt_ur;
295             typedef typename super::preinv preinv;
296 
297             /* this reduces a in place, and copies the result to r */
reduce(elt & r,elt_ur & a,elt const & p,preinv const & j)298             static void reduce(elt & r, elt_ur & a, elt const & p, preinv const & j) {
299                 mp_limb_t a1 = a[n] << j.shift;
300                 if (j.shift) {
301                     a1 |= a[n-1] >> (mp_bits_per_limb - j.shift);
302                 }
303                 signed_mp_limb_t sa1 = a1;
304                 mp_limb_t tmp[2];
305 #ifdef  umul_ppmm
306                 umul_ppmm(tmp[1], tmp[0], a1, j[0]);
307 #elif defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM)
308                 __asm__ ("mulq %3" : "=a"(tmp[0]), "=d" (tmp[1]) : "0" (a1), "rm" (j[0]));
309 #else
310                 mpn_mul_n(tmp, &a1, j, 1);
311 #endif
312                 mp_limb_t q0 = tmp[1] + a1;
313                 if (sa1 < 0) {
314                     /* see above for the specificities of the negative case */
315                     q0 -= j[0] + 1;
316                     mpn_add_n(a + 1, a + 1, p, n);
317                 }
318                 T::submul_ui(a, p, q0, p, j);
319                 /*
320                    {
321                    mp_limb_t cy = mpn_submul_1(a, p, n, q0);
322                    super::propagate_borrow(a + n, cy);
323                    }
324                    */
325 #if !defined(NDEBUG) && !defined(DEBUG_INFINITE_LOOPS)
326                 int spin=0;
327 #endif
328                 while (a[n] || mpn_cmp(a, p, n) >= 0) {
329                     T::sub(a, p);
330                     /*
331                        {
332                        mp_limb_t cy = mpn_sub_n(a, a, p, n);
333                        super::propagate_borrow(a + n, cy);
334                        }
335                        */
336 #if !defined(NDEBUG) && !defined(DEBUG_INFINITE_LOOPS)
337                     spin++;
338                     ASSERT_ALWAYS(spin < 4);
339 #endif
340                 }
341                 mpn_copyi(r, a, n);
342             }
343         };
344 
345     template<int n, int extra=1>
346         struct gfp : public gfp_middle<n, extra, gfp<n, extra> >
347     {
348     };
349 
350 
351     /* Now for some sizes, we see a clear interest in using auxiliary
352      * vector types. We call these "fast" types. The general compromise
353      * is that we accept types which may be a little wider, but generally
354      * allow for better performance. The specs go typically as follows.
355      *
356      * - conversion to the "fast" types must be done for both operands
357      *   (say, source vector as well as destination vector). We don't
358      *   intend to go with the same kind of arithmetic that what we do
359      *   with elt and elt_ur above, where a "mixed" add/sub function
360      *   exists.
361      *
362      * - "fast" types are amenable to vector instructions
363      *
364      * - up to some number of additions or subtractions may be performed
365      *   on the fast type before reduction.
366      *
367      * - type may be ambiguous (so comparison entails conversion).
368      *
369      * We have two natural choices:
370      *
371      *  - RNS representation
372      *  - carry-save (aka nails).
373      *
374      * The specializations below work with nails. The idea is as follows.
375      * For a p spanning three 64-bit words, we spread data into four
376      * 48-bit words in an avx register. Then we can accumulate up to 2^16
377      * of these at little cost.
378      */
379 
380     /* the default version is just making no difference, so that we'll
381      * use the simple elt / elt_ur mechanism */
382     template<typename T> struct fast_type : public T { };
383 
384 
385 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
386     /* Now some specializations */
387 
388     /* {{{ gfp<1,1> */
389     template<> struct gfp<1, 1> : public gfp_middle<1,1,gfp<1,1> > {
addarith_modp::details::gfp390         static inline void add(elt_ur & dst, elt const & src)
391         {
392             asm("# gfp<1, 1>::add\n"
393                 "addq %q2, %q0\n"
394                 "adcq $0x0, %q1\n"
395                 : "+r"(dst[0]), "+r"(dst[1])
396                 : "rm"(src[0])
397                );
398         }
399 
subarith_modp::details::gfp400         static inline void sub(elt_ur & dst, elt const & src)
401         {
402             asm("# gfp<1, 1>::sub\n"
403                 "subq %q2, %q0\n"
404                 "sbbq $0x0, %q1\n"
405                 : "+r"(dst[0]), "+r"(dst[1])
406                 : "rm"(src[0])
407                );
408         }
409 
addmul_uiarith_modp::details::gfp410         static inline void addmul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&)
411         {
412             mp_limb_t foo, bar;
413             asm("# gfp<1, 1>::addmul_ui\n"
414                 "mulq   %[mult]\n"
415                 "addq   %%rax, %[z0]\n"
416                 "adcq   $0, %%rdx\n"
417                 "addq   %%rdx, %[z1]\n"
418             : "=a"(foo), "=&d"(bar), [z0]"+rm"(dst[0]), [z1]"+rm"(dst[1])
419             : "0"(src[0]), [mult]"r1m"(x)
420             );
421         }
submul_uiarith_modp::details::gfp422         static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&)
423         {
424             mp_limb_t foo, bar;
425             asm("# gfp<1, 1>::submul_ui\n"
426                 "mulq   %[mult]\n"
427                 "subq   %%rax, %[z0]\n"
428                 "adcq   $0, %%rdx\n"
429                 "subq   %%rdx, %[z1]\n"
430             : "=a"(foo), "=&d"(bar), [z0]"+rm"(dst[0]), [z1]"+rm"(dst[1])
431             : "0"(src[0]), [mult]"r1m"(x)
432             );
433         }
434     };
435     /* }}} */
436 
437     /* {{{ gfp<2,1> */
438     template<> struct gfp<2, 1> : public gfp_middle<2,1,gfp<2,1> > {
addarith_modp::details::gfp439         static inline void add(elt_ur & dst, elt const & src) {
440             asm("# gfp<2, 1>::add\n"
441                 "addq %q[s0], %q[d0]\n"
442                 "adcq %q[s1], %q[d1]\n"
443                 "adcq $0x0, %q[d2]\n"
444                 : [d0]"+rm"(dst[0]), [d1]"+rm"(dst[1]), [d2]"+rm"(dst[2])
445                 : [s0]"r"(src[0]), [s1]"r"(src[1])
446                );
447         }
448 
subarith_modp::details::gfp449         static inline void sub(elt_ur & dst, elt const & src) {
450             asm("# gfp<2, 1>::sub\n"
451                 "subq %q[s0], %q[d0]\n"
452                 "sbbq %q[s1], %q[d1]\n"
453                 "sbbq $0x0, %q[d2]\n"
454                 : [d0]"+rm"(dst[0]), [d1]"+rm"(dst[1]), [d2]"+rm"(dst[2])
455                 : [s0]"r"(src[0]), [s1]"r"(src[1])
456                );
457         }
458 
addmul_uiarith_modp::details::gfp459         static inline void addmul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&)
460         {
461             mp_limb_t foo, bar;
462             asm("# gfp<2, 1>::addmul_ui\n"
463                 "mulq   %[mult]\n"
464                 "addq   %%rax, %[z0]\n"
465                 "adcq   $0, %%rdx\n"
466                 "movq   %%rdx, %%rcx\n"
467                 "movq   %[s1], %%rax\n"
468                 "mulq   %[mult]\n"
469                 "addq   %%rcx, %%rax\n"
470                 "adcq   $0, %%rdx\n"
471                 "addq   %%rax, %[z1]\n"
472                 "adcq   $0, %%rdx\n"
473                 "addq   %%rdx, %[z2]\n"
474             : "=&a"(foo), "=&d"(bar),
475             [z0]"+rm"(dst[0]),
476             [z1]"+rm"(dst[1]),
477             [z2]"+rm"(dst[2])
478             : [s0]"0"(src[0]), [s1]"rm"(src[1]), [mult]"rm"(x)
479             : "rcx"
480             );
481         }
482 
submul_uiarith_modp::details::gfp483         static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) {
484             mp_limb_t foo, bar;
485             asm("# gfp<2, 1>::submul_ui\n"
486                 "mulq   %[mult]\n"
487                 "subq   %%rax, %[z0]\n"
488                 "adcq   $0, %%rdx\n"
489                 "movq   %%rdx, %%rcx\n"
490                 "movq   %[s1], %%rax\n"
491                 "mulq   %[mult]\n"
492                 "addq   %%rcx, %%rax\n"
493                 "adcq   $0, %%rdx\n"
494                 "subq   %%rax, %[z1]\n"
495                 "adcq   $0, %%rdx\n"
496                 "subq   %%rdx, %[z2]\n"
497             : "=&a"(foo), "=&d"(bar),
498             [z0]"+rm"(dst[0]),
499             [z1]"+rm"(dst[1]),
500             [z2]"+rm"(dst[2])
501             : [s0]"0"(src[0]), [s1]"rm"(src[1]), [mult]"rm"(x)
502             : "rcx"
503             );
504         }
505     };
506     /* }}} */
507 
508     /* {{{ macros for assembly for further specializations */
509 #define FEED_IN_WITH_S0_IN_RAX(in1, r0, r1) \
510         /* status: s0 in rax */                                \
511         "mulq   %[mult]\n"             /* rdx:rax = s0 * v */  \
512         "movq   %%rax, %%" #r0 "\n"    /* lo contrib to d1 */  \
513         "movq   " in1 ", %%rax\n"        /* load s1          */  \
514         "movq   %%rdx, %%" #r1 "\n"    /* hi contrib to d1 */
515 #define FEED_IN(in0, in1, r0, r1) \
516         "movq   " in0 ", %%rax\n"       \
517         FEED_IN_WITH_S0_IN_RAX(in1, r0, r1)
518 #define INNER_MUL(op, out, in, r0, r1, r2)   \
519         /* status: r1:r0 to be added to d_{i+1}:d_i, rax = s_{i+1} */     \
520         "xorq   %%" #r2 ", %%" #r2 "\n"                                   \
521         "mulq   %[mult]\n"                   /* rdx:rax = s_{i+1} * v */  \
522         "" #op "q %%" #r0 ", " out "\n" /* store d_i             */   \
523         "adcq   %%rax, %%" #r1 "\n"         /* lo contrib to d_{i+1} */   \
524         "adcq   %%rdx, %%" #r2 "\n"         /* hi contrib to d_{i+2} */   \
525         "movq   " in ", %%rax\n"       /* load s_{i+2}          */
526 #define FINISH(op, opc, out0, out1, out2, r0, r1) \
527         /* r1:r0 to be added to d_{i+1}:d_i ; rax = s_{i+2} */	\
528         "mulq   %[mult]\n"                   			\
529         "" #op "q   %%" #r0 ", " out0 "\n"  			\
530         "adcq   %%rax, %%" #r1 "\n"				\
531         "adcq   $0x0, %%rdx\n"					\
532         "" #op "q   %%" #r1 ", " out1 "\n" 			\
533         "" #opc "q   %%rdx, " out2 "\n"
534     /* }}} */
535     /* {{{ this macro actually exposes the specialization in itself */
536 #define EXPOSE_SPECIALIZATION(n)					\
537     template<> struct gfp<n, 1> : public gfp_middle<n,1,gfp<n,1> > {	\
538         using gfp_middle<n,1,gfp<n,1> >::add_ur;   \
539         using gfp_middle<n,1,gfp<n,1> >::sub_ur;   \
540         static inline void add(elt_ur & dst, elt const & src) {		\
541             asm("# gfp<" #n ", 1>::add\n"				\
542                     ADDSUB_CODE ## n(add, adc)				\
543                );							\
544         }								\
545         static inline void sub(elt_ur & dst, elt const & src) {		\
546             asm("# gfp<" #n ", 1>::sub\n"					\
547                     ADDSUB_CODE ## n(sub, sbb)				\
548                );							\
549         }								\
550         static inline void addmul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) {\
551             mp_limb_t foo MAYBE_UNUSED;					\
552             asm ("# gfp<" #n ", 1>::addmul_ui\n"				\
553                     ADDSUBMUL_CODE ## n(add, adc)			\
554             );								\
555         }								\
556         static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, elt const&, preinv const&) {\
557             mp_limb_t foo MAYBE_UNUSED;					\
558             asm("# gfp<" #n ", 1>::submul_ui\n"				\
559                     ADDSUBMUL_CODE ## n(sub, sbb)			\
560             );								\
561         }								\
562     }
563     /* }}} */
564 
565     /* {{{ code for gfp<3, 1> */
566 #define ADDSUBMUL_CODE3(op, opc)					\
567                 FEED_IN_WITH_S0_IN_RAX("%[s1]", r8, r9)			\
568                 INNER_MUL(op, "%[z0]", "%[s2]", r8, r9, r10)		\
569                 FINISH(op, opc, "%[z1]", "%[z2]", "%[z3]", r9, r10)	\
570                 : "=&a"(foo),                                           \
571                     [z0]"+rm"(dst[0]),				\
572                     [z1]"+rm"(dst[1]),				\
573                     [z2]"+rm"(dst[2]),				\
574                     [z3]"+rm"(dst[3])					\
575                 :							\
576                     [s0]"0"(src[0]),					\
577                     [s1]"rm"(src[1]),					\
578                     [s2]"rm"(src[2]),					\
579                     [mult]"rm"(x)					\
580                 : "r8", "r9", "r10", "rdx"
581 
582 #define ADDSUB_CODE3(op, opc)   \
583         "" #op  "q %q[s0], %q[d0]\n"					\
584         "" #opc "q %q[s1], %q[d1]\n"					\
585         "" #opc "q %q[s2], %q[d2]\n"					\
586         "" #opc "q $0x0, %q[d3]\n"					\
587                 :							\
588                 [d0]"+rm"(dst[0]),					\
589                 [d1]"+rm"(dst[1]),					\
590                 [d2]"+rm"(dst[2]),					\
591                 [d3]"+rm"(dst[3])					\
592                 :							\
593                 [s0]"r"(src[0]),					\
594                 [s1]"r"(src[1]),					\
595                 [s2]"r"(src[2])
596 
597     EXPOSE_SPECIALIZATION(3);
598     /* }}} */
599 
600     /* {{{ code for gfp<4, 1> */
601     /*
602 #define ADDSUBMUL_CODE4(op, opc)					\
603                 FEED_IN_WITH_S0_IN_RAX("%[s1]", r8, r9)			\
604                 INNER_MUL(op, "%[z0]", "%[s2]", r8, r9, r10)		\
605                 INNER_MUL(op, "%[z1]", "%[s3]", r9, r10, r11)		\
606                 FINISH(op, opc, "%[z2]", "%[z3]", "%[z4]", r10, r11)	\
607                 : "=&a"(foo),                                           \
608                     [z0]"+rm"(dst[0]),				\
609                     [z1]"+rm"(dst[1]),				\
610                     [z2]"+rm"(dst[2]),				\
611                     [z3]"+rm"(dst[3]),				\
612                     [z4]"+rm"(dst[4])					\
613                 :							\
614                     [s0]"0"(src[0]),					\
615                     [s1]"rm"(src[1]),					\
616                     [s2]"rm"(src[2]),					\
617                     [s3]"rm"(src[3]),					\
618                     [mult]"rm"(x)					\
619                 : "r8", "r9", "r10", "r11", "rdx"
620 
621 #define xADDSUB_CODE4(op, opc)   \
622 "" #op  "q %q[s0], (%[z])\n"					\
623 "" #opc "q %q[s1], 0x8(%[z])\n"					\
624 "" #opc "q %q[s2], 0x10(%[z])\n"				\
625 "" #opc "q %q[s3], 0x18(%[z])\n"				\
626 "" #opc "q $0x0, 0x20(%[z])\n"					\
627         :							\
628         :							\
629             [z]"r"(&dst[0]),				        \
630             [s0]"r"(src[0]),					\
631             [s1]"r"(src[1]),					\
632             [s2]"r"(src[2]),					\
633             [s3]"r"(src[3])                                   \
634         : "memory"
635 
636 
637                 */
638 #define ADDSUB_CODE4(op, opc)   \
639         "" #op  "q %q[s0], %q[d0]\n"					\
640         "" #opc "q %q[s1], %q[d1]\n"					\
641         "" #opc "q %q[s2], %q[d2]\n"					\
642         "" #opc "q %q[s3], %q[d3]\n"					\
643         "" #opc "q $0x0, %q[d4]\n"					\
644                 :							\
645                 [d0]"+rm"(dst[0]),					\
646                 [d1]"+rm"(dst[1]),					\
647                 [d2]"+rm"(dst[2]),					\
648                 [d3]"+rm"(dst[3]),					\
649                 [d4]"+rm"(dst[4])					\
650                 :							\
651                 [s0]"r"(src[0]),					\
652                 [s1]"r"(src[1]),					\
653                 [s2]"r"(src[2]),					\
654                 [s3]"r"(src[3])
655 
656 
657 #define ADDSUBMUL_CODE4(op, opc)					\
658                 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9)		\
659                 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10)	\
660                 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11)	\
661                 FINISH(op, opc,                                         \
662                         "0x10(%[z])", "0x18(%[z])", "0x20(%[z])",       \
663                         r10, r11)                               	\
664                 :							\
665                 :							\
666                     [z]"D"(&dst[0]),				        \
667                     [s]"S"(&src[0]),					\
668                     [mult]"rm"(x)					\
669                 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory"
670 
671     EXPOSE_SPECIALIZATION(4);
672     /* }}} */
673 
674     /* {{{ code for gfp<5, 1> */
675 
676 #define ADDSUBMUL_CODE5(op, opc)					\
677                 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9)		\
678                 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10)	\
679                 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11)	\
680                 INNER_MUL(op, "0x10(%[z])", "0x20(%[s])", r10, r11, r8)	\
681                 FINISH(op, opc,                                         \
682                         "0x18(%[z])", "0x20(%[z])", "0x28(%[z])",       \
683                         r11, r8)                               	\
684                 :							\
685                 :							\
686                     [z]"D"(&dst[0]),				        \
687                     [s]"S"(&src[0]),					\
688                     [mult]"rm"(x)					\
689                 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory"
690 
691 #define ADDSUB_CODE5(op, opc)   \
692         "" #op  "q %q[s0], (%[z])\n"					\
693         "" #opc "q %q[s1], 0x8(%[z])\n"					\
694         "" #opc "q %q[s2], 0x10(%[z])\n"				\
695         "" #opc "q %q[s3], 0x18(%[z])\n"				\
696         "" #opc "q %q[s4], 0x20(%[z])\n"				\
697         "" #opc "q $0x0, 0x28(%[z])\n"					\
698                 :							\
699                 :							\
700                     [z]"r"(&dst[0]),				        \
701                     [s0]"r"(src[0]),					\
702                     [s1]"r"(src[1]),					\
703                     [s2]"r"(src[2]),					\
704                     [s3]"r"(src[3]),					\
705                     [s4]"r"(src[4])                                   \
706                 : "memory"
707 
708     EXPOSE_SPECIALIZATION(5);
709     /* }}} */
710 
711     /* {{{ code for gfp<6, 1> */
712 
713 #define ADDSUBMUL_CODE6(op, opc)					\
714                 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9)		\
715                 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10)	\
716                 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11)	\
717                 INNER_MUL(op, "0x10(%[z])", "0x20(%[s])", r10, r11, r8)	\
718                 INNER_MUL(op, "0x18(%[z])", "0x28(%[s])", r11, r8, r9)	\
719                 FINISH(op, opc,                                         \
720                         "0x20(%[z])", "0x28(%[z])", "0x30(%[z])",       \
721                         r8, r9)                               	\
722                 :							\
723                 :							\
724                     [z]"D"(&dst[0]),				        \
725                     [s]"S"(&src[0]),					\
726                     [mult]"rm"(x)					\
727                 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory"
728 
729 #define ADDSUB_CODE6(op, opc)   \
730         "" #op  "q %q[s0], (%[z])\n"					\
731         "" #opc "q %q[s1], 0x8(%[z])\n"					\
732         "" #opc "q %q[s2], 0x10(%[z])\n"				\
733         "" #opc "q %q[s3], 0x18(%[z])\n"				\
734         "" #opc "q %q[s4], 0x20(%[z])\n"				\
735         "" #opc "q %q[s5], 0x28(%[z])\n"				\
736         "" #opc "q $0x0, 0x30(%[z])\n"					\
737                 :							\
738                 :							\
739                     [z]"r"(&dst[0]),				        \
740                     [s0]"r"(src[0]),					\
741                     [s1]"r"(src[1]),					\
742                     [s2]"r"(src[2]),					\
743                     [s3]"r"(src[3]),					\
744                     [s4]"r"(src[4]),                                  \
745                     [s5]"r"(src[5])                                   \
746                 : "memory"
747 
748     EXPOSE_SPECIALIZATION(6);
749     /* }}} */
750 
751     /* {{{ code for gfp<7, 1> */
752 #define ADDSUBMUL_CODE7(op, opc)					\
753                 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9)		\
754                 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10)	\
755                 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11)	\
756                 INNER_MUL(op, "0x10(%[z])", "0x20(%[s])", r10, r11, r8)	\
757                 INNER_MUL(op, "0x18(%[z])", "0x28(%[s])", r11, r8, r9)	\
758                 INNER_MUL(op, "0x20(%[z])", "0x30(%[s])", r8, r9, r10)	\
759                 FINISH(op, opc,                                         \
760                         "0x28(%[z])", "0x30(%[z])", "0x38(%[z])",       \
761                         r9, r10)                               	\
762                 :							\
763                 :							\
764                     [z]"D"(&dst[0]),				        \
765                     [s]"S"(&src[0]),					\
766                     [mult]"rm"(x)					\
767                 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory"
768 
769 #define ADDSUB_CODE7(op, opc)   \
770         "" #op  "q %q[s0], (%[z])\n"					\
771         "" #opc "q %q[s1], 0x8(%[z])\n"					\
772         "" #opc "q %q[s2], 0x10(%[z])\n"				\
773         "" #opc "q %q[s3], 0x18(%[z])\n"				\
774         "" #opc "q %q[s4], 0x20(%[z])\n"				\
775         "" #opc "q %q[s5], 0x28(%[z])\n"				\
776         "" #opc "q %q[s6], 0x30(%[z])\n"				\
777         "" #opc "q $0x0, 0x38(%[z])\n"					\
778                 :							\
779                 :							\
780                     [z]"r"(&dst[0]),				        \
781                     [s0]"r"(src[0]),					\
782                     [s1]"r"(src[1]),					\
783                     [s2]"r"(src[2]),					\
784                     [s3]"r"(src[3]),					\
785                     [s4]"r"(src[4]),                                  \
786                     [s5]"r"(src[5]),                                  \
787                     [s6]"r"(src[6])                                   \
788                 : "memory"
789 
790     EXPOSE_SPECIALIZATION(7);
791     /* }}} */
792 
793     /* {{{ code for gfp<8, 1> */
794 #define ADDSUBMUL_CODE8(op, opc)					\
795                 FEED_IN("0x0(%[s])", "0x8(%[s])", r8, r9)		\
796                 INNER_MUL(op, "0x0(%[z])", "0x10(%[s])", r8, r9, r10)	\
797                 INNER_MUL(op, "0x8(%[z])", "0x18(%[s])", r9, r10, r11)	\
798                 INNER_MUL(op, "0x10(%[z])", "0x20(%[s])", r10, r11, r8)	\
799                 INNER_MUL(op, "0x18(%[z])", "0x28(%[s])", r11, r8, r9)	\
800                 INNER_MUL(op, "0x20(%[z])", "0x30(%[s])", r8, r9, r10)	\
801                 INNER_MUL(op, "0x28(%[z])", "0x38(%[s])", r9, r10, r11)	\
802                 FINISH(op, opc,                                         \
803                         "0x30(%[z])", "0x38(%[z])", "0x40(%[z])",       \
804                         r10, r11)                               	\
805                 :							\
806                 :							\
807                     [z]"D"(&dst[0]),				        \
808                     [s]"S"(&src[0]),					\
809                     [mult]"rm"(x)					\
810                 : "r8", "r9", "r10", "r11", "rax", "rdx", "memory"
811 
812 #define ADDSUB_CODE8(op, opc)   \
813         "" #op  "q %q[s0], (%[z])\n"					\
814         "" #opc "q %q[s1], 0x8(%[z])\n"					\
815         "" #opc "q %q[s2], 0x10(%[z])\n"				\
816         "" #opc "q %q[s3], 0x18(%[z])\n"				\
817         "" #opc "q %q[s4], 0x20(%[z])\n"				\
818         "" #opc "q %q[s5], 0x28(%[z])\n"				\
819         "" #opc "q %q[s6], 0x30(%[z])\n"				\
820         "" #opc "q %q[s7], 0x38(%[z])\n"				\
821         "" #opc "q $0x0, 0x40(%[z])\n"					\
822                 :							\
823                 :							\
824                     [z]"r"(&dst[0]),				        \
825                     [s0]"r"(src[0]),					\
826                     [s1]"r"(src[1]),					\
827                     [s2]"r"(src[2]),					\
828                     [s3]"r"(src[3]),					\
829                     [s4]"r"(src[4]),                                  \
830                     [s5]"r"(src[5]),                                  \
831                     [s6]"r"(src[6]),                                   \
832                     [s7]"r"(src[7])                                   \
833                 : "memory"
834 
835     EXPOSE_SPECIALIZATION(8);
836     /* }}} */
837 
838     /* further specialization only seem to bring very marginal
839      * improvements. */
840 
841     /* AVX/SSE code is a choice which is mostly unrelated to the C++
842      * dialect we use, except for a fine point below. We use a union to
843      * protect an off-bounds write. And pre-C++11 does not really like
844      * unions with contructors for members. Given that anyway we have
845      * little use for this code at the moment, since it is suboptimal,
846      * let's protect it with HAVE_CXX11
847      */
848 #ifdef HAVE_CXX11
849 #if defined(HAVE_AVX2) || defined(HAVE_SSSE3)
850     template<> struct fast_type<gfp<3, 1> > {
851         typedef gfp<3, 1> super;
852         struct elt;
853         typedef elt elt_ur;
854         struct elt {
855 #ifdef  HAVE_AVX2
856         static const int alignment = 32;
857 #else
858         static const int alignment = 16;
859 #endif
860             typedef elt self;
861 #ifdef  HAVE_AVX2
862             __m256i data[1];
863 #else
864             __m128i data[2];
865 #endif
eltarith_modp::details::fast_type::elt866             elt() { zero(); }
867             elt(elt const& a) = default;
868             elt& operator=(elt const& a) = default;
869 
870             /* we do not construct (nor affect) from mpz, because we're not
871              * positional */
zeroarith_modp::details::fast_type::elt872             void zero() {
873 #ifdef  HAVE_AVX2
874                 data[0] = _mm256_setzero_si256();
875 #else
876                 data[0] = _mm_setzero_si128();
877                 data[1] = _mm_setzero_si128();
878 #endif
879             }
zeroarith_modp::details::fast_type::elt880             static void zero(elt * x, int N) {
881                 // see bug 21663
882                 memset(x->data, 0, N * sizeof(data));
883             }
copyarith_modp::details::fast_type::elt884             static void copy(elt * y, const elt * x, int N) {
885                 // see bug 21663
886                 memcpy(y->data, x->data, N * sizeof(data));
887             }
operator ==arith_modp::details::fast_type::elt888             bool operator==(elt const& a) {
889                 return memcmp(data, a.data, sizeof(data)) == 0;
890             }
eltarith_modp::details::fast_type::elt891             elt(super::elt const& a) {
892                 convert(*this, a);
893             }
894 
operator super::elt_urarith_modp::details::fast_type::elt895             operator super::elt_ur() const {
896                 super::elt_ur carries(conv_backend_get_carries(*this));
897                 super::add(carries, conv_backend_get_main(*this));
898                 return carries;
899             }
900 
901             /* same, but we assume carry is zero */
operator super::eltarith_modp::details::fast_type::elt902             operator super::elt() const {
903                 ASSERT(conv_backend_get_carries(*this).is_zero());
904                 return conv_backend_get_main(*this);
905             }
906         };
907 
stream_storearith_modp::details::fast_type908         static inline void stream_store(elt * dst, elt const& src) {
909             /* Do we want to stream that or not ? In fact it's slower
910              * when streaming... */
911 #if 0
912 #ifdef  HAVE_AVX2
913             _mm256_stream_si256(dst->data+0, src.data[0]);
914 #else
915             _mm_stream_si128(dst->data+0, src.data[0]);
916             _mm_stream_si128(dst->data+1, src.data[1]);
917 #endif
918 #else
919 #ifdef  HAVE_AVX2
920             _mm256_storeu_si256(dst->data+0, src.data[0]);
921 #else
922             _mm_storeu_si128(dst->data+0, src.data[0]);
923             _mm_storeu_si128(dst->data+1, src.data[1]);
924 #endif
925 #endif
926         }
addarith_modp::details::fast_type927         static inline void add(elt & dst, elt const & src)
928         {
929 #ifdef  HAVE_AVX2
930             dst.data[0] = _mm256_add_epi64 (dst.data[0], src.data[0]);
931 #else
932             dst.data[0] = _mm_add_epi64 (dst.data[0], src.data[0]);
933             dst.data[1] = _mm_add_epi64 (dst.data[1], src.data[1]);
934 #endif
935         }
936 
subarith_modp::details::fast_type937         static inline void sub(elt & dst, elt const & src)
938         {
939 #ifdef  HAVE_AVX2
940             dst.data[0] = _mm256_sub_epi64 (dst.data[0], src.data[0]);
941 #else
942             dst.data[0] = _mm_sub_epi64 (dst.data[0], src.data[0]);
943             dst.data[1] = _mm_sub_epi64 (dst.data[1], src.data[1]);
944 #endif
945         }
946 
sub_urarith_modp::details::fast_type947         static inline void sub_ur(elt_ur & dst, elt_ur const & src)
948         {
949 #ifdef  HAVE_AVX2
950             dst.data[0] = _mm256_sub_epi64 (dst.data[0], src.data[0]);
951 #else
952             dst.data[0] = _mm_sub_epi64 (dst.data[0], src.data[0]);
953             dst.data[1] = _mm_sub_epi64 (dst.data[1], src.data[1]);
954 #endif
955         }
956 
957         /* conversions are done as a combination of blend & shuffle */
958 
959 #ifdef  HAVE_AVX2
960         /* We grok only values for w_i which are integer immediates
961          * within {-1} \cup {0..15}
962          */
963 #define shuffle_16bit_words(out, in,        		        	\
964             w0, w1, w2, w3,						\
965             w4, w5, w6, w7,						\
966             w8, w9, wa, wb,						\
967             wc, wd, we, wf)						\
968     do {                                                                \
969         *out = _mm256_xor_si256(				        \
970             _mm256_shuffle_epi8(					\
971             in,								\
972             _mm256_setr_epi8( 						\
973                 (w0 < 0) ? -1 : ((w0 < 8)  ? 2*(w0&7) + 0 : -1),	\
974                 (w0 < 0) ? -1 : ((w0 < 8)  ? 2*(w0&7) + 1 : -1),	\
975                 (w1 < 0) ? -1 : ((w1 < 8)  ? 2*(w1&7) + 0 : -1),	\
976                 (w1 < 0) ? -1 : ((w1 < 8)  ? 2*(w1&7) + 1 : -1),	\
977                 (w2 < 0) ? -1 : ((w2 < 8)  ? 2*(w2&7) + 0 : -1),	\
978                 (w2 < 0) ? -1 : ((w2 < 8)  ? 2*(w2&7) + 1 : -1),	\
979                 (w3 < 0) ? -1 : ((w3 < 8)  ? 2*(w3&7) + 0 : -1),	\
980                 (w3 < 0) ? -1 : ((w3 < 8)  ? 2*(w3&7) + 1 : -1),	\
981                 (w4 < 0) ? -1 : ((w4 < 8)  ? 2*(w4&7) + 0 : -1),	\
982                 (w4 < 0) ? -1 : ((w4 < 8)  ? 2*(w4&7) + 1 : -1),	\
983                 (w5 < 0) ? -1 : ((w5 < 8)  ? 2*(w5&7) + 0 : -1),	\
984                 (w5 < 0) ? -1 : ((w5 < 8)  ? 2*(w5&7) + 1 : -1),	\
985                 (w6 < 0) ? -1 : ((w6 < 8)  ? 2*(w6&7) + 0 : -1),	\
986                 (w6 < 0) ? -1 : ((w6 < 8)  ? 2*(w6&7) + 1 : -1),	\
987                 (w7 < 0) ? -1 : ((w7 < 8)  ? 2*(w7&7) + 0 : -1),	\
988                 (w7 < 0) ? -1 : ((w7 < 8)  ? 2*(w7&7) + 1 : -1),	\
989                 (w8 < 0) ? -1 : ((w8 >= 8) ? 2*(w8&7) + 0 : -1),	\
990                 (w8 < 0) ? -1 : ((w8 >= 8) ? 2*(w8&7) + 1 : -1),	\
991                 (w9 < 0) ? -1 : ((w9 >= 8) ? 2*(w9&7) + 0 : -1),	\
992                 (w9 < 0) ? -1 : ((w9 >= 8) ? 2*(w9&7) + 1 : -1),	\
993                 (wa < 0) ? -1 : ((wa >= 8) ? 2*(wa&7) + 0 : -1),	\
994                 (wa < 0) ? -1 : ((wa >= 8) ? 2*(wa&7) + 1 : -1),	\
995                 (wb < 0) ? -1 : ((wb >= 8) ? 2*(wb&7) + 0 : -1),	\
996                 (wb < 0) ? -1 : ((wb >= 8) ? 2*(wb&7) + 1 : -1),	\
997                 (wc < 0) ? -1 : ((wc >= 8) ? 2*(wc&7) + 0 : -1),	\
998                 (wc < 0) ? -1 : ((wc >= 8) ? 2*(wc&7) + 1 : -1),	\
999                 (wd < 0) ? -1 : ((wd >= 8) ? 2*(wd&7) + 0 : -1),	\
1000                 (wd < 0) ? -1 : ((wd >= 8) ? 2*(wd&7) + 1 : -1),	\
1001                 (we < 0) ? -1 : ((we >= 8) ? 2*(we&7) + 0 : -1),	\
1002                 (we < 0) ? -1 : ((we >= 8) ? 2*(we&7) + 1 : -1),	\
1003                 (wf < 0) ? -1 : ((wf >= 8) ? 2*(wf&7) + 0 : -1),	\
1004                 (wf < 0) ? -1 : ((wf >= 8) ? 2*(wf&7) + 1 : -1))),	\
1005             _mm256_shuffle_epi8(					\
1006                 /* 0x4E is 0b01001110 aka (1,0,3,2) */			\
1007                 _mm256_permute4x64_epi64 (in, _MM_SHUFFLE(1,0,3,2)), 	\
1008             _mm256_setr_epi8( 						\
1009                 (w0 < 0) ? -1 : ((w0 >= 8) ? 2*(w0&7) + 0 : -1),	\
1010                 (w0 < 0) ? -1 : ((w0 >= 8) ? 2*(w0&7) + 1 : -1),	\
1011                 (w1 < 0) ? -1 : ((w1 >= 8) ? 2*(w1&7) + 0 : -1),	\
1012                 (w1 < 0) ? -1 : ((w1 >= 8) ? 2*(w1&7) + 1 : -1),	\
1013                 (w2 < 0) ? -1 : ((w2 >= 8) ? 2*(w2&7) + 0 : -1),	\
1014                 (w2 < 0) ? -1 : ((w2 >= 8) ? 2*(w2&7) + 1 : -1),	\
1015                 (w3 < 0) ? -1 : ((w3 >= 8) ? 2*(w3&7) + 0 : -1),	\
1016                 (w3 < 0) ? -1 : ((w3 >= 8) ? 2*(w3&7) + 1 : -1),	\
1017                 (w4 < 0) ? -1 : ((w4 >= 8) ? 2*(w4&7) + 0 : -1),	\
1018                 (w4 < 0) ? -1 : ((w4 >= 8) ? 2*(w4&7) + 1 : -1),	\
1019                 (w5 < 0) ? -1 : ((w5 >= 8) ? 2*(w5&7) + 0 : -1),	\
1020                 (w5 < 0) ? -1 : ((w5 >= 8) ? 2*(w5&7) + 1 : -1),	\
1021                 (w6 < 0) ? -1 : ((w6 >= 8) ? 2*(w6&7) + 0 : -1),	\
1022                 (w6 < 0) ? -1 : ((w6 >= 8) ? 2*(w6&7) + 1 : -1),	\
1023                 (w7 < 0) ? -1 : ((w7 >= 8) ? 2*(w7&7) + 0 : -1),	\
1024                 (w7 < 0) ? -1 : ((w7 >= 8) ? 2*(w7&7) + 1 : -1),	\
1025                 (w8 < 0) ? -1 : ((w8 < 8)  ? 2*(w8&7) + 0 : -1),	\
1026                 (w8 < 0) ? -1 : ((w8 < 8)  ? 2*(w8&7) + 1 : -1),	\
1027                 (w9 < 0) ? -1 : ((w9 < 8)  ? 2*(w9&7) + 0 : -1),	\
1028                 (w9 < 0) ? -1 : ((w9 < 8)  ? 2*(w9&7) + 1 : -1),	\
1029                 (wa < 0) ? -1 : ((wa < 8)  ? 2*(wa&7) + 0 : -1),	\
1030                 (wa < 0) ? -1 : ((wa < 8)  ? 2*(wa&7) + 1 : -1),	\
1031                 (wb < 0) ? -1 : ((wb < 8)  ? 2*(wb&7) + 0 : -1),	\
1032                 (wb < 0) ? -1 : ((wb < 8)  ? 2*(wb&7) + 1 : -1),	\
1033                 (wc < 0) ? -1 : ((wc < 8)  ? 2*(wc&7) + 0 : -1),	\
1034                 (wc < 0) ? -1 : ((wc < 8)  ? 2*(wc&7) + 1 : -1),	\
1035                 (wd < 0) ? -1 : ((wd < 8)  ? 2*(wd&7) + 0 : -1),	\
1036                 (wd < 0) ? -1 : ((wd < 8)  ? 2*(wd&7) + 1 : -1),	\
1037                 (we < 0) ? -1 : ((we < 8)  ? 2*(we&7) + 0 : -1),	\
1038                 (we < 0) ? -1 : ((we < 8)  ? 2*(we&7) + 1 : -1),	\
1039                 (wf < 0) ? -1 : ((wf < 8)  ? 2*(wf&7) + 0 : -1),	\
1040                 (wf < 0) ? -1 : ((wf < 8)  ? 2*(wf&7) + 1 : -1)))	\
1041         );                                                              \
1042     } while (0)
1043 #else
1044 #define shuffle_16bit_words(out, lo, hi,       		        	\
1045             w0, w1, w2, w3,						\
1046             w4, w5, w6, w7,						\
1047             w8, w9, wa, wb,						\
1048             wc, wd, we, wf)						\
1049     do {                                                                \
1050         out[0] = _mm_xor_si128(			        	        \
1051             _mm_shuffle_epi8(lo, _mm_setr_epi8( 			\
1052                 (w0 < 0) ? -1 : ((w0 < 8)  ? 2*(w0&7) + 0 : -1),	\
1053                 (w0 < 0) ? -1 : ((w0 < 8)  ? 2*(w0&7) + 1 : -1),	\
1054                 (w1 < 0) ? -1 : ((w1 < 8)  ? 2*(w1&7) + 0 : -1),	\
1055                 (w1 < 0) ? -1 : ((w1 < 8)  ? 2*(w1&7) + 1 : -1),	\
1056                 (w2 < 0) ? -1 : ((w2 < 8)  ? 2*(w2&7) + 0 : -1),	\
1057                 (w2 < 0) ? -1 : ((w2 < 8)  ? 2*(w2&7) + 1 : -1),	\
1058                 (w3 < 0) ? -1 : ((w3 < 8)  ? 2*(w3&7) + 0 : -1),	\
1059                 (w3 < 0) ? -1 : ((w3 < 8)  ? 2*(w3&7) + 1 : -1),	\
1060                 (w4 < 0) ? -1 : ((w4 < 8)  ? 2*(w4&7) + 0 : -1),	\
1061                 (w4 < 0) ? -1 : ((w4 < 8)  ? 2*(w4&7) + 1 : -1),	\
1062                 (w5 < 0) ? -1 : ((w5 < 8)  ? 2*(w5&7) + 0 : -1),	\
1063                 (w5 < 0) ? -1 : ((w5 < 8)  ? 2*(w5&7) + 1 : -1),	\
1064                 (w6 < 0) ? -1 : ((w6 < 8)  ? 2*(w6&7) + 0 : -1),	\
1065                 (w6 < 0) ? -1 : ((w6 < 8)  ? 2*(w6&7) + 1 : -1),	\
1066                 (w7 < 0) ? -1 : ((w7 < 8)  ? 2*(w7&7) + 0 : -1),	\
1067                 (w7 < 0) ? -1 : ((w7 < 8)  ? 2*(w7&7) + 1 : -1))),	\
1068             _mm_shuffle_epi8(hi, _mm_setr_epi8( 			\
1069                 (w0 < 0) ? -1 : ((w0 >= 8) ? 2*(w0&7) + 0 : -1),	\
1070                 (w0 < 0) ? -1 : ((w0 >= 8) ? 2*(w0&7) + 1 : -1),	\
1071                 (w1 < 0) ? -1 : ((w1 >= 8) ? 2*(w1&7) + 0 : -1),	\
1072                 (w1 < 0) ? -1 : ((w1 >= 8) ? 2*(w1&7) + 1 : -1),	\
1073                 (w2 < 0) ? -1 : ((w2 >= 8) ? 2*(w2&7) + 0 : -1),	\
1074                 (w2 < 0) ? -1 : ((w2 >= 8) ? 2*(w2&7) + 1 : -1),	\
1075                 (w3 < 0) ? -1 : ((w3 >= 8) ? 2*(w3&7) + 0 : -1),	\
1076                 (w3 < 0) ? -1 : ((w3 >= 8) ? 2*(w3&7) + 1 : -1),	\
1077                 (w4 < 0) ? -1 : ((w4 >= 8) ? 2*(w4&7) + 0 : -1),	\
1078                 (w4 < 0) ? -1 : ((w4 >= 8) ? 2*(w4&7) + 1 : -1),	\
1079                 (w5 < 0) ? -1 : ((w5 >= 8) ? 2*(w5&7) + 0 : -1),	\
1080                 (w5 < 0) ? -1 : ((w5 >= 8) ? 2*(w5&7) + 1 : -1),	\
1081                 (w6 < 0) ? -1 : ((w6 >= 8) ? 2*(w6&7) + 0 : -1),	\
1082                 (w6 < 0) ? -1 : ((w6 >= 8) ? 2*(w6&7) + 1 : -1),	\
1083                 (w7 < 0) ? -1 : ((w7 >= 8) ? 2*(w7&7) + 0 : -1),	\
1084                 (w7 < 0) ? -1 : ((w7 >= 8) ? 2*(w7&7) + 1 : -1))));	\
1085         out[1] = _mm_xor_si128(			        	        \
1086             _mm_shuffle_epi8(lo, _mm_setr_epi8( 			\
1087                 (w8 < 0) ? -1 : ((w8 < 8)  ? 2*(w8&7) + 0 : -1),	\
1088                 (w8 < 0) ? -1 : ((w8 < 8)  ? 2*(w8&7) + 1 : -1),	\
1089                 (w9 < 0) ? -1 : ((w9 < 8)  ? 2*(w9&7) + 0 : -1),	\
1090                 (w9 < 0) ? -1 : ((w9 < 8)  ? 2*(w9&7) + 1 : -1),	\
1091                 (wa < 0) ? -1 : ((wa < 8)  ? 2*(wa&7) + 0 : -1),	\
1092                 (wa < 0) ? -1 : ((wa < 8)  ? 2*(wa&7) + 1 : -1),	\
1093                 (wb < 0) ? -1 : ((wb < 8)  ? 2*(wb&7) + 0 : -1),	\
1094                 (wb < 0) ? -1 : ((wb < 8)  ? 2*(wb&7) + 1 : -1),	\
1095                 (wc < 0) ? -1 : ((wc < 8)  ? 2*(wc&7) + 0 : -1),	\
1096                 (wc < 0) ? -1 : ((wc < 8)  ? 2*(wc&7) + 1 : -1),	\
1097                 (wd < 0) ? -1 : ((wd < 8)  ? 2*(wd&7) + 0 : -1),	\
1098                 (wd < 0) ? -1 : ((wd < 8)  ? 2*(wd&7) + 1 : -1),	\
1099                 (we < 0) ? -1 : ((we < 8)  ? 2*(we&7) + 0 : -1),	\
1100                 (we < 0) ? -1 : ((we < 8)  ? 2*(we&7) + 1 : -1),	\
1101                 (wf < 0) ? -1 : ((wf < 8)  ? 2*(wf&7) + 0 : -1),	\
1102                 (wf < 0) ? -1 : ((wf < 8)  ? 2*(wf&7) + 1 : -1))),	\
1103             _mm_shuffle_epi8(hi, _mm_setr_epi8(                         \
1104                 (w8 < 0) ? -1 : ((w8 >= 8) ? 2*(w8&7) + 0 : -1),	\
1105                 (w8 < 0) ? -1 : ((w8 >= 8) ? 2*(w8&7) + 1 : -1),	\
1106                 (w9 < 0) ? -1 : ((w9 >= 8) ? 2*(w9&7) + 0 : -1),	\
1107                 (w9 < 0) ? -1 : ((w9 >= 8) ? 2*(w9&7) + 1 : -1),	\
1108                 (wa < 0) ? -1 : ((wa >= 8) ? 2*(wa&7) + 0 : -1),	\
1109                 (wa < 0) ? -1 : ((wa >= 8) ? 2*(wa&7) + 1 : -1),	\
1110                 (wb < 0) ? -1 : ((wb >= 8) ? 2*(wb&7) + 0 : -1),	\
1111                 (wb < 0) ? -1 : ((wb >= 8) ? 2*(wb&7) + 1 : -1),	\
1112                 (wc < 0) ? -1 : ((wc >= 8) ? 2*(wc&7) + 0 : -1),	\
1113                 (wc < 0) ? -1 : ((wc >= 8) ? 2*(wc&7) + 1 : -1),	\
1114                 (wd < 0) ? -1 : ((wd >= 8) ? 2*(wd&7) + 0 : -1),	\
1115                 (wd < 0) ? -1 : ((wd >= 8) ? 2*(wd&7) + 1 : -1),	\
1116                 (we < 0) ? -1 : ((we >= 8) ? 2*(we&7) + 0 : -1),	\
1117                 (we < 0) ? -1 : ((we >= 8) ? 2*(we&7) + 1 : -1),	\
1118                 (wf < 0) ? -1 : ((wf >= 8) ? 2*(wf&7) + 0 : -1),	\
1119                 (wf < 0) ? -1 : ((wf >= 8) ? 2*(wf&7) + 1 : -1))));	\
1120     } while (0)
1121 #endif
1122 
1123         /* case of 192 bits within 256 bits. Three 64-bit words
1124          * split into four 48-bit words.
1125          */
convertarith_modp::details::fast_type1126         static void convert(elt& dst, const super::elt& a) {
1127             /* index of 16-bit word in destination, fetched from
1128              * which index of 16-bit word in the gfp::elt. This is
1129              * given for the 256-bit registers
1130              *
1131              * 0    0
1132              * 1    1
1133              * 2    2
1134              * 3    <empty>
1135              * 4    3
1136              * 5    4
1137              * 6    5
1138              * 7    <empty>
1139              * 8    6
1140              * 9    7
1141              * 10   8
1142              * 11   <empty>
1143              * 12   9
1144              * 13   10
1145              * 14   11
1146              * 15   <empty>
1147              */
1148 #ifdef  HAVE_AVX2
1149             /* I'm really upset here. _mm256_shuffle_epi8, aka VPSHUFB,
1150              * reads only 4-byte immediates (and discards the rest). As a
1151              * consequnence, the following does not work: the indices
1152              * 12,13,14,15 read off bounds, while the 16,17, etc actually
1153              * do what they want, but based on the fact that they're
1154              * reduced mod 16 + implicitly considered wrt the high part
1155              * of the operand...
1156             dst.data[0] = _mm256_shuffle_epi8(
1157                     _mm256_loadu_si256((__m256i*) a.x),
1158                     _mm256_setr_epi8(
1159                         0,1,2,3,4,5,-1,-1,
1160                         6,7,8,9,10,11,-1,-1,
1161                         12,13,14,15,16,17,-1,-1,
1162                         18,19,20,21,22,23,-1,-1));
1163             */
1164 
1165 #if 0
1166             __m256i in = _mm256_loadu_si256((__m256i*) a.x);
1167             dst.data[0] =
1168                     _mm256_xor_si256(
1169                         _mm256_shuffle_epi8(
1170                         in,
1171                         _mm256_setr_epi8(
1172                             0,1,2,3,4,5,-1,-1,
1173                             6,7,8,9,10,11,-1,-1,
1174                             -1,-1,-1,-1,0,1,-1,-1,
1175                             2,3,4,5,6,7,-1,-1)),
1176                         _mm256_shuffle_epi8(
1177                             /* 0x4E is 0b01001110 aka (1,0,3,2) */
1178                             _mm256_permute4x64_epi64 (in, _MM_SHUFFLE(1,0,3,2)),
1179                         _mm256_setr_epi8(
1180                             -1,-1,-1,-1,-1,-1,-1,-1,
1181                             -1,-1,-1,-1,-1,-1,-1,-1,
1182                             12,13,14,15,-1,-1,-1,-1,
1183                             -1,-1,-1,-1,-1,-1,-1,-1)));
1184 #endif
1185             __m256i in = _mm256_loadu_si256((__m256i*) a.x);
1186             shuffle_16bit_words(dst.data, in,
1187                     0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1);
1188 #else   /* SSSE3 !! */
1189             __m128i lo = _mm_loadu_si128((__m128i*) a.x);
1190             __m128i hi = _mm_loadu_si128((__m128i*) (a.x + 2));
1191             shuffle_16bit_words(dst.data, lo, hi,
1192                     0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1);
1193             /* note that 16bit-wide shuffles use an 8-bit immediate,
1194              * but do not offer the option to selectively insert
1195              * zeroes. So we're probably better off shuffling bytes.
1196              */
1197 #endif
1198         }
1199 
conv_backend_get_mainarith_modp::details::fast_type1200         static super::elt conv_backend_get_main(elt const& src) {
1201             /* This is needed because we knowingly write off bounds */
1202             union station {
1203                 super::elt e;
1204 #ifdef  HAVE_AVX2
1205                 __m256i v[1];
1206 #else
1207                 __m128i v[2];
1208 #endif
1209                 station() {};
1210             } main;
1211 #ifdef  HAVE_AVX2
1212             shuffle_16bit_words(main.v, src.data[0],
1213                     0,1,2, 4,5,6, 8,9,10, 12,13,14, -1,-1,-1,-1);
1214 #else
1215             shuffle_16bit_words(main.v, src.data[0], src.data[1],
1216                     0,1,2, 4,5,6, 8,9,10, 12,13,14, -1,-1,-1,-1);
1217 #endif
1218             return main.e;
1219         }
conv_backend_get_carriesarith_modp::details::fast_type1220         static super::elt_ur conv_backend_get_carries(elt const& src) {
1221             union station {
1222                 super::elt_ur e;
1223 #ifdef  HAVE_AVX2
1224                 __m256i v[1];
1225 #else
1226                 __m128i v[2];
1227 #endif
1228                 station() {};
1229             } carries, ncarries;
1230 
1231             /* It's slightly more complicated than it seems. The carry
1232              * words may be negative. So we must sign-extend them to the
1233              * full unreduced element size.
1234              */
1235 #ifdef  HAVE_AVX2
1236             shuffle_16bit_words(carries.v, src.data[0],
1237                     -1,-1,-1,3,
1238                     -1,-1,7,-1,
1239                     -1,11,-1,-1,
1240                     15,-1,-1,-1);
1241             __m256i zero = _mm256_setzero_si256();
1242             shuffle_16bit_words(ncarries.v,
1243                     _mm256_sub_epi16(zero,
1244                         _mm256_cmpgt_epi16(zero, carries.v[0])),
1245                     -1,-1,-1,-1,
1246                     3,-1,-1,6,
1247                     -1,-1,9,-1,
1248                     -1,12,-1,-1);
1249 #else
1250             shuffle_16bit_words(carries.v, src.data[0], src.data[1],
1251                     -1,-1,-1,3,
1252                     -1,-1,7,-1,
1253                     -1,11,-1,-1,
1254                     15,-1,-1,-1);
1255             __m128i zero = _mm_setzero_si128();
1256             shuffle_16bit_words(ncarries.v,
1257                     _mm_sub_epi16(zero, _mm_cmpgt_epi16(zero, carries.v[0])),
1258                     _mm_sub_epi16(zero, _mm_cmpgt_epi16(zero, carries.v[1])),
1259                     -1,-1,-1,-1,
1260                     3,-1,-1,6,
1261                     -1,-1,9,-1,
1262                     -1,12,-1,-1);
1263 #endif
1264             super::sub_ur(carries.e, ncarries.e);
1265             return carries.e;
1266         }
1267 
1268 
1269 
1270         /* (add|sub)mul_ui go through convert, do naively and convert
1271          * back. Yes, it's slightly painful. Here we assume that src
1272          * has undergone little to no accumulated additions, so that
1273          * it can basically be converted lossless to a gfp::elt
1274          */
addmul_uiarith_modp::details::fast_type1275         static inline void addmul_ui(elt & dst, elt const & src, mp_limb_t x, super::elt const & p, super::preinv const & j)
1276         {
1277             super::elt zr;
1278             super::elt_ur z(dst);
1279             super::addmul_ui(z, (super::elt) src, x, p, j);
1280             super::reduce(zr, z, p, j);
1281             dst = zr;
1282         }
submul_uiarith_modp::details::fast_type1283         static inline void submul_ui(elt_ur & dst, elt const & src, mp_limb_t x, super::elt const & p, super::preinv const & j)
1284         {
1285             super::elt zr;
1286             super::elt_ur z(dst);
1287             super::submul_ui(z, (super::elt) src, x, p, j);
1288             super::reduce(zr, z, p, j);
1289             dst = zr;
1290         }
1291 
1292         /* we have *TWO* reduction functions here. One which assigns to a
1293          * standard gfp::elt, and one which assigns to fast_type::elt */
reducearith_modp::details::fast_type1294         static void reduce(super::elt & r, elt const & a, super::elt const & p, super::preinv const & j)
1295         {
1296             super::elt_ur z(a);
1297             super::reduce(r, z, p, j);
1298         }
reducearith_modp::details::fast_type1299         static void reduce(elt & r, elt const & a, super::elt const & p, super::preinv const & j)
1300         {
1301             super::elt zr;
1302             reduce(zr, a, p, j);
1303             r = zr;
1304         }
1305     };
1306 #endif  /* defined(HAVE_AVX2) || defined(HAVE_SSSE3) */
1307 #endif
1308 #endif
1309     }
1310 
1311 /* expose only what we have in our public interface */
1312 using details::gfp;
1313 using details::fast_type;
1314 }
1315 
1316 
1317 
1318 #endif
1319