1 #ifndef LAS_SIEVE2357_HPP
2 #define LAS_SIEVE2357_HPP
3 
4 #include "cado_config.h"  // for HAVE_AVX2, HAVE_SSSE3
5 
6 #include <cstddef>       // for size_t
7 #include <cstdint>       // for uint8_t
8 #ifdef HAVE_SSE2
9 /* The base type that we'll use is good ol sse2 __m128i, but we need
10  * SSSE3 instructions to deal with them in the .cpp file */
11 #include <emmintrin.h>  // IWYU pragma: keep
12 #endif
13 #ifdef HAVE_AVX2
14 #include <immintrin.h>
15 #endif
16 #ifdef HAVE_ARM_NEON
17 #include <arm_neon.h>
18 #endif
19 
20 #include "fb-types.h"     // for fbprime_t
21 struct where_am_I;
22 
23 
24 class sieve2357base {
25 public:
26 #if defined(HAVE_AVX2)
27     typedef __m256i preferred_simd_type;
28 #elif defined(HAVE_SSSE3)
29     typedef __m128i preferred_simd_type;
30 #elif defined(HAVE_ARM_NEON)
31     typedef uint8x16_t preferred_simd_type;
32 #elif LONG_BIT == 64 /* FIXME: this is false on, e.g., MinGW. What's a better condition here? */
33     typedef uint64_t preferred_simd_type;
34 #else
35     typedef uint32_t preferred_simd_type;
36 #endif
37 
38     /* Define ordering on q which sieve2357 expect: first q=1, then powers of 2,
39        then odd primes in increasing order */
order_lt(fbprime_t q1,fbprime_t q2)40     static inline bool order_lt(fbprime_t q1, fbprime_t q2) {
41         if (q1 == 1 && q2 != 1)
42           return true;
43         if (q1 != 1 && q2 == 1)
44           return false;
45         if (q1 % 2 == 0 && q2 % 2 == 1)
46           return true;
47         if (q1 % 2 == 1 && q2 % 2 == 0)
48           return false;
49         return q1 < q2;
50     }
51 
52     /* We hit sievearray[x] for x = idx + i*q with 0 <= x < arraylen */
53     struct prime_t {
54         fbprime_t q, idx;
55         unsigned char logp;
56     };
57 
58     /* The sieve() function can either write to the sieve array, overwriting any
59        data that was previously there, or add to the sieve array.
60        If sieve2357::sieve() is to write to a sieve array that contains only
61        zeroes, then the former is faster (don't have to zero out the array and
62        sieve() saves one read and add per word).
63        If the sieve array contains non-zero data, then obviously update_add is
64        required. */
65     enum {
66         update_set,
67         update_add
68     };
69 };
70 
71 template <typename SIMDTYPE, typename ELEMTYPE>
72 class sieve2357: public sieve2357base {
73 private:
74     static inline void
75     sieve_odd_prime(SIMDTYPE * const result, const ELEMTYPE logp,
76         const fbprime_t stride, const fbprime_t idx, const SIMDTYPE even_mask);
77     static void
78     big_loop_set(SIMDTYPE * __restrict__ sievearray, const SIMDTYPE * const __restrict__ pattern235,
79         const SIMDTYPE * const __restrict__ pattern7, size_t l7);
80     static void
81     big_loop_add(SIMDTYPE * __restrict__ sievearray, const SIMDTYPE * const __restrict__ pattern235,
82         const SIMDTYPE * const __restrict__ pattern7, size_t l7);
83     static SIMDTYPE
84     sieve2(const fbprime_t q, const fbprime_t idx, const uint8_t logp);
85     static SIMDTYPE
86     get_even_mask(const int skip_mod_2);
87 
88 public:
89     /* A predicate that tells whether a prime power q = p^k can be sieved by
90        sieve2357 with a given SIMD and element data type */
can_sieve(const fbprime_t q)91     static inline bool can_sieve(const fbprime_t q)
92     {
93         const size_t N = sizeof(SIMDTYPE) / sizeof(ELEMTYPE);
94         return q == 1 || (q % 2 == 0 && N % q  == 0) || q == 3 || q == 5
95             || q == 7;
96     }
97 
98     static void sieve(SIMDTYPE *sievearray, size_t arraylen, const prime_t *primes,
99         int skip_mod_2, int update_operation, where_am_I & w);
100 
101 };
102 #endif
103