1 #include <stdlib.h>
2 #ifndef LAS_SMALLSIEVE_LOWLEVEL_HPP_
3 #define LAS_SMALLSIEVE_LOWLEVEL_HPP_
4 
5 #include "las-config.h"
6 #include "macros.h"
7 
8 /* About row0_is_oddj: in order to check whether a j coordinate is even,
9  * we need to take into account the bucket number, especially in case
10  * buckets are as large as the sieve region. The row number corresponding
11  * to a given i0 is i0/I, but we also need to add bucket_nr*bucket_size/I
12  * to this, which is what this flag is for.  Sublat must also be taken
13  * into account.
14  */
15 
16 /* This is copied from LOGNORM_FILL_COMMON_DEFS in las-norms.cpp ; from
17  * logI, N, and LOG_BUCKET_REGION, define the integers i0, i1, j0, j1,
18  * and I.
19  */
20 
21 #define SMALLSIEVE_COMMON_DEFS()                                         \
22     const unsigned int log_lines_per_region = MAX(0, LOG_BUCKET_REGION - logI);\
23     const unsigned int log_regions_per_line = MAX(0, logI - LOG_BUCKET_REGION);\
24     const unsigned int regions_per_line = 1 << log_regions_per_line;           \
25     const unsigned int region_rank_in_line = N & (regions_per_line - 1);       \
26     const bool last_region_in_line MAYBE_UNUSED = region_rank_in_line == (regions_per_line - 1); \
27     const unsigned int j0 = (N >> log_regions_per_line) << log_lines_per_region;    \
28     const unsigned int j1 MAYBE_UNUSED = j0 + (1 << log_lines_per_region);    \
29     const int I = 1 << logI;                                            \
30     const int i0 = (region_rank_in_line << LOG_BUCKET_REGION) - I/2;          \
31     const int i1 MAYBE_UNUSED = i0 + (1 << MIN(LOG_BUCKET_REGION, logI));     \
32     /* those are (1,0,0) in the standard case */                        \
33     const int sublatm MAYBE_UNUSED = sl.m ? sl.m : 1; \
34     const unsigned int sublati0 MAYBE_UNUSED = sl.i0;       \
35     const unsigned int sublatj0 MAYBE_UNUSED = sl.j0;       \
36     const int row0_is_oddj MAYBE_UNUSED = (j0*sublatm + sublatj0) & 1;  \
37     bool has_haxis = !j0;                                               \
38     bool has_vaxis = region_rank_in_line == ((regions_per_line-1)/2);   \
39     bool has_origin MAYBE_UNUSED = has_haxis && has_vaxis;              \
40     do {} while (0)
41 
42 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
43 /* {{{ preprocessor macro to generate assembly code for the small sieve */
44 /* {{{ Comments on the _OLDLOOP version (AF)
45    0. The C code and the asm X86 code have the same algorithm.
46    Read first the C code to understand easily the asm code.
47    1. If there are less than 12 "T" in the line, the goal is to do
48    only one jump (pipe-line breaks) and of course the instructions
49    minimal number.
50    2. 12 "T" seems the best in the critical loop. Before, gcc tries
51    to optimize in a bad way the loop. For gcc generated code, the
52    best is here a systematic code (12*2 instructions), like the C code.
53    3. The asm X86 optimization uses addb logp,(pi+p_or_2p*[0,1,2])
54    & three_p_or_2p = 3 * p_or_2p; the lea (add simulation) & real
55    add interlace seems a bit interesting.
56    So, the loop is smaller & faster (19 instructions versus 27 for
57    gcc best X86 asm).
58    4. Of course, the gain between the 2 versions is light, because
59    the main problem is the access time of the L0 cache: read + write
60    with sieve_increase(pi,logp,w), or *pi += logp in fact.
61    }}} */
62 /* define sequences of operations rather systematically.
63  *
64  * ADD_NOINCR   *pi += logp.
65  * ADD_INCR     *pi += logp; pi += incr
66  * XADD_NOINCR  if (pi >= fence) goto exit; *pi += logp;
67  * XADD_INCR    if (pi >= fence) goto exit; *pi += logp; pi += incr;
68  */
69 /* {{{ *_ADD_NOINCR */
70 #define ONE_ADD_NOINCR(pi, logp)                                     	\
71     "addb " logp ",(" pi ")\n"                /* pi[0] += logp */
72 #define TWO_ADDS_NOINCR(pi, incr, logp)                 		\
73     ONE_ADD_NOINCR(pi, logp)               /* pi[0] += logp */       	\
74     ONE_ADD_NOINCR(pi "," incr ",1", logp) /* pi[incr] += logp */
75 #define THREE_ADDS_NOINCR(pi, incr, logp)                 		\
76     ONE_ADD_NOINCR(pi, logp)               /* pi[0] += logp */       	\
77     ONE_ADD_NOINCR(pi "," incr ",1", logp) /* pi[incr] += logp */    	\
78     ONE_ADD_NOINCR(pi "," incr ",2", logp) /* pi[2*incr] += logp */
79 /* }}} */
80 /* {{{ *_ADDS_INCR, including _PRECOMP variant */
81 #define ONE_ADD_INCR(pi, incr, logp)                             	\
82     ONE_ADD_NOINCR(pi, logp)                                         	\
83     "lea (" pi "," incr ",1)," pi "\n"        /* pi += incr */
84 #define TWO_ADDS_INCR(pi, incr, logp)                 			\
85     ONE_ADD_NOINCR(pi, logp)               /* pi[0] += logp */       	\
86     ONE_ADD_NOINCR(pi "," incr ",1", logp) /* pi[incr] += logp */    	\
87     "lea (" pi "," incr ",2)," pi "\n"     /* pi += 2*incr */
88 /* The _PRECOMP variant uses a precomputed  bigincr = 3 * incr  */
89 #define THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp)                \
90      THREE_ADDS_NOINCR(pi, incr, logp)                                  \
91     "lea (" pi "," bigincr ",1)," pi "\n"     /* pi += 3*incr */
92 /* {{{ expand to define new lengths */
93 #define FOUR_ADDS_INCR(pi, incr, logp)                                  \
94     TWO_ADDS_INCR(pi, incr, logp)                                       \
95     TWO_ADDS_INCR(pi, incr, logp)
96 #define EIGHT_ADDS_INCR(pi, incr, logp)                                 \
97     FOUR_ADDS_INCR(pi, incr, logp)                                      \
98     FOUR_ADDS_INCR(pi, incr, logp)
99 #define TWELVE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp)               \
100     THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp)                    \
101     THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp)                    \
102     THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp)                    \
103     THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp)
104 #define SIXTEEN_ADDS_INCR(pi, incr, logp)                               \
105     EIGHT_ADDS_INCR(pi, incr, logp)                                     \
106     EIGHT_ADDS_INCR(pi, incr, logp)
107 /* }}} */
108 /* }}} */
109 /* {{{ caution: *_XADDS_INCR */
110 #define ONE_XADD_NOINCR(pi, fence, logp, exit)               	\
111     "cmp " fence ", " pi "\n"                 /* if (pi >= S1) break; */\
112     "jae " exit "\n"                                                    \
113     ONE_ADD_NOINCR(pi, logp)
114 #define ONE_XADD_INCR(pi, incr, fence, logp, exit)       		\
115     ONE_XADD_NOINCR(pi, fence, logp, exit)                   		\
116     "lea (" pi "," incr ",1)," pi "\n"        /* pi += incr */
117 #define TWO_XADDS_INCR(pi, incr, fence, logp, exit)             	\
118     ONE_XADD_INCR(pi, incr, fence, logp, exit)                  	\
119     ONE_XADD_INCR(pi, incr, fence, logp, exit)
120 /* no need for THREE_XADDs */
121 /* {{{ expand to define new lengths */
122 #define FOUR_XADDS_INCR(pi, incr, fence, logp, exit)     		\
123     TWO_XADDS_INCR(pi, incr, fence, logp, exit)          		\
124     TWO_XADDS_INCR(pi, incr, fence, logp, exit)
125 #define EIGHT_XADDS_INCR(pi, incr, fence, logp, exit)    		\
126     FOUR_XADDS_INCR(pi, incr, fence, logp, exit)         		\
127     FOUR_XADDS_INCR(pi, incr, fence, logp, exit)
128 #define TWELVE_XADDS_INCR(pi, incr, fence, logp, exit) 			\
129     EIGHT_XADDS_INCR(pi, incr, fence, logp, exit)        		\
130     FOUR_XADDS_INCR(pi, incr, fence, logp, exit)
131 #define SIXTEEN_XADDS_INCR(pi, incr, fence, logp, exit)    		\
132     EIGHT_XADDS_INCR(pi, incr, fence, logp, exit)         		\
133     EIGHT_XADDS_INCR(pi, incr, fence, logp, exit)
134 /* }}} */
135 /* }}} */
136 /* {{{ We also have _NOLASTINCR variants. Those are used in the old
137  * assembly loop. As a matter of fact, it is not a good idea to optimize
138  * like this: we do need the end value of the pointer, and it's really
139  * crucially important for the case there I>B. So using these functions
140  * is part of the reason why the old code will not do in the longer term.
141  */
142 #define TWO_XADDS_NOLASTINCR(pi, incr, fence, logp, exit)       	\
143     ONE_XADD_INCR(pi, incr, fence, logp, exit)                  	\
144     ONE_XADD_NOINCR(pi, fence, logp, exit)
145 #define FOUR_XADDS_NOLASTINCR(pi, incr, fence, logp, exit)   		\
146     TWO_XADDS_INCR(pi, incr, fence, logp, exit)          		\
147     TWO_XADDS_NOLASTINCR(pi, incr, fence, logp, exit)
148 #define TWELVE_XADDS_NOLASTINCR(pi, incr, fence, logp, exit) 		\
149     EIGHT_XADDS_INCR(pi, incr, fence, logp, exit)        		\
150     FOUR_XADDS_NOLASTINCR(pi, incr, fence, logp, exit)
151 #define SMALLSIEVE_ASSEMBLY_OLD(_pi, _incr, _fence, _logp) do {             \
152     unsigned char *dummy;                                              \
153     size_t three_p_or_2p;                                               \
154     __asm__ __volatile__ (                                              \
155             "lea (%[incr],%[incr],2), %[three_p_or_2p]\n"     /* three_p_or_2p = p_or_2p * 3 */ \
156             "lea (%[pi],%[three_p_or_2p],4), %[dummy]\n"     /* pi_end = pi + p_or_2p*12 */    \
157             "cmp %[fence], %[dummy]\n"            /* if (pi_end > S1) no loop */    \
158             "ja 1f\n"                                                  \
159             ".balign 8\n"                                               \
160             "0:\n"                                                      \
161             TWELVE_ADDS_INCR_PRECOMP("%[pi]","%[incr]","%[three_p_or_2p]","%[logp]")               \
162             "lea (%[pi],%[three_p_or_2p],4), %[dummy]\n"    /* if (pi+p_or_2p*12 > S1) break */\
163             "cmp %[fence], %[dummy]\n"                                              \
164             "jbe 0b\n"                                                  \
165             "1:\n"                                                      \
166             TWELVE_XADDS_INCR("%[pi]","%[incr]","%[fence]","%[logp]","2f")    \
167             "2:\n"                                                      \
168             : [dummy] "=&r"(dummy), [pi] "+r"(_pi), [three_p_or_2p]"=&r"(three_p_or_2p)            \
169             : [incr]"r"(_incr), [_logp]"q"(logp), [fence]"r"(_fence) : "cc");                 \
170 } while (0)
171 /* }}} */
172 #if 0 /* {{{ unused GCC trick */
173 /* there is technically a way to drop the "volatile" qualifier, so that
174  * gcc understands that we're touching memory, and which memory exactly.
175  * Such a way can be found in the snippet below.
176  * Alas, I don't know how it is possible to do so and at the same time
177  * make sure that: 1 - the memory reference that is passed to the inline asm
178  * is strictly of the form (%[register]), and 2 - we can actually get the
179  * register back, or emit code that is similar to (%[register],%[displ])
180  * from there.
181  * And anyway, I don't really believe that it's important: this block is
182  * really the expensive operation, what happens to the glue around it is
183  * less important.
184  */
185 #define SMALLSIEVE_ASSEMBLY_NEW(S0, pos, incr, fence, logp) do {        \
186     /* see gcc 6.45.2.6 Clobbers */                                     \
187     typedef unsigned char (*ss_area_t)[1<<16];                          \
188     ss_area_t S0x = (ss_area_t) S0;                                     \
189     asm (                                                               \
190             "leaq (%[S0],%[pos]), %[pi]\n"                              \
191             ONE_ADD_INCR("%[pi]","%[inc]", "%[logp]")            \
192             ONE_XADD_INCR("%[pi]", "%[inc]", "%[ff]", "%[logp]", "0f") \
193             ONE_XADD_INCR("%[pi]", "%[inc]", "%[ff]", "%[logp]", "0f") \
194             ONE_XADD_INCR("%[pi]", "%[inc]", "%[ff]", "%[logp]", "0f") \
195             "0:\n"                                                      \
196             : [S0]"=m"(*S0x), [pi]"=&r"(pi)                             \
197             : [pos]"r"(pos),                                            \
198             [inc]"r"(incr), [logp]"q"(logp), [ff] "r" (fence));         \
199 } while (0)
200 #endif /* }}} */
201 /* {{{ asm volatile statements for all the variants */
202 /* {{{ unrolled statements, limited length */
203 #define SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)          \
204             "0:\n"                                                      \
205             : [pi]"+r"(_pi)                                             \
206             : [inc]"r"(_incr), [logp]"q"(_logp), [ff] "r" (_fence)
207 #define SMALLSIEVE_ASSEMBLY_NEW_1_TO_2(_pi, _incr, _fence, _logp)       \
208     asm volatile (                                                      \
209             ONE_ADD_INCR("%[pi]","%[inc]","%[logp]")                    \
210             ONE_XADD_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f")      \
211             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
212 #define SMALLSIEVE_ASSEMBLY_NEW_2_TO_4(_pi, _incr, _fence, _logp)       \
213     asm volatile (                                                      \
214             TWO_ADDS_INCR("%[pi]","%[inc]","%[logp]")                   \
215             TWO_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \
216             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
217 #define SMALLSIEVE_ASSEMBLY_NEW_4_TO_8(_pi, _incr, _fence, _logp)       \
218     asm volatile (                                                      \
219             FOUR_ADDS_INCR("%[pi]","%[inc]","%[logp]")                  \
220             FOUR_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \
221             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
222 #define SMALLSIEVE_ASSEMBLY_NEW_8_TO_16(_pi, _incr, _fence, _logp)      \
223     asm volatile (                                                      \
224             EIGHT_ADDS_INCR("%[pi]","%[inc]","%[logp]")                 \
225             EIGHT_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \
226             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
227 #define SMALLSIEVE_ASSEMBLY_NEW_16_TO_32(_pi, _incr, _fence, _logp)     \
228     asm volatile (                                                      \
229             SIXTEEN_ADDS_INCR("%[pi]","%[inc]","%[logp]")               \
230             SIXTEEN_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \
231             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
232 /* this is the point where the complete unrolling starts being excessive */
233 #define SMALLSIEVE_ASSEMBLY_NEW_32_TO_64(_pi, _incr, _fence, _logp)     \
234     asm volatile (                                                      \
235             SIXTEEN_ADDS_INCR("%[pi]","%[inc]","%[logp]")               \
236             SIXTEEN_ADDS_INCR("%[pi]","%[inc]","%[logp]")               \
237             SIXTEEN_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \
238             SIXTEEN_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \
239             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
240 /* }}} */
241 /* {{{ loop8 */
242 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP8_COMMON(_pi, _incr, _fence, _logp, _PP) \
243     asm volatile (                                                      \
244             "leaq (%[pi], %[inc], 8), %%rdi\n"                          \
245             "cmpq %[ff], %%rdi\n"                                       \
246             "ja 1f\n"           /* if rdi > fence, no loop */           \
247             ".balign 8\n"                                               \
248             "2:\n"                                                      \
249             _PP                                                         \
250             EIGHT_ADDS_INCR("%[pi]","%[inc]","%[logp]")                 \
251             "leaq (%%rdi, %[inc], 8), %%rdi\n"                          \
252             "cmpq %[ff], %%rdi\n"                                       \
253             "jbe 2b\n"                                                  \
254             "1:\n"                                                      \
255             EIGHT_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f")   \
256             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)      \
257             : "rdi")
258 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP8(_pi, _incr, _fence, _logp)        \
259         SMALLSIEVE_ASSEMBLY_NEW_LOOP8_COMMON(_pi, _incr, _fence, _logp, \
260                 "")
261 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP8P0(_pi, _incr, _fence, _logp)      \
262         SMALLSIEVE_ASSEMBLY_NEW_LOOP8_COMMON(_pi, _incr, _fence, _logp, \
263                 "prefetch  (%%rdi)\n")
264 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP8P1(_pi, _incr, _fence, _logp)      \
265         SMALLSIEVE_ASSEMBLY_NEW_LOOP8_COMMON(_pi, _incr, _fence, _logp, \
266                 "prefetch  (%%rdi, %[inc], 8)\n")
267 /* }}} */
268 /* {{{ loop12 */
269 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp, _PP) \
270     asm volatile (                                                      \
271             "leaq (%[inc], %[inc], 2), %%r10\n"                         \
272             "leaq (%[pi], %%r10, 4), %%rdi\n"                           \
273             "cmpq %[ff], %%rdi\n"                                       \
274             "ja 1f\n"           /* if rdi > fence, no loop */           \
275             ".balign 8\n"                                               \
276             "2:\n"                                                      \
277             _PP                                                         \
278             TWELVE_ADDS_INCR_PRECOMP("%[pi]","%[inc]","%%r10","%[logp]")\
279             "leaq (%%rdi, %%r10, 4), %%rdi\n"                           \
280             "cmpq %[ff], %%rdi\n"                                       \
281             "jbe 2b\n"                                                  \
282             "1:\n"                                                      \
283             TWELVE_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f")  \
284             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)      \
285             : "rdi", "r10")
286 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12(_pi, _incr, _fence, _logp)       \
287         SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp,\
288                 "")
289 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12P0(_pi, _incr, _fence, _logp)     \
290         SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp,\
291                 "prefetch  (%%rdi)\n")
292 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12P1(_pi, _incr, _fence, _logp)    \
293         SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp,\
294                 "prefetch  (%%rdi, %%r10, 4)\n")
295 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12P2(_pi, _incr, _fence, _logp)    \
296         SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp,\
297                 "prefetch  (%%rdi, %%r10, 8)\n")
298 /* }}} */
299 /* {{{ loop16 */
300 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP16_COMMON(_pi, _incr, _fence, _logp, _PP) \
301     asm volatile (                                                      \
302             "leaq (%[pi], %[inc2], 8), %%rdi\n"                         \
303             "cmpq %[ff], %%rdi\n"                                       \
304             "ja 1f\n"           /* if rdi > fence, no loop */           \
305             ".balign 8\n"                                               \
306             "2:\n"                                                      \
307             _PP                                                         \
308             SIXTEEN_ADDS_INCR("%[pi]","%[inc]","%[logp]")               \
309             "leaq (%%rdi, %[inc2], 8), %%rdi\n"                         \
310             "cmpq %[ff], %%rdi\n"                                       \
311             "jbe 2b\n"                                                  \
312             "1:\n"                                                      \
313             SIXTEEN_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \
314             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)      \
315             , [inc2]"r"(2*(_incr)) : "rdi")
316 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP16(_pi, _incr, _fence, _logp)        \
317         SMALLSIEVE_ASSEMBLY_NEW_LOOP16_COMMON(_pi, _incr, _fence, _logp, \
318                 "")
319 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP16P0(_pi, _incr, _fence, _logp)      \
320         SMALLSIEVE_ASSEMBLY_NEW_LOOP16_COMMON(_pi, _incr, _fence, _logp, \
321                 "prefetch  (%%rdi)\n")
322 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP16P1(_pi, _incr, _fence, _logp)      \
323         SMALLSIEVE_ASSEMBLY_NEW_LOOP16_COMMON(_pi, _incr, _fence, _logp, \
324                 "prefetch  (%%rdi, %[inc2], 8)\n")
325 /*}}}*/
326 /* {{{ the routines below are unused. The might be, in case we insist on
327  * using the same code for both even and odd lines (but then it would
328  * probably be a loop anyway) */
329 #define SMALLSIEVE_ASSEMBLY_NEW_0_TO_1(_pi, _incr, _fence, _logp)       \
330     asm volatile (                                                      \
331             ONE_XADD_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f")      \
332             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
333 #define SMALLSIEVE_ASSEMBLY_NEW_0_TO_2(_pi, _incr, _fence, _logp)       \
334     asm volatile (                                                      \
335             TWO_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f")     \
336             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
337 #define SMALLSIEVE_ASSEMBLY_NEW_0_TO_4(_pi, _incr, _fence, _logp)       \
338     asm volatile (                                                      \
339             FOUR_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f")    \
340             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
341 #define SMALLSIEVE_ASSEMBLY_NEW_1_TO_4(_pi, _incr, _fence, _logp)       \
342     asm volatile (                                                      \
343             ONE_ADD_INCR("%[pi]","%[inc]","%[logp]")                    \
344             TWO_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f")     \
345             ONE_XADD_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f")      \
346             SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp))
347 /*}}}*/
348 /*}}}*/
349 /* }}} */
350 #endif
351 
352 /*{{{ function objects for all the routines that we have */
353 #define BEGIN_FOBJ(name_, compatibility_condition)       		\
354     struct name_ {							\
355         template<int b> struct is_compatible {                          \
356             static_assert(compatibility_condition,                      \
357                     # name_ " requires " # compatibility_condition);    \
358             static const bool value = compatibility_condition;          \
359         };                                                              \
360         static constexpr const char * name = # name_;			\
361         inline size_t operator()(					\
362                 unsigned char * S0,					\
363                 unsigned char * S1,					\
364                 size_t x0 MAYBE_UNUSED,					\
365                 size_t pos,						\
366                 size_t p_or_2p,						\
367                 unsigned char logp,					\
368                 where_am_I w MAYBE_UNUSED) const			\
369         {								\
370             unsigned char * pi = S0 + pos;                              \
371             INTERMEDIARY_FOBJ()
372 
373 #ifndef TRACE_K
374 #define INTERMEDIARY_FOBJ()       do {} while (0)
375 #else
376 #define INTERMEDIARY_FOBJ() do {					\
377             if (trace_on_range_Nx(w->N, x0 + pos, x0 + S1 - S0)) {	\
378                 if ((trace_Nx.x - x0 - pos) % p_or_2p == 0) {           \
379                     WHERE_AM_I_UPDATE(w, x, trace_Nx.x);		\
380                     sieve_increase_logging(S0 + w->x - x0, logp, w);	\
381                 }							\
382             }								\
383 } while (0)
384 #endif
385 
386 #define END_FOBJ()							\
387     return pi - S1;							\
388 }}
389 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM
390 /* {{{ function objects for assembly code */
391 BEGIN_FOBJ(assembly_generic_loop8, true);
392 SMALLSIEVE_ASSEMBLY_NEW_LOOP8(pi, p_or_2p, S1, logp);
393 END_FOBJ();
394 BEGIN_FOBJ(assembly_generic_loop12, true);
395 SMALLSIEVE_ASSEMBLY_NEW_LOOP12(pi, p_or_2p, S1, logp);
396 END_FOBJ();
397 BEGIN_FOBJ(assembly_generic_loop16, true);
398 SMALLSIEVE_ASSEMBLY_NEW_LOOP16(pi, p_or_2p, S1, logp);
399 END_FOBJ();
400 BEGIN_FOBJ(assembly_generic_loop8p0, true);
401 SMALLSIEVE_ASSEMBLY_NEW_LOOP8P0(pi, p_or_2p, S1, logp);
402 END_FOBJ();
403 BEGIN_FOBJ(assembly_generic_loop12p0, true);
404 SMALLSIEVE_ASSEMBLY_NEW_LOOP12P0(pi, p_or_2p, S1, logp);
405 END_FOBJ();
406 BEGIN_FOBJ(assembly_generic_loop16p0, true);
407 SMALLSIEVE_ASSEMBLY_NEW_LOOP16P0(pi, p_or_2p, S1, logp);
408 END_FOBJ();
409 BEGIN_FOBJ(assembly_generic_loop8p1, true);
410 SMALLSIEVE_ASSEMBLY_NEW_LOOP8P1(pi, p_or_2p, S1, logp);
411 END_FOBJ();
412 BEGIN_FOBJ(assembly_generic_loop12p1, true);
413 SMALLSIEVE_ASSEMBLY_NEW_LOOP12P1(pi, p_or_2p, S1, logp);
414 END_FOBJ();
415 BEGIN_FOBJ(assembly_generic_loop16p1, true);
416 SMALLSIEVE_ASSEMBLY_NEW_LOOP16P1(pi, p_or_2p, S1, logp);
417 END_FOBJ();
418 BEGIN_FOBJ(assembly_generic_loop12p2, true);
419 SMALLSIEVE_ASSEMBLY_NEW_LOOP12P2(pi, p_or_2p, S1, logp);
420 END_FOBJ();
421 BEGIN_FOBJ(assembly_generic_oldloop, true);
422 SMALLSIEVE_ASSEMBLY_OLD(pi, p_or_2p, S1, logp);
423 END_FOBJ();
424 BEGIN_FOBJ(assembly0, b <= 0);
425 SMALLSIEVE_ASSEMBLY_NEW_0_TO_1(pi, p_or_2p, S1, logp);
426 END_FOBJ();
427 BEGIN_FOBJ(assembly1, b == 1);
428 SMALLSIEVE_ASSEMBLY_NEW_1_TO_2(pi, p_or_2p, S1, logp);
429 END_FOBJ();
430 BEGIN_FOBJ(assembly1x, b <= 1);
431 SMALLSIEVE_ASSEMBLY_NEW_0_TO_2(pi, p_or_2p, S1, logp);
432 END_FOBJ();
433 BEGIN_FOBJ(assembly2, b == 2);
434 SMALLSIEVE_ASSEMBLY_NEW_2_TO_4(pi, p_or_2p, S1, logp);
435 END_FOBJ();
436 BEGIN_FOBJ(assembly2x, b <= 2);
437 SMALLSIEVE_ASSEMBLY_NEW_0_TO_4(pi, p_or_2p, S1, logp);
438 END_FOBJ();
439 BEGIN_FOBJ(assembly3, b == 3);
440 SMALLSIEVE_ASSEMBLY_NEW_4_TO_8(pi, p_or_2p, S1, logp);
441 END_FOBJ();
442 BEGIN_FOBJ(assembly4, b == 4);
443 SMALLSIEVE_ASSEMBLY_NEW_8_TO_16(pi, p_or_2p, S1, logp);
444 END_FOBJ();
445 BEGIN_FOBJ(assembly5, b == 5);
446 SMALLSIEVE_ASSEMBLY_NEW_16_TO_32(pi, p_or_2p, S1, logp);
447 END_FOBJ();
448 BEGIN_FOBJ(assembly6, b == 6);
449 SMALLSIEVE_ASSEMBLY_NEW_32_TO_64(pi, p_or_2p, S1, logp);
450 END_FOBJ();
451 /* }}} */
452 #endif
453 /* {{{ Function objects for C code, manually unrolled except for the
454  * _loop versions. */
455 BEGIN_FOBJ(manual0, b <= 0);
456 if (pi < S1) { *pi += logp; pi += p_or_2p; }
457 END_FOBJ();
458 BEGIN_FOBJ(manual1, b == 1);
459 do {
460     *pi += logp; if ((pi += p_or_2p) >= S1) break;
461     *pi += logp; pi += p_or_2p;
462 } while (0);
463 END_FOBJ();
464 BEGIN_FOBJ(manual2, b == 2);
465 do {
466     *pi += logp; pi += p_or_2p;
467     *pi += logp; if ((pi += p_or_2p) >= S1) break;
468     *pi += logp; if ((pi += p_or_2p) >= S1) break;
469     *pi += logp; pi += p_or_2p;
470 } while (0);
471 END_FOBJ();
472 BEGIN_FOBJ(manual3, b == 3);
473 do {
474     *pi += logp; pi += p_or_2p;
475     *pi += logp; pi += p_or_2p;
476     *pi += logp; pi += p_or_2p;
477     *pi += logp; if ((pi += p_or_2p) >= S1) break;
478     *pi += logp; if ((pi += p_or_2p) >= S1) break;
479     *pi += logp; if ((pi += p_or_2p) >= S1) break;
480     *pi += logp; if ((pi += p_or_2p) >= S1) break;
481     *pi += logp; pi += p_or_2p;
482 } while (0);
483 END_FOBJ();
484 BEGIN_FOBJ(manual4, b == 4);
485 do {
486     *pi += logp; pi += p_or_2p;
487     *pi += logp; pi += p_or_2p;
488     *pi += logp; pi += p_or_2p;
489     *pi += logp; pi += p_or_2p;
490     *pi += logp; pi += p_or_2p;
491     *pi += logp; pi += p_or_2p;
492     *pi += logp; pi += p_or_2p;
493     *pi += logp; if ((pi += p_or_2p) >= S1) break;
494     *pi += logp; if ((pi += p_or_2p) >= S1) break;
495     *pi += logp; if ((pi += p_or_2p) >= S1) break;
496     *pi += logp; if ((pi += p_or_2p) >= S1) break;
497     *pi += logp; if ((pi += p_or_2p) >= S1) break;
498     *pi += logp; if ((pi += p_or_2p) >= S1) break;
499     *pi += logp; if ((pi += p_or_2p) >= S1) break;
500     *pi += logp; if ((pi += p_or_2p) >= S1) break;
501     *pi += logp; pi += p_or_2p;
502 } while (0);
503 END_FOBJ();
504 
505 /* Gotcha: we're now preventively tracing the small sieve events outside
506  * the asm code, which causes sieve_increase_logging_backend to be
507  * called, and in turn test_divisible, which modifies traced norms (in
508  * debug mode). So for consistency, we mustn't use sieve_increase here,
509  * or the norm would be divided by p twice.
510  */
511 BEGIN_FOBJ(manual_oldloop, true);
512 #define T do {                                                          \
513     WHERE_AM_I_UPDATE(w, x, x0 + pi - S0);                              \
514     *pi += logp; /* sieve_increase (pi, logp, w); */ pi += p_or_2p;     \
515 } while(0)
516 while (UNLIKELY(pi + p_or_2p * 12 <= S1))
517 { T; T; T; T; T; T; T; T; T; T; T; T; }
518 do {
519     if (pi >= S1) break;
520     T;
521     if (pi >= S1) break;
522     T;
523     if (pi >= S1) break;
524     T;
525     if (pi >= S1) break;
526     T;
527     if (pi >= S1) break;
528     T;
529     if (pi >= S1) break;
530     T;
531     if (pi >= S1) break;
532     T;
533     if (pi >= S1) break;
534     T;
535     if (pi >= S1) break;
536     T;
537     if (pi >= S1) break;
538     T;
539     if (pi >= S1) break;
540     T;
541     if (pi >= S1) break;
542     T;
543 } while (0);
544 #undef T
545 END_FOBJ();
546 BEGIN_FOBJ(manual_oldloop_nounroll, true);
547 for ( ; pi < S1 ; pi += p_or_2p) {
548     WHERE_AM_I_UPDATE(w, x, x0 + pi - S0);
549     *pi += logp; // sieve_increase (pi, logp, w);
550 }
551 END_FOBJ();
552 /*}}}*/
553 /*}}}*/
554 
555 /* {{{ our selection of best routines indexed by ceil(log2(I/p)) */
556 template<int bit> struct best_evenline;
557 template<int bit> struct best_oddline ;
558 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) && !defined(TRACK_CODE_PATH)
559 template<int bit> struct best_evenline {
560     typedef int is_default;
561     typedef manual_oldloop type;
562 };
563 template<int bit> struct best_oddline  {
564     typedef int is_default;
565     typedef manual_oldloop type;
566 };
567 template<> struct best_evenline<1> { typedef manual0 type; };
568 template<> struct best_evenline<2> { typedef assembly1 type; };
569 template<> struct best_evenline<3> { typedef assembly2 type; };
570 template<> struct best_evenline<4> { typedef assembly3 type; };
571 template<> struct best_evenline<5> { typedef assembly4 type; };
572 template<> struct best_evenline<6> { typedef assembly5 type; };
573 
574 template<> struct best_oddline<1> { typedef assembly1 type; };
575 template<> struct best_oddline<2> { typedef assembly2 type; };
576 template<> struct best_oddline<3> { typedef assembly3 type; };
577 template<> struct best_oddline<4> { typedef assembly4 type; };
578 template<> struct best_oddline<5> { typedef assembly_generic_loop8 type; };
579 template<> struct best_oddline<6> { typedef assembly_generic_loop8 type; };
580 
581 #if 1
582 template<> struct best_evenline<7> { typedef assembly_generic_loop12 type; };
583 template<> struct best_evenline<8> { typedef assembly_generic_loop12 type; };
584 template<> struct best_evenline<9> { typedef assembly_generic_loop12 type; };
585 template<> struct best_evenline<10> { typedef assembly_generic_loop12 type; };
586 template<> struct best_evenline<11> { typedef assembly_generic_loop12 type; };
587 template<> struct best_evenline<12> { typedef assembly_generic_loop12 type; };
588 template<> struct best_evenline<13> { typedef assembly_generic_loop16p0 type; };
589 template<> struct best_evenline<14> { typedef assembly_generic_loop16p1 type; };
590 template<> struct best_evenline<15> { typedef assembly_generic_loop16p0 type; };
591 
592 template<> struct best_oddline<7> { typedef assembly_generic_loop12 type; };
593 template<> struct best_oddline<8> { typedef assembly_generic_loop12 type; };
594 template<> struct best_oddline<9> { typedef assembly_generic_loop12 type; };
595 template<> struct best_oddline<10> { typedef assembly_generic_loop12 type; };
596 template<> struct best_oddline<11> { typedef assembly_generic_loop12 type; };
597 template<> struct best_oddline<12> { typedef assembly_generic_loop12 type; };
598 template<> struct best_oddline<13> { typedef assembly_generic_loop16p0 type; };
599 template<> struct best_oddline<14> { typedef assembly_generic_loop16p1 type; };
600 template<> struct best_oddline<15> { typedef assembly_generic_loop16p0 type; };
601 typedef assembly_generic_oldloop default_smallsieve_inner_loop;
602 #endif
603 /* TODO: we could perhaps provide hints so that the generated code can
604  * merge some rounds of the loop. Or maybe the compiler will do that ? */
605 #else
606 template<int bit> struct best_evenline { typedef manual_oldloop type; };
607 template<int bit> struct best_oddline  { typedef manual_oldloop type; };
608 typedef manual_oldloop default_smallsieve_inner_loop;
609 #endif
610 /* }}} */
611 
612 
613 #endif	/* LAS_SMALLSIEVE_LOWLEVEL_HPP_ */
614