1 #include <stdlib.h> 2 #ifndef LAS_SMALLSIEVE_LOWLEVEL_HPP_ 3 #define LAS_SMALLSIEVE_LOWLEVEL_HPP_ 4 5 #include "las-config.h" 6 #include "macros.h" 7 8 /* About row0_is_oddj: in order to check whether a j coordinate is even, 9 * we need to take into account the bucket number, especially in case 10 * buckets are as large as the sieve region. The row number corresponding 11 * to a given i0 is i0/I, but we also need to add bucket_nr*bucket_size/I 12 * to this, which is what this flag is for. Sublat must also be taken 13 * into account. 14 */ 15 16 /* This is copied from LOGNORM_FILL_COMMON_DEFS in las-norms.cpp ; from 17 * logI, N, and LOG_BUCKET_REGION, define the integers i0, i1, j0, j1, 18 * and I. 19 */ 20 21 #define SMALLSIEVE_COMMON_DEFS() \ 22 const unsigned int log_lines_per_region = MAX(0, LOG_BUCKET_REGION - logI);\ 23 const unsigned int log_regions_per_line = MAX(0, logI - LOG_BUCKET_REGION);\ 24 const unsigned int regions_per_line = 1 << log_regions_per_line; \ 25 const unsigned int region_rank_in_line = N & (regions_per_line - 1); \ 26 const bool last_region_in_line MAYBE_UNUSED = region_rank_in_line == (regions_per_line - 1); \ 27 const unsigned int j0 = (N >> log_regions_per_line) << log_lines_per_region; \ 28 const unsigned int j1 MAYBE_UNUSED = j0 + (1 << log_lines_per_region); \ 29 const int I = 1 << logI; \ 30 const int i0 = (region_rank_in_line << LOG_BUCKET_REGION) - I/2; \ 31 const int i1 MAYBE_UNUSED = i0 + (1 << MIN(LOG_BUCKET_REGION, logI)); \ 32 /* those are (1,0,0) in the standard case */ \ 33 const int sublatm MAYBE_UNUSED = sl.m ? sl.m : 1; \ 34 const unsigned int sublati0 MAYBE_UNUSED = sl.i0; \ 35 const unsigned int sublatj0 MAYBE_UNUSED = sl.j0; \ 36 const int row0_is_oddj MAYBE_UNUSED = (j0*sublatm + sublatj0) & 1; \ 37 bool has_haxis = !j0; \ 38 bool has_vaxis = region_rank_in_line == ((regions_per_line-1)/2); \ 39 bool has_origin MAYBE_UNUSED = has_haxis && has_vaxis; \ 40 do {} while (0) 41 42 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM 43 /* {{{ preprocessor macro to generate assembly code for the small sieve */ 44 /* {{{ Comments on the _OLDLOOP version (AF) 45 0. The C code and the asm X86 code have the same algorithm. 46 Read first the C code to understand easily the asm code. 47 1. If there are less than 12 "T" in the line, the goal is to do 48 only one jump (pipe-line breaks) and of course the instructions 49 minimal number. 50 2. 12 "T" seems the best in the critical loop. Before, gcc tries 51 to optimize in a bad way the loop. For gcc generated code, the 52 best is here a systematic code (12*2 instructions), like the C code. 53 3. The asm X86 optimization uses addb logp,(pi+p_or_2p*[0,1,2]) 54 & three_p_or_2p = 3 * p_or_2p; the lea (add simulation) & real 55 add interlace seems a bit interesting. 56 So, the loop is smaller & faster (19 instructions versus 27 for 57 gcc best X86 asm). 58 4. Of course, the gain between the 2 versions is light, because 59 the main problem is the access time of the L0 cache: read + write 60 with sieve_increase(pi,logp,w), or *pi += logp in fact. 61 }}} */ 62 /* define sequences of operations rather systematically. 63 * 64 * ADD_NOINCR *pi += logp. 65 * ADD_INCR *pi += logp; pi += incr 66 * XADD_NOINCR if (pi >= fence) goto exit; *pi += logp; 67 * XADD_INCR if (pi >= fence) goto exit; *pi += logp; pi += incr; 68 */ 69 /* {{{ *_ADD_NOINCR */ 70 #define ONE_ADD_NOINCR(pi, logp) \ 71 "addb " logp ",(" pi ")\n" /* pi[0] += logp */ 72 #define TWO_ADDS_NOINCR(pi, incr, logp) \ 73 ONE_ADD_NOINCR(pi, logp) /* pi[0] += logp */ \ 74 ONE_ADD_NOINCR(pi "," incr ",1", logp) /* pi[incr] += logp */ 75 #define THREE_ADDS_NOINCR(pi, incr, logp) \ 76 ONE_ADD_NOINCR(pi, logp) /* pi[0] += logp */ \ 77 ONE_ADD_NOINCR(pi "," incr ",1", logp) /* pi[incr] += logp */ \ 78 ONE_ADD_NOINCR(pi "," incr ",2", logp) /* pi[2*incr] += logp */ 79 /* }}} */ 80 /* {{{ *_ADDS_INCR, including _PRECOMP variant */ 81 #define ONE_ADD_INCR(pi, incr, logp) \ 82 ONE_ADD_NOINCR(pi, logp) \ 83 "lea (" pi "," incr ",1)," pi "\n" /* pi += incr */ 84 #define TWO_ADDS_INCR(pi, incr, logp) \ 85 ONE_ADD_NOINCR(pi, logp) /* pi[0] += logp */ \ 86 ONE_ADD_NOINCR(pi "," incr ",1", logp) /* pi[incr] += logp */ \ 87 "lea (" pi "," incr ",2)," pi "\n" /* pi += 2*incr */ 88 /* The _PRECOMP variant uses a precomputed bigincr = 3 * incr */ 89 #define THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp) \ 90 THREE_ADDS_NOINCR(pi, incr, logp) \ 91 "lea (" pi "," bigincr ",1)," pi "\n" /* pi += 3*incr */ 92 /* {{{ expand to define new lengths */ 93 #define FOUR_ADDS_INCR(pi, incr, logp) \ 94 TWO_ADDS_INCR(pi, incr, logp) \ 95 TWO_ADDS_INCR(pi, incr, logp) 96 #define EIGHT_ADDS_INCR(pi, incr, logp) \ 97 FOUR_ADDS_INCR(pi, incr, logp) \ 98 FOUR_ADDS_INCR(pi, incr, logp) 99 #define TWELVE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp) \ 100 THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp) \ 101 THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp) \ 102 THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp) \ 103 THREE_ADDS_INCR_PRECOMP(pi, incr, bigincr, logp) 104 #define SIXTEEN_ADDS_INCR(pi, incr, logp) \ 105 EIGHT_ADDS_INCR(pi, incr, logp) \ 106 EIGHT_ADDS_INCR(pi, incr, logp) 107 /* }}} */ 108 /* }}} */ 109 /* {{{ caution: *_XADDS_INCR */ 110 #define ONE_XADD_NOINCR(pi, fence, logp, exit) \ 111 "cmp " fence ", " pi "\n" /* if (pi >= S1) break; */\ 112 "jae " exit "\n" \ 113 ONE_ADD_NOINCR(pi, logp) 114 #define ONE_XADD_INCR(pi, incr, fence, logp, exit) \ 115 ONE_XADD_NOINCR(pi, fence, logp, exit) \ 116 "lea (" pi "," incr ",1)," pi "\n" /* pi += incr */ 117 #define TWO_XADDS_INCR(pi, incr, fence, logp, exit) \ 118 ONE_XADD_INCR(pi, incr, fence, logp, exit) \ 119 ONE_XADD_INCR(pi, incr, fence, logp, exit) 120 /* no need for THREE_XADDs */ 121 /* {{{ expand to define new lengths */ 122 #define FOUR_XADDS_INCR(pi, incr, fence, logp, exit) \ 123 TWO_XADDS_INCR(pi, incr, fence, logp, exit) \ 124 TWO_XADDS_INCR(pi, incr, fence, logp, exit) 125 #define EIGHT_XADDS_INCR(pi, incr, fence, logp, exit) \ 126 FOUR_XADDS_INCR(pi, incr, fence, logp, exit) \ 127 FOUR_XADDS_INCR(pi, incr, fence, logp, exit) 128 #define TWELVE_XADDS_INCR(pi, incr, fence, logp, exit) \ 129 EIGHT_XADDS_INCR(pi, incr, fence, logp, exit) \ 130 FOUR_XADDS_INCR(pi, incr, fence, logp, exit) 131 #define SIXTEEN_XADDS_INCR(pi, incr, fence, logp, exit) \ 132 EIGHT_XADDS_INCR(pi, incr, fence, logp, exit) \ 133 EIGHT_XADDS_INCR(pi, incr, fence, logp, exit) 134 /* }}} */ 135 /* }}} */ 136 /* {{{ We also have _NOLASTINCR variants. Those are used in the old 137 * assembly loop. As a matter of fact, it is not a good idea to optimize 138 * like this: we do need the end value of the pointer, and it's really 139 * crucially important for the case there I>B. So using these functions 140 * is part of the reason why the old code will not do in the longer term. 141 */ 142 #define TWO_XADDS_NOLASTINCR(pi, incr, fence, logp, exit) \ 143 ONE_XADD_INCR(pi, incr, fence, logp, exit) \ 144 ONE_XADD_NOINCR(pi, fence, logp, exit) 145 #define FOUR_XADDS_NOLASTINCR(pi, incr, fence, logp, exit) \ 146 TWO_XADDS_INCR(pi, incr, fence, logp, exit) \ 147 TWO_XADDS_NOLASTINCR(pi, incr, fence, logp, exit) 148 #define TWELVE_XADDS_NOLASTINCR(pi, incr, fence, logp, exit) \ 149 EIGHT_XADDS_INCR(pi, incr, fence, logp, exit) \ 150 FOUR_XADDS_NOLASTINCR(pi, incr, fence, logp, exit) 151 #define SMALLSIEVE_ASSEMBLY_OLD(_pi, _incr, _fence, _logp) do { \ 152 unsigned char *dummy; \ 153 size_t three_p_or_2p; \ 154 __asm__ __volatile__ ( \ 155 "lea (%[incr],%[incr],2), %[three_p_or_2p]\n" /* three_p_or_2p = p_or_2p * 3 */ \ 156 "lea (%[pi],%[three_p_or_2p],4), %[dummy]\n" /* pi_end = pi + p_or_2p*12 */ \ 157 "cmp %[fence], %[dummy]\n" /* if (pi_end > S1) no loop */ \ 158 "ja 1f\n" \ 159 ".balign 8\n" \ 160 "0:\n" \ 161 TWELVE_ADDS_INCR_PRECOMP("%[pi]","%[incr]","%[three_p_or_2p]","%[logp]") \ 162 "lea (%[pi],%[three_p_or_2p],4), %[dummy]\n" /* if (pi+p_or_2p*12 > S1) break */\ 163 "cmp %[fence], %[dummy]\n" \ 164 "jbe 0b\n" \ 165 "1:\n" \ 166 TWELVE_XADDS_INCR("%[pi]","%[incr]","%[fence]","%[logp]","2f") \ 167 "2:\n" \ 168 : [dummy] "=&r"(dummy), [pi] "+r"(_pi), [three_p_or_2p]"=&r"(three_p_or_2p) \ 169 : [incr]"r"(_incr), [_logp]"q"(logp), [fence]"r"(_fence) : "cc"); \ 170 } while (0) 171 /* }}} */ 172 #if 0 /* {{{ unused GCC trick */ 173 /* there is technically a way to drop the "volatile" qualifier, so that 174 * gcc understands that we're touching memory, and which memory exactly. 175 * Such a way can be found in the snippet below. 176 * Alas, I don't know how it is possible to do so and at the same time 177 * make sure that: 1 - the memory reference that is passed to the inline asm 178 * is strictly of the form (%[register]), and 2 - we can actually get the 179 * register back, or emit code that is similar to (%[register],%[displ]) 180 * from there. 181 * And anyway, I don't really believe that it's important: this block is 182 * really the expensive operation, what happens to the glue around it is 183 * less important. 184 */ 185 #define SMALLSIEVE_ASSEMBLY_NEW(S0, pos, incr, fence, logp) do { \ 186 /* see gcc 6.45.2.6 Clobbers */ \ 187 typedef unsigned char (*ss_area_t)[1<<16]; \ 188 ss_area_t S0x = (ss_area_t) S0; \ 189 asm ( \ 190 "leaq (%[S0],%[pos]), %[pi]\n" \ 191 ONE_ADD_INCR("%[pi]","%[inc]", "%[logp]") \ 192 ONE_XADD_INCR("%[pi]", "%[inc]", "%[ff]", "%[logp]", "0f") \ 193 ONE_XADD_INCR("%[pi]", "%[inc]", "%[ff]", "%[logp]", "0f") \ 194 ONE_XADD_INCR("%[pi]", "%[inc]", "%[ff]", "%[logp]", "0f") \ 195 "0:\n" \ 196 : [S0]"=m"(*S0x), [pi]"=&r"(pi) \ 197 : [pos]"r"(pos), \ 198 [inc]"r"(incr), [logp]"q"(logp), [ff] "r" (fence)); \ 199 } while (0) 200 #endif /* }}} */ 201 /* {{{ asm volatile statements for all the variants */ 202 /* {{{ unrolled statements, limited length */ 203 #define SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp) \ 204 "0:\n" \ 205 : [pi]"+r"(_pi) \ 206 : [inc]"r"(_incr), [logp]"q"(_logp), [ff] "r" (_fence) 207 #define SMALLSIEVE_ASSEMBLY_NEW_1_TO_2(_pi, _incr, _fence, _logp) \ 208 asm volatile ( \ 209 ONE_ADD_INCR("%[pi]","%[inc]","%[logp]") \ 210 ONE_XADD_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 211 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 212 #define SMALLSIEVE_ASSEMBLY_NEW_2_TO_4(_pi, _incr, _fence, _logp) \ 213 asm volatile ( \ 214 TWO_ADDS_INCR("%[pi]","%[inc]","%[logp]") \ 215 TWO_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 216 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 217 #define SMALLSIEVE_ASSEMBLY_NEW_4_TO_8(_pi, _incr, _fence, _logp) \ 218 asm volatile ( \ 219 FOUR_ADDS_INCR("%[pi]","%[inc]","%[logp]") \ 220 FOUR_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 221 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 222 #define SMALLSIEVE_ASSEMBLY_NEW_8_TO_16(_pi, _incr, _fence, _logp) \ 223 asm volatile ( \ 224 EIGHT_ADDS_INCR("%[pi]","%[inc]","%[logp]") \ 225 EIGHT_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 226 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 227 #define SMALLSIEVE_ASSEMBLY_NEW_16_TO_32(_pi, _incr, _fence, _logp) \ 228 asm volatile ( \ 229 SIXTEEN_ADDS_INCR("%[pi]","%[inc]","%[logp]") \ 230 SIXTEEN_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 231 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 232 /* this is the point where the complete unrolling starts being excessive */ 233 #define SMALLSIEVE_ASSEMBLY_NEW_32_TO_64(_pi, _incr, _fence, _logp) \ 234 asm volatile ( \ 235 SIXTEEN_ADDS_INCR("%[pi]","%[inc]","%[logp]") \ 236 SIXTEEN_ADDS_INCR("%[pi]","%[inc]","%[logp]") \ 237 SIXTEEN_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 238 SIXTEEN_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 239 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 240 /* }}} */ 241 /* {{{ loop8 */ 242 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP8_COMMON(_pi, _incr, _fence, _logp, _PP) \ 243 asm volatile ( \ 244 "leaq (%[pi], %[inc], 8), %%rdi\n" \ 245 "cmpq %[ff], %%rdi\n" \ 246 "ja 1f\n" /* if rdi > fence, no loop */ \ 247 ".balign 8\n" \ 248 "2:\n" \ 249 _PP \ 250 EIGHT_ADDS_INCR("%[pi]","%[inc]","%[logp]") \ 251 "leaq (%%rdi, %[inc], 8), %%rdi\n" \ 252 "cmpq %[ff], %%rdi\n" \ 253 "jbe 2b\n" \ 254 "1:\n" \ 255 EIGHT_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 256 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp) \ 257 : "rdi") 258 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP8(_pi, _incr, _fence, _logp) \ 259 SMALLSIEVE_ASSEMBLY_NEW_LOOP8_COMMON(_pi, _incr, _fence, _logp, \ 260 "") 261 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP8P0(_pi, _incr, _fence, _logp) \ 262 SMALLSIEVE_ASSEMBLY_NEW_LOOP8_COMMON(_pi, _incr, _fence, _logp, \ 263 "prefetch (%%rdi)\n") 264 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP8P1(_pi, _incr, _fence, _logp) \ 265 SMALLSIEVE_ASSEMBLY_NEW_LOOP8_COMMON(_pi, _incr, _fence, _logp, \ 266 "prefetch (%%rdi, %[inc], 8)\n") 267 /* }}} */ 268 /* {{{ loop12 */ 269 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp, _PP) \ 270 asm volatile ( \ 271 "leaq (%[inc], %[inc], 2), %%r10\n" \ 272 "leaq (%[pi], %%r10, 4), %%rdi\n" \ 273 "cmpq %[ff], %%rdi\n" \ 274 "ja 1f\n" /* if rdi > fence, no loop */ \ 275 ".balign 8\n" \ 276 "2:\n" \ 277 _PP \ 278 TWELVE_ADDS_INCR_PRECOMP("%[pi]","%[inc]","%%r10","%[logp]")\ 279 "leaq (%%rdi, %%r10, 4), %%rdi\n" \ 280 "cmpq %[ff], %%rdi\n" \ 281 "jbe 2b\n" \ 282 "1:\n" \ 283 TWELVE_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 284 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp) \ 285 : "rdi", "r10") 286 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12(_pi, _incr, _fence, _logp) \ 287 SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp,\ 288 "") 289 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12P0(_pi, _incr, _fence, _logp) \ 290 SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp,\ 291 "prefetch (%%rdi)\n") 292 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12P1(_pi, _incr, _fence, _logp) \ 293 SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp,\ 294 "prefetch (%%rdi, %%r10, 4)\n") 295 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP12P2(_pi, _incr, _fence, _logp) \ 296 SMALLSIEVE_ASSEMBLY_NEW_LOOP12_COMMON(_pi, _incr, _fence, _logp,\ 297 "prefetch (%%rdi, %%r10, 8)\n") 298 /* }}} */ 299 /* {{{ loop16 */ 300 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP16_COMMON(_pi, _incr, _fence, _logp, _PP) \ 301 asm volatile ( \ 302 "leaq (%[pi], %[inc2], 8), %%rdi\n" \ 303 "cmpq %[ff], %%rdi\n" \ 304 "ja 1f\n" /* if rdi > fence, no loop */ \ 305 ".balign 8\n" \ 306 "2:\n" \ 307 _PP \ 308 SIXTEEN_ADDS_INCR("%[pi]","%[inc]","%[logp]") \ 309 "leaq (%%rdi, %[inc2], 8), %%rdi\n" \ 310 "cmpq %[ff], %%rdi\n" \ 311 "jbe 2b\n" \ 312 "1:\n" \ 313 SIXTEEN_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 314 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp) \ 315 , [inc2]"r"(2*(_incr)) : "rdi") 316 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP16(_pi, _incr, _fence, _logp) \ 317 SMALLSIEVE_ASSEMBLY_NEW_LOOP16_COMMON(_pi, _incr, _fence, _logp, \ 318 "") 319 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP16P0(_pi, _incr, _fence, _logp) \ 320 SMALLSIEVE_ASSEMBLY_NEW_LOOP16_COMMON(_pi, _incr, _fence, _logp, \ 321 "prefetch (%%rdi)\n") 322 #define SMALLSIEVE_ASSEMBLY_NEW_LOOP16P1(_pi, _incr, _fence, _logp) \ 323 SMALLSIEVE_ASSEMBLY_NEW_LOOP16_COMMON(_pi, _incr, _fence, _logp, \ 324 "prefetch (%%rdi, %[inc2], 8)\n") 325 /*}}}*/ 326 /* {{{ the routines below are unused. The might be, in case we insist on 327 * using the same code for both even and odd lines (but then it would 328 * probably be a loop anyway) */ 329 #define SMALLSIEVE_ASSEMBLY_NEW_0_TO_1(_pi, _incr, _fence, _logp) \ 330 asm volatile ( \ 331 ONE_XADD_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 332 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 333 #define SMALLSIEVE_ASSEMBLY_NEW_0_TO_2(_pi, _incr, _fence, _logp) \ 334 asm volatile ( \ 335 TWO_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 336 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 337 #define SMALLSIEVE_ASSEMBLY_NEW_0_TO_4(_pi, _incr, _fence, _logp) \ 338 asm volatile ( \ 339 FOUR_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 340 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 341 #define SMALLSIEVE_ASSEMBLY_NEW_1_TO_4(_pi, _incr, _fence, _logp) \ 342 asm volatile ( \ 343 ONE_ADD_INCR("%[pi]","%[inc]","%[logp]") \ 344 TWO_XADDS_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 345 ONE_XADD_INCR("%[pi]","%[inc]","%[ff]","%[logp]","0f") \ 346 SMALLSIEVE_ASSEMBLY_TRAILER(_pi, _incr, _fence, _logp)) 347 /*}}}*/ 348 /*}}}*/ 349 /* }}} */ 350 #endif 351 352 /*{{{ function objects for all the routines that we have */ 353 #define BEGIN_FOBJ(name_, compatibility_condition) \ 354 struct name_ { \ 355 template<int b> struct is_compatible { \ 356 static_assert(compatibility_condition, \ 357 # name_ " requires " # compatibility_condition); \ 358 static const bool value = compatibility_condition; \ 359 }; \ 360 static constexpr const char * name = # name_; \ 361 inline size_t operator()( \ 362 unsigned char * S0, \ 363 unsigned char * S1, \ 364 size_t x0 MAYBE_UNUSED, \ 365 size_t pos, \ 366 size_t p_or_2p, \ 367 unsigned char logp, \ 368 where_am_I w MAYBE_UNUSED) const \ 369 { \ 370 unsigned char * pi = S0 + pos; \ 371 INTERMEDIARY_FOBJ() 372 373 #ifndef TRACE_K 374 #define INTERMEDIARY_FOBJ() do {} while (0) 375 #else 376 #define INTERMEDIARY_FOBJ() do { \ 377 if (trace_on_range_Nx(w->N, x0 + pos, x0 + S1 - S0)) { \ 378 if ((trace_Nx.x - x0 - pos) % p_or_2p == 0) { \ 379 WHERE_AM_I_UPDATE(w, x, trace_Nx.x); \ 380 sieve_increase_logging(S0 + w->x - x0, logp, w); \ 381 } \ 382 } \ 383 } while (0) 384 #endif 385 386 #define END_FOBJ() \ 387 return pi - S1; \ 388 }} 389 #ifdef HAVE_GCC_STYLE_AMD64_INLINE_ASM 390 /* {{{ function objects for assembly code */ 391 BEGIN_FOBJ(assembly_generic_loop8, true); 392 SMALLSIEVE_ASSEMBLY_NEW_LOOP8(pi, p_or_2p, S1, logp); 393 END_FOBJ(); 394 BEGIN_FOBJ(assembly_generic_loop12, true); 395 SMALLSIEVE_ASSEMBLY_NEW_LOOP12(pi, p_or_2p, S1, logp); 396 END_FOBJ(); 397 BEGIN_FOBJ(assembly_generic_loop16, true); 398 SMALLSIEVE_ASSEMBLY_NEW_LOOP16(pi, p_or_2p, S1, logp); 399 END_FOBJ(); 400 BEGIN_FOBJ(assembly_generic_loop8p0, true); 401 SMALLSIEVE_ASSEMBLY_NEW_LOOP8P0(pi, p_or_2p, S1, logp); 402 END_FOBJ(); 403 BEGIN_FOBJ(assembly_generic_loop12p0, true); 404 SMALLSIEVE_ASSEMBLY_NEW_LOOP12P0(pi, p_or_2p, S1, logp); 405 END_FOBJ(); 406 BEGIN_FOBJ(assembly_generic_loop16p0, true); 407 SMALLSIEVE_ASSEMBLY_NEW_LOOP16P0(pi, p_or_2p, S1, logp); 408 END_FOBJ(); 409 BEGIN_FOBJ(assembly_generic_loop8p1, true); 410 SMALLSIEVE_ASSEMBLY_NEW_LOOP8P1(pi, p_or_2p, S1, logp); 411 END_FOBJ(); 412 BEGIN_FOBJ(assembly_generic_loop12p1, true); 413 SMALLSIEVE_ASSEMBLY_NEW_LOOP12P1(pi, p_or_2p, S1, logp); 414 END_FOBJ(); 415 BEGIN_FOBJ(assembly_generic_loop16p1, true); 416 SMALLSIEVE_ASSEMBLY_NEW_LOOP16P1(pi, p_or_2p, S1, logp); 417 END_FOBJ(); 418 BEGIN_FOBJ(assembly_generic_loop12p2, true); 419 SMALLSIEVE_ASSEMBLY_NEW_LOOP12P2(pi, p_or_2p, S1, logp); 420 END_FOBJ(); 421 BEGIN_FOBJ(assembly_generic_oldloop, true); 422 SMALLSIEVE_ASSEMBLY_OLD(pi, p_or_2p, S1, logp); 423 END_FOBJ(); 424 BEGIN_FOBJ(assembly0, b <= 0); 425 SMALLSIEVE_ASSEMBLY_NEW_0_TO_1(pi, p_or_2p, S1, logp); 426 END_FOBJ(); 427 BEGIN_FOBJ(assembly1, b == 1); 428 SMALLSIEVE_ASSEMBLY_NEW_1_TO_2(pi, p_or_2p, S1, logp); 429 END_FOBJ(); 430 BEGIN_FOBJ(assembly1x, b <= 1); 431 SMALLSIEVE_ASSEMBLY_NEW_0_TO_2(pi, p_or_2p, S1, logp); 432 END_FOBJ(); 433 BEGIN_FOBJ(assembly2, b == 2); 434 SMALLSIEVE_ASSEMBLY_NEW_2_TO_4(pi, p_or_2p, S1, logp); 435 END_FOBJ(); 436 BEGIN_FOBJ(assembly2x, b <= 2); 437 SMALLSIEVE_ASSEMBLY_NEW_0_TO_4(pi, p_or_2p, S1, logp); 438 END_FOBJ(); 439 BEGIN_FOBJ(assembly3, b == 3); 440 SMALLSIEVE_ASSEMBLY_NEW_4_TO_8(pi, p_or_2p, S1, logp); 441 END_FOBJ(); 442 BEGIN_FOBJ(assembly4, b == 4); 443 SMALLSIEVE_ASSEMBLY_NEW_8_TO_16(pi, p_or_2p, S1, logp); 444 END_FOBJ(); 445 BEGIN_FOBJ(assembly5, b == 5); 446 SMALLSIEVE_ASSEMBLY_NEW_16_TO_32(pi, p_or_2p, S1, logp); 447 END_FOBJ(); 448 BEGIN_FOBJ(assembly6, b == 6); 449 SMALLSIEVE_ASSEMBLY_NEW_32_TO_64(pi, p_or_2p, S1, logp); 450 END_FOBJ(); 451 /* }}} */ 452 #endif 453 /* {{{ Function objects for C code, manually unrolled except for the 454 * _loop versions. */ 455 BEGIN_FOBJ(manual0, b <= 0); 456 if (pi < S1) { *pi += logp; pi += p_or_2p; } 457 END_FOBJ(); 458 BEGIN_FOBJ(manual1, b == 1); 459 do { 460 *pi += logp; if ((pi += p_or_2p) >= S1) break; 461 *pi += logp; pi += p_or_2p; 462 } while (0); 463 END_FOBJ(); 464 BEGIN_FOBJ(manual2, b == 2); 465 do { 466 *pi += logp; pi += p_or_2p; 467 *pi += logp; if ((pi += p_or_2p) >= S1) break; 468 *pi += logp; if ((pi += p_or_2p) >= S1) break; 469 *pi += logp; pi += p_or_2p; 470 } while (0); 471 END_FOBJ(); 472 BEGIN_FOBJ(manual3, b == 3); 473 do { 474 *pi += logp; pi += p_or_2p; 475 *pi += logp; pi += p_or_2p; 476 *pi += logp; pi += p_or_2p; 477 *pi += logp; if ((pi += p_or_2p) >= S1) break; 478 *pi += logp; if ((pi += p_or_2p) >= S1) break; 479 *pi += logp; if ((pi += p_or_2p) >= S1) break; 480 *pi += logp; if ((pi += p_or_2p) >= S1) break; 481 *pi += logp; pi += p_or_2p; 482 } while (0); 483 END_FOBJ(); 484 BEGIN_FOBJ(manual4, b == 4); 485 do { 486 *pi += logp; pi += p_or_2p; 487 *pi += logp; pi += p_or_2p; 488 *pi += logp; pi += p_or_2p; 489 *pi += logp; pi += p_or_2p; 490 *pi += logp; pi += p_or_2p; 491 *pi += logp; pi += p_or_2p; 492 *pi += logp; pi += p_or_2p; 493 *pi += logp; if ((pi += p_or_2p) >= S1) break; 494 *pi += logp; if ((pi += p_or_2p) >= S1) break; 495 *pi += logp; if ((pi += p_or_2p) >= S1) break; 496 *pi += logp; if ((pi += p_or_2p) >= S1) break; 497 *pi += logp; if ((pi += p_or_2p) >= S1) break; 498 *pi += logp; if ((pi += p_or_2p) >= S1) break; 499 *pi += logp; if ((pi += p_or_2p) >= S1) break; 500 *pi += logp; if ((pi += p_or_2p) >= S1) break; 501 *pi += logp; pi += p_or_2p; 502 } while (0); 503 END_FOBJ(); 504 505 /* Gotcha: we're now preventively tracing the small sieve events outside 506 * the asm code, which causes sieve_increase_logging_backend to be 507 * called, and in turn test_divisible, which modifies traced norms (in 508 * debug mode). So for consistency, we mustn't use sieve_increase here, 509 * or the norm would be divided by p twice. 510 */ 511 BEGIN_FOBJ(manual_oldloop, true); 512 #define T do { \ 513 WHERE_AM_I_UPDATE(w, x, x0 + pi - S0); \ 514 *pi += logp; /* sieve_increase (pi, logp, w); */ pi += p_or_2p; \ 515 } while(0) 516 while (UNLIKELY(pi + p_or_2p * 12 <= S1)) 517 { T; T; T; T; T; T; T; T; T; T; T; T; } 518 do { 519 if (pi >= S1) break; 520 T; 521 if (pi >= S1) break; 522 T; 523 if (pi >= S1) break; 524 T; 525 if (pi >= S1) break; 526 T; 527 if (pi >= S1) break; 528 T; 529 if (pi >= S1) break; 530 T; 531 if (pi >= S1) break; 532 T; 533 if (pi >= S1) break; 534 T; 535 if (pi >= S1) break; 536 T; 537 if (pi >= S1) break; 538 T; 539 if (pi >= S1) break; 540 T; 541 if (pi >= S1) break; 542 T; 543 } while (0); 544 #undef T 545 END_FOBJ(); 546 BEGIN_FOBJ(manual_oldloop_nounroll, true); 547 for ( ; pi < S1 ; pi += p_or_2p) { 548 WHERE_AM_I_UPDATE(w, x, x0 + pi - S0); 549 *pi += logp; // sieve_increase (pi, logp, w); 550 } 551 END_FOBJ(); 552 /*}}}*/ 553 /*}}}*/ 554 555 /* {{{ our selection of best routines indexed by ceil(log2(I/p)) */ 556 template<int bit> struct best_evenline; 557 template<int bit> struct best_oddline ; 558 #if defined(HAVE_GCC_STYLE_AMD64_INLINE_ASM) && !defined(TRACK_CODE_PATH) 559 template<int bit> struct best_evenline { 560 typedef int is_default; 561 typedef manual_oldloop type; 562 }; 563 template<int bit> struct best_oddline { 564 typedef int is_default; 565 typedef manual_oldloop type; 566 }; 567 template<> struct best_evenline<1> { typedef manual0 type; }; 568 template<> struct best_evenline<2> { typedef assembly1 type; }; 569 template<> struct best_evenline<3> { typedef assembly2 type; }; 570 template<> struct best_evenline<4> { typedef assembly3 type; }; 571 template<> struct best_evenline<5> { typedef assembly4 type; }; 572 template<> struct best_evenline<6> { typedef assembly5 type; }; 573 574 template<> struct best_oddline<1> { typedef assembly1 type; }; 575 template<> struct best_oddline<2> { typedef assembly2 type; }; 576 template<> struct best_oddline<3> { typedef assembly3 type; }; 577 template<> struct best_oddline<4> { typedef assembly4 type; }; 578 template<> struct best_oddline<5> { typedef assembly_generic_loop8 type; }; 579 template<> struct best_oddline<6> { typedef assembly_generic_loop8 type; }; 580 581 #if 1 582 template<> struct best_evenline<7> { typedef assembly_generic_loop12 type; }; 583 template<> struct best_evenline<8> { typedef assembly_generic_loop12 type; }; 584 template<> struct best_evenline<9> { typedef assembly_generic_loop12 type; }; 585 template<> struct best_evenline<10> { typedef assembly_generic_loop12 type; }; 586 template<> struct best_evenline<11> { typedef assembly_generic_loop12 type; }; 587 template<> struct best_evenline<12> { typedef assembly_generic_loop12 type; }; 588 template<> struct best_evenline<13> { typedef assembly_generic_loop16p0 type; }; 589 template<> struct best_evenline<14> { typedef assembly_generic_loop16p1 type; }; 590 template<> struct best_evenline<15> { typedef assembly_generic_loop16p0 type; }; 591 592 template<> struct best_oddline<7> { typedef assembly_generic_loop12 type; }; 593 template<> struct best_oddline<8> { typedef assembly_generic_loop12 type; }; 594 template<> struct best_oddline<9> { typedef assembly_generic_loop12 type; }; 595 template<> struct best_oddline<10> { typedef assembly_generic_loop12 type; }; 596 template<> struct best_oddline<11> { typedef assembly_generic_loop12 type; }; 597 template<> struct best_oddline<12> { typedef assembly_generic_loop12 type; }; 598 template<> struct best_oddline<13> { typedef assembly_generic_loop16p0 type; }; 599 template<> struct best_oddline<14> { typedef assembly_generic_loop16p1 type; }; 600 template<> struct best_oddline<15> { typedef assembly_generic_loop16p0 type; }; 601 typedef assembly_generic_oldloop default_smallsieve_inner_loop; 602 #endif 603 /* TODO: we could perhaps provide hints so that the generated code can 604 * merge some rounds of the loop. Or maybe the compiler will do that ? */ 605 #else 606 template<int bit> struct best_evenline { typedef manual_oldloop type; }; 607 template<int bit> struct best_oddline { typedef manual_oldloop type; }; 608 typedef manual_oldloop default_smallsieve_inner_loop; 609 #endif 610 /* }}} */ 611 612 613 #endif /* LAS_SMALLSIEVE_LOWLEVEL_HPP_ */ 614