1 #ifndef __PLINK2_BASE_H__
2 #define __PLINK2_BASE_H__
3
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation; either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library. If not, see <http://www.gnu.org/licenses/>.
19
20
21 // Low-level C99/C++03/C++11 library covering basic I/O, SWAR/SIMD, and
22 // Windows/OS X/Linux portability. We try to benefit from as much C++ type
23 // safety as we can without either breaking compatibility with C-only codebases
24 // or making extension of pgenlib/plink2 code more difficult than the old
25 // type-unsafe style.
26 //
27 // Parameter conventions:
28 // - Input parameters, then in/out, then pure outputs, then temporary buffers.
29 // Reference-style input parameters tend to go in the very front, to make it
30 // more obvious that they aren't in/out.
31 // - "bitarr" indicates a word-aligned, packed array of bits, while "bitvec"
32 // indicates vector-alignment in 64-bit builds. ("vector" always means SIMD
33 // inputs/outputs here; C++ std::vector is not used in this codebase.)
34 // - Most pointers are stationary; moving pointers have an _iter suffix.
35 //
36 // Type-choice guidelines:
37 // - Integers are unsigned by default, signed only when necessary.
38 // It's necessary to choose one or the other to avoid drowning in a sea of
39 // casts and unexpected behavior. Each choice has its own surprising
40 // pitfalls that the developer had better be aware of; and I definitely do
41 // not take the position that unsigned is the better default *in all C and/or
42 // C++ code*. However, for this codebase, the extremely high frequency of
43 // bitwise operations makes unsigned-default the only sane choice.
44 // Some consequences of this choice:
45 // - All pointer differences that are part of a larger arithmetic or
46 // comparison expression are explicitly casted to uintptr_t.
47 // - Since uint64_t -> double conversion is frequently slower than int64_t ->
48 // double conversion, u63tod() should be used when the integer is known to
49 // be less than 2^63. If we also know it's less than 2^31, u31tod() can
50 // provide a performance improvement on Win32.
51 // - Integers that can be >= 2^32 in some of the largest existing datasets, but
52 // are usually smaller, should be defined as uintptr_t, to strike a good
53 // balance between 32-bit performance and 64-bit scaling. Exhaustive
54 // overflow checking in the 32-bit build is a non-goal; but I do aim for very
55 // high statistical reliability, by inserting checks whenever it occurs to me
56 // that overflow is especially likely (e.g. when multiplying two potentially
57 // large 32-bit numbers).
58 // - Bitarrays and 'nyparrays' (packed arrays of 2-bit elements, such as a row
59 // of a plink 1.x .bed file) are usually uintptr_t*, to encourage
60 // word-at-a-time iteration without requiring vector-alignment. Quite a few
61 // low-level library functions cast them to VecW*. As mentioned above, the
62 // affected function parameter names must end in 'vec' when this creates an
63 // alignment requirement.
64 // - A buffer/iterator expected to contain only UTF-8 text should be char*.
65 // unsigned char* should be reserved for byte-array buffers and iterators
66 // which are expected to interact with some non-text bytes, and generic
67 // memory-location pointers which will be subject to pointer arithmetic.
68 // (Note that this creates some clutter in low-level parsing code: since the
69 // signedness of char is platform-dependent, it becomes necessary to use e.g.
70 // ctou32() a fair bit.)
71 // - unsigned char is an acceptable argument type for functions intended to
72 // process a single text character, thanks to implicit char -> unsigned char
73 // conversion; it's just unsigned char* that should be avoided.
74 // - void* return values should be restricted to generic pointers which are
75 // *not* expected to be subject to pointer arithmetic. void* as input
76 // parameter type should only be used when there are at least two equally
77 // valid input types, NOT counting VecW*.
78
79
80 #if (__GNUC__ < 4)
81 // may eventually add MSVC support to gain access to MKL on Windows, but can't
82 // justify doing that before all major features are implemented.
83 # error "gcc 4.x+ or clang equivalent required."
84 #endif
85
86 // The -Wshorten-64-to-32 diagnostic forces the code to be cluttered with
87 // meaningless uintptr_t -> uint32_t static casts (number known to be < 2^32,
88 // just stored in a uintptr_t because there's no speed penalty and we generally
89 // want to think in terms of word-based operations). The code is more readable
90 // if S_CAST(uint32_t, <potentially wider value>) is reserved for situations
91 // where a higher bit may actually be set. This pragma can always be commented
92 // out on the few occasions where inappropriate silent truncation is suspected.
93 #ifdef __clang__
94 # pragma clang diagnostic ignored "-Wshorten-64-to-32"
95 #endif
96
97 // 10000 * major + 100 * minor + patch
98 // Exception to CONSTI32, since we want the preprocessor to have access
99 // to this value. Named with all caps as a consequence.
100 #define PLINK2_BASE_VERNUM 702
101
102
103 #define _FILE_OFFSET_BITS 64
104
105 #include <stdio.h>
106 #include <stdlib.h>
107 #include <string.h>
108 #include <stddef.h> // offsetof()
109 #include <stdint.h>
110 #ifndef __STDC_FORMAT_MACROS
111 # define __STDC_FORMAT_MACROS 1
112 #endif
113 #include <inttypes.h>
114 #include <limits.h> // CHAR_BIT, PATH_MAX
115
116 // #define NDEBUG
117 #include <assert.h>
118
119 #ifdef _WIN32
120 // needed for EnterCriticalSection, etc.
121 # ifndef _WIN64
122 # define WINVER 0x0501
123 # else
124 # define __LP64__
125 # endif
126 # ifndef WIN32_LEAN_AND_MEAN
127 # define WIN32_LEAN_AND_MEAN
128 # endif
129 # include <windows.h>
130 #endif
131
132 #if __cplusplus >= 201103L
133 # include <array>
134 #endif
135
136 #ifdef __LP64__
137 # ifndef __SSE2__
138 // possible todo: remove this requirement, the 32-bit VecW-using code does most
139 // of what we need. But little point in doing this before we have e.g. an
140 // ARM-based machine to test with that scientists would plausibly want to run
141 // plink2 on.
142 # error "64-bit builds currently require SSE2. Try producing a 32-bit build instead."
143 # endif
144 # include <emmintrin.h>
145 # ifdef __SSE4_2__
146 # define USE_SSE42
147 # include <smmintrin.h>
148 # ifdef __AVX2__
149 # include <immintrin.h>
150 # ifndef __BMI__
151 # error "AVX2 builds require -mbmi as well."
152 # endif
153 # ifndef __BMI2__
154 # error "AVX2 builds require -mbmi2 as well."
155 # endif
156 # ifndef __LZCNT__
157 # error "AVX2 builds require -mlzcnt as well."
158 # endif
159 # define USE_AVX2
160 # endif
161 # endif
162 # define ALIGNV16 __attribute__ ((aligned (16)))
163 #else
164 # define ALIGNV16
165 #endif
166
167 // done with #includes, can start C++ namespace...
168 #ifdef __cplusplus
169 namespace plink2 {
170 #endif
171
172 // ...though a bunch of symbols remain to be #defined; try to reduce the number
173 // over time.
174
175 #ifndef UINT32_MAX
176 // can theoretically be undefined in C++03
177 # define UINT32_MAX 0xffffffffU
178 #endif
179
180 #define UINT32_MAXM1 0xfffffffeU
181
182 #ifdef __cplusplus
183 # define HEADER_INLINE inline
184 // Previously went on a wild constexpr spree, but now these are mostly unused.
185 // Reserve for cases where (i) there's a clear constant-initialization use case
186 // for an imaginable downstream program (I'm looking at you, DivUp() and
187 // RoundUpPow2()...), or (ii) it allows a useful static_assert to be inserted
188 // for a hardcoded constant.
189 # if __cplusplus >= 201103L
190 # define HEADER_CINLINE constexpr
191 # define CSINLINE static constexpr
192 # if __cplusplus > 201103L
193 # define HEADER_CINLINE2 constexpr
194 # define CSINLINE2 static constexpr
195 # else
196 # define HEADER_CINLINE2 inline
197 # define CSINLINE2 static inline
198 # endif
199 # else
200 # define HEADER_CINLINE inline
201 # define HEADER_CINLINE2 inline
202 # define CSINLINE static inline
203 # define CSINLINE2 static inline
204 # endif
205 # if __cplusplus <= 199711L
206 // this may be defined anyway, at least on OS X
207 # ifndef static_assert
208 // todo: check other cases
209 # define static_assert(cond, msg)
210 # endif
211 # endif
212 #else
213 # define HEADER_INLINE static inline
214 # define HEADER_CINLINE static inline
215 # define HEADER_CINLINE2 static inline
216 # define CSINLINE static inline
217 # define CSINLINE2 static inline
218 // _Static_assert() should work in gcc 4.6+
219 # if (__GNUC__ == 4) && (__GNUC_MINOR__ < 6)
220 # if defined(__clang__) && defined(__has_feature) && defined(__has_extension)
221 # if __has_feature(c_static_assert) || __has_extension(c_static_assert)
222 # define static_assert _Static_assert
223 # else
224 # define static_assert(cond, msg)
225 # endif
226 # else
227 # define static_assert(cond, msg)
228 # endif
229 # else
230 # define static_assert _Static_assert
231 # endif
232 #endif
233
234 #define __maybe_unused __attribute__((unused))
235
236 // Rule of thumb: Use these macros if, and only if, the condition would always
237 // trigger exit-from-program. As a side effect, this makes it more
238 // straightforward, if still tedious, to make global changes to error-handling
239 // strategy (always dump backtrace and exit immediately?), though provision
240 // must still be made for sometimes-error-sometimes-not return paths which
241 // don't get an unlikely annotation.
242 #ifndef likely
243 # define likely(expr) __builtin_expect(!!(expr), 1)
244 # define unlikely(expr) __builtin_expect(!!(expr), 0)
245 #endif
246
247 #ifdef __cplusplus
248 # define K_CAST(type, val) (const_cast<type>(val))
249 # define R_CAST(type, val) (reinterpret_cast<type>(val))
250 # define S_CAST(type, val) (static_cast<type>(val))
251 #else
252 # define K_CAST(type, val) ((type)(val))
253 # define R_CAST(type, val) ((type)(val))
254 # define S_CAST(type, val) ((type)(val))
255 #endif
256
257 // (from Linux kernel)
258 // container_of - cast a member of a structure out to the containing structure
259 // @ptr: the pointer to the member.
260 // @type: the type of the container struct this is embedded in.
261 // @member: the name of the member within the struct.
262 #define container_of(ptr, type, member) \
263 (R_CAST(type*, R_CAST(char*, ptr) - offsetof(type, member)))
264
265 // original macro doesn't work in C++ when e.g. ptr is a const char*, and the
266 // quick workaround of casting away the const is unsafe.
267 #define const_container_of(ptr, type, member) \
268 (R_CAST(const type*, R_CAST(const char*, ptr) - offsetof(type, member)))
269
u31tod(uint32_t uii)270 HEADER_INLINE double u31tod(uint32_t uii) {
271 const int32_t ii = uii;
272 assert(ii >= 0);
273 return S_CAST(double, ii);
274 }
275
swtod(intptr_t lii)276 HEADER_INLINE double swtod(intptr_t lii) {
277 return S_CAST(double, lii);
278 }
279
u63tod(uint64_t ullii)280 HEADER_INLINE double u63tod(uint64_t ullii) {
281 const int64_t llii = ullii;
282 assert(llii >= 0);
283 return S_CAST(double, llii);
284 }
285
u31tof(uint32_t uii)286 HEADER_INLINE float u31tof(uint32_t uii) {
287 const int32_t ii = uii;
288 assert(ii >= 0);
289 return S_CAST(float, ii);
290 }
291
ctou32(char cc)292 HEADER_INLINE uint32_t ctou32(char cc) {
293 return S_CAST(unsigned char, cc);
294 }
295
ctow(char cc)296 HEADER_INLINE uintptr_t ctow(char cc) {
297 return S_CAST(unsigned char, cc);
298 }
299
ctou64(char cc)300 HEADER_INLINE uint64_t ctou64(char cc) {
301 return S_CAST(unsigned char, cc);
302 }
303
304 // Error return types. All of these evaluate to true on error and false on
305 // success, but otherwise they have slightly different semantics:
306 // * PglErr is the general-purpose enum. Unlike an enum, implicit conversion
307 // *to* int, not just from int, is prevented by the C++11 compiler (and the
308 // C++11-compiler-validated code still works under C99). (To achieve this
309 // additional safety, we engage in a bit of code duplication which would be
310 // unreasonable for flagsets.)
311 // (Previously, explicit cast to uint32_t, but not int32_t, was supported, to
312 // reflect the fact that all error codes are positive. This was deemed
313 // silly.)
314 // * BoolErr allows implicit conversion from int, but conversion back to
315 // uint32_t requires an explicit cast. (It should always be 0/1-valued, but
316 // this isn't enforced by the compiler.)
317 // * IntErr allows implicit conversion from int, but conversion back to
318 // int32_t requires an explicit cast. It mainly serves as a holding pen for
319 // C standard library error return values, which can be negative.
320 #if __cplusplus >= 201103L
321 struct PglErr {
322 enum class ec
323 #else
324 typedef enum
325 #endif
326 {
327 kPglRetSuccess,
328 kPglRetSkipped,
329 kPglRetNomem,
330 kPglRetOpenFail,
331 kPglRetReadFail,
332 kPglRetWriteFail,
333 // MalformedInput should be returned on low-level file format violations,
334 // while InconsistentInput should be returned for higher-level logical
335 // problems like mismatched files (generally solvable by fixing the command
336 // line), and DegenerateData for properly-formatted-and-matched files that
337 // yields degenerate computational results due to e.g. divide by zero or
338 // insufficient rank.
339 kPglRetMalformedInput,
340 kPglRetInconsistentInput,
341 kPglRetInvalidCmdline,
342 kPglRetThreadCreateFail,
343 kPglRetNetworkFail,
344 kPglRetVarRecordTooLarge,
345 kPglRetUnsupportedInstructions,
346 kPglRetDegenerateData,
347 kPglRetDecompressFail, // also distinguish this from MalformedInput
348 kPglRetRewindFail,
349 kPglRetSampleMajorBed = 32,
350 kPglRetNomemCustomMsg = 59,
351 kPglRetInternalError = 60,
352 kPglRetWarningErrcode = 61,
353 kPglRetImproperFunctionCall = 62,
354 kPglRetNotYetSupported = 63,
355
356 // These are only for internal use. If any of these reach the top level
357 // instead of being handled or converted to another error code, that's a bug,
358 // and plink2 prints a message to that effect.
359 kPglRetHelp = 125,
360 kPglRetLongLine = 126,
361 kPglRetEof = 127}
362 #if __cplusplus >= 201103L
363 ;
364
PglErrPglErr365 PglErr() {}
366
PglErrPglErr367 PglErr(const PglErr& source) : value_(source.value_) {}
368
PglErrPglErr369 PglErr(ec source) : value_(source) {}
370
371 // Allow explicit conversion from uint64_t, and NOT uint32_t, to this error
372 // type, to support reproducible multithreaded error reporting (where
373 // multiple threads may atomically attempt to modify a single uint64_t with
374 // the error code in the low 32 bits and a priority number in the high bits).
PglErrPglErr375 explicit PglErr(uint64_t source) : value_(static_cast<ec>(source)) {}
376
377 PglErr& operator=(const PglErr&) = default;
378
ecPglErr379 operator ec() const {
380 return value_;
381 }
382
uint32_tPglErr383 explicit operator uint32_t() const {
384 return static_cast<uint32_t>(value_);
385 }
386
int32_tPglErr387 explicit operator int32_t() const {
388 return static_cast<int32_t>(value_);
389 }
390
391 explicit operator bool() const {
392 return (static_cast<uint32_t>(value_) != 0);
393 }
394
395 private:
396 ec value_;
397 };
398
399 const PglErr kPglRetSuccess = PglErr::ec::kPglRetSuccess;
400 const PglErr kPglRetSkipped = PglErr::ec::kPglRetSkipped;
401 const PglErr kPglRetNomem = PglErr::ec::kPglRetNomem;
402 const PglErr kPglRetOpenFail = PglErr::ec::kPglRetOpenFail;
403 const PglErr kPglRetReadFail = PglErr::ec::kPglRetReadFail;
404 const PglErr kPglRetWriteFail = PglErr::ec::kPglRetWriteFail;
405 const PglErr kPglRetMalformedInput = PglErr::ec::kPglRetMalformedInput;
406 const PglErr kPglRetInconsistentInput = PglErr::ec::kPglRetInconsistentInput;
407 const PglErr kPglRetInvalidCmdline = PglErr::ec::kPglRetInvalidCmdline;
408 const PglErr kPglRetHelp = PglErr::ec::kPglRetHelp;
409 const PglErr kPglRetThreadCreateFail = PglErr::ec::kPglRetThreadCreateFail;
410 const PglErr kPglRetNetworkFail = PglErr::ec::kPglRetNetworkFail;
411 const PglErr kPglRetVarRecordTooLarge = PglErr::ec::kPglRetVarRecordTooLarge;
412 const PglErr kPglRetUnsupportedInstructions = PglErr::ec::kPglRetUnsupportedInstructions;
413 const PglErr kPglRetDegenerateData = PglErr::ec::kPglRetDegenerateData;
414 const PglErr kPglRetDecompressFail = PglErr::ec::kPglRetDecompressFail;
415 const PglErr kPglRetRewindFail = PglErr::ec::kPglRetRewindFail;
416 const PglErr kPglRetSampleMajorBed = PglErr::ec::kPglRetSampleMajorBed;
417 const PglErr kPglRetWarningErrcode = PglErr::ec::kPglRetWarningErrcode;
418 const PglErr kPglRetNomemCustomMsg = PglErr::ec::kPglRetNomemCustomMsg;
419 const PglErr kPglRetInternalError = PglErr::ec::kPglRetInternalError;
420 const PglErr kPglRetImproperFunctionCall = PglErr::ec::kPglRetImproperFunctionCall;
421 const PglErr kPglRetNotYetSupported = PglErr::ec::kPglRetNotYetSupported;
422 const PglErr kPglRetLongLine = PglErr::ec::kPglRetLongLine;
423 const PglErr kPglRetEof = PglErr::ec::kPglRetEof;
424 #else
425 PglErr;
426 #endif
427
428 #if __cplusplus >= 201103L
429 // allow efficient arithmetic on these, but force them to require explicit
430 // int32_t/uint32_t casts; only permit implicit assignment from
431 // int32_t/uint32_t by default.
432 // built-in bool type does too many things we don't want...
433
434 // expected to be integer-valued, but not necessarily 0/1 or positive
435 struct IntErr {
IntErrIntErr436 IntErr() {}
437
IntErrIntErr438 IntErr(int32_t source) : value_(source) {}
439
int32_tIntErr440 explicit operator int32_t() const {
441 return value_;
442 }
443
444 explicit operator bool() const {
445 return (value_ != 0);
446 }
447
448 private:
449 int32_t value_;
450 };
451
452 // expected to be 0/1-valued
453 struct BoolErr {
BoolErrBoolErr454 BoolErr() {}
455
BoolErrBoolErr456 BoolErr(uint32_t source) : value_(source) {}
457
uint32_tBoolErr458 explicit operator uint32_t() const {
459 return value_;
460 }
461
462 explicit operator bool() const {
463 return (value_ != 0);
464 }
465
466 private:
467 uint32_t value_;
468 };
469 #else
470 typedef int32_t IntErr;
471 typedef uint32_t BoolErr;
472 #endif
473
474 // make this work on 32-bit as well as 64-bit systems, across
475 // Windows/OS X/Linux
476 // (todo: clean this up a bit. it's inherently a baling-wire-and-duct-tape
477 // sort of thing, though...)
478 #ifdef _WIN32
479 // must compile with -std=gnu++11, not c++11, on 32-bit Windows since
480 // otherwise fseeko64 not defined...
481 # define fseeko fseeko64
482 # define ftello ftello64
483 # define FOPEN_RB "rb"
484 # define FOPEN_WB "wb"
485 # define FOPEN_AB "ab"
486 # define ferror_unlocked ferror
487 # define feof_unlocked feof
488 # ifdef __LP64__
489 # define getc_unlocked _fgetc_nolock
490 # define putc_unlocked _fputc_nolock
491 // todo: find mingw-w64 build which properly links _fread_nolock, and
492 // conditional-compile
493 # define fread_unlocked fread
494 # define fwrite_unlocked fwrite
495 # else
496 # define getc_unlocked getc
497 # define putc_unlocked putc
498 # define fread_unlocked fread
499 # define fwrite_unlocked fwrite
500 # endif
501 # if __cplusplus < 201103L
502 # define uint64_t unsigned long long
503 # define int64_t long long
504 # endif
505 #else // Linux or OS X
506 # define FOPEN_RB "r"
507 # define FOPEN_WB "w"
508 # define FOPEN_AB "a"
509 # if defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__)
510 # define fread_unlocked fread
511 # define fwrite_unlocked fwrite
512 # endif
513 # if defined(__NetBSD__)
514 # define ferror_unlocked ferror
515 # define feof_unlocked feof
516 # endif
517 #endif
518
519 #ifdef _WIN32
520 # undef PRId64
521 # undef PRIu64
522 # define PRId64 "I64d"
523 # define PRIu64 "I64u"
524 #else
525 # ifdef __cplusplus
526 # ifndef PRId64
527 # define PRId64 "lld"
528 # endif
529 # endif
530 #endif
531
532 // These are useful for defending against base-pointer integer overflow on bad
533 // input.
PtrAddCk(const unsigned char * end,intptr_t incr,const unsigned char ** basep)534 HEADER_INLINE BoolErr PtrAddCk(const unsigned char* end, intptr_t incr, const unsigned char** basep) {
535 *basep += incr;
536 return unlikely((end - (*basep)) < 0);
537 }
538
539 // 'W' for writable
PtrWSubCk(unsigned char * base,intptr_t decr,unsigned char ** endp)540 HEADER_INLINE BoolErr PtrWSubCk(unsigned char* base, intptr_t decr, unsigned char** endp) {
541 *endp -= decr;
542 return unlikely(((*endp) - base) < 0);
543 }
544
PtrCheck(const void * end,const void * base,intptr_t req)545 HEADER_INLINE BoolErr PtrCheck(const void* end, const void* base, intptr_t req) {
546 const unsigned char* end_uc = S_CAST(const unsigned char*, end);
547 const unsigned char* base_uc = S_CAST(const unsigned char*, base);
548 return unlikely((end_uc - base_uc) < req);
549 }
550
551 // We want this to return an uint32_t, not an int32_t.
ctzu32(uint32_t uii)552 HEADER_INLINE uint32_t ctzu32(uint32_t uii) {
553 return __builtin_ctz(uii);
554 }
555
556 // this should always compile down to bsr.
bsru32(uint32_t uii)557 HEADER_INLINE uint32_t bsru32(uint32_t uii) {
558 return 31 - __builtin_clz(uii);
559 }
560
561 #ifdef _WIN64
ctzw(unsigned long long ullii)562 HEADER_INLINE uint32_t ctzw(unsigned long long ullii) {
563 return __builtin_ctzll(ullii);
564 }
565
bsrw(unsigned long long ullii)566 HEADER_INLINE uint32_t bsrw(unsigned long long ullii) {
567 // Note that this actually compiles to a single instruction on x86; it's
568 // naked __builtin_clzll which requires an additional subtraction.
569 return 63 - __builtin_clzll(ullii);
570 }
571 #else
ctzw(unsigned long ulii)572 HEADER_INLINE uint32_t ctzw(unsigned long ulii) {
573 return __builtin_ctzl(ulii);
574 }
575
bsrw(unsigned long ulii)576 HEADER_INLINE uint32_t bsrw(unsigned long ulii) {
577 return (8 * sizeof(intptr_t) - 1) - __builtin_clzl(ulii);
578 }
579 # ifndef __LP64__
580 // needed to prevent GCC 6 build failure
581 # if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
582 # if (__cplusplus < 201103L) && !defined(__APPLE__)
583 # ifndef uintptr_t
584 # define uintptr_t unsigned long
585 # endif
586 # ifndef intptr_t
587 # define intptr_t long
588 # endif
589 # endif
590 # endif
591 # endif
592 #endif
593
594 #ifdef __LP64__
595 # ifdef _WIN32 // i.e. Win64
596
597 # undef PRIuPTR
598 # undef PRIdPTR
599 # define PRIuPTR PRIu64
600 # define PRIdPTR PRId64
601 # define PRIxPTR2 "016I64x"
602
603 # else // not _WIN32
604
605 # ifndef PRIuPTR
606 # define PRIuPTR "lu"
607 # endif
608 # ifndef PRIdPTR
609 # define PRIdPTR "ld"
610 # endif
611 # define PRIxPTR2 "016lx"
612
613 # endif // Win64
614
615 #else // not __LP64__
616
617 // without this, we get ridiculous warning spew...
618 // not 100% sure this is the right cutoff, but this has been tested on 4.7
619 // and 4.8 build machines, so it plausibly is.
620 # if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8) && (__cplusplus < 201103L)
621 # undef PRIuPTR
622 # undef PRIdPTR
623 # define PRIuPTR "lu"
624 # define PRIdPTR "ld"
625 # endif
626
627 # define PRIxPTR2 "08lx"
628
629 #endif
630
631 #ifndef HAVE_NULLPTR
632 # ifndef __cplusplus
633 # define nullptr NULL
634 # else
635 # if __cplusplus <= 199711L
636 # ifndef nullptr
637 # define nullptr NULL
638 # endif
639 # endif
640 # endif
641 #endif
642
643 // Checked a bunch of alternatives to #define constants. For integer constants
644 // in [-2^31, 2^31), enum {} avoids macro expansion issues that actually
645 // matter, and that more than cancels out any tiny increase in binary size due
646 // to additional debugger information (which has value, anyway). However, we
647 // don't want to use this under C++ due to enumeral/non-enumeral conditional
648 // expression warnings, so this isn't one-size-fits-all; and plain old const
649 // int has all the functionality we want under C++ (including internal linkage,
650 // so it's fine to define them in header files). Thus we wrap the
651 // implementation in a macro.
652 //
653 // Otherwise, the macro expansion thing is still annoying but we suck it up due
654 // to the need for too much duplicate C vs. C++ code ("initializer element is
655 // not constant" when using const <type> in C99...)
656 //
657 // We start most plink2- and pgenlib-specific numeric constant names here with
658 // "kPgl", which should have a vanishingly small chance of colliding with
659 // anything in C99. Note that stuff like kBytesPerWord is not considered
660 // library-specific, so it's exempt from having "Pgl" in the name. Also, the
661 // few string literals here are of the FOPEN_WB sort, which have similar usage
662 // patterns to e.g. PRIuPTR which shouldn't be renamed, so those remain
663 // all-caps.
664 //
665 // (Update, May 2018: CONSTU31 was renamed to CONSTI32 and changed to type
666 // int32_t, to prevent C vs. C++ differences. This almost never makes a
667 // difference, since when int32_t and uint32_t are mixed in the same
668 // expression, the former gets converted to unsigned. However, unpleasant
669 // surprises are occasionally possible when mixing these constants with
670 // uint16_t or unsigned char values, since then the unsigned values are
671 // promoted to int32_t. Also, this essentially forces use of -Wno-sign-compare
672 // when using gcc 4.4.
673 //
674 // Biggest thing to watch out for is mixing of Halfword with these constants in
675 // 32-bit builds. Dosage and Vec8thUint are also relevant.)
676 #ifdef __cplusplus
677 # define CONSTI32(name, expr) const int32_t name = (expr)
678 #else
679 # define CONSTI32(name, expr) enum {name = (expr)}
680 #endif
681
682 // useful because of its bitwise complement: ~k0LU is a word with all 1 bits,
683 // while ~0 is always 32 1 bits.
684 // LLU is used over ULL for searchability (no conflict with NULL).
685 static const uintptr_t k0LU = S_CAST(uintptr_t, 0);
686
687 // mainly useful for bitshifts: (k1LU << 32) works in 64-bit builds, while
688 // (1 << 32) is undefined. also used as a quicker-to-type way of casting
689 // numbers/expressions to uintptr_t (via multiplication).
690 static const uintptr_t k1LU = S_CAST(uintptr_t, 1);
691
692
693 #ifdef __LP64__
694 # ifdef USE_AVX2
695 CONSTI32(kBytesPerVec, 32);
696
697 // 16 still seems to noticeably outperform 32 on my Mac test machine, and
698 // is about equal on my Linux test machine, probably due to reduced clock
699 // frequency when 32-byte floating point vector operations are used (as in, ALL
700 // operations, sometimes on ALL cores, become slower when a single core
701 // performs a 32-byte fp vector operation).
702 // However, processor power management, numeric libraries, and my AVX2 code
703 // should improve over time. There will probably come a time where switching
704 // to 32-byte fp is worthwhile.
705 // #define FVEC_32
706
707 // bleah, have to define these here, vector_size doesn't see enum values
708 typedef uintptr_t VecW __attribute__ ((vector_size (32)));
709 typedef uint32_t VecU32 __attribute__ ((vector_size (32)));
710 typedef int32_t VecI32 __attribute__ ((vector_size (32)));
711 typedef unsigned short VecU16 __attribute__ ((vector_size (32)));
712 typedef short VecI16 __attribute__ ((vector_size (32)));
713 // documentation says 'char', but int8_t works fine under gcc 4.4 and conveys
714 // intent better (char not guaranteed to be signed)
715 typedef int8_t VecI8 __attribute__ ((vector_size (32)));
716 typedef unsigned char VecUc __attribute__ ((vector_size (32)));
717 # else
718 CONSTI32(kBytesPerVec, 16);
719 typedef uintptr_t VecW __attribute__ ((vector_size (16)));
720 typedef uint32_t VecU32 __attribute ((vector_size (16)));
721 typedef int32_t VecI32 __attribute ((vector_size (16)));
722 typedef unsigned short VecU16 __attribute__ ((vector_size (16)));
723 typedef short VecI16 __attribute__ ((vector_size (16)));
724 typedef int8_t VecI8 __attribute__ ((vector_size (16)));
725 typedef unsigned char VecUc __attribute__ ((vector_size (16)));
726 # endif
727 CONSTI32(kBitsPerWord, 64);
728 CONSTI32(kBitsPerWordLog2, 6);
729
730 typedef uint32_t Halfword;
731 typedef uint16_t Quarterword;
732
733 # ifdef USE_AVX2
734
735 // _mm256_set1_... seems to have the same performance; could use that instead.
736 # define VCONST_W(xx) {xx, xx, xx, xx}
737 # define VCONST_S(xx) {xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx}
738 # define VCONST_C(xx) {xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx}
739 # define VCONST_UC VCONST_C
740
741 // vv = VCONST_W(k0LU) doesn't work (only ok for initialization)
vecw_setzero()742 HEADER_INLINE VecW vecw_setzero() {
743 return R_CAST(VecW, _mm256_setzero_si256());
744 }
745
vecu32_setzero()746 HEADER_INLINE VecU32 vecu32_setzero() {
747 return R_CAST(VecU32, _mm256_setzero_si256());
748 }
749
vecu16_setzero()750 HEADER_INLINE VecU16 vecu16_setzero() {
751 return R_CAST(VecU16, _mm256_setzero_si256());
752 }
753
veci16_setzero()754 HEADER_INLINE VecI16 veci16_setzero() {
755 return R_CAST(VecI16, _mm256_setzero_si256());
756 }
757
vecuc_setzero()758 HEADER_INLINE VecUc vecuc_setzero() {
759 return R_CAST(VecUc, _mm256_setzero_si256());
760 }
761
veci8_setzero()762 HEADER_INLINE VecI8 veci8_setzero() {
763 return R_CAST(VecI8, _mm256_setzero_si256());
764 }
765
766 // "vv >> ct" doesn't work, and Scientific Linux gcc 4.4 might not optimize
767 // VCONST_W shift properly (todo: test this)
vecw_srli(VecW vv,uint32_t ct)768 HEADER_INLINE VecW vecw_srli(VecW vv, uint32_t ct) {
769 return R_CAST(VecW, _mm256_srli_epi64(R_CAST(__m256i, vv), ct));
770 }
771
vecw_slli(VecW vv,uint32_t ct)772 HEADER_INLINE VecW vecw_slli(VecW vv, uint32_t ct) {
773 return R_CAST(VecW, _mm256_slli_epi64(R_CAST(__m256i, vv), ct));
774 }
775
vecu32_srli(VecU32 vv,uint32_t ct)776 HEADER_INLINE VecU32 vecu32_srli(VecU32 vv, uint32_t ct) {
777 return R_CAST(VecU32, _mm256_srli_epi32(R_CAST(__m256i, vv), ct));
778 }
779
vecu32_slli(VecU32 vv,uint32_t ct)780 HEADER_INLINE VecU32 vecu32_slli(VecU32 vv, uint32_t ct) {
781 return R_CAST(VecU32, _mm256_slli_epi32(R_CAST(__m256i, vv), ct));
782 }
783
vecu16_srli(VecU16 vv,uint32_t ct)784 HEADER_INLINE VecU16 vecu16_srli(VecU16 vv, uint32_t ct) {
785 return R_CAST(VecU16, _mm256_srli_epi16(R_CAST(__m256i, vv), ct));
786 }
787
vecu16_slli(VecU16 vv,uint32_t ct)788 HEADER_INLINE VecU16 vecu16_slli(VecU16 vv, uint32_t ct) {
789 return R_CAST(VecU16, _mm256_slli_epi16(R_CAST(__m256i, vv), ct));
790 }
791
792 // Compiler still doesn't seem to be smart enough to use andnot properly.
vecw_and_notfirst(VecW excl,VecW main)793 HEADER_INLINE VecW vecw_and_notfirst(VecW excl, VecW main) {
794 return R_CAST(VecW, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
795 }
796
vecu32_and_notfirst(VecU32 excl,VecU32 main)797 HEADER_INLINE VecU32 vecu32_and_notfirst(VecU32 excl, VecU32 main) {
798 return R_CAST(VecU32, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
799 }
800
veci32_and_notfirst(VecI32 excl,VecI32 main)801 HEADER_INLINE VecI32 veci32_and_notfirst(VecI32 excl, VecI32 main) {
802 return R_CAST(VecI32, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
803 }
804
vecu16_and_notfirst(VecU16 excl,VecU16 main)805 HEADER_INLINE VecU16 vecu16_and_notfirst(VecU16 excl, VecU16 main) {
806 return R_CAST(VecU16, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
807 }
808
veci16_and_notfirst(VecI16 excl,VecI16 main)809 HEADER_INLINE VecI16 veci16_and_notfirst(VecI16 excl, VecI16 main) {
810 return R_CAST(VecI16, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
811 }
812
veci8_and_notfirst(VecI8 excl,VecI8 main)813 HEADER_INLINE VecI8 veci8_and_notfirst(VecI8 excl, VecI8 main) {
814 return R_CAST(VecI8, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
815 }
816
vecw_set1(uintptr_t ulii)817 HEADER_INLINE VecW vecw_set1(uintptr_t ulii) {
818 return R_CAST(VecW, _mm256_set1_epi64x(ulii));
819 }
820
vecu32_set1(uint32_t uii)821 HEADER_INLINE VecU32 vecu32_set1(uint32_t uii) {
822 return R_CAST(VecU32, _mm256_set1_epi32(uii));
823 }
824
veci32_set1(int32_t ii)825 HEADER_INLINE VecI32 veci32_set1(int32_t ii) {
826 return R_CAST(VecI32, _mm256_set1_epi32(ii));
827 }
828
vecu16_set1(unsigned short usi)829 HEADER_INLINE VecU16 vecu16_set1(unsigned short usi) {
830 return R_CAST(VecU16, _mm256_set1_epi16(usi));
831 }
832
veci16_set1(short si)833 HEADER_INLINE VecI16 veci16_set1(short si) {
834 return R_CAST(VecI16, _mm256_set1_epi16(si));
835 }
836
vecuc_set1(unsigned char ucc)837 HEADER_INLINE VecUc vecuc_set1(unsigned char ucc) {
838 return R_CAST(VecUc, _mm256_set1_epi8(ucc));
839 }
840
veci8_set1(char cc)841 HEADER_INLINE VecI8 veci8_set1(char cc) {
842 return R_CAST(VecI8, _mm256_set1_epi8(cc));
843 }
844
vecw_movemask(VecW vv)845 HEADER_INLINE uint32_t vecw_movemask(VecW vv) {
846 return _mm256_movemask_epi8(R_CAST(__m256i, vv));
847 }
848
vecu32_movemask(VecU32 vv)849 HEADER_INLINE uint32_t vecu32_movemask(VecU32 vv) {
850 return _mm256_movemask_epi8(R_CAST(__m256i, vv));
851 }
852
veci32_movemask(VecI32 vv)853 HEADER_INLINE uint32_t veci32_movemask(VecI32 vv) {
854 return _mm256_movemask_epi8(R_CAST(__m256i, vv));
855 }
856
vecu16_movemask(VecU16 vv)857 HEADER_INLINE uint32_t vecu16_movemask(VecU16 vv) {
858 return _mm256_movemask_epi8(R_CAST(__m256i, vv));
859 }
860
veci16_movemask(VecI16 vv)861 HEADER_INLINE uint32_t veci16_movemask(VecI16 vv) {
862 return _mm256_movemask_epi8(R_CAST(__m256i, vv));
863 }
864
veci8_movemask(VecI8 vv)865 HEADER_INLINE uint32_t veci8_movemask(VecI8 vv) {
866 return _mm256_movemask_epi8(R_CAST(__m256i, vv));
867 }
868
vecuc_movemask(VecUc vv)869 HEADER_INLINE uint32_t vecuc_movemask(VecUc vv) {
870 return _mm256_movemask_epi8(R_CAST(__m256i, vv));
871 }
872
873 // Repeats elements in second lane in AVX2 case.
vecw_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)874 HEADER_INLINE VecW vecw_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
875 return R_CAST(VecW, _mm256_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
876 }
877
vecu16_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)878 HEADER_INLINE VecU16 vecu16_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
879 return R_CAST(VecU16, _mm256_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
880 }
881
vecuc_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)882 HEADER_INLINE VecUc vecuc_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
883 return R_CAST(VecUc, _mm256_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
884 }
885
886 // Discards last 16 arguments in SSE2/SSE4.2 case.
vecw_setr8x(char e31,char e30,char e29,char e28,char e27,char e26,char e25,char e24,char e23,char e22,char e21,char e20,char e19,char e18,char e17,char e16,char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)887 HEADER_INLINE VecW vecw_setr8x(char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
888 return R_CAST(VecW, _mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
889 }
890
vecw_unpacklo8(VecW evens,VecW odds)891 HEADER_INLINE VecW vecw_unpacklo8(VecW evens, VecW odds) {
892 return R_CAST(VecW, _mm256_unpacklo_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
893 }
894
vecw_unpackhi8(VecW evens,VecW odds)895 HEADER_INLINE VecW vecw_unpackhi8(VecW evens, VecW odds) {
896 return R_CAST(VecW, _mm256_unpackhi_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
897 }
898
veci8_unpacklo8(VecI8 evens,VecI8 odds)899 HEADER_INLINE VecI8 veci8_unpacklo8(VecI8 evens, VecI8 odds) {
900 return R_CAST(VecI8, _mm256_unpacklo_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
901 }
902
veci8_unpackhi8(VecI8 evens,VecI8 odds)903 HEADER_INLINE VecI8 veci8_unpackhi8(VecI8 evens, VecI8 odds) {
904 return R_CAST(VecI8, _mm256_unpackhi_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
905 }
906
vecuc_unpacklo8(VecUc evens,VecUc odds)907 HEADER_INLINE VecUc vecuc_unpacklo8(VecUc evens, VecUc odds) {
908 return R_CAST(VecUc, _mm256_unpacklo_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
909 }
910
vecuc_unpackhi8(VecUc evens,VecUc odds)911 HEADER_INLINE VecUc vecuc_unpackhi8(VecUc evens, VecUc odds) {
912 return R_CAST(VecUc, _mm256_unpackhi_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
913 }
914
vecw_unpacklo16(VecW evens,VecW odds)915 HEADER_INLINE VecW vecw_unpacklo16(VecW evens, VecW odds) {
916 return R_CAST(VecW, _mm256_unpacklo_epi16(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
917 }
918
vecw_unpackhi16(VecW evens,VecW odds)919 HEADER_INLINE VecW vecw_unpackhi16(VecW evens, VecW odds) {
920 return R_CAST(VecW, _mm256_unpackhi_epi16(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
921 }
922
vecw_unpacklo32(VecW evens,VecW odds)923 HEADER_INLINE VecW vecw_unpacklo32(VecW evens, VecW odds) {
924 return R_CAST(VecW, _mm256_unpacklo_epi32(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
925 }
926
vecw_unpackhi32(VecW evens,VecW odds)927 HEADER_INLINE VecW vecw_unpackhi32(VecW evens, VecW odds) {
928 return R_CAST(VecW, _mm256_unpackhi_epi32(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
929 }
930
vecw_permute0xd8_if_avx2(VecW vv)931 HEADER_INLINE VecW vecw_permute0xd8_if_avx2(VecW vv) {
932 return R_CAST(VecW, _mm256_permute4x64_epi64(R_CAST(__m256i, vv), 0xd8));
933 }
934
veci8_permute0xd8_if_avx2(VecI8 vv)935 HEADER_INLINE VecI8 veci8_permute0xd8_if_avx2(VecI8 vv) {
936 return R_CAST(VecI8, _mm256_permute4x64_epi64(R_CAST(__m256i, vv), 0xd8));
937 }
938
vecuc_permute0xd8_if_avx2(VecUc vv)939 HEADER_INLINE VecUc vecuc_permute0xd8_if_avx2(VecUc vv) {
940 return R_CAST(VecUc, _mm256_permute4x64_epi64(R_CAST(__m256i, vv), 0xd8));
941 }
942
vecw_shuffle8(VecW table,VecW indexes)943 HEADER_INLINE VecW vecw_shuffle8(VecW table, VecW indexes) {
944 return R_CAST(VecW, _mm256_shuffle_epi8(R_CAST(__m256i, table), R_CAST(__m256i, indexes)));
945 }
946
vecu16_shuffle8(VecU16 table,VecU16 indexes)947 HEADER_INLINE VecU16 vecu16_shuffle8(VecU16 table, VecU16 indexes) {
948 return R_CAST(VecU16, _mm256_shuffle_epi8(R_CAST(__m256i, table), R_CAST(__m256i, indexes)));
949 }
950
vecuc_shuffle8(VecUc table,VecUc indexes)951 HEADER_INLINE VecUc vecuc_shuffle8(VecUc table, VecUc indexes) {
952 return R_CAST(VecUc, _mm256_shuffle_epi8(R_CAST(__m256i, table), R_CAST(__m256i, indexes)));
953 }
954
vecw_extract64_0(VecW vv)955 HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
956 return _mm256_extract_epi64(R_CAST(__m256i, vv), 0);
957 }
958
vecw_extract64_1(VecW vv)959 HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) {
960 return _mm256_extract_epi64(R_CAST(__m256i, vv), 1);
961 }
962
963 // *** AVX2-only section ***
vecw_extract64_2(VecW vv)964 HEADER_INLINE uintptr_t vecw_extract64_2(VecW vv) {
965 return _mm256_extract_epi64(R_CAST(__m256i, vv), 2);
966 }
967
vecw_extract64_3(VecW vv)968 HEADER_INLINE uintptr_t vecw_extract64_3(VecW vv) {
969 return _mm256_extract_epi64(R_CAST(__m256i, vv), 3);
970 }
971
972 // todo: permute
973
974 // *** end AVX2-only section ***
975
976 # define kVec8thUintMax UINT32_MAX
977
978 typedef uint16_t Vec16thUint;
979 typedef uint32_t Vec8thUint;
980 typedef uint64_t Vec4thUint;
981
vecw_load(const void * mem_addr)982 HEADER_INLINE VecW vecw_load(const void* mem_addr) {
983 return R_CAST(VecW, _mm256_load_si256(S_CAST(const __m256i*, mem_addr)));
984 }
985
986 // There may be some value in adding a 4-consecutive-vector load function when
987 // addresses are expected to be unaligned: see
988 // https://www.agner.org/optimize/blog/read.php?i=627&v=t
989
vecw_loadu(const void * mem_addr)990 HEADER_INLINE VecW vecw_loadu(const void* mem_addr) {
991 return R_CAST(VecW, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
992 }
993
vecu32_loadu(const void * mem_addr)994 HEADER_INLINE VecU32 vecu32_loadu(const void* mem_addr) {
995 return R_CAST(VecU32, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
996 }
997
veci32_loadu(const void * mem_addr)998 HEADER_INLINE VecI32 veci32_loadu(const void* mem_addr) {
999 return R_CAST(VecI32, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1000 }
1001
vecu16_loadu(const void * mem_addr)1002 HEADER_INLINE VecU16 vecu16_loadu(const void* mem_addr) {
1003 return R_CAST(VecU16, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1004 }
1005
veci16_loadu(const void * mem_addr)1006 HEADER_INLINE VecI16 veci16_loadu(const void* mem_addr) {
1007 return R_CAST(VecI16, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1008 }
1009
vecuc_loadu(const void * mem_addr)1010 HEADER_INLINE VecUc vecuc_loadu(const void* mem_addr) {
1011 return R_CAST(VecUc, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1012 }
1013
veci8_loadu(const void * mem_addr)1014 HEADER_INLINE VecI8 veci8_loadu(const void* mem_addr) {
1015 return R_CAST(VecI8, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1016 }
1017
vecw_storeu(void * mem_addr,VecW vv)1018 HEADER_INLINE void vecw_storeu(void* mem_addr, VecW vv) {
1019 _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1020 }
1021
vecu32_storeu(void * mem_addr,VecU32 vv)1022 HEADER_INLINE void vecu32_storeu(void* mem_addr, VecU32 vv) {
1023 _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1024 }
1025
veci32_storeu(void * mem_addr,VecI32 vv)1026 HEADER_INLINE void veci32_storeu(void* mem_addr, VecI32 vv) {
1027 _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1028 }
1029
vecu16_storeu(void * mem_addr,VecU16 vv)1030 HEADER_INLINE void vecu16_storeu(void* mem_addr, VecU16 vv) {
1031 _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1032 }
1033
veci16_storeu(void * mem_addr,VecI16 vv)1034 HEADER_INLINE void veci16_storeu(void* mem_addr, VecI16 vv) {
1035 _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1036 }
1037
vecuc_storeu(void * mem_addr,VecUc vv)1038 HEADER_INLINE void vecuc_storeu(void* mem_addr, VecUc vv) {
1039 _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1040 }
1041
veci32_max(VecI32 v1,VecI32 v2)1042 HEADER_INLINE VecI32 veci32_max(VecI32 v1, VecI32 v2) {
1043 return R_CAST(VecI32, _mm256_max_epi32(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1044 }
1045
veci16_max(VecI16 v1,VecI16 v2)1046 HEADER_INLINE VecI16 veci16_max(VecI16 v1, VecI16 v2) {
1047 return R_CAST(VecI16, _mm256_max_epi16(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1048 }
1049
vecw_sad(VecW v1,VecW v2)1050 HEADER_INLINE VecW vecw_sad(VecW v1, VecW v2) {
1051 return R_CAST(VecW, _mm256_sad_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1052 }
1053
vecuc_adds(VecUc v1,VecUc v2)1054 HEADER_INLINE VecUc vecuc_adds(VecUc v1, VecUc v2) {
1055 return R_CAST(VecUc, _mm256_adds_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1056 }
1057
vecu16_min8(VecU16 v1,VecU16 v2)1058 HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) {
1059 return R_CAST(VecU16, _mm256_min_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1060 }
1061
vecuc_min(VecUc v1,VecUc v2)1062 HEADER_INLINE VecUc vecuc_min(VecUc v1, VecUc v2) {
1063 return R_CAST(VecUc, _mm256_min_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1064 }
1065
vecw_blendv(VecW aa,VecW bb,VecW mask)1066 HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
1067 return R_CAST(VecW, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
1068 }
1069
vecu32_blendv(VecU32 aa,VecU32 bb,VecU32 mask)1070 HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
1071 return R_CAST(VecU32, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
1072 }
1073
vecu16_blendv(VecU16 aa,VecU16 bb,VecU16 mask)1074 HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
1075 return R_CAST(VecU16, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
1076 }
1077
1078 # else // !USE_AVX2
1079
1080 # define VCONST_W(xx) {xx, xx}
1081 # define VCONST_S(xx) {xx, xx, xx, xx, xx, xx, xx, xx}
1082 # define VCONST_C(xx) {xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx}
1083 # define VCONST_UC VCONST_C
1084
vecw_setzero()1085 HEADER_INLINE VecW vecw_setzero() {
1086 return R_CAST(VecW, _mm_setzero_si128());
1087 }
1088
vecu32_setzero()1089 HEADER_INLINE VecU32 vecu32_setzero() {
1090 return R_CAST(VecU32, _mm_setzero_si128());
1091 }
1092
vecu16_setzero()1093 HEADER_INLINE VecU16 vecu16_setzero() {
1094 return R_CAST(VecU16, _mm_setzero_si128());
1095 }
1096
veci16_setzero()1097 HEADER_INLINE VecI16 veci16_setzero() {
1098 return R_CAST(VecI16, _mm_setzero_si128());
1099 }
1100
vecuc_setzero()1101 HEADER_INLINE VecUc vecuc_setzero() {
1102 return R_CAST(VecUc, _mm_setzero_si128());
1103 }
1104
veci8_setzero()1105 HEADER_INLINE VecI8 veci8_setzero() {
1106 return R_CAST(VecI8, _mm_setzero_si128());
1107 }
1108
vecw_srli(VecW vv,uint32_t ct)1109 HEADER_INLINE VecW vecw_srli(VecW vv, uint32_t ct) {
1110 return R_CAST(VecW, _mm_srli_epi64(R_CAST(__m128i, vv), ct));
1111 }
1112
vecw_slli(VecW vv,uint32_t ct)1113 HEADER_INLINE VecW vecw_slli(VecW vv, uint32_t ct) {
1114 return R_CAST(VecW, _mm_slli_epi64(R_CAST(__m128i, vv), ct));
1115 }
1116
vecu32_srli(VecU32 vv,uint32_t ct)1117 HEADER_INLINE VecU32 vecu32_srli(VecU32 vv, uint32_t ct) {
1118 return R_CAST(VecU32, _mm_srli_epi32(R_CAST(__m128i, vv), ct));
1119 }
1120
vecu32_slli(VecU32 vv,uint32_t ct)1121 HEADER_INLINE VecU32 vecu32_slli(VecU32 vv, uint32_t ct) {
1122 return R_CAST(VecU32, _mm_slli_epi32(R_CAST(__m128i, vv), ct));
1123 }
1124
vecu16_srli(VecU16 vv,uint32_t ct)1125 HEADER_INLINE VecU16 vecu16_srli(VecU16 vv, uint32_t ct) {
1126 return R_CAST(VecU16, _mm_srli_epi16(R_CAST(__m128i, vv), ct));
1127 }
1128
vecu16_slli(VecU16 vv,uint32_t ct)1129 HEADER_INLINE VecU16 vecu16_slli(VecU16 vv, uint32_t ct) {
1130 return R_CAST(VecU16, _mm_slli_epi16(R_CAST(__m128i, vv), ct));
1131 }
1132
vecw_and_notfirst(VecW excl,VecW main)1133 HEADER_INLINE VecW vecw_and_notfirst(VecW excl, VecW main) {
1134 return R_CAST(VecW, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1135 }
1136
vecu32_and_notfirst(VecU32 excl,VecU32 main)1137 HEADER_INLINE VecU32 vecu32_and_notfirst(VecU32 excl, VecU32 main) {
1138 return R_CAST(VecU32, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1139 }
1140
veci32_and_notfirst(VecI32 excl,VecI32 main)1141 HEADER_INLINE VecI32 veci32_and_notfirst(VecI32 excl, VecI32 main) {
1142 return R_CAST(VecI32, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1143 }
1144
vecu16_and_notfirst(VecU16 excl,VecU16 main)1145 HEADER_INLINE VecU16 vecu16_and_notfirst(VecU16 excl, VecU16 main) {
1146 return R_CAST(VecU16, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1147 }
1148
veci16_and_notfirst(VecI16 excl,VecI16 main)1149 HEADER_INLINE VecI16 veci16_and_notfirst(VecI16 excl, VecI16 main) {
1150 return R_CAST(VecI16, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1151 }
1152
veci8_and_notfirst(VecI8 excl,VecI8 main)1153 HEADER_INLINE VecI8 veci8_and_notfirst(VecI8 excl, VecI8 main) {
1154 return R_CAST(VecI8, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1155 }
1156
vecw_set1(uintptr_t ulii)1157 HEADER_INLINE VecW vecw_set1(uintptr_t ulii) {
1158 return R_CAST(VecW, _mm_set1_epi64x(ulii));
1159 }
1160
vecu32_set1(uint32_t uii)1161 HEADER_INLINE VecU32 vecu32_set1(uint32_t uii) {
1162 return R_CAST(VecU32, _mm_set1_epi32(uii));
1163 }
1164
veci32_set1(int32_t ii)1165 HEADER_INLINE VecI32 veci32_set1(int32_t ii) {
1166 return R_CAST(VecI32, _mm_set1_epi32(ii));
1167 }
1168
vecu16_set1(unsigned short usi)1169 HEADER_INLINE VecU16 vecu16_set1(unsigned short usi) {
1170 return R_CAST(VecU16, _mm_set1_epi16(usi));
1171 }
1172
veci16_set1(short si)1173 HEADER_INLINE VecI16 veci16_set1(short si) {
1174 return R_CAST(VecI16, _mm_set1_epi16(si));
1175 }
1176
vecuc_set1(unsigned char ucc)1177 HEADER_INLINE VecUc vecuc_set1(unsigned char ucc) {
1178 return R_CAST(VecUc, _mm_set1_epi8(ucc));
1179 }
1180
veci8_set1(char cc)1181 HEADER_INLINE VecI8 veci8_set1(char cc) {
1182 return R_CAST(VecI8, _mm_set1_epi8(cc));
1183 }
1184
vecw_movemask(VecW vv)1185 HEADER_INLINE uint32_t vecw_movemask(VecW vv) {
1186 return _mm_movemask_epi8(R_CAST(__m128i, vv));
1187 }
1188
vecu32_movemask(VecU32 vv)1189 HEADER_INLINE uint32_t vecu32_movemask(VecU32 vv) {
1190 return _mm_movemask_epi8(R_CAST(__m128i, vv));
1191 }
1192
veci32_movemask(VecI32 vv)1193 HEADER_INLINE uint32_t veci32_movemask(VecI32 vv) {
1194 return _mm_movemask_epi8(R_CAST(__m128i, vv));
1195 }
1196
vecu16_movemask(VecU16 vv)1197 HEADER_INLINE uint32_t vecu16_movemask(VecU16 vv) {
1198 return _mm_movemask_epi8(R_CAST(__m128i, vv));
1199 }
1200
veci16_movemask(VecI16 vv)1201 HEADER_INLINE uint32_t veci16_movemask(VecI16 vv) {
1202 return _mm_movemask_epi8(R_CAST(__m128i, vv));
1203 }
1204
veci8_movemask(VecI8 vv)1205 HEADER_INLINE uint32_t veci8_movemask(VecI8 vv) {
1206 return _mm_movemask_epi8(R_CAST(__m128i, vv));
1207 }
1208
vecuc_movemask(VecUc vv)1209 HEADER_INLINE uint32_t vecuc_movemask(VecUc vv) {
1210 return _mm_movemask_epi8(R_CAST(__m128i, vv));
1211 }
1212
1213 CONSTI32(kVec8thUintMax, 65535);
1214
1215 // # define kVec8thUintMax 65535
1216
1217 typedef unsigned char Vec16thUint;
1218 typedef uint16_t Vec8thUint;
1219 typedef uint32_t Vec4thUint;
1220
vecw_load(const void * mem_addr)1221 HEADER_INLINE VecW vecw_load(const void* mem_addr) {
1222 return R_CAST(VecW, _mm_load_si128(S_CAST(const __m128i*, mem_addr)));
1223 }
1224
vecw_loadu(const void * mem_addr)1225 HEADER_INLINE VecW vecw_loadu(const void* mem_addr) {
1226 return R_CAST(VecW, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1227 }
1228
vecu32_loadu(const void * mem_addr)1229 HEADER_INLINE VecU32 vecu32_loadu(const void* mem_addr) {
1230 return R_CAST(VecU32, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1231 }
1232
veci32_loadu(const void * mem_addr)1233 HEADER_INLINE VecI32 veci32_loadu(const void* mem_addr) {
1234 return R_CAST(VecI32, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1235 }
1236
vecu16_loadu(const void * mem_addr)1237 HEADER_INLINE VecU16 vecu16_loadu(const void* mem_addr) {
1238 return R_CAST(VecU16, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1239 }
1240
veci16_loadu(const void * mem_addr)1241 HEADER_INLINE VecI16 veci16_loadu(const void* mem_addr) {
1242 return R_CAST(VecI16, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1243 }
1244
vecuc_loadu(const void * mem_addr)1245 HEADER_INLINE VecUc vecuc_loadu(const void* mem_addr) {
1246 return R_CAST(VecUc, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1247 }
1248
veci8_loadu(const void * mem_addr)1249 HEADER_INLINE VecI8 veci8_loadu(const void* mem_addr) {
1250 return R_CAST(VecI8, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1251 }
1252
vecw_storeu(void * mem_addr,VecW vv)1253 HEADER_INLINE void vecw_storeu(void* mem_addr, VecW vv) {
1254 _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1255 }
1256
vecu32_storeu(void * mem_addr,VecU32 vv)1257 HEADER_INLINE void vecu32_storeu(void* mem_addr, VecU32 vv) {
1258 _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1259 }
1260
veci32_storeu(void * mem_addr,VecI32 vv)1261 HEADER_INLINE void veci32_storeu(void* mem_addr, VecI32 vv) {
1262 _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1263 }
1264
vecu16_storeu(void * mem_addr,VecU16 vv)1265 HEADER_INLINE void vecu16_storeu(void* mem_addr, VecU16 vv) {
1266 _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1267 }
1268
veci16_storeu(void * mem_addr,VecI16 vv)1269 HEADER_INLINE void veci16_storeu(void* mem_addr, VecI16 vv) {
1270 _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1271 }
1272
vecuc_storeu(void * mem_addr,VecUc vv)1273 HEADER_INLINE void vecuc_storeu(void* mem_addr, VecUc vv) {
1274 _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1275 }
1276
1277 // Repeats arguments in AVX2 case.
vecw_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)1278 HEADER_INLINE VecW vecw_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
1279 return R_CAST(VecW, _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
1280 }
1281
vecu16_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)1282 HEADER_INLINE VecU16 vecu16_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
1283 return R_CAST(VecU16, _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
1284 }
1285
vecuc_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)1286 HEADER_INLINE VecUc vecuc_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
1287 return R_CAST(VecUc, _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
1288 }
1289
1290 // Discards last 16 arguments in SSE2/SSE4.2 case.
vecw_setr8x(char e31,char e30,char e29,char e28,char e27,char e26,char e25,char e24,char e23,char e22,char e21,char e20,char e19,char e18,char e17,char e16,__maybe_unused char e15,__maybe_unused char e14,__maybe_unused char e13,__maybe_unused char e12,__maybe_unused char e11,__maybe_unused char e10,__maybe_unused char e9,__maybe_unused char e8,__maybe_unused char e7,__maybe_unused char e6,__maybe_unused char e5,__maybe_unused char e4,__maybe_unused char e3,__maybe_unused char e2,__maybe_unused char e1,__maybe_unused char e0)1291 HEADER_INLINE VecW vecw_setr8x(
1292 char e31, char e30, char e29, char e28,
1293 char e27, char e26, char e25, char e24,
1294 char e23, char e22, char e21, char e20,
1295 char e19, char e18, char e17, char e16,
1296 __maybe_unused char e15, __maybe_unused char e14,
1297 __maybe_unused char e13, __maybe_unused char e12,
1298 __maybe_unused char e11, __maybe_unused char e10,
1299 __maybe_unused char e9, __maybe_unused char e8,
1300 __maybe_unused char e7, __maybe_unused char e6,
1301 __maybe_unused char e5, __maybe_unused char e4,
1302 __maybe_unused char e3, __maybe_unused char e2,
1303 __maybe_unused char e1, __maybe_unused char e0) {
1304 return R_CAST(VecW, _mm_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16));
1305 }
1306
vecw_unpacklo8(VecW evens,VecW odds)1307 HEADER_INLINE VecW vecw_unpacklo8(VecW evens, VecW odds) {
1308 return R_CAST(VecW, _mm_unpacklo_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1309 }
1310
vecw_unpackhi8(VecW evens,VecW odds)1311 HEADER_INLINE VecW vecw_unpackhi8(VecW evens, VecW odds) {
1312 return R_CAST(VecW, _mm_unpackhi_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1313 }
1314
veci8_unpacklo8(VecI8 evens,VecI8 odds)1315 HEADER_INLINE VecI8 veci8_unpacklo8(VecI8 evens, VecI8 odds) {
1316 return R_CAST(VecI8, _mm_unpacklo_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1317 }
1318
veci8_unpackhi8(VecI8 evens,VecI8 odds)1319 HEADER_INLINE VecI8 veci8_unpackhi8(VecI8 evens, VecI8 odds) {
1320 return R_CAST(VecI8, _mm_unpackhi_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1321 }
1322
vecuc_unpacklo8(VecUc evens,VecUc odds)1323 HEADER_INLINE VecUc vecuc_unpacklo8(VecUc evens, VecUc odds) {
1324 return R_CAST(VecUc, _mm_unpacklo_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1325 }
1326
vecuc_unpackhi8(VecUc evens,VecUc odds)1327 HEADER_INLINE VecUc vecuc_unpackhi8(VecUc evens, VecUc odds) {
1328 return R_CAST(VecUc, _mm_unpackhi_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1329 }
1330
vecw_unpacklo16(VecW evens,VecW odds)1331 HEADER_INLINE VecW vecw_unpacklo16(VecW evens, VecW odds) {
1332 return R_CAST(VecW, _mm_unpacklo_epi16(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1333 }
1334
vecw_unpackhi16(VecW evens,VecW odds)1335 HEADER_INLINE VecW vecw_unpackhi16(VecW evens, VecW odds) {
1336 return R_CAST(VecW, _mm_unpackhi_epi16(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1337 }
1338
vecw_unpacklo32(VecW evens,VecW odds)1339 HEADER_INLINE VecW vecw_unpacklo32(VecW evens, VecW odds) {
1340 return R_CAST(VecW, _mm_unpacklo_epi32(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1341 }
1342
vecw_unpackhi32(VecW evens,VecW odds)1343 HEADER_INLINE VecW vecw_unpackhi32(VecW evens, VecW odds) {
1344 return R_CAST(VecW, _mm_unpackhi_epi32(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1345 }
1346
vecw_unpacklo64(VecW evens,VecW odds)1347 HEADER_INLINE VecW vecw_unpacklo64(VecW evens, VecW odds) {
1348 return R_CAST(VecW, _mm_unpacklo_epi64(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1349 }
1350
vecw_unpackhi64(VecW evens,VecW odds)1351 HEADER_INLINE VecW vecw_unpackhi64(VecW evens, VecW odds) {
1352 return R_CAST(VecW, _mm_unpackhi_epi64(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1353 }
1354
vecw_permute0xd8_if_avx2(VecW vv)1355 HEADER_INLINE VecW vecw_permute0xd8_if_avx2(VecW vv) {
1356 return vv;
1357 }
1358
veci8_permute0xd8_if_avx2(VecI8 vv)1359 HEADER_INLINE VecI8 veci8_permute0xd8_if_avx2(VecI8 vv) {
1360 return vv;
1361 }
1362
vecuc_permute0xd8_if_avx2(VecUc vv)1363 HEADER_INLINE VecUc vecuc_permute0xd8_if_avx2(VecUc vv) {
1364 return vv;
1365 }
1366
1367 # ifdef USE_SSE42
veci32_max(VecI32 v1,VecI32 v2)1368 HEADER_INLINE VecI32 veci32_max(VecI32 v1, VecI32 v2) {
1369 return R_CAST(VecI32, _mm_max_epi32(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1370 }
1371
vecw_shuffle8(VecW table,VecW indexes)1372 HEADER_INLINE VecW vecw_shuffle8(VecW table, VecW indexes) {
1373 return R_CAST(VecW, _mm_shuffle_epi8(R_CAST(__m128i, table), R_CAST(__m128i, indexes)));
1374 }
1375
vecu16_shuffle8(VecU16 table,VecU16 indexes)1376 HEADER_INLINE VecU16 vecu16_shuffle8(VecU16 table, VecU16 indexes) {
1377 return R_CAST(VecU16, _mm_shuffle_epi8(R_CAST(__m128i, table), R_CAST(__m128i, indexes)));
1378 }
1379
vecuc_shuffle8(VecUc table,VecUc indexes)1380 HEADER_INLINE VecUc vecuc_shuffle8(VecUc table, VecUc indexes) {
1381 return R_CAST(VecUc, _mm_shuffle_epi8(R_CAST(__m128i, table), R_CAST(__m128i, indexes)));
1382 }
1383
vecw_extract64_0(VecW vv)1384 HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
1385 return _mm_extract_epi64(R_CAST(__m128i, vv), 0);
1386 }
1387
vecw_extract64_1(VecW vv)1388 HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) {
1389 return _mm_extract_epi64(R_CAST(__m128i, vv), 1);
1390 }
1391
vecw_blendv(VecW aa,VecW bb,VecW mask)1392 HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
1393 return R_CAST(VecW, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
1394 }
1395
vecu32_blendv(VecU32 aa,VecU32 bb,VecU32 mask)1396 HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
1397 return R_CAST(VecU32, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
1398 }
1399
vecu16_blendv(VecU16 aa,VecU16 bb,VecU16 mask)1400 HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
1401 return R_CAST(VecU16, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
1402 }
1403 # else
vecw_extract64_0(VecW vv)1404 HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
1405 return R_CAST(uintptr_t, _mm_movepi64_pi64(R_CAST(__m128i, vv)));
1406 }
1407
vecw_extract64_1(VecW vv)1408 HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) {
1409 const __m128i v0 = _mm_srli_si128(R_CAST(__m128i, vv), 8);
1410 return R_CAST(uintptr_t, _mm_movepi64_pi64(v0));
1411 }
1412
1413 // N.B. we do *not* enforce the low bits of each mask byte matching the high
1414 // bit.
vecw_blendv(VecW aa,VecW bb,VecW mask)1415 HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
1416 return vecw_and_notfirst(mask, aa) | (mask & bb);
1417 }
1418
vecu32_blendv(VecU32 aa,VecU32 bb,VecU32 mask)1419 HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
1420 return vecu32_and_notfirst(mask, aa) | (mask & bb);
1421 }
1422
vecu16_blendv(VecU16 aa,VecU16 bb,VecU16 mask)1423 HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
1424 return vecu16_and_notfirst(mask, aa) | (mask & bb);
1425 }
1426 # endif
1427
veci16_max(VecI16 v1,VecI16 v2)1428 HEADER_INLINE VecI16 veci16_max(VecI16 v1, VecI16 v2) {
1429 return R_CAST(VecI16, _mm_max_epi16(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1430 }
1431
vecw_sad(VecW v1,VecW v2)1432 HEADER_INLINE VecW vecw_sad(VecW v1, VecW v2) {
1433 return R_CAST(VecW, _mm_sad_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1434 }
1435
vecuc_adds(VecUc v1,VecUc v2)1436 HEADER_INLINE VecUc vecuc_adds(VecUc v1, VecUc v2) {
1437 return R_CAST(VecUc, _mm_adds_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1438 }
1439
vecu16_min8(VecU16 v1,VecU16 v2)1440 HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) {
1441 return R_CAST(VecU16, _mm_min_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1442 }
1443
vecuc_min(VecUc v1,VecUc v2)1444 HEADER_INLINE VecUc vecuc_min(VecUc v1, VecUc v2) {
1445 return R_CAST(VecUc, _mm_min_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1446 }
1447 # endif // !USE_AVX2
1448
vecw_bytesum(VecW src,VecW m0)1449 HEADER_INLINE VecW vecw_bytesum(VecW src, VecW m0) {
1450 return vecw_sad(src, m0);
1451 }
1452
1453 CONSTI32(kVec8thUintPerWord, sizeof(intptr_t) / sizeof(Vec8thUint));
1454
1455 # ifdef FVEC_32
1456
1457 # ifndef __FMA__
1458 # error "32-byte-float-vector builds require FMA3 as well."
1459 # endif
1460
1461 CONSTI32(kBytesPerFVec, 32);
1462 typedef float VecF __attribute__ ((vector_size (32)));
1463
1464 # define VCONST_F(xx) {xx, xx, xx, xx, xx, xx, xx, xx}
1465
vecf_setzero()1466 HEADER_INLINE VecF vecf_setzero() {
1467 return R_CAST(VecF, _mm256_setzero_ps());
1468 }
1469
1470 # else // !FVEC_32
1471
1472 CONSTI32(kBytesPerFVec, 16);
1473 typedef float VecF __attribute__ ((vector_size (16)));
1474
1475 # define VCONST_F(xx) {xx, xx, xx, xx}
1476
vecf_setzero()1477 HEADER_INLINE VecF vecf_setzero() {
1478 return R_CAST(VecF, _mm_setzero_ps());
1479 }
1480
1481 # endif // !FVEC_32
1482
1483 #else // not __LP64__
1484 CONSTI32(kBytesPerVec, 4);
1485 CONSTI32(kBytesPerFVec, 4);
1486 CONSTI32(kBitsPerWord, 32);
1487 CONSTI32(kBitsPerWordLog2, 5);
1488
1489 typedef uint16_t Halfword;
1490 typedef uint8_t Quarterword;
1491
1492 typedef uintptr_t VecW;
1493 typedef uintptr_t VecU32;
1494 typedef float VecF;
1495 // VecI16 and VecI8 aren't worth the trouble of scaling down to 32-bit
1496
1497 # define VCONST_W(xx) (xx)
1498
vecw_setzero()1499 HEADER_INLINE VecW vecw_setzero() {
1500 return k0LU;
1501 }
1502
vecw_srli(VecW vv,uint32_t ct)1503 HEADER_INLINE VecW vecw_srli(VecW vv, uint32_t ct) {
1504 return vv >> ct;
1505 }
1506
vecw_slli(VecW vv,uint32_t ct)1507 HEADER_INLINE VecW vecw_slli(VecW vv, uint32_t ct) {
1508 return vv << ct;
1509 }
1510
vecw_set1(uintptr_t ulii)1511 HEADER_INLINE VecW vecw_set1(uintptr_t ulii) {
1512 return ulii;
1513 }
1514
vecw_loadu(const void * mem_addr)1515 HEADER_INLINE VecW vecw_loadu(const void* mem_addr) {
1516 return *S_CAST(const VecW*, mem_addr);
1517 }
1518
vecw_bytesum(VecW src,__maybe_unused VecW m0)1519 HEADER_INLINE VecW vecw_bytesum(VecW src, __maybe_unused VecW m0) {
1520 src = (src & 0x00ff00ff) + ((src >> 8) & 0x00ff00ff);
1521 return (src + (src >> 16)) & 0xffff;
1522 }
1523
vecw_and_notfirst(VecW excl,VecW main)1524 HEADER_INLINE VecW vecw_and_notfirst(VecW excl, VecW main) {
1525 return (~excl) & main;
1526 }
1527
vecu32_and_notfirst(VecU32 excl,VecU32 main)1528 HEADER_INLINE VecU32 vecu32_and_notfirst(VecU32 excl, VecU32 main) {
1529 return (~excl) & main;
1530 }
1531 #endif // !__LP64__
1532
1533 // debug
PrintVec(const void * vv)1534 HEADER_INLINE void PrintVec(const void* vv) {
1535 const unsigned char* vv_alias = S_CAST(const unsigned char*, vv);
1536 for (uint32_t uii = 0; uii != kBytesPerVec; ++uii) {
1537 printf("%u ", vv_alias[uii]);
1538 }
1539 printf("\n");
1540 }
1541
1542 // Unfortunately, we need to spell out S_CAST(uintptr_t, 0) instead of just
1543 // typing k0LU in C99.
1544 static const uintptr_t kMask5555 = (~S_CAST(uintptr_t, 0)) / 3;
1545 static const uintptr_t kMaskAAAA = ((~S_CAST(uintptr_t, 0)) / 3) * 2;
1546 static const uintptr_t kMask3333 = (~S_CAST(uintptr_t, 0)) / 5;
1547 static const uintptr_t kMask1111 = (~S_CAST(uintptr_t, 0)) / 15;
1548 static const uintptr_t kMask0F0F = (~S_CAST(uintptr_t, 0)) / 17;
1549 static const uintptr_t kMask0101 = (~S_CAST(uintptr_t, 0)) / 255;
1550 static const uintptr_t kMask00FF = (~S_CAST(uintptr_t, 0)) / 257;
1551 static const uintptr_t kMask0001 = (~S_CAST(uintptr_t, 0)) / 65535;
1552 static const uintptr_t kMask0000FFFF = (~S_CAST(uintptr_t, 0)) / 65537;
1553 static const uintptr_t kMask00000001 = (~S_CAST(uintptr_t, 0)) / 4294967295U;
1554
1555 static const uintptr_t kMask000000FF = (~S_CAST(uintptr_t, 0)) / 16843009;
1556 static const uintptr_t kMask000F = (~S_CAST(uintptr_t, 0)) / 4369;
1557 static const uintptr_t kMask0303 = (~S_CAST(uintptr_t, 0)) / 85;
1558
1559 CONSTI32(kBitsPerVec, kBytesPerVec * CHAR_BIT);
1560
1561 // We now use Knuth's Nyp/Nybble vocabulary for 2-bit and 4-bit elements,
1562 // respectively.
1563 CONSTI32(kNypsPerVec, kBytesPerVec * 4);
1564
1565 CONSTI32(kBitsPerWordD2, kBitsPerWord / 2);
1566 CONSTI32(kBitsPerWordD4, kBitsPerWord / 4);
1567
1568 // number of bytes in a word
1569 CONSTI32(kBytesPerWord, kBitsPerWord / CHAR_BIT);
1570
1571 CONSTI32(kInt16PerWord, kBytesPerWord / 2);
1572
1573 static_assert(CHAR_BIT == 8, "plink2_base requires CHAR_BIT == 8.");
1574 static_assert(sizeof(int8_t) == 1, "plink2_base requires sizeof(int8_t) == 1.");
1575 static_assert(sizeof(int16_t) == 2, "plink2_base requires sizeof(int16_t) == 2.");
1576 static_assert(sizeof(int32_t) == 4, "plink2_base requires sizeof(int32_t) == 4.");
1577 static_assert(sizeof(int) >= 4, "plink2_base requires sizeof(int) >= 4.");
1578 static_assert(sizeof(intptr_t) == kBytesPerWord, "plink2_base requires sizeof(intptr_t) == kBytesPerWord.");
1579 static_assert(sizeof(int64_t) == 8, "plink2_base requires sizeof(int64_t) == 8.");
1580
1581 CONSTI32(kWordsPerVec, kBytesPerVec / kBytesPerWord);
1582 CONSTI32(kInt32PerVec, kBytesPerVec / 4);
1583 CONSTI32(kInt16PerVec, kBytesPerVec / 2);
1584
1585 CONSTI32(kFloatPerFVec, kBytesPerFVec / 4);
1586
1587 CONSTI32(kCacheline, 64);
1588
1589 CONSTI32(kBitsPerCacheline, kCacheline * CHAR_BIT);
1590 CONSTI32(kNypsPerCacheline, kCacheline * 4);
1591 CONSTI32(kInt16PerCacheline, kCacheline / sizeof(int16_t));
1592 CONSTI32(kInt32PerCacheline, kCacheline / sizeof(int32_t));
1593 CONSTI32(kInt64PerCacheline, kCacheline / sizeof(int64_t));
1594 CONSTI32(kWordsPerCacheline, kCacheline / kBytesPerWord);
1595 CONSTI32(kDoublesPerCacheline, kCacheline / sizeof(double));
1596 CONSTI32(kVecsPerCacheline, kCacheline / kBytesPerVec);
1597
1598 // could use ioctl, etc. to dynamically determine this later, and pass it as a
1599 // parameter to e.g. PgfiMultiread()
1600 CONSTI32(kDiskBlockSize, 4096);
1601
1602 CONSTI32(kPglFwriteBlockSize, 131072);
1603
1604 // unsafe to fread or fwrite more bytes than this on e.g. OS X
1605 CONSTI32(kMaxBytesPerIO, 0x7ffff000);
1606
1607 // Maximum size of "dynamically" allocated line load buffer. (This is the
1608 // limit that applies to .vcf and similar files.) Inconvenient to go higher
1609 // since fgets() takes a int32_t size argument.
1610 CONSTI32(kMaxLongLine, 0x7fffffc0);
1611 static_assert(!(kMaxLongLine % kCacheline), "kMaxLongLine must be a multiple of kCacheline.");
1612
1613 #ifdef __APPLE__
1614 // OS X is limited to 256?
1615 CONSTI32(kMaxOpenFiles, 252);
1616 #else
1617 // Can't assume more than 512 are allowed on Windows, with current compilation
1618 // settings.
1619 CONSTI32(kMaxOpenFiles, 504);
1620 #endif
1621
1622 // note that this is NOT foolproof: see e.g.
1623 // http://insanecoding.blogspot.com/2007/11/pathmax-simply-isnt.html . (This
1624 // is why I haven't bothered with OS-based #ifdefs here.) But it should be
1625 // good enough in practice. And PATH_MAX itself is still relevant due to use
1626 // of realpath().
1627 CONSTI32(kPglFnamesize, 4096);
1628 #if defined(PATH_MAX) && !defined(_WIN32)
1629 static_assert(kPglFnamesize >= PATH_MAX, "plink2_base assumes PATH_MAX <= 4096. (Safe to increase kPglFnamesize to address this, up to 131072.)");
1630 #endif
1631
1632
1633 #if __cplusplus >= 201103L
1634 // Main application of std::array in this codebase is enforcing length when
1635 // passing references between functions. Conversely, if the array type has
1636 // different lengths in different functions (e.g. col_skips[]/col_types[]), we
1637 // actively want to avoid &arr[0] clutter.
1638 // When neither applies, it doesn't really matter whether we use this or not;
1639 // I normally won't use it unless it plausibly makes sense to pass
1640 // fixed-length-array-references in the future.
1641 # define STD_ARRAY_DECL(tt, nn, vv) std::array<tt, nn> vv
1642 # define STD_ARRAY_REF(tt, nn) std::array<tt, nn>&
1643
1644 // necessary if tt is a pointer type, otherwise optional
1645 # define STD_ARRAY_KREF(tt, nn) const std::array<tt, nn>&
1646
1647 # define STD_ARRAY_COPY(src, nn, dst) static_assert(sizeof(dst) == sizeof((dst)[0]) * nn, "Invalid STD_ARRAY_COPY() invocation."); (dst) = (src)
1648
1649 # define STD_ARRAY_PTR_TYPE(tt, nn) std::array<tt, nn>*
1650 # define STD_ARRAY_PTR_DECL(tt, nn, vv) std::array<tt, nn>* vv
1651
1652 // argh, need double-braces for C++11 std::array and single-braces for C
1653 # define STD_ARRAY_INIT_START() {
1654 # define STD_ARRAY_INIT_END() }
1655
STD_ARRAY_FILL0(std::array<T,N> & arr)1656 template <class T, std::size_t N> void STD_ARRAY_FILL0(std::array<T, N>& arr) {
1657 arr.fill(0);
1658 }
1659
1660 // plain STD_ARRAY_FILL0() can't be used on array-references due to fallback
1661 // code.
1662 // this macro ensures that we *only* use it with uint32_t array-references
1663 # define STD_ARRAY_REF_FILL0(ct, aref) static_assert(ct * sizeof(aref[0]) == sizeof(aref), "invalid STD_ARRAY_REF_FILL0() invocation"); aref.fill(0)
1664
1665 # define NONCOPYABLE(struct_name) \
1666 struct_name() = default; \
1667 struct_name(const struct_name&) = delete; \
1668 struct_name& operator=(const struct_name&) = delete
1669
1670 #else
1671 # define STD_ARRAY_DECL(tt, nn, vv) tt vv[nn]
1672 # define STD_ARRAY_REF(tt, nn) tt* const
1673 # define STD_ARRAY_KREF(tt, nn) tt const* const
1674 # define STD_ARRAY_COPY(src, nn, dst) memcpy(dst, src, nn * sizeof(dst[0]));
1675 # define STD_ARRAY_PTR_TYPE(tt, nn) tt(*)[nn]
1676 # define STD_ARRAY_PTR_DECL(tt, nn, vv) tt(*vv)[nn]
1677 # define STD_ARRAY_INIT_START()
1678 # define STD_ARRAY_INIT_END()
1679 # define STD_ARRAY_FILL0(arr) memset(arr, 0, sizeof(arr))
1680 # define STD_ARRAY_REF_FILL0(ct, aref) memset(aref, 0, ct * sizeof(*aref))
1681
1682 # define NONCOPYABLE(struct_name)
1683 #endif
1684
1685 typedef union {
1686 VecW vw;
1687
1688 STD_ARRAY_DECL(uintptr_t, kWordsPerVec, w);
1689
1690 STD_ARRAY_DECL(uint32_t, kInt32PerVec, u32);
1691 } UniVec;
1692
1693 typedef union {
1694 VecF vf;
1695 STD_ARRAY_DECL(float, kFloatPerFVec, f4);
1696 } UniVecF;
1697
1698 // sum must fit in 16 bits
UniVecHsum16(UniVec uv)1699 HEADER_INLINE uintptr_t UniVecHsum16(UniVec uv) {
1700 #ifdef __LP64__
1701 # ifdef USE_AVX2
1702 return ((uv.w[0] + uv.w[1] + uv.w[2] + uv.w[3]) * kMask0001) >> 48;
1703 # else
1704 return ((uv.w[0] + uv.w[1]) * kMask0001) >> 48;
1705 # endif
1706 #else
1707 return (uv.w[0] * kMask0001) >> 16;
1708 #endif
1709 }
1710
1711 // sum must fit in 32 bits
UniVecHsum32(UniVec uv)1712 HEADER_INLINE uintptr_t UniVecHsum32(UniVec uv) {
1713 #ifdef __LP64__
1714 # ifdef USE_AVX2
1715 return ((uv.w[0] + uv.w[1] + uv.w[2] + uv.w[3]) * kMask00000001) >> 32;
1716 # else
1717 return ((uv.w[0] + uv.w[1]) * kMask00000001) >> 32;
1718 # endif
1719 #else
1720 return uv.w[0];
1721 #endif
1722 }
1723
VecFHsum(VecF vecf)1724 HEADER_INLINE float VecFHsum(VecF vecf) {
1725 UniVecF uvf;
1726 uvf.vf = vecf;
1727 #ifdef __LP64__
1728 # ifdef FVEC_32
1729 // tested various uses of _mm256_hadd_ps, couldn't get them to be faster
1730 return uvf.f4[0] + uvf.f4[1] + uvf.f4[2] + uvf.f4[3] + uvf.f4[4] + uvf.f4[5] + uvf.f4[6] + uvf.f4[7];
1731 # else
1732 return uvf.f4[0] + uvf.f4[1] + uvf.f4[2] + uvf.f4[3];
1733 # endif
1734 #else
1735 return uvf.f4[0];
1736 #endif
1737 }
1738
1739 #ifdef USE_AVX2
UnpackHalfwordToWord(uintptr_t hw)1740 HEADER_INLINE uintptr_t UnpackHalfwordToWord(uintptr_t hw) {
1741 return _pdep_u64(hw, kMask5555);
1742 }
1743
UnpackHalfwordToWordShift1(uintptr_t hw)1744 HEADER_INLINE uintptr_t UnpackHalfwordToWordShift1(uintptr_t hw) {
1745 return _pdep_u64(hw, kMaskAAAA);
1746 }
1747
UnpackVec8thUintTo4th(Vec8thUint hw)1748 HEADER_INLINE Vec4thUint UnpackVec8thUintTo4th(Vec8thUint hw) {
1749 return _pdep_u64(hw, kMask5555);
1750 }
1751
PackWordToHalfword(uintptr_t ww)1752 HEADER_INLINE Halfword PackWordToHalfword(uintptr_t ww) {
1753 return _pext_u64(ww, kMask5555);
1754 }
1755
PackWordToHalfwordMask5555(uintptr_t ww)1756 HEADER_INLINE Halfword PackWordToHalfwordMask5555(uintptr_t ww) {
1757 return _pext_u64(ww, kMask5555);
1758 }
1759
PackWordToHalfwordMaskAAAA(uintptr_t ww)1760 HEADER_INLINE Halfword PackWordToHalfwordMaskAAAA(uintptr_t ww) {
1761 return _pext_u64(ww, kMaskAAAA);
1762 }
1763
PackVec4thUintTo8th(Vec4thUint ww)1764 HEADER_INLINE Vec8thUint PackVec4thUintTo8th(Vec4thUint ww) {
1765 return _pext_u64(ww, kMask5555);
1766 }
1767
PackVec8thUintTo16th(Vec8thUint ww)1768 HEADER_INLINE Vec16thUint PackVec8thUintTo16th(Vec8thUint ww) {
1769 return _pext_u64(ww, kMask5555);
1770 }
1771
Unpack0F0F(uintptr_t hw)1772 HEADER_INLINE uintptr_t Unpack0F0F(uintptr_t hw) {
1773 return _pdep_u64(hw, kMask0F0F);
1774 }
1775
Pack0F0F(uintptr_t ww)1776 HEADER_INLINE Halfword Pack0F0F(uintptr_t ww) {
1777 return _pext_u64(ww, kMask0F0F);
1778 }
1779
Pack0F0FMask(uintptr_t ww)1780 HEADER_INLINE Halfword Pack0F0FMask(uintptr_t ww) {
1781 return _pext_u64(ww, kMask0F0F);
1782 }
1783
Unpack0303(uintptr_t qw)1784 HEADER_INLINE uintptr_t Unpack0303(uintptr_t qw) {
1785 return _pdep_u64(qw, kMask0303);
1786 }
1787
Pack0303(uintptr_t ww)1788 HEADER_INLINE Quarterword Pack0303(uintptr_t ww) {
1789 return _pext_u64(ww, kMask0303);
1790 }
1791
Pack0303Mask(uintptr_t ww)1792 HEADER_INLINE Quarterword Pack0303Mask(uintptr_t ww) {
1793 return _pext_u64(ww, kMask0303);
1794 }
1795
1796 // See https://stackoverflow.com/questions/21622212/how-to-perform-the-inverse-of-mm256-movemask-epi8-vpmovmskb .
InverseMovemaskFF(Vec8thUint mask)1797 HEADER_INLINE VecUc InverseMovemaskFF(Vec8thUint mask) {
1798 __m256i vmask = _mm256_set1_epi32(mask);
1799 const __m256i byte_gather = _mm256_setr_epi64x(0, kMask0101, 2 * kMask0101, 3 * kMask0101);
1800 vmask = _mm256_shuffle_epi8(vmask, byte_gather);
1801 const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfeLL);
1802 vmask = _mm256_or_si256(vmask, bit_mask);
1803 return R_CAST(VecUc, _mm256_cmpeq_epi8(vmask, _mm256_set1_epi64x(-1)));
1804 }
1805
1806 // If we're only interested in the even bits of mask. No need to mask out odd
1807 // bits before calling.
InverseMovespreadmaskFF(Vec4thUint mask)1808 HEADER_INLINE VecUc InverseMovespreadmaskFF(Vec4thUint mask) {
1809 __m256i vmask = _mm256_set1_epi64x(mask);
1810 const __m256i byte_gather = _mm256_setr_epi32(0, 0x01010101, 0x02020202, 0x03030303, 0x04040404, 0x05050505, 0x06060606, 0x07070707);
1811 vmask = _mm256_shuffle_epi8(vmask, byte_gather);
1812 const __m256i bit_mask = _mm256_set1_epi32(0xbfeffbfeU);
1813 vmask = _mm256_or_si256(vmask, bit_mask);
1814 return R_CAST(VecUc, _mm256_cmpeq_epi8(vmask, _mm256_set1_epi64x(-1)));
1815 }
1816
1817 #else // !USE_AVX2
UnpackHalfwordToWord(uintptr_t hw)1818 HEADER_INLINE uintptr_t UnpackHalfwordToWord(uintptr_t hw) {
1819 # ifdef __LP64__
1820 hw = (hw | (hw << 16)) & kMask0000FFFF;
1821 # endif
1822 hw = (hw | (hw << 8)) & kMask00FF;
1823 hw = (hw | (hw << 4)) & kMask0F0F;
1824 hw = (hw | (hw << 2)) & kMask3333;
1825 return ((hw | (hw << 1)) & kMask5555);
1826 }
1827
UnpackHalfwordToWordShift1(uintptr_t hw)1828 HEADER_INLINE uintptr_t UnpackHalfwordToWordShift1(uintptr_t hw) {
1829 return UnpackHalfwordToWord(hw) << 1;
1830 }
1831
PackWordToHalfword(uintptr_t ww)1832 HEADER_INLINE Halfword PackWordToHalfword(uintptr_t ww) {
1833 // assumes only even bits of ww can be set
1834 ww = (ww | (ww >> 1)) & kMask3333;
1835 ww = (ww | (ww >> 2)) & kMask0F0F;
1836 ww = (ww | (ww >> 4)) & kMask00FF;
1837 # ifdef __LP64__
1838 ww = (ww | (ww >> 8)) & kMask0000FFFF;
1839 # endif
1840 return S_CAST(Halfword, ww | (ww >> kBitsPerWordD4));
1841 }
1842
PackWordToHalfwordMask5555(uintptr_t ww)1843 HEADER_INLINE Halfword PackWordToHalfwordMask5555(uintptr_t ww) {
1844 return PackWordToHalfword(ww & kMask5555);
1845 }
1846
PackWordToHalfwordMaskAAAA(uintptr_t ww)1847 HEADER_INLINE Halfword PackWordToHalfwordMaskAAAA(uintptr_t ww) {
1848 return PackWordToHalfword((ww >> 1) & kMask5555);
1849 }
1850
Unpack0F0F(uintptr_t hw)1851 HEADER_INLINE uintptr_t Unpack0F0F(uintptr_t hw) {
1852 #ifdef __LP64__
1853 hw = (hw | (hw << 16)) & kMask0000FFFF;
1854 #endif
1855 hw = (hw | (hw << 8)) & kMask00FF;
1856 return ((hw | (hw << 4)) & kMask0F0F);
1857 }
1858
Pack0F0F(uintptr_t ww)1859 HEADER_INLINE Halfword Pack0F0F(uintptr_t ww) {
1860 ww = (ww | (ww >> 4)) & kMask00FF;
1861 # ifdef __LP64__
1862 ww = (ww | (ww >> 8)) & kMask0000FFFF;
1863 # endif
1864 return S_CAST(Halfword, ww | (ww >> kBitsPerWordD4));
1865 }
1866
Pack0F0FMask(uintptr_t ww)1867 HEADER_INLINE Halfword Pack0F0FMask(uintptr_t ww) {
1868 return Pack0F0F(ww & kMask0F0F);
1869 }
1870
Unpack0303(uintptr_t qw)1871 HEADER_INLINE uintptr_t Unpack0303(uintptr_t qw) {
1872 // ................................................fedcba9876543210
1873 #ifdef __LP64__
1874 qw = (qw | (qw << 24)) & kMask000000FF;
1875 // ........................fedcba98........................76543210
1876 #endif
1877 qw = qw | (qw << 12);
1878 // ............fedcba98....fedcba98............76543210....76543210
1879
1880 qw = qw | (qw << 6);
1881 // ......fedcbaXXdcbaXXdcbaXXdcba98......765432XX5432XX5432XX543210
1882
1883 return (qw & kMask0303);
1884 // ......fe......dc......ba......98......76......54......32......10
1885 }
1886
Pack0303(uintptr_t ww)1887 HEADER_INLINE Quarterword Pack0303(uintptr_t ww) {
1888 // ......fe......dc......ba......98......76......54......32......10
1889
1890 ww = ww | (ww >> 6);
1891 // ......fe....fedc....dcba....ba98....9876....7654....5432....3210
1892
1893 ww = ww | (ww >> 12);
1894 // ......fe....fedc..fedcbafedcba98dcba9876ba9876549876543276543210
1895
1896 #ifdef __LP64__
1897 ww = ww & kMask000000FF;
1898 // ........................fedcba98........................76543210
1899
1900 return S_CAST(Quarterword, ww | (ww >> 24));
1901 #else
1902 return S_CAST(Quarterword, ww);
1903 #endif
1904 }
1905
Pack0303Mask(uintptr_t ww)1906 HEADER_INLINE uintptr_t Pack0303Mask(uintptr_t ww) {
1907 return Pack0303(ww & kMask0303);
1908 }
1909
1910 # ifdef __LP64__
UnpackVec8thUintTo4th(Vec8thUint hw)1911 HEADER_INLINE Vec4thUint UnpackVec8thUintTo4th(Vec8thUint hw) {
1912 hw = (hw | (hw << 8)) & 0x00ff00ffU;
1913 hw = (hw | (hw << 4)) & 0x0f0f0f0fU;
1914 hw = (hw | (hw << 2)) & 0x33333333U;
1915 return (hw | (hw << 1)) & 0x55555555U;
1916 }
1917
PackVec4thUintTo8th(Vec4thUint ww)1918 HEADER_INLINE Vec8thUint PackVec4thUintTo8th(Vec4thUint ww) {
1919 ww = (ww | (ww >> 1)) & kMask3333;
1920 ww = (ww | (ww >> 2)) & kMask0F0F;
1921 ww = (ww | (ww >> 4)) & kMask00FF;
1922 return S_CAST(Vec8thUint, ww | (ww >> 8));
1923 }
1924
PackVec8thUintTo16th(Vec8thUint ww)1925 HEADER_INLINE Vec16thUint PackVec8thUintTo16th(Vec8thUint ww) {
1926 ww = (ww | (ww >> 1)) & 0x3333;
1927 ww = (ww | (ww >> 2)) & 0x0f0f;
1928 return S_CAST(Vec16thUint, ww | (ww >> 4));
1929 }
1930
1931 # ifdef USE_SSE42
InverseMovemaskFF(Vec8thUint mask)1932 HEADER_INLINE VecUc InverseMovemaskFF(Vec8thUint mask) {
1933 __m128i vmask = _mm_set1_epi16(mask);
1934 const __m128i byte_gather = _mm_setr_epi32(0, 0, 0x01010101, 0x01010101);
1935 vmask = _mm_shuffle_epi8(vmask, byte_gather);
1936 const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfeLL);
1937 vmask = _mm_or_si128(vmask, bit_mask);
1938 return R_CAST(VecUc, _mm_cmpeq_epi8(vmask, _mm_set1_epi64x(-1)));
1939 }
1940
InverseMovespreadmaskFF(Vec4thUint mask)1941 HEADER_INLINE VecUc InverseMovespreadmaskFF(Vec4thUint mask) {
1942 __m128i vmask = _mm_set1_epi32(mask);
1943 const __m128i byte_gather = _mm_setr_epi32(0, 0x01010101, 0x02020202, 0x03030303);
1944 vmask = _mm_shuffle_epi8(vmask, byte_gather);
1945 const __m128i bit_mask = _mm_set1_epi32(0xbfeffbfeU);
1946 vmask = _mm_or_si128(vmask, bit_mask);
1947 return R_CAST(VecUc, _mm_cmpeq_epi8(vmask, _mm_set1_epi64x(-1)));
1948 }
1949 # endif
1950
1951 # endif
1952 #endif // !USE_AVX2
1953
1954 // alignment must be a power of 2
1955 // tried splitting out RoundDownPow2U32() and RoundUpPow2U32() functions, no
1956 // practical difference
RoundDownPow2(uintptr_t val,uintptr_t alignment)1957 HEADER_CINLINE uintptr_t RoundDownPow2(uintptr_t val, uintptr_t alignment) {
1958 return val & (~(alignment - 1));
1959 }
1960
RoundDownPow2U64(uint64_t val,uint64_t alignment)1961 HEADER_CINLINE uint64_t RoundDownPow2U64(uint64_t val, uint64_t alignment) {
1962 return val & (~(alignment - 1));
1963 }
1964
RoundUpPow2(uintptr_t val,uintptr_t alignment)1965 HEADER_CINLINE uintptr_t RoundUpPow2(uintptr_t val, uintptr_t alignment) {
1966 return (val + alignment - 1) & (~(alignment - 1));
1967 }
1968
1969
1970 // This is best when the divisor is constant (so (divisor - 1) can be
1971 // collapsed), and handles val == 0 properly. If the divisor isn't constant
1972 // and val is guaranteed to be nonzero, go with explicit
1973 // "1 + (val - 1) / divisor".
1974 //
1975 // Note that this fails if (val + divisor - 1) overflows the widest integer
1976 // type on the left.
1977 //
1978 // Since forced-uint32_t RoundDownPow2 was pointless, it stands to reason that
1979 // the same applies to DivUp. With that said, we may as well make divisor a
1980 // uint32_t just in case this ever gets used on a not-known-at-compile-time
1981 // divisor, since 64/64 can be slower than 64/32.
DivUp(uintptr_t val,uint32_t divisor)1982 HEADER_CINLINE uintptr_t DivUp(uintptr_t val, uint32_t divisor) {
1983 return (val + divisor - 1) / divisor;
1984 }
1985
DivUpU64(uint64_t val,uint32_t divisor)1986 HEADER_CINLINE uint64_t DivUpU64(uint64_t val, uint32_t divisor) {
1987 return (val + divisor - 1) / divisor;
1988 }
1989
1990 // "Nz" means nonzero in two ways:
1991 // * result is in [1, modulus], not [0, modulus - 1]
1992 // * val should not be zero (though this expression still works if val is zero
1993 // and modulus is a hardcoded power of 2)
ModNz(uintptr_t val,uint32_t modulus)1994 HEADER_INLINE uint32_t ModNz(uintptr_t val, uint32_t modulus) {
1995 return (1 + ((val - 1) % modulus));
1996 }
1997
1998 // No need for ModNzU64 in practice, since high bits don't affect result when
1999 // modulus is a power of 2.
2000
2001 // Equivalent to (static_cast<int32_t>(uii) < 0). Most frequently used on
2002 // possibly-error chromosome indexes.
IsI32Neg(uint32_t uii)2003 HEADER_INLINE uint32_t IsI32Neg(uint32_t uii) {
2004 return uii >> 31;
2005 }
2006
abs_i32(int32_t ii)2007 HEADER_INLINE uint32_t abs_i32(int32_t ii) {
2008 // Arithmetic right shift. 0xffffffffU when ii is negative, 0 otherwise.
2009 const uint32_t neg_sign_bit = S_CAST(uint32_t, ii >> 31);
2010
2011 return (S_CAST(uint32_t, ii) ^ neg_sign_bit) - neg_sign_bit;
2012 }
2013
2014 extern uintptr_t g_failed_alloc_attempt_size;
2015 // with NDEBUG undefined, may want to define a bunch of macros so that line
2016 // number is printed as well; see e.g.
2017 // https://stackoverflow.com/questions/15884793/how-to-get-the-name-or-file-and-line-of-caller-method
2018
2019 #if (__GNUC__ == 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
2020 // putting this in the header file caused a bunch of gcc 4.4 strict-aliasing
2021 // warnings, while not doing so seems to inhibit some malloc-related compiler
2022 // optimizations, bleah
2023 // compromise: header-inline iff gcc version >= 4.7 (might not be the right
2024 // cutoff?)
2025 BoolErr pgl_malloc(uintptr_t size, void* pp);
2026 #else
2027 // Unfortunately, defining the second parameter to be of type void** doesn't do
2028 // the right thing.
pgl_malloc(uintptr_t size,void * pp)2029 HEADER_INLINE BoolErr pgl_malloc(uintptr_t size, void* pp) {
2030 *S_CAST(unsigned char**, pp) = S_CAST(unsigned char*, malloc(size));
2031 if (likely(*S_CAST(unsigned char**, pp))) {
2032 return 0;
2033 }
2034 g_failed_alloc_attempt_size = size;
2035 return 1;
2036 }
2037 #endif
2038
2039 // This must be used for all fwrite() calls where len could be >= 2^31, since
2040 // OS X raw fwrite() doesn't work in that case.
2041 static_assert(sizeof(size_t) == sizeof(intptr_t), "plink2_base assumes size_t and intptr_t are synonymous.");
2042 BoolErr fwrite_checked(const void* buf, uintptr_t len, FILE* outfile);
2043
2044 // Only use this if loading < len bytes is not an error.
2045 // IntErr fread_checked2(void* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr);
2046
2047 BoolErr fread_checked(void* buf, uintptr_t len, FILE* infile);
2048
fclose_null(FILE ** fptr_ptr)2049 HEADER_INLINE BoolErr fclose_null(FILE** fptr_ptr) {
2050 int32_t ii = ferror_unlocked(*fptr_ptr);
2051 int32_t jj = fclose(*fptr_ptr);
2052 *fptr_ptr = nullptr;
2053 return ii || jj;
2054 }
2055
2056
2057 #ifdef __LP64__
2058 // Reads an integer in [1, cap].
2059 // * Errors out unless first character is a digit, or is '+' followed by a
2060 // digit. Initial whitespace is not permitted.
2061 // * Like atoi(), this considereds the number to be terminated by *any*
2062 // nondigit character. E.g. "1000genomes" is treated as a valid instance of
2063 // 1000 rather than a nonnumeric token, and "98.6" is treated as 98. (See
2064 // ScanmovPosintCapped(), ScanmovUintCapped(), etc. in plink2_string if
2065 // you want strtol-like semantics, where the pointer is moved.)
2066 // * Errors out on overflow. This may be the biggest advantage over atoi().
2067 BoolErr ScanPosintCapped(const char* str_iter, uint64_t cap, uint32_t* valp);
2068
2069 // [0, cap]
2070 BoolErr ScanUintCapped(const char* str_iter, uint64_t cap, uint32_t* valp);
2071
2072 // [-bound, bound]
2073 BoolErr ScanIntAbsBounded(const char* str_iter, uint64_t bound, int32_t* valp);
2074 #else // not __LP64__
2075 // Need to be more careful in 32-bit case due to overflow.
2076 // A funny-looking div_10/mod_10 interface is used since the cap will usually
2077 // be a constant, and we want the integer division/modulus to occur at compile
2078 // time.
2079 BoolErr ScanPosintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp);
2080
2081 BoolErr ScanUintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp);
2082
2083 BoolErr ScanIntAbsBounded32(const char* str_iter, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp);
2084
ScanPosintCapped(const char * str,uint32_t cap,uint32_t * valp)2085 HEADER_INLINE BoolErr ScanPosintCapped(const char* str, uint32_t cap, uint32_t* valp) {
2086 return ScanPosintCapped32(str, cap / 10, cap % 10, valp);
2087 }
2088
ScanUintCapped(const char * str,uint32_t cap,uint32_t * valp)2089 HEADER_INLINE BoolErr ScanUintCapped(const char* str, uint32_t cap, uint32_t* valp) {
2090 return ScanUintCapped32(str, cap / 10, cap % 10, valp);
2091 }
2092
ScanIntAbsBounded(const char * str,uint32_t bound,int32_t * valp)2093 HEADER_INLINE BoolErr ScanIntAbsBounded(const char* str, uint32_t bound, int32_t* valp) {
2094 return ScanIntAbsBounded32(str, bound / 10, bound % 10, valp);
2095 }
2096 #endif
2097
2098
2099 // intentionally rejects -2^31 for now
2100 // (that's a reason why this doesn't have the shorter name 'ScanI32')
ScanInt32(const char * str,int32_t * valp)2101 HEADER_INLINE BoolErr ScanInt32(const char* str, int32_t* valp) {
2102 return ScanIntAbsBounded(str, 0x7fffffff, valp);
2103 }
2104
2105 // default cap = 0x7ffffffe
ScanPosintDefcap(const char * str,uint32_t * valp)2106 HEADER_INLINE BoolErr ScanPosintDefcap(const char* str, uint32_t* valp) {
2107 return ScanPosintCapped(str, 0x7ffffffe, valp);
2108 }
2109
ScanUintDefcap(const char * str,uint32_t * valp)2110 HEADER_INLINE BoolErr ScanUintDefcap(const char* str, uint32_t* valp) {
2111 return ScanUintCapped(str, 0x7ffffffe, valp);
2112 }
2113
ScanIntAbsDefcap(const char * str,int32_t * valp)2114 HEADER_INLINE BoolErr ScanIntAbsDefcap(const char* str, int32_t* valp) {
2115 return ScanIntAbsBounded(str, 0x7ffffffe, valp);
2116 }
2117
ScanUintIcap(const char * str,uint32_t * valp)2118 HEADER_INLINE BoolErr ScanUintIcap(const char* str, uint32_t* valp) {
2119 return ScanUintCapped(str, 0x7fffffff, valp);
2120 }
2121
2122
2123 // memcpya() tends to be used to copy known-length text strings, while
2124 // memseta() has more mixed usage but char* type is also at least as common as
2125 // unsigned char*; append comes up less when working with raw byte arrays. So
2126 // give the shortest-name forms char* return types.
memseta(void * target,unsigned char val,uintptr_t ct)2127 HEADER_INLINE char* memseta(void* target, unsigned char val, uintptr_t ct) {
2128 memset(target, val, ct);
2129 return &(S_CAST(char*, target)[ct]);
2130 }
2131
memsetua(void * target,unsigned char val,uintptr_t ct)2132 HEADER_INLINE unsigned char* memsetua(void* target, unsigned char val, uintptr_t ct) {
2133 memset(target, val, ct);
2134 return &(S_CAST(unsigned char*, target)[ct]);
2135 }
2136
BitCtToVecCt(uintptr_t val)2137 HEADER_CINLINE uintptr_t BitCtToVecCt(uintptr_t val) {
2138 return DivUp(val, kBitsPerVec);
2139 }
2140
BitCtToWordCt(uintptr_t val)2141 HEADER_CINLINE uintptr_t BitCtToWordCt(uintptr_t val) {
2142 return DivUp(val, kBitsPerWord);
2143 }
2144
BitCtToAlignedWordCt(uintptr_t val)2145 HEADER_CINLINE uintptr_t BitCtToAlignedWordCt(uintptr_t val) {
2146 return kWordsPerVec * BitCtToVecCt(val);
2147 }
2148
BitCtToCachelineCt(uintptr_t val)2149 HEADER_CINLINE uintptr_t BitCtToCachelineCt(uintptr_t val) {
2150 return DivUp(val, kBitsPerCacheline);
2151 }
2152
Int32CtToVecCt(uintptr_t val)2153 HEADER_CINLINE uintptr_t Int32CtToVecCt(uintptr_t val) {
2154 return DivUp(val, kInt32PerVec);
2155 }
2156
Int32CtToCachelineCt(uintptr_t val)2157 HEADER_CINLINE uintptr_t Int32CtToCachelineCt(uintptr_t val) {
2158 return DivUp(val, kInt32PerCacheline);
2159 }
2160
WordCtToVecCt(uintptr_t val)2161 HEADER_CINLINE uintptr_t WordCtToVecCt(uintptr_t val) {
2162 return DivUp(val, kWordsPerVec);
2163 }
2164
WordCtToCachelineCtU64(uint64_t val)2165 HEADER_CINLINE uint64_t WordCtToCachelineCtU64(uint64_t val) {
2166 return DivUpU64(val, kWordsPerCacheline);
2167 }
2168
2169 #ifdef __LP64__
Int64CtToVecCt(uintptr_t val)2170 HEADER_CINLINE uintptr_t Int64CtToVecCt(uintptr_t val) {
2171 return DivUp(val, kBytesPerVec / 8);
2172 }
2173 #else
Int64CtToVecCt(uintptr_t val)2174 HEADER_CINLINE uintptr_t Int64CtToVecCt(uintptr_t val) {
2175 return val * 2;
2176 }
2177 #endif
2178
Int64CtToCachelineCt(uintptr_t val)2179 HEADER_CINLINE uintptr_t Int64CtToCachelineCt(uintptr_t val) {
2180 return DivUp(val, kInt64PerCacheline);
2181 }
2182
DblCtToVecCt(uintptr_t val)2183 HEADER_CINLINE uintptr_t DblCtToVecCt(uintptr_t val) {
2184 return Int64CtToVecCt(val);
2185 }
2186
VecCtToCachelineCt(uintptr_t val)2187 HEADER_CINLINE uintptr_t VecCtToCachelineCt(uintptr_t val) {
2188 return DivUp(val, kVecsPerCacheline);
2189 }
2190
VecCtToCachelineCtU64(uint64_t val)2191 HEADER_CINLINE uint64_t VecCtToCachelineCtU64(uint64_t val) {
2192 return DivUpU64(val, kVecsPerCacheline);
2193 }
2194
2195 // C++11 standard guarantees std::min and std::max return leftmost minimum in
2196 // case of equality; best to adhere to that
2197 // We don't actually use std::min/max since casting one argument when comparing
2198 // e.g. a uint32_t with a uintptr_t is pointlessly verbose. Compiler will
2199 // still warn against comparison of signed with unsigned.
2200 #define MAXV(aa, bb) (((bb) > (aa))? (bb) : (aa))
2201 #define MINV(aa, bb) (((bb) < (aa))? (bb) : (aa))
2202
2203
2204 // don't use PglErr here since there's only one failure mode, it's
2205 // obvious what it is, and stacking multiple aligned_mallocs in a single
2206 // if-statement is useful.
2207 BoolErr aligned_malloc(uintptr_t size, uintptr_t alignment, void* aligned_pp);
2208
2209 #ifdef USE_SSE42
NypsumWord(uintptr_t val)2210 HEADER_CINLINE uint32_t NypsumWord(uintptr_t val) {
2211 return __builtin_popcountll(val) + __builtin_popcountll(val & kMaskAAAA);
2212 }
2213 #else
NypsumWord(uintptr_t val)2214 HEADER_CINLINE2 uint32_t NypsumWord(uintptr_t val) {
2215 val = (val & kMask3333) + ((val >> 2) & kMask3333);
2216 return (((val + (val >> 4)) & kMask0F0F) * kMask0101) >> (kBitsPerWord - 8);
2217 }
2218 #endif
2219
2220 // the simple version, good enough for all non-time-critical stuff
2221 // (without SSE4.2, PopcountWords() tends to be >3x as fast on arrays. with
2222 // SSE4.2 but no AVX2, there's no noticeable difference. with AVX2,
2223 // PopcountWords() gains another factor of 1.5-2x.)
2224 #ifdef USE_SSE42
PopcountWord(uintptr_t val)2225 HEADER_CINLINE uint32_t PopcountWord(uintptr_t val) {
2226 return __builtin_popcountll(val);
2227 }
2228 #else
PopcountWord(uintptr_t val)2229 HEADER_CINLINE2 uint32_t PopcountWord(uintptr_t val) {
2230 // Sadly, this was still faster than the LLVM implementation of the intrinsic
2231 // as of 2016.
2232 return NypsumWord(val - ((val >> 1) & kMask5555));
2233 }
2234 #endif
2235
2236 #ifdef USE_SSE42
Popcount2Words(uintptr_t val0,uintptr_t val1)2237 HEADER_INLINE uint32_t Popcount2Words(uintptr_t val0, uintptr_t val1) {
2238 return __builtin_popcountll(val0) + __builtin_popcountll(val1);
2239 }
2240 #else
Popcount2Words(uintptr_t val0,uintptr_t val1)2241 HEADER_INLINE uint32_t Popcount2Words(uintptr_t val0, uintptr_t val1) {
2242 val0 -= (val0 >> 1) & kMask5555;
2243 val1 -= (val1 >> 1) & kMask5555;
2244 const uintptr_t four_bit = (val0 & kMask3333) + ((val0 >> 2) & kMask3333) + (val1 & kMask3333) + ((val1 >> 2) & kMask3333);
2245 // up to 16 values in 0..12; sum fits in 8 bits
2246 return (((four_bit & kMask0F0F) + ((four_bit >> 4) & kMask0F0F)) * kMask0101) >> (kBitsPerWord - 8);
2247 }
2248 #endif
2249
2250 #ifndef __LP64__
Popcount4Words(uintptr_t val0,uintptr_t val1,uintptr_t val2,uintptr_t val3)2251 HEADER_INLINE uint32_t Popcount4Words(uintptr_t val0, uintptr_t val1, uintptr_t val2, uintptr_t val3) {
2252 val0 -= (val0 >> 1) & kMask5555;
2253 val1 -= (val1 >> 1) & kMask5555;
2254 val2 -= (val2 >> 1) & kMask5555;
2255 val3 -= (val3 >> 1) & kMask5555;
2256 const uintptr_t four_bit_0 = (val0 & kMask3333) + ((val0 >> 2) & kMask3333) + (val1 & kMask3333) + ((val1 >> 2) & kMask3333);
2257 const uintptr_t four_bit_1 = (val2 & kMask3333) + ((val2 >> 2) & kMask3333) + (val3 & kMask3333) + ((val3 >> 2) & kMask3333);
2258 return (((four_bit_0 & kMask0F0F) + ((four_bit_0 >> 4) & kMask0F0F) + (four_bit_1 & kMask0F0F) + ((four_bit_1 >> 4) & kMask0F0F)) * kMask0101) >> (kBitsPerWord - 8);
2259 }
2260 #endif
2261
2262 #ifdef __LP64__
2263 # ifdef USE_SSE42
PopcountVec8thUint(uint32_t val)2264 HEADER_INLINE uint32_t PopcountVec8thUint(uint32_t val) {
2265 return __builtin_popcount(val);
2266 }
2267 # else
PopcountVec8thUint(uint32_t val)2268 HEADER_INLINE uint32_t PopcountVec8thUint(uint32_t val) {
2269 // May as well exploit the fact that only the low 15 bits may be set.
2270 val = val - ((val >> 1) & 0x5555);
2271 val = (val & 0x3333) + ((val >> 2) & 0x3333);
2272 val = (val + (val >> 4)) & 0x0f0f;
2273 return (val + (val >> 8)) & 0xff;
2274 }
2275 # endif
2276 #endif
2277
VecIsAligned(const void * ptr)2278 HEADER_INLINE uint32_t VecIsAligned(const void* ptr) {
2279 return !(R_CAST(uintptr_t, ptr) % kBytesPerVec);
2280 }
2281
VecAlignUp(void * pp)2282 HEADER_INLINE void VecAlignUp(void* pp) {
2283 uintptr_t addr = *S_CAST(uintptr_t*, pp);
2284 #if (__GNUC__ == 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
2285 // bleah, need to write this way to avoid gcc 4.4 strict-aliasing warning
2286 addr = RoundUpPow2(addr, kBytesPerVec);
2287 memcpy(pp, &addr, sizeof(intptr_t));
2288 #else
2289 *S_CAST(uintptr_t*, pp) = RoundUpPow2(addr, kBytesPerVec);
2290 #endif
2291 }
2292
2293 #ifdef __LP64__
VecAlignUp64(void * pp)2294 HEADER_INLINE void VecAlignUp64(void* pp) {
2295 VecAlignUp(pp);
2296 }
2297 #else
VecAlignUp64(__maybe_unused void * pp)2298 HEADER_INLINE void VecAlignUp64(__maybe_unused void* pp) {
2299 }
2300 #endif
2301
2302 // Turns out memcpy(&cur_word, bytearr, ct) can't be trusted to be fast when ct
2303 // isn't known at compile time.
2304 //
2305 // ct must be less than sizeof(intptr_t). ct == 0 handled correctly, albeit
2306 // inefficiently.
ProperSubwordLoad(const void * bytearr,uint32_t ct)2307 HEADER_INLINE uintptr_t ProperSubwordLoad(const void* bytearr, uint32_t ct) {
2308 const unsigned char* bytearr_uc = S_CAST(const unsigned char*, bytearr);
2309 #ifdef __LP64__
2310 if (ct >= 4) {
2311 const uint32_t remainder = ct - 4;
2312 bytearr_uc = &(bytearr_uc[remainder]);
2313 uintptr_t cur_word = *R_CAST(const uint32_t*, bytearr_uc);
2314 if (remainder) {
2315 cur_word <<= remainder * CHAR_BIT;
2316 cur_word |= *S_CAST(const uint32_t*, bytearr);
2317 }
2318 return cur_word;
2319 }
2320 #endif
2321 if (ct >= 2) {
2322 const uint32_t remainder = ct & 1;
2323 uintptr_t cur_word = *R_CAST(const uint16_t*, &(bytearr_uc[remainder]));
2324 if (remainder) {
2325 cur_word <<= 8;
2326 cur_word |= bytearr_uc[0];
2327 }
2328 return cur_word;
2329 }
2330 return ct? bytearr_uc[0] : 0;
2331 }
2332
SubwordLoad(const void * bytearr,uint32_t ct)2333 HEADER_INLINE uintptr_t SubwordLoad(const void* bytearr, uint32_t ct) {
2334 if (ct == S_CAST(uint32_t, kBytesPerWord)) {
2335 return *S_CAST(const uintptr_t*, bytearr);
2336 }
2337 return ProperSubwordLoad(bytearr, ct);
2338 }
2339
2340 // ct must be in 1..4.
SubU32Load(const void * bytearr,uint32_t ct)2341 HEADER_INLINE uint32_t SubU32Load(const void* bytearr, uint32_t ct) {
2342 if (ct & 1) {
2343 const unsigned char* bytearr_iter = S_CAST(const unsigned char*, bytearr);
2344 uint32_t cur_uint = *bytearr_iter;
2345 if (ct == 3) {
2346 ++bytearr_iter;
2347 cur_uint |= S_CAST(uint32_t, *R_CAST(const uint16_t*, bytearr_iter)) << 8;
2348 }
2349 return cur_uint;
2350 }
2351 if (ct == 2) {
2352 return *S_CAST(const uint16_t*, bytearr);
2353 }
2354 return *S_CAST(const uint32_t*, bytearr);
2355 }
2356
2357 // tried making this non-inline, loop took more than 50% longer
ProperSubwordStore(uintptr_t cur_word,uint32_t byte_ct,void * target)2358 HEADER_INLINE void ProperSubwordStore(uintptr_t cur_word, uint32_t byte_ct, void* target) {
2359 unsigned char* target_iter = S_CAST(unsigned char*, target);
2360 #ifdef __LP64__
2361 if (byte_ct >= 4) {
2362 *R_CAST(uint32_t*, target_iter) = cur_word;
2363 if (byte_ct == 4) {
2364 return;
2365 }
2366 const uint32_t remainder = byte_ct - 4;
2367 target_iter = &(target_iter[remainder]);
2368 cur_word >>= remainder * CHAR_BIT;
2369 *R_CAST(uint32_t*, target_iter) = cur_word;
2370 return;
2371 }
2372 #endif
2373 if (byte_ct & 1) {
2374 *target_iter++ = cur_word;
2375 cur_word >>= 8;
2376 }
2377 if (byte_ct & 2) {
2378 *R_CAST(uint16_t*, target_iter) = cur_word;
2379 }
2380 }
2381
ProperSubwordStoreMov(uintptr_t cur_word,uint32_t byte_ct,unsigned char ** targetp)2382 HEADER_INLINE void ProperSubwordStoreMov(uintptr_t cur_word, uint32_t byte_ct, unsigned char** targetp) {
2383 ProperSubwordStore(cur_word, byte_ct, *targetp);
2384 *targetp += byte_ct;
2385 }
2386
SubwordStore(uintptr_t cur_word,uint32_t byte_ct,void * target)2387 HEADER_INLINE void SubwordStore(uintptr_t cur_word, uint32_t byte_ct, void* target) {
2388 if (byte_ct == kBytesPerWord) {
2389 *S_CAST(uintptr_t*, target) = cur_word;
2390 return;
2391 }
2392 ProperSubwordStore(cur_word, byte_ct, target);
2393 }
2394
SubwordStoreMov(uintptr_t cur_word,uint32_t byte_ct,unsigned char ** targetp)2395 HEADER_INLINE void SubwordStoreMov(uintptr_t cur_word, uint32_t byte_ct, unsigned char** targetp) {
2396 SubwordStore(cur_word, byte_ct, *targetp);
2397 *targetp += byte_ct;
2398 }
2399
2400 // byte_ct must be in 1..4.
SubU32Store(uint32_t cur_uint,uint32_t byte_ct,void * target)2401 HEADER_INLINE void SubU32Store(uint32_t cur_uint, uint32_t byte_ct, void* target) {
2402 if (byte_ct & 1) {
2403 unsigned char* target_iter = S_CAST(unsigned char*, target);
2404 *target_iter = cur_uint;
2405 if (byte_ct == 3) {
2406 ++target_iter;
2407 *R_CAST(uint16_t*, target_iter) = cur_uint >> 8;
2408 }
2409 return;
2410 }
2411 if (byte_ct == 2) {
2412 *S_CAST(uint16_t*, target) = cur_uint;
2413 return;
2414 }
2415 *S_CAST(uint32_t*, target) = cur_uint;
2416 return;
2417 }
2418
SubU32StoreMov(uint32_t cur_uint,uint32_t byte_ct,unsigned char ** targetp)2419 HEADER_INLINE void SubU32StoreMov(uint32_t cur_uint, uint32_t byte_ct, unsigned char** targetp) {
2420 SubU32Store(cur_uint, byte_ct, *targetp);
2421 *targetp += byte_ct;
2422 }
2423
2424 #ifdef __LP64__
SubU64StoreMov(uint64_t cur_u64,uint32_t byte_ct,unsigned char ** targetp)2425 HEADER_INLINE void SubU64StoreMov(uint64_t cur_u64, uint32_t byte_ct, unsigned char** targetp) {
2426 return SubwordStoreMov(cur_u64, byte_ct, targetp);
2427 }
2428 #else
SubU64StoreMov(uint64_t cur_u64,uint32_t byte_ct,unsigned char ** targetp)2429 HEADER_INLINE void SubU64StoreMov(uint64_t cur_u64, uint32_t byte_ct, unsigned char** targetp) {
2430 if (byte_ct > 4) {
2431 *R_CAST(uint32_t*, *targetp) = cur_u64;
2432 *targetp += 4;
2433 byte_ct -= 4;
2434 cur_u64 >>= 32;
2435 }
2436 return SubU32StoreMov(cur_u64, byte_ct, targetp);
2437 }
2438 #endif
2439
2440
vecaligned_malloc(uintptr_t size,void * aligned_pp)2441 HEADER_INLINE BoolErr vecaligned_malloc(uintptr_t size, void* aligned_pp) {
2442 #ifdef USE_AVX2
2443 return aligned_malloc(size, kBytesPerVec, aligned_pp);
2444 #else
2445 # if defined(__APPLE__) || !defined(__LP64__)
2446 const BoolErr ret_boolerr = pgl_malloc(size, aligned_pp);
2447 assert(VecIsAligned(*S_CAST(uintptr_t**, aligned_pp)));
2448 return ret_boolerr;
2449 # else
2450 return aligned_malloc(size, kBytesPerVec, aligned_pp);
2451 # endif
2452 #endif
2453 }
2454
cachealigned_malloc(uintptr_t size,void * aligned_pp)2455 HEADER_INLINE BoolErr cachealigned_malloc(uintptr_t size, void* aligned_pp) {
2456 return aligned_malloc(size, kCacheline, aligned_pp);
2457 }
2458
aligned_free(void * aligned_ptr)2459 HEADER_INLINE void aligned_free(void* aligned_ptr) {
2460 free(R_CAST(void*, S_CAST(uintptr_t*, aligned_ptr)[-1]));
2461 }
2462
aligned_free_cond(void * aligned_ptr)2463 HEADER_INLINE void aligned_free_cond(void* aligned_ptr) {
2464 if (aligned_ptr) {
2465 free(R_CAST(void*, S_CAST(uintptr_t*, aligned_ptr)[-1]));
2466 }
2467 }
2468
2469 // C spec is slightly broken here
free_const(const void * memptr)2470 HEADER_INLINE void free_const(const void* memptr) {
2471 free(K_CAST(void*, memptr));
2472 }
2473
free_cond(const void * memptr)2474 HEADER_INLINE void free_cond(const void* memptr) {
2475 if (memptr) {
2476 free_const(memptr);
2477 }
2478 }
2479
2480 #ifdef USE_AVX2
vecaligned_free(void * aligned_ptr)2481 HEADER_INLINE void vecaligned_free(void* aligned_ptr) {
2482 aligned_free(aligned_ptr);
2483 }
2484
vecaligned_free_cond(void * aligned_ptr)2485 HEADER_INLINE void vecaligned_free_cond(void* aligned_ptr) {
2486 aligned_free_cond(aligned_ptr);
2487 }
2488 #else
2489 # if defined(__APPLE__) || !defined(__LP64__)
vecaligned_free(void * aligned_ptr)2490 HEADER_INLINE void vecaligned_free(void* aligned_ptr) {
2491 free(aligned_ptr);
2492 }
2493
vecaligned_free_cond(void * aligned_ptr)2494 HEADER_INLINE void vecaligned_free_cond(void* aligned_ptr) {
2495 free_cond(aligned_ptr);
2496 }
2497 # else
vecaligned_free(void * aligned_ptr)2498 HEADER_INLINE void vecaligned_free(void* aligned_ptr) {
2499 aligned_free(aligned_ptr);
2500 }
2501
vecaligned_free_cond(void * aligned_ptr)2502 HEADER_INLINE void vecaligned_free_cond(void* aligned_ptr) {
2503 aligned_free_cond(aligned_ptr);
2504 }
2505 # endif
2506 #endif
2507
2508
2509 #ifdef __LP64__
2510 int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct);
2511 #else
memequal(const void * m1,const void * m2,uintptr_t byte_ct)2512 HEADER_INLINE int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) {
2513 return !memcmp(m1, m2, byte_ct);
2514 }
2515 #endif
2516
memcpya(void * __restrict target,const void * __restrict source,uintptr_t ct)2517 HEADER_INLINE char* memcpya(void* __restrict target, const void* __restrict source, uintptr_t ct) {
2518 memcpy(target, source, ct);
2519 return &(S_CAST(char*, target)[ct]);
2520 }
2521
memcpyua(void * __restrict target,const void * __restrict source,uintptr_t ct)2522 HEADER_INLINE unsigned char* memcpyua(void* __restrict target, const void* __restrict source, uintptr_t ct) {
2523 memcpy(target, source, ct);
2524 return &(S_CAST(unsigned char*, target)[ct]);
2525 }
2526
AppendU16(uint32_t usii,unsigned char ** targetp)2527 HEADER_INLINE void AppendU16(uint32_t usii, unsigned char** targetp) {
2528 memcpy(*targetp, &usii, sizeof(int16_t));
2529 *targetp += sizeof(int16_t);
2530 }
2531
AppendU32(uint32_t uii,unsigned char ** targetp)2532 HEADER_INLINE void AppendU32(uint32_t uii, unsigned char** targetp) {
2533 memcpy(*targetp, &uii, sizeof(int32_t));
2534 *targetp += sizeof(int32_t);
2535 }
2536
2537 // Tried beating memcpy for usually-small strings not known to have length <=
2538 // 8, gave up.
2539
2540 #if defined(__LP64__) && defined(__cplusplus)
2541 // See https://stackoverflow.com/questions/9510514/integer-range-based-template-specialisation .
2542
2543 template <bool> struct TRange;
2544
2545 // This makes MemequalKImpl<byte_ct> expand to
2546 // MemequalKImpl<byte_ct, TRange<true> >.
2547 // If a later single-parameter template defines the same thing, that takes
2548 // precedence.
2549 template <uint32_t N, typename = TRange<true> > struct MemequalKImpl {
MemequalKMemequalKImpl2550 static int32_t MemequalK(const void* m1, const void* m2) {
2551 return memequal(m1, m2, N);
2552 }
2553 };
2554
2555 template <> struct MemequalKImpl<1> {
2556 static int32_t MemequalK(const void* m1, const void* m2) {
2557 const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2558 const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2559 return (m1_uc[0] == m2_uc[0]);
2560 }
2561 };
2562
2563 template <> struct MemequalKImpl<2> {
2564 static int32_t MemequalK(const void* m1, const void* m2) {
2565 return ((*R_CAST(const uint16_t*, m1)) == (*R_CAST(const uint16_t*, m2)));
2566 }
2567 };
2568
2569 template <> struct MemequalKImpl<3> {
2570 static int32_t MemequalK(const void* m1, const void* m2) {
2571 const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2572 const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2573 return
2574 ((*R_CAST(const uint16_t*, m1)) == (*R_CAST(const uint16_t*, m2))) &&
2575 (m1_uc[2] == m2_uc[2]);
2576 }
2577 };
2578
2579 template <> struct MemequalKImpl<4> {
2580 static int32_t MemequalK(const void* m1, const void* m2) {
2581 return ((*R_CAST(const uint32_t*, m1)) == (*R_CAST(const uint32_t*, m2)));
2582 }
2583 };
2584
2585 template <uint32_t N> struct MemequalKImpl<N, TRange<(5 <= N) && (N <= 7)> > {
2586 static int32_t MemequalK(const void* m1, const void* m2) {
2587 const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2588 const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2589 return
2590 ((*R_CAST(const uint32_t*, m1)) == (*R_CAST(const uint32_t*, m2))) &&
2591 ((*R_CAST(const uint32_t*, &(m1_uc[N - 4]))) == (*R_CAST(const uint32_t*, &(m2_uc[N - 4]))));
2592 }
2593 };
2594
2595 template <> struct MemequalKImpl<8> {
2596 static int32_t MemequalK(const void* m1, const void* m2) {
2597 return ((*R_CAST(const uint64_t*, m1)) == (*R_CAST(const uint64_t*, m2)));
2598 }
2599 };
2600
2601 template <uint32_t N> struct MemequalKImpl<N, TRange<(9 <= N) && (N <= 15)> > {
2602 static int32_t MemequalK(const void* m1, const void* m2) {
2603 const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2604 const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2605 return
2606 ((*R_CAST(const uint64_t*, m1)) == (*R_CAST(const uint64_t*, m2))) &&
2607 ((*R_CAST(const uint64_t*, &(m1_uc[N - 8]))) == (*R_CAST(const uint64_t*, &(m2_uc[N - 8]))));
2608 }
2609 };
2610
2611 template <> struct MemequalKImpl<16> {
2612 static int32_t MemequalK(const void* m1, const void* m2) {
2613 const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
2614 const __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
2615 return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535);
2616 }
2617 };
2618
2619 template <uint32_t N> struct MemequalKImpl<N, TRange<(17 <= N) && (N <= 24)> > {
2620 static int32_t MemequalK(const void* m1, const void* m2) {
2621 const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2622 const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2623 const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
2624 const __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
2625 return
2626 (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535) &&
2627 ((*R_CAST(const uint64_t*, &(m1_uc[N - 8]))) == (*R_CAST(const uint64_t*, &(m2_uc[N - 8]))));
2628 }
2629 };
2630
2631 template <uint32_t N> struct MemequalKImpl<N, TRange<(25 <= N) && (N <= 31)> > {
2632 static int32_t MemequalK(const void* m1, const void* m2) {
2633 __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
2634 __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
2635 if (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) != 65535) {
2636 return 0;
2637 }
2638 const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2639 const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2640 v1 = _mm_loadu_si128(R_CAST(const __m128i*, &(m1_uc[N - 16])));
2641 v2 = _mm_loadu_si128(R_CAST(const __m128i*, &(m2_uc[N - 16])));
2642 return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535);
2643 }
2644 };
2645
2646 # define memequal_k(m1, m2, byte_ct) plink2::MemequalKImpl<byte_ct>::MemequalK(m1, m2)
2647
2648 template <uint32_t N, typename = TRange<true> > struct MemcpyKImpl {
2649 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2650 memcpy(dst, src, N);
2651 }
2652 };
2653
2654 // Patch a bunch of cases where some commonly-used gcc and clang versions
2655 // generate suboptimal code. (Since this code is shamelessly x86-specific, we
2656 // don't worry about the formal undefinedness of unaligned pointer dereferences
2657 // here.)
2658 // (todo: check if/when this has been fixed, and remove this bloat once all
2659 // production build machines have sufficiently new compilers.)
2660 template <> struct MemcpyKImpl<2> {
2661 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2662 *S_CAST(uint16_t*, dst) = *S_CAST(const uint16_t*, src);
2663 }
2664 };
2665
2666 template <> struct MemcpyKImpl<3> {
2667 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2668 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2669 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2670 *S_CAST(uint16_t*, dst) = *S_CAST(const uint16_t*, src);
2671 dst_uc[2] = src_uc[2];
2672 }
2673 };
2674
2675 template <> struct MemcpyKImpl<5> {
2676 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2677 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2678 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2679 *S_CAST(uint32_t*, dst) = *S_CAST(const uint32_t*, src);
2680 dst_uc[4] = src_uc[4];
2681 }
2682 };
2683
2684 template <> struct MemcpyKImpl<6> {
2685 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2686 uint16_t* dst_u16 = S_CAST(uint16_t*, dst);
2687 const uint16_t* src_u16 = S_CAST(const uint16_t*, src);
2688 *S_CAST(uint32_t*, dst) = *S_CAST(const uint32_t*, src);
2689 dst_u16[2] = src_u16[2];
2690 }
2691 };
2692
2693 template <> struct MemcpyKImpl<7> {
2694 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2695 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2696 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2697 *S_CAST(uint32_t*, dst) = *S_CAST(const uint32_t*, src);
2698 *R_CAST(uint32_t*, &(dst_uc[3])) = *R_CAST(const uint32_t*, &(src_uc[3]));
2699 }
2700 };
2701
2702 template <> struct MemcpyKImpl<9> {
2703 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2704 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2705 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2706 *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2707 dst_uc[8] = src_uc[8];
2708 }
2709 };
2710
2711 template <> struct MemcpyKImpl<10> {
2712 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2713 uint16_t* dst_u16 = S_CAST(uint16_t*, dst);
2714 const uint16_t* src_u16 = S_CAST(const uint16_t*, src);
2715 *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2716 dst_u16[4] = src_u16[4];
2717 }
2718 };
2719
2720 template <uint32_t N> struct MemcpyKImpl<N, TRange<(11 <= N) && (N <= 12)> > {
2721 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2722 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2723 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2724 *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2725 *R_CAST(uint32_t*, &(dst_uc[N - 4])) = *R_CAST(const uint32_t*, &(src_uc[N - 4]));
2726 }
2727 };
2728
2729 template <uint32_t N> struct MemcpyKImpl<N, TRange<(13 <= N) && (N <= 15)> > {
2730 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2731 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2732 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2733 *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2734 *R_CAST(uint64_t*, &(dst_uc[N - 8])) = *R_CAST(const uint64_t*, &(src_uc[N - 8]));
2735 }
2736 };
2737
2738 template <> struct MemcpyKImpl<17> {
2739 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2740 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2741 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2742 const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2743 _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2744 dst_uc[16] = src_uc[16];
2745 }
2746 };
2747
2748 template <> struct MemcpyKImpl<18> {
2749 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2750 uint16_t* dst_u16 = S_CAST(uint16_t*, dst);
2751 const uint16_t* src_u16 = S_CAST(const uint16_t*, src);
2752 const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2753 _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2754 dst_u16[8] = src_u16[8];
2755 }
2756 };
2757
2758 template <uint32_t N> struct MemcpyKImpl<N, TRange<(19 <= N) && (N <= 20)> > {
2759 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2760 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2761 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2762 const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2763 _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2764 *R_CAST(uint32_t*, &(dst_uc[N - 4])) = *R_CAST(const uint32_t*, &(src_uc[N - 4]));
2765 }
2766 };
2767
2768 template <uint32_t N> struct MemcpyKImpl<N, TRange<(21 <= N) && (N <= 24)> > {
2769 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2770 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2771 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2772 const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2773 _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2774 *R_CAST(uint64_t*, &(dst_uc[N - 8])) = *R_CAST(const uint64_t*, &(src_uc[N - 8]));
2775 }
2776 };
2777
2778 template <uint32_t N> struct MemcpyKImpl<N, TRange<(25 <= N) && (N <= 31)> > {
2779 static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2780 unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2781 const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2782 const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, src));
2783 const __m128i v2 = _mm_loadu_si128(R_CAST(const __m128i*, &(src_uc[N - 16])));
2784 _mm_storeu_si128(S_CAST(__m128i*, dst), v1);
2785 _mm_storeu_si128(R_CAST(__m128i*, &(dst_uc[N - 16])), v2);
2786 }
2787 };
2788
2789 // Note that there's no difference between memcpy() and memcpy_k() for common
2790 // 'well-behaved' sizes like 1, 4, 8, and 16. It's the funny numbers in
2791 // between, which often arise with constant strings, which this template is
2792 // targeting.
2793 # define memcpy_k(dst, src, ct) plink2::MemcpyKImpl<ct>::MemcpyK(dst, src)
2794
2795 template <uint32_t N> char* MemcpyaK(void* __restrict dst, const void* __restrict src) {
2796 MemcpyKImpl<N>::MemcpyK(dst, src);
2797 char* dst_c = S_CAST(char*, dst);
2798 return &(dst_c[N]);
2799 }
2800
2801 # define memcpya_k(dst, src, ct) plink2::MemcpyaK<ct>(dst, src)
2802 # define memcpyua_k(dst, src, ct) R_CAST(unsigned char*, plink2::MemcpyaK<ct>(dst, src))
2803
2804 template <uint32_t N> struct MemcpyoKImpl {
2805 static void MemcpyoK(void* __restrict dst, const void* __restrict src) {
2806 MemcpyKImpl<N>::MemcpyK(dst, src);
2807 }
2808 };
2809
2810 template <> struct MemcpyoKImpl<3> {
2811 static void MemcpyoK(void* __restrict dst, const void* __restrict src) {
2812 *S_CAST(uint32_t*, dst) = *S_CAST(const uint32_t*, src);
2813 }
2814 };
2815
2816 template <> struct MemcpyoKImpl<7> {
2817 static void MemcpyoK(void* __restrict dst, const void* __restrict src) {
2818 *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2819 }
2820 };
2821
2822 template <> struct MemcpyoKImpl<15> {
2823 static void MemcpyoK(void* __restrict dst, const void* __restrict src) {
2824 const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2825 _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2826 }
2827 };
2828
2829 // interestingly, __m256i copy does not seem to be better in 31 byte case
2830
2831 # define memcpyo_k(dst, src, ct) plink2::MemcpyoKImpl<ct>::MemcpyoK(dst, src)
2832
2833 template <uint32_t N> char* MemcpyaoK(void* __restrict dst, const void* __restrict src) {
2834 MemcpyoKImpl<N>::MemcpyoK(dst, src);
2835 char* dst_c = S_CAST(char*, dst);
2836 return &(dst_c[N]);
2837 }
2838
2839 # define memcpyao_k(dst, src, ct) plink2::MemcpyaoK<ct>(dst, src)
2840 # define memcpyuao_k(dst, src, ct) R_CAST(unsigned char*, plink2::MemcpyaoK<ct>(dst, src))
2841
2842 # else // !(defined(__LP64__) && defined(__cplusplus))
2843
2844 HEADER_INLINE int32_t memequal_k(const void* m1, const void* m2, uintptr_t ct) {
2845 return !memcmp(m1, m2, ct);
2846 }
2847
2848 HEADER_INLINE void memcpy_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2849 memcpy(dst, src, ct);
2850 }
2851
2852 HEADER_INLINE char* memcpya_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2853 return memcpya(dst, src, ct);
2854 }
2855
2856 HEADER_INLINE unsigned char* memcpyua_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2857 return memcpyua(dst, src, ct);
2858 }
2859
2860 HEADER_INLINE void memcpyo_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2861 memcpy(dst, src, ct);
2862 }
2863
2864 HEADER_INLINE char* memcpyao_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2865 return memcpya(dst, src, ct);
2866 }
2867
2868 HEADER_INLINE unsigned char* memcpyuao_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2869 return memcpyua(dst, src, ct);
2870 }
2871
2872 #endif
2873
2874 #ifdef __LP64__
2875 // This is also better than the June 2018 OS X/LLVM stock implementation,
2876 // especially for small values of ct.
2877 // (gcc 7.1 and clang 6.0.0 should have better stock implementations;
2878 // re-benchmark this once Linux build machine is upgraded to Ubuntu 18.04.)
2879 int32_t Memcmp(const void* m1, const void* m2, uintptr_t ct);
2880 #else
2881 HEADER_INLINE int32_t Memcmp(const void* m1, const void* m2, uintptr_t ct) {
2882 return memcmp(m1, m2, ct);
2883 }
2884 #endif
2885
2886
2887 // now compiling with gcc >= 4.4 (or clang equivalent) on all platforms, so
2888 // safe to use memset everywhere
2889 HEADER_INLINE void ZeroU32Arr(uintptr_t entry_ct, uint32_t* u32arr) {
2890 memset(u32arr, 0, entry_ct * sizeof(int32_t));
2891 }
2892
2893 HEADER_INLINE void ZeroWArr(uintptr_t entry_ct, uintptr_t* warr) {
2894 memset(warr, 0, entry_ct * sizeof(intptr_t));
2895 }
2896
2897 HEADER_INLINE void ZeroU64Arr(uintptr_t entry_ct, uint64_t* u64arr) {
2898 memset(u64arr, 0, entry_ct * sizeof(int64_t));
2899 }
2900
2901 HEADER_INLINE void ZeroPtrArr(uintptr_t entry_ct, void* pp) {
2902 memset(pp, 0, entry_ct * sizeof(intptr_t));
2903 }
2904
2905 HEADER_INLINE void ZeroHwArr(uintptr_t entry_ct, Halfword* hwarr) {
2906 memset(hwarr, 0, entry_ct * sizeof(Halfword));
2907 }
2908
2909 HEADER_INLINE void SetAllWArr(uintptr_t entry_ct, uintptr_t* warr) {
2910 // todo: test this against vecset()
2911 for (uintptr_t idx = 0; idx != entry_ct; ++idx) {
2912 warr[idx] = ~k0LU;
2913 }
2914 }
2915
2916
2917 // tried _bzhi_u64() in AVX2 case, it was actually worse on my Mac (more
2918 // opaque to compiler?)
2919 // todo: check gcc behavior since it may be different: see
2920 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82298 .
2921 //
2922 // This is undefined if idx == kBitsPerWord.
2923 HEADER_INLINE uintptr_t bzhi(uintptr_t ww, uint32_t idx) {
2924 return ww & ((k1LU << idx) - k1LU);
2925 }
2926
2927 // This is undefined if idx == 0.
2928 HEADER_INLINE uintptr_t bzhi_max(uintptr_t ww, uint32_t idx) {
2929 return ww & ((~k0LU) >> (kBitsPerWord - idx));
2930 }
2931
2932 // Don't bother defining blsr(), compiler should automatically use the
2933 // instruction under -mbmi and regular code is more readable? (again, should
2934 // verify this is true for gcc)
2935
2936 HEADER_INLINE uint32_t BytesToRepresentNzU32(uint32_t uii) {
2937 return 1 + (bsru32(uii) / CHAR_BIT);
2938 }
2939
2940 // analogous to memset()
2941 // this can be slightly slower if e.g. system supports AVX2 but non-AVX2 plink2
2942 // build is in use; fine to pay that price given the small-array advantage for
2943 // now. Should revisit this after next build-machine Ubuntu upgrade, though.
2944 HEADER_INLINE void vecset(void* target_vec, uintptr_t ww, uintptr_t vec_ct) {
2945 VecW* target_vec_iter = S_CAST(VecW*, target_vec);
2946 #ifdef __LP64__
2947 const VecW payload = VCONST_W(ww);
2948 for (uintptr_t vec_idx = 0; vec_idx != vec_ct; ++vec_idx) {
2949 *target_vec_iter++ = payload;
2950 }
2951 #else
2952 for (uintptr_t vec_idx = 0; vec_idx != vec_ct; ++vec_idx) {
2953 *target_vec_iter++ = ww;
2954 }
2955 #endif
2956 }
2957
2958 // todo: make sure these are efficient for small ct
2959 HEADER_INLINE void u16set(void* dst, uint16_t usii, uintptr_t ct) {
2960 uint16_t* dst_u16 = S_CAST(uint16_t*, dst);
2961 for (uintptr_t ulii = 0; ulii != ct; ++ulii) {
2962 dst_u16[ulii] = usii;
2963 }
2964 }
2965
2966 HEADER_INLINE char* u16setsa(char* dst, uint16_t usii, uintptr_t ct) {
2967 u16set(dst, usii, ct);
2968 return &(dst[ct * 2]);
2969 }
2970
2971 HEADER_INLINE uintptr_t ClearBottomSetBits(uint32_t ct, uintptr_t ulii) {
2972 #ifdef USE_AVX2
2973 return _pdep_u64((~k0LU) << ct, ulii);
2974 #else
2975 for (uint32_t uii = 0; uii != ct; ++uii) {
2976 ulii &= ulii - 1;
2977 }
2978 return ulii;
2979 #endif
2980 }
2981
2982 HEADER_INLINE uint32_t WordBitIdxToUidx(uintptr_t ulii, uint32_t bit_idx) {
2983 return ctzw(ClearBottomSetBits(bit_idx, ulii));
2984 }
2985
2986 CONSTI32(kNybblesPerWord, 2 * kBytesPerWord);
2987 CONSTI32(kNybblesPerCacheline, 2 * kCacheline);
2988
2989 HEADER_CINLINE uintptr_t NybbleCtToByteCt(uintptr_t val) {
2990 return DivUp(val, 2);
2991 }
2992
2993 HEADER_CINLINE uintptr_t NybbleCtToWordCt(uintptr_t val) {
2994 return DivUp(val, kNybblesPerWord);
2995 }
2996
2997 HEADER_INLINE uintptr_t GetNybbleArrEntry(const uintptr_t* nybblearr, uint32_t idx) {
2998 return (nybblearr[idx / kBitsPerWordD4] >> (4 * (idx % kBitsPerWordD4))) & 15;
2999 }
3000
3001 // Returns zero when ww has no zero bytes, and a word where the lowest set bit
3002 // is at position 8x + 7 when the first zero byte is [8x .. 8x+7].
3003 HEADER_INLINE uintptr_t DetectFirstZeroByte(uintptr_t ww) {
3004 return (ww - kMask0101) & (~ww) & (kMask0101 * 0x80);
3005 }
3006
3007 // From TAOCP 4a, 7.1.3, (91).
3008 // Position 8x + 7 is always set iff byte x is zero. All other bits are always
3009 // zero.
3010 HEADER_INLINE uintptr_t DetectAllZeroBytes(uintptr_t ww) {
3011 return (kMask0101 * 0x80) & (~(ww | ((ww | (kMask0101 * 0x80)) - kMask0101)));
3012 }
3013
3014 HEADER_INLINE uintptr_t DetectAllZeroNybbles(uintptr_t ww) {
3015 return (kMask1111 * 8) & (~(ww | ((ww | (kMask1111 * 8)) - kMask1111)));
3016 }
3017
3018 // Flagset conventions:
3019 // * Each 32-bit and 64-bit flagset has its own type, which is guaranteed to be
3020 // the appropriate width. (Todo: verify that bit 31 works properly in 32-bit
3021 // case.)
3022 // * Constant flag names start with "kf[CamelCase description]", followed by a
3023 // description that shouldn't suck too badly. The zero flagset is always
3024 // named kf[CamelCase description]0.
3025 // * The type name is always of the form [snake_case description]_flags_t.
3026 // * To gain the desired level of type-checking under C++11 without pointless
3027 // verbosity, &, |, ^, ~, &=, |=, and ^= operations are defined; [my_flags_t
3028 // variable] |= [another my_flags_t variable] & [a my_flags_t constant] works
3029 // without an explicit cast. (Defining "struct my_flags_t" separately from
3030 // the enum global-scope-constants container is necessary to make |= work
3031 // without a cast. inline is needed due to duplicate operator definitions
3032 // across multiple files.)
3033 // * To slightly reduce the chance of breakage under C99/C++03, the enum is
3034 // nameless; the flagset type is just a uint32_t/uint64_t alias. This is
3035 // because the C99 and C++03 specs do not provide enough control over the
3036 // enum base type to make it safe for the enum to serve as the flagset type.
3037 // * Implicit conversion to int is not prevented for now, since I'm trying to
3038 // keep PglErr-style code duplication to a minimum.
3039 #if __cplusplus >= 201103L
3040
3041 // could avoid the typedef here, but that leads to a bit more verbosity.
3042 # define FLAGSET_DEF_START() typedef enum : uint32_t {
3043 # define FLAGSET_DEF_END(tname) } tname ## _PLINK2_BASE_DO_NOT_USE__ ; \
3044 \
3045 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator|(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3046 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint32_t>(aa) | static_cast<uint32_t>(bb)); \
3047 } \
3048 \
3049 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator&(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3050 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint32_t>(aa) & static_cast<uint32_t>(bb)); \
3051 } \
3052 \
3053 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator^(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3054 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint32_t>(aa) ^ static_cast<uint32_t>(bb)); \
3055 } \
3056 \
3057 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator~(tname ## _PLINK2_BASE_DO_NOT_USE__ aa) { \
3058 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(~static_cast<uint32_t>(aa)); \
3059 } \
3060 \
3061 struct tname { \
3062 tname() {} \
3063 \
3064 tname(const tname& source) : value_(source.value_) {} \
3065 \
3066 tname(const tname ## _PLINK2_BASE_DO_NOT_USE__ source) : value_(static_cast<uint32_t>(source)) {} \
3067 \
3068 explicit tname(uint32_t source) : value_(source) {} \
3069 \
3070 operator tname ## _PLINK2_BASE_DO_NOT_USE__() const { \
3071 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(value_); \
3072 } \
3073 \
3074 tname& operator|=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3075 value_ |= rhs; \
3076 return *this; \
3077 } \
3078 \
3079 tname& operator&=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3080 value_ &= rhs; \
3081 return *this; \
3082 } \
3083 \
3084 tname& operator^=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3085 value_ ^= rhs; \
3086 return *this; \
3087 } \
3088 \
3089 tname& operator=(const tname& rhs) = default; \
3090 \
3091 private: \
3092 uint32_t value_; \
3093 }
3094
3095 # define FLAGSET64_DEF_START() typedef enum : uint64_t {
3096 # define FLAGSET64_DEF_END(tname) } tname ## _PLINK2_BASE_DO_NOT_USE__ ; \
3097 \
3098 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator|(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3099 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint64_t>(aa) | static_cast<uint64_t>(bb)); \
3100 } \
3101 \
3102 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator&(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3103 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint64_t>(aa) & static_cast<uint64_t>(bb)); \
3104 } \
3105 \
3106 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator^(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3107 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint64_t>(aa) ^ static_cast<uint64_t>(bb)); \
3108 } \
3109 \
3110 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator~(tname ## _PLINK2_BASE_DO_NOT_USE__ aa) { \
3111 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(~static_cast<uint64_t>(aa)); \
3112 } \
3113 \
3114 struct tname { \
3115 tname() {} \
3116 \
3117 tname(const tname& source) : value_(source.value_) {} \
3118 \
3119 tname(const tname ## _PLINK2_BASE_DO_NOT_USE__ source) : value_(static_cast<uint64_t>(source)) {} \
3120 \
3121 explicit tname(uint64_t source) : value_(source) {} \
3122 \
3123 operator tname ## _PLINK2_BASE_DO_NOT_USE__() const { \
3124 return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(value_); \
3125 } \
3126 \
3127 tname& operator|=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3128 value_ |= rhs; \
3129 return *this; \
3130 } \
3131 \
3132 tname& operator&=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3133 value_ &= rhs; \
3134 return *this; \
3135 } \
3136 \
3137 tname& operator^=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3138 value_ ^= rhs; \
3139 return *this; \
3140 } \
3141 \
3142 tname& operator=(const tname& rhs) = default; \
3143 \
3144 private: \
3145 uint64_t value_; \
3146 }
3147
3148 # define ENUM_U31_DEF_START() typedef enum : uint32_t {
3149 # define ENUM_U31_DEF_END(tname) } tname
3150
3151 #else // !__cplusplus >= 201103L
3152
3153 # define FLAGSET_DEF_START() enum {
3154 # define FLAGSET_DEF_END(tname) } ; \
3155 typedef uint32_t tname
3156
3157 // don't use a nameless enum here, since we want to be able to static_assert
3158 // the enum size.
3159 // best to artificially add an element to the end for now to force width to
3160 // 64-bit, otherwise gcc actually shrinks it even when the constants are
3161 // defined with LLU.
3162 # define FLAGSET64_DEF_START() typedef enum {
3163 # define FLAGSET64_DEF_END(tname) , \
3164 tname ## PLINK2_BASE_DO_NOT_USE__ALL_64_SET__ = ~(0LLU) } tname ## _PLINK2_BASE_DO_NOT_USE__ ; \
3165 static_assert(sizeof(tname ## _PLINK2_BASE_DO_NOT_USE__) == 8, "64-bit flagset constants are not actually uint64_ts."); \
3166 typedef uint64_t tname
3167
3168 # define ENUM_U31_DEF_START() typedef enum {
3169 # define ENUM_U31_DEF_END(tname) } tname ## _PLINK2_BASE_DO_NOT_USE__ ; \
3170 typedef uint32_t tname
3171
3172 #endif
3173
3174 // This supports private struct members in code that still compiles as C.
3175 //
3176 // Internal code should access these members with GET_PRIVATE(), and define a
3177 // pair of public C++-only GET_PRIVATE_...() member functions (one const and
3178 // one non-const) that each return a reference to the member; see plink2_thread
3179 // for examples. In addition, .cc API code should define a small number of
3180 // standard GET_PRIVATE() accessors at the top of the file, and practically all
3181 // private-member access should occur through those file-scope accessors; this
3182 // keeps the surface area under control.
3183 //
3184 // (Tried to define a DECLARE_PRIVATE(typ, member) macro as well, but didn't
3185 // get that to work. This is already pretty painless if it's restricted to key
3186 // APIs, though.)
3187 //
3188 // probable todo: see if the intended effect can be achieved in a simpler
3189 // manner with well-chosen explicit and implicit type conversions.
3190 #ifdef __cplusplus
3191 # define GET_PRIVATE(par, member) (par).GET_PRIVATE_ ## member()
3192 #else
3193 # define GET_PRIVATE(par, member) (par).member
3194 #endif
3195
3196 #ifdef __cplusplus
3197 } // namespace plink2
3198 #endif
3199
3200 #endif // __PLINK2_BASE_H__
3201