1 #ifndef __PLINK2_BASE_H__
2 #define __PLINK2_BASE_H__
3 
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation; either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
19 
20 
21 // Low-level C99/C++03/C++11 library covering basic I/O, SWAR/SIMD, and
22 // Windows/OS X/Linux portability.  We try to benefit from as much C++ type
23 // safety as we can without either breaking compatibility with C-only codebases
24 // or making extension of pgenlib/plink2 code more difficult than the old
25 // type-unsafe style.
26 //
27 // Parameter conventions:
28 // - Input parameters, then in/out, then pure outputs, then temporary buffers.
29 //   Reference-style input parameters tend to go in the very front, to make it
30 //   more obvious that they aren't in/out.
31 // - "bitarr" indicates a word-aligned, packed array of bits, while "bitvec"
32 //   indicates vector-alignment in 64-bit builds.  ("vector" always means SIMD
33 //   inputs/outputs here; C++ std::vector is not used in this codebase.)
34 // - Most pointers are stationary; moving pointers have an _iter suffix.
35 //
36 // Type-choice guidelines:
37 // - Integers are unsigned by default, signed only when necessary.
38 //   It's necessary to choose one or the other to avoid drowning in a sea of
39 //   casts and unexpected behavior.  Each choice has its own surprising
40 //   pitfalls that the developer had better be aware of; and I definitely do
41 //   not take the position that unsigned is the better default *in all C and/or
42 //   C++ code*.  However, for this codebase, the extremely high frequency of
43 //   bitwise operations makes unsigned-default the only sane choice.
44 //   Some consequences of this choice:
45 //   - All pointer differences that are part of a larger arithmetic or
46 //     comparison expression are explicitly casted to uintptr_t.
47 //   - Since uint64_t -> double conversion is frequently slower than int64_t ->
48 //     double conversion, u63tod() should be used when the integer is known to
49 //     be less than 2^63.  If we also know it's less than 2^31, u31tod() can
50 //     provide a performance improvement on Win32.
51 // - Integers that can be >= 2^32 in some of the largest existing datasets, but
52 //   are usually smaller, should be defined as uintptr_t, to strike a good
53 //   balance between 32-bit performance and 64-bit scaling.  Exhaustive
54 //   overflow checking in the 32-bit build is a non-goal; but I do aim for very
55 //   high statistical reliability, by inserting checks whenever it occurs to me
56 //   that overflow is especially likely (e.g. when multiplying two potentially
57 //   large 32-bit numbers).
58 // - Bitarrays and 'nyparrays' (packed arrays of 2-bit elements, such as a row
59 //   of a plink 1.x .bed file) are usually uintptr_t*, to encourage
60 //   word-at-a-time iteration without requiring vector-alignment.  Quite a few
61 //   low-level library functions cast them to VecW*.  As mentioned above, the
62 //   affected function parameter names must end in 'vec' when this creates an
63 //   alignment requirement.
64 // - A buffer/iterator expected to contain only UTF-8 text should be char*.
65 //   unsigned char* should be reserved for byte-array buffers and iterators
66 //   which are expected to interact with some non-text bytes, and generic
67 //   memory-location pointers which will be subject to pointer arithmetic.
68 //   (Note that this creates some clutter in low-level parsing code: since the
69 //   signedness of char is platform-dependent, it becomes necessary to use e.g.
70 //   ctou32() a fair bit.)
71 // - unsigned char is an acceptable argument type for functions intended to
72 //   process a single text character, thanks to implicit char -> unsigned char
73 //   conversion; it's just unsigned char* that should be avoided.
74 // - void* return values should be restricted to generic pointers which are
75 //   *not* expected to be subject to pointer arithmetic.  void* as input
76 //   parameter type should only be used when there are at least two equally
77 //   valid input types, NOT counting VecW*.
78 
79 
80 #if (__GNUC__ < 4)
81 // may eventually add MSVC support to gain access to MKL on Windows, but can't
82 // justify doing that before all major features are implemented.
83 #  error "gcc 4.x+ or clang equivalent required."
84 #endif
85 
86 // The -Wshorten-64-to-32 diagnostic forces the code to be cluttered with
87 // meaningless uintptr_t -> uint32_t static casts (number known to be < 2^32,
88 // just stored in a uintptr_t because there's no speed penalty and we generally
89 // want to think in terms of word-based operations).  The code is more readable
90 // if S_CAST(uint32_t, <potentially wider value>) is reserved for situations
91 // where a higher bit may actually be set.  This pragma can always be commented
92 // out on the few occasions where inappropriate silent truncation is suspected.
93 #ifdef __clang__
94 #  pragma clang diagnostic ignored "-Wshorten-64-to-32"
95 #endif
96 
97 // 10000 * major + 100 * minor + patch
98 // Exception to CONSTI32, since we want the preprocessor to have access
99 // to this value.  Named with all caps as a consequence.
100 #define PLINK2_BASE_VERNUM 702
101 
102 
103 #define _FILE_OFFSET_BITS 64
104 
105 #include <stdio.h>
106 #include <stdlib.h>
107 #include <string.h>
108 #include <stddef.h>  // offsetof()
109 #include <stdint.h>
110 #ifndef __STDC_FORMAT_MACROS
111 #  define __STDC_FORMAT_MACROS 1
112 #endif
113 #include <inttypes.h>
114 #include <limits.h>  // CHAR_BIT, PATH_MAX
115 
116 // #define NDEBUG
117 #include <assert.h>
118 
119 #ifdef _WIN32
120   // needed for EnterCriticalSection, etc.
121 #  ifndef _WIN64
122 #    define WINVER 0x0501
123 #  else
124 #    define __LP64__
125 #  endif
126 #  ifndef WIN32_LEAN_AND_MEAN
127 #    define WIN32_LEAN_AND_MEAN
128 #  endif
129 #  include <windows.h>
130 #endif
131 
132 #if __cplusplus >= 201103L
133 #  include <array>
134 #endif
135 
136 #ifdef __LP64__
137 #  ifndef __SSE2__
138 // possible todo: remove this requirement, the 32-bit VecW-using code does most
139 // of what we need.  But little point in doing this before we have e.g. an
140 // ARM-based machine to test with that scientists would plausibly want to run
141 // plink2 on.
142 #    error "64-bit builds currently require SSE2.  Try producing a 32-bit build instead."
143 #  endif
144 #  include <emmintrin.h>
145 #  ifdef __SSE4_2__
146 #    define USE_SSE42
147 #    include <smmintrin.h>
148 #    ifdef __AVX2__
149 #      include <immintrin.h>
150 #      ifndef __BMI__
151 #        error "AVX2 builds require -mbmi as well."
152 #      endif
153 #      ifndef __BMI2__
154 #        error "AVX2 builds require -mbmi2 as well."
155 #      endif
156 #      ifndef __LZCNT__
157 #        error "AVX2 builds require -mlzcnt as well."
158 #      endif
159 #      define USE_AVX2
160 #    endif
161 #  endif
162 #  define ALIGNV16 __attribute__ ((aligned (16)))
163 #else
164 #  define ALIGNV16
165 #endif
166 
167 // done with #includes, can start C++ namespace...
168 #ifdef __cplusplus
169 namespace plink2 {
170 #endif
171 
172 // ...though a bunch of symbols remain to be #defined; try to reduce the number
173 // over time.
174 
175 #ifndef UINT32_MAX
176   // can theoretically be undefined in C++03
177 #  define UINT32_MAX 0xffffffffU
178 #endif
179 
180 #define UINT32_MAXM1 0xfffffffeU
181 
182 #ifdef __cplusplus
183 #  define HEADER_INLINE inline
184 // Previously went on a wild constexpr spree, but now these are mostly unused.
185 // Reserve for cases where (i) there's a clear constant-initialization use case
186 // for an imaginable downstream program (I'm looking at you, DivUp() and
187 // RoundUpPow2()...), or (ii) it allows a useful static_assert to be inserted
188 // for a hardcoded constant.
189 #  if __cplusplus >= 201103L
190 #    define HEADER_CINLINE constexpr
191 #    define CSINLINE static constexpr
192 #    if __cplusplus > 201103L
193 #      define HEADER_CINLINE2 constexpr
194 #      define CSINLINE2 static constexpr
195 #    else
196 #      define HEADER_CINLINE2 inline
197 #      define CSINLINE2 static inline
198 #    endif
199 #  else
200 #    define HEADER_CINLINE inline
201 #    define HEADER_CINLINE2 inline
202 #    define CSINLINE static inline
203 #    define CSINLINE2 static inline
204 #  endif
205 #  if __cplusplus <= 199711L
206     // this may be defined anyway, at least on OS X
207 #    ifndef static_assert
208       // todo: check other cases
209 #      define static_assert(cond, msg)
210 #    endif
211 #  endif
212 #else
213 #  define HEADER_INLINE static inline
214 #  define HEADER_CINLINE static inline
215 #  define HEADER_CINLINE2 static inline
216 #  define CSINLINE static inline
217 #  define CSINLINE2 static inline
218   // _Static_assert() should work in gcc 4.6+
219 #  if (__GNUC__ == 4) && (__GNUC_MINOR__ < 6)
220 #    if defined(__clang__) && defined(__has_feature) && defined(__has_extension)
221 #      if __has_feature(c_static_assert) || __has_extension(c_static_assert)
222 #        define static_assert _Static_assert
223 #      else
224 #        define static_assert(cond, msg)
225 #      endif
226 #    else
227 #      define static_assert(cond, msg)
228 #    endif
229 #  else
230 #    define static_assert _Static_assert
231 #  endif
232 #endif
233 
234 #define __maybe_unused __attribute__((unused))
235 
236 // Rule of thumb: Use these macros if, and only if, the condition would always
237 // trigger exit-from-program.  As a side effect, this makes it more
238 // straightforward, if still tedious, to make global changes to error-handling
239 // strategy (always dump backtrace and exit immediately?), though provision
240 // must still be made for sometimes-error-sometimes-not return paths which
241 // don't get an unlikely annotation.
242 #ifndef likely
243 #  define likely(expr) __builtin_expect(!!(expr), 1)
244 #  define unlikely(expr) __builtin_expect(!!(expr), 0)
245 #endif
246 
247 #ifdef __cplusplus
248 #  define K_CAST(type, val) (const_cast<type>(val))
249 #  define R_CAST(type, val) (reinterpret_cast<type>(val))
250 #  define S_CAST(type, val) (static_cast<type>(val))
251 #else
252 #  define K_CAST(type, val) ((type)(val))
253 #  define R_CAST(type, val) ((type)(val))
254 #  define S_CAST(type, val) ((type)(val))
255 #endif
256 
257 // (from Linux kernel)
258 // container_of - cast a member of a structure out to the containing structure
259 // @ptr: the pointer to the member.
260 // @type: the type of the container struct this is embedded in.
261 // @member: the name of the member within the struct.
262 #define container_of(ptr, type, member) \
263   (R_CAST(type*, R_CAST(char*, ptr) - offsetof(type, member)))
264 
265 // original macro doesn't work in C++ when e.g. ptr is a const char*, and the
266 // quick workaround of casting away the const is unsafe.
267 #define const_container_of(ptr, type, member) \
268   (R_CAST(const type*, R_CAST(const char*, ptr) - offsetof(type, member)))
269 
u31tod(uint32_t uii)270 HEADER_INLINE double u31tod(uint32_t uii) {
271   const int32_t ii = uii;
272   assert(ii >= 0);
273   return S_CAST(double, ii);
274 }
275 
swtod(intptr_t lii)276 HEADER_INLINE double swtod(intptr_t lii) {
277   return S_CAST(double, lii);
278 }
279 
u63tod(uint64_t ullii)280 HEADER_INLINE double u63tod(uint64_t ullii) {
281   const int64_t llii = ullii;
282   assert(llii >= 0);
283   return S_CAST(double, llii);
284 }
285 
u31tof(uint32_t uii)286 HEADER_INLINE float u31tof(uint32_t uii) {
287   const int32_t ii = uii;
288   assert(ii >= 0);
289   return S_CAST(float, ii);
290 }
291 
ctou32(char cc)292 HEADER_INLINE uint32_t ctou32(char cc) {
293   return S_CAST(unsigned char, cc);
294 }
295 
ctow(char cc)296 HEADER_INLINE uintptr_t ctow(char cc) {
297   return S_CAST(unsigned char, cc);
298 }
299 
ctou64(char cc)300 HEADER_INLINE uint64_t ctou64(char cc) {
301   return S_CAST(unsigned char, cc);
302 }
303 
304 // Error return types.  All of these evaluate to true on error and false on
305 // success, but otherwise they have slightly different semantics:
306 // * PglErr is the general-purpose enum.  Unlike an enum, implicit conversion
307 //   *to* int, not just from int, is prevented by the C++11 compiler (and the
308 //   C++11-compiler-validated code still works under C99).  (To achieve this
309 //   additional safety, we engage in a bit of code duplication which would be
310 //   unreasonable for flagsets.)
311 //   (Previously, explicit cast to uint32_t, but not int32_t, was supported, to
312 //   reflect the fact that all error codes are positive.  This was deemed
313 //   silly.)
314 // * BoolErr allows implicit conversion from int, but conversion back to
315 //   uint32_t requires an explicit cast.  (It should always be 0/1-valued, but
316 //   this isn't enforced by the compiler.)
317 // * IntErr allows implicit conversion from int, but conversion back to
318 //   int32_t requires an explicit cast.  It mainly serves as a holding pen for
319 //   C standard library error return values, which can be negative.
320 #if __cplusplus >= 201103L
321 struct PglErr {
322   enum class ec
323 #else
324 typedef enum
325 #endif
326   {
327   kPglRetSuccess,
328   kPglRetSkipped,
329   kPglRetNomem,
330   kPglRetOpenFail,
331   kPglRetReadFail,
332   kPglRetWriteFail,
333   // MalformedInput should be returned on low-level file format violations,
334   // while InconsistentInput should be returned for higher-level logical
335   // problems like mismatched files (generally solvable by fixing the command
336   // line), and DegenerateData for properly-formatted-and-matched files that
337   // yields degenerate computational results due to e.g. divide by zero or
338   // insufficient rank.
339   kPglRetMalformedInput,
340   kPglRetInconsistentInput,
341   kPglRetInvalidCmdline,
342   kPglRetThreadCreateFail,
343   kPglRetNetworkFail,
344   kPglRetVarRecordTooLarge,
345   kPglRetUnsupportedInstructions,
346   kPglRetDegenerateData,
347   kPglRetDecompressFail, // also distinguish this from MalformedInput
348   kPglRetRewindFail,
349   kPglRetSampleMajorBed = 32,
350   kPglRetNomemCustomMsg = 59,
351   kPglRetInternalError = 60,
352   kPglRetWarningErrcode = 61,
353   kPglRetImproperFunctionCall = 62,
354   kPglRetNotYetSupported = 63,
355 
356   // These are only for internal use.  If any of these reach the top level
357   // instead of being handled or converted to another error code, that's a bug,
358   // and plink2 prints a message to that effect.
359   kPglRetHelp = 125,
360   kPglRetLongLine = 126,
361   kPglRetEof = 127}
362 #if __cplusplus >= 201103L
363   ;
364 
PglErrPglErr365   PglErr() {}
366 
PglErrPglErr367   PglErr(const PglErr& source) : value_(source.value_) {}
368 
PglErrPglErr369   PglErr(ec source) : value_(source) {}
370 
371   // Allow explicit conversion from uint64_t, and NOT uint32_t, to this error
372   // type, to support reproducible multithreaded error reporting (where
373   // multiple threads may atomically attempt to modify a single uint64_t with
374   // the error code in the low 32 bits and a priority number in the high bits).
PglErrPglErr375   explicit PglErr(uint64_t source) : value_(static_cast<ec>(source)) {}
376 
377   PglErr& operator=(const PglErr&) = default;
378 
ecPglErr379   operator ec() const {
380     return value_;
381   }
382 
uint32_tPglErr383   explicit operator uint32_t() const {
384     return static_cast<uint32_t>(value_);
385   }
386 
int32_tPglErr387   explicit operator int32_t() const {
388     return static_cast<int32_t>(value_);
389   }
390 
391   explicit operator bool() const {
392     return (static_cast<uint32_t>(value_) != 0);
393   }
394 
395 private:
396   ec value_;
397 };
398 
399 const PglErr kPglRetSuccess = PglErr::ec::kPglRetSuccess;
400 const PglErr kPglRetSkipped = PglErr::ec::kPglRetSkipped;
401 const PglErr kPglRetNomem = PglErr::ec::kPglRetNomem;
402 const PglErr kPglRetOpenFail = PglErr::ec::kPglRetOpenFail;
403 const PglErr kPglRetReadFail = PglErr::ec::kPglRetReadFail;
404 const PglErr kPglRetWriteFail = PglErr::ec::kPglRetWriteFail;
405 const PglErr kPglRetMalformedInput = PglErr::ec::kPglRetMalformedInput;
406 const PglErr kPglRetInconsistentInput = PglErr::ec::kPglRetInconsistentInput;
407 const PglErr kPglRetInvalidCmdline = PglErr::ec::kPglRetInvalidCmdline;
408 const PglErr kPglRetHelp = PglErr::ec::kPglRetHelp;
409 const PglErr kPglRetThreadCreateFail = PglErr::ec::kPglRetThreadCreateFail;
410 const PglErr kPglRetNetworkFail = PglErr::ec::kPglRetNetworkFail;
411 const PglErr kPglRetVarRecordTooLarge = PglErr::ec::kPglRetVarRecordTooLarge;
412 const PglErr kPglRetUnsupportedInstructions = PglErr::ec::kPglRetUnsupportedInstructions;
413 const PglErr kPglRetDegenerateData = PglErr::ec::kPglRetDegenerateData;
414 const PglErr kPglRetDecompressFail = PglErr::ec::kPglRetDecompressFail;
415 const PglErr kPglRetRewindFail = PglErr::ec::kPglRetRewindFail;
416 const PglErr kPglRetSampleMajorBed = PglErr::ec::kPglRetSampleMajorBed;
417 const PglErr kPglRetWarningErrcode = PglErr::ec::kPglRetWarningErrcode;
418 const PglErr kPglRetNomemCustomMsg = PglErr::ec::kPglRetNomemCustomMsg;
419 const PglErr kPglRetInternalError = PglErr::ec::kPglRetInternalError;
420 const PglErr kPglRetImproperFunctionCall = PglErr::ec::kPglRetImproperFunctionCall;
421 const PglErr kPglRetNotYetSupported = PglErr::ec::kPglRetNotYetSupported;
422 const PglErr kPglRetLongLine = PglErr::ec::kPglRetLongLine;
423 const PglErr kPglRetEof = PglErr::ec::kPglRetEof;
424 #else
425   PglErr;
426 #endif
427 
428 #if __cplusplus >= 201103L
429 // allow efficient arithmetic on these, but force them to require explicit
430 // int32_t/uint32_t casts; only permit implicit assignment from
431 // int32_t/uint32_t by default.
432 // built-in bool type does too many things we don't want...
433 
434 // expected to be integer-valued, but not necessarily 0/1 or positive
435 struct IntErr {
IntErrIntErr436   IntErr() {}
437 
IntErrIntErr438   IntErr(int32_t source) : value_(source) {}
439 
int32_tIntErr440   explicit operator int32_t() const {
441     return value_;
442   }
443 
444   explicit operator bool() const {
445     return (value_ != 0);
446   }
447 
448 private:
449   int32_t value_;
450 };
451 
452 // expected to be 0/1-valued
453 struct BoolErr {
BoolErrBoolErr454   BoolErr() {}
455 
BoolErrBoolErr456   BoolErr(uint32_t source) : value_(source) {}
457 
uint32_tBoolErr458   explicit operator uint32_t() const {
459     return value_;
460   }
461 
462   explicit operator bool() const {
463     return (value_ != 0);
464   }
465 
466 private:
467   uint32_t value_;
468 };
469 #else
470 typedef int32_t IntErr;
471 typedef uint32_t BoolErr;
472 #endif
473 
474 // make this work on 32-bit as well as 64-bit systems, across
475 // Windows/OS X/Linux
476 // (todo: clean this up a bit.  it's inherently a baling-wire-and-duct-tape
477 // sort of thing, though...)
478 #ifdef _WIN32
479   // must compile with -std=gnu++11, not c++11, on 32-bit Windows since
480   // otherwise fseeko64 not defined...
481 #  define fseeko fseeko64
482 #  define ftello ftello64
483 #  define FOPEN_RB "rb"
484 #  define FOPEN_WB "wb"
485 #  define FOPEN_AB "ab"
486 #  define ferror_unlocked ferror
487 #  define feof_unlocked feof
488 #  ifdef __LP64__
489 #    define getc_unlocked _fgetc_nolock
490 #    define putc_unlocked _fputc_nolock
491     // todo: find mingw-w64 build which properly links _fread_nolock, and
492     // conditional-compile
493 #    define fread_unlocked fread
494 #    define fwrite_unlocked fwrite
495 #  else
496 #    define getc_unlocked getc
497 #    define putc_unlocked putc
498 #    define fread_unlocked fread
499 #    define fwrite_unlocked fwrite
500 #  endif
501 #  if __cplusplus < 201103L
502 #    define uint64_t unsigned long long
503 #    define int64_t long long
504 #  endif
505 #else  // Linux or OS X
506 #  define FOPEN_RB "r"
507 #  define FOPEN_WB "w"
508 #  define FOPEN_AB "a"
509 #  if defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__)
510 #    define fread_unlocked fread
511 #    define fwrite_unlocked fwrite
512 #  endif
513 #  if defined(__NetBSD__)
514 #    define ferror_unlocked ferror
515 #    define feof_unlocked feof
516 #  endif
517 #endif
518 
519 #ifdef _WIN32
520 #  undef PRId64
521 #  undef PRIu64
522 #  define PRId64 "I64d"
523 #  define PRIu64 "I64u"
524 #else
525 #  ifdef __cplusplus
526 #    ifndef PRId64
527 #      define PRId64 "lld"
528 #    endif
529 #  endif
530 #endif
531 
532 // These are useful for defending against base-pointer integer overflow on bad
533 // input.
PtrAddCk(const unsigned char * end,intptr_t incr,const unsigned char ** basep)534 HEADER_INLINE BoolErr PtrAddCk(const unsigned char* end, intptr_t incr, const unsigned char** basep) {
535   *basep += incr;
536   return unlikely((end - (*basep)) < 0);
537 }
538 
539 // 'W' for writable
PtrWSubCk(unsigned char * base,intptr_t decr,unsigned char ** endp)540 HEADER_INLINE BoolErr PtrWSubCk(unsigned char* base, intptr_t decr, unsigned char** endp) {
541   *endp -= decr;
542   return unlikely(((*endp) - base) < 0);
543 }
544 
PtrCheck(const void * end,const void * base,intptr_t req)545 HEADER_INLINE BoolErr PtrCheck(const void* end, const void* base, intptr_t req) {
546   const unsigned char* end_uc = S_CAST(const unsigned char*, end);
547   const unsigned char* base_uc = S_CAST(const unsigned char*, base);
548   return unlikely((end_uc - base_uc) < req);
549 }
550 
551 // We want this to return an uint32_t, not an int32_t.
ctzu32(uint32_t uii)552 HEADER_INLINE uint32_t ctzu32(uint32_t uii) {
553   return __builtin_ctz(uii);
554 }
555 
556 // this should always compile down to bsr.
bsru32(uint32_t uii)557 HEADER_INLINE uint32_t bsru32(uint32_t uii) {
558   return 31 - __builtin_clz(uii);
559 }
560 
561 #ifdef _WIN64
ctzw(unsigned long long ullii)562 HEADER_INLINE uint32_t ctzw(unsigned long long ullii) {
563   return __builtin_ctzll(ullii);
564 }
565 
bsrw(unsigned long long ullii)566 HEADER_INLINE uint32_t bsrw(unsigned long long ullii) {
567   // Note that this actually compiles to a single instruction on x86; it's
568   // naked __builtin_clzll which requires an additional subtraction.
569   return 63 - __builtin_clzll(ullii);
570 }
571 #else
ctzw(unsigned long ulii)572 HEADER_INLINE uint32_t ctzw(unsigned long ulii) {
573   return __builtin_ctzl(ulii);
574 }
575 
bsrw(unsigned long ulii)576 HEADER_INLINE uint32_t bsrw(unsigned long ulii) {
577   return (8 * sizeof(intptr_t) - 1) - __builtin_clzl(ulii);
578 }
579 #  ifndef __LP64__
580     // needed to prevent GCC 6 build failure
581 #    if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8)
582 #      if (__cplusplus < 201103L) && !defined(__APPLE__)
583 #        ifndef uintptr_t
584 #          define uintptr_t unsigned long
585 #        endif
586 #        ifndef intptr_t
587 #          define intptr_t long
588 #        endif
589 #      endif
590 #    endif
591 #  endif
592 #endif
593 
594 #ifdef __LP64__
595 #  ifdef _WIN32 // i.e. Win64
596 
597 #    undef PRIuPTR
598 #    undef PRIdPTR
599 #    define PRIuPTR PRIu64
600 #    define PRIdPTR PRId64
601 #    define PRIxPTR2 "016I64x"
602 
603 #  else  // not _WIN32
604 
605 #    ifndef PRIuPTR
606 #      define PRIuPTR "lu"
607 #    endif
608 #    ifndef PRIdPTR
609 #      define PRIdPTR "ld"
610 #    endif
611 #    define PRIxPTR2 "016lx"
612 
613 #  endif  // Win64
614 
615 #else  // not __LP64__
616 
617   // without this, we get ridiculous warning spew...
618   // not 100% sure this is the right cutoff, but this has been tested on 4.7
619   // and 4.8 build machines, so it plausibly is.
620 #  if (__GNUC__ == 4) && (__GNUC_MINOR__ < 8) && (__cplusplus < 201103L)
621 #    undef PRIuPTR
622 #    undef PRIdPTR
623 #    define PRIuPTR "lu"
624 #    define PRIdPTR "ld"
625 #  endif
626 
627 #  define PRIxPTR2 "08lx"
628 
629 #endif
630 
631 #ifndef HAVE_NULLPTR
632 #  ifndef __cplusplus
633 #    define nullptr NULL
634 #  else
635 #    if __cplusplus <= 199711L
636 #      ifndef nullptr
637 #        define nullptr NULL
638 #      endif
639 #    endif
640 #  endif
641 #endif
642 
643 // Checked a bunch of alternatives to #define constants.  For integer constants
644 // in [-2^31, 2^31), enum {} avoids macro expansion issues that actually
645 // matter, and that more than cancels out any tiny increase in binary size due
646 // to additional debugger information (which has value, anyway).  However, we
647 // don't want to use this under C++ due to enumeral/non-enumeral conditional
648 // expression warnings, so this isn't one-size-fits-all; and plain old const
649 // int has all the functionality we want under C++ (including internal linkage,
650 // so it's fine to define them in header files).  Thus we wrap the
651 // implementation in a macro.
652 //
653 // Otherwise, the macro expansion thing is still annoying but we suck it up due
654 // to the need for too much duplicate C vs. C++ code ("initializer element is
655 // not constant" when using const <type> in C99...)
656 //
657 // We start most plink2- and pgenlib-specific numeric constant names here with
658 // "kPgl", which should have a vanishingly small chance of colliding with
659 // anything in C99.  Note that stuff like kBytesPerWord is not considered
660 // library-specific, so it's exempt from having "Pgl" in the name.  Also, the
661 // few string literals here are of the FOPEN_WB sort, which have similar usage
662 // patterns to e.g. PRIuPTR which shouldn't be renamed, so those remain
663 // all-caps.
664 //
665 // (Update, May 2018: CONSTU31 was renamed to CONSTI32 and changed to type
666 // int32_t, to prevent C vs. C++ differences.  This almost never makes a
667 // difference, since when int32_t and uint32_t are mixed in the same
668 // expression, the former gets converted to unsigned.  However, unpleasant
669 // surprises are occasionally possible when mixing these constants with
670 // uint16_t or unsigned char values, since then the unsigned values are
671 // promoted to int32_t.  Also, this essentially forces use of -Wno-sign-compare
672 // when using gcc 4.4.
673 //
674 // Biggest thing to watch out for is mixing of Halfword with these constants in
675 // 32-bit builds.  Dosage and Vec8thUint are also relevant.)
676 #ifdef __cplusplus
677 #  define CONSTI32(name, expr) const int32_t name = (expr)
678 #else
679 #  define CONSTI32(name, expr) enum {name = (expr)}
680 #endif
681 
682 // useful because of its bitwise complement: ~k0LU is a word with all 1 bits,
683 // while ~0 is always 32 1 bits.
684 // LLU is used over ULL for searchability (no conflict with NULL).
685 static const uintptr_t k0LU = S_CAST(uintptr_t, 0);
686 
687 // mainly useful for bitshifts: (k1LU << 32) works in 64-bit builds, while
688 // (1 << 32) is undefined.  also used as a quicker-to-type way of casting
689 // numbers/expressions to uintptr_t (via multiplication).
690 static const uintptr_t k1LU = S_CAST(uintptr_t, 1);
691 
692 
693 #ifdef __LP64__
694 #  ifdef USE_AVX2
695 CONSTI32(kBytesPerVec, 32);
696 
697 // 16 still seems to noticeably outperform 32 on my Mac test machine, and
698 // is about equal on my Linux test machine, probably due to reduced clock
699 // frequency when 32-byte floating point vector operations are used (as in, ALL
700 // operations, sometimes on ALL cores, become slower when a single core
701 // performs a 32-byte fp vector operation).
702 // However, processor power management, numeric libraries, and my AVX2 code
703 // should improve over time.  There will probably come a time where switching
704 // to 32-byte fp is worthwhile.
705 // #define FVEC_32
706 
707 // bleah, have to define these here, vector_size doesn't see enum values
708 typedef uintptr_t VecW __attribute__ ((vector_size (32)));
709 typedef uint32_t VecU32 __attribute__ ((vector_size (32)));
710 typedef int32_t VecI32 __attribute__ ((vector_size (32)));
711 typedef unsigned short VecU16 __attribute__ ((vector_size (32)));
712 typedef short VecI16 __attribute__ ((vector_size (32)));
713 // documentation says 'char', but int8_t works fine under gcc 4.4 and conveys
714 // intent better (char not guaranteed to be signed)
715 typedef int8_t VecI8 __attribute__ ((vector_size (32)));
716 typedef unsigned char VecUc __attribute__ ((vector_size (32)));
717 #  else
718 CONSTI32(kBytesPerVec, 16);
719 typedef uintptr_t VecW __attribute__ ((vector_size (16)));
720 typedef uint32_t VecU32 __attribute ((vector_size (16)));
721 typedef int32_t VecI32 __attribute ((vector_size (16)));
722 typedef unsigned short VecU16 __attribute__ ((vector_size (16)));
723 typedef short VecI16 __attribute__ ((vector_size (16)));
724 typedef int8_t VecI8 __attribute__ ((vector_size (16)));
725 typedef unsigned char VecUc __attribute__ ((vector_size (16)));
726 #  endif
727 CONSTI32(kBitsPerWord, 64);
728 CONSTI32(kBitsPerWordLog2, 6);
729 
730 typedef uint32_t Halfword;
731 typedef uint16_t Quarterword;
732 
733 #  ifdef USE_AVX2
734 
735 // _mm256_set1_... seems to have the same performance; could use that instead.
736 #    define VCONST_W(xx) {xx, xx, xx, xx}
737 #    define VCONST_S(xx) {xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx}
738 #    define VCONST_C(xx) {xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx}
739 #    define VCONST_UC VCONST_C
740 
741 // vv = VCONST_W(k0LU) doesn't work (only ok for initialization)
vecw_setzero()742 HEADER_INLINE VecW vecw_setzero() {
743   return R_CAST(VecW, _mm256_setzero_si256());
744 }
745 
vecu32_setzero()746 HEADER_INLINE VecU32 vecu32_setzero() {
747   return R_CAST(VecU32, _mm256_setzero_si256());
748 }
749 
vecu16_setzero()750 HEADER_INLINE VecU16 vecu16_setzero() {
751   return R_CAST(VecU16, _mm256_setzero_si256());
752 }
753 
veci16_setzero()754 HEADER_INLINE VecI16 veci16_setzero() {
755   return R_CAST(VecI16, _mm256_setzero_si256());
756 }
757 
vecuc_setzero()758 HEADER_INLINE VecUc vecuc_setzero() {
759   return R_CAST(VecUc, _mm256_setzero_si256());
760 }
761 
veci8_setzero()762 HEADER_INLINE VecI8 veci8_setzero() {
763   return R_CAST(VecI8, _mm256_setzero_si256());
764 }
765 
766 // "vv >> ct" doesn't work, and Scientific Linux gcc 4.4 might not optimize
767 // VCONST_W shift properly (todo: test this)
vecw_srli(VecW vv,uint32_t ct)768 HEADER_INLINE VecW vecw_srli(VecW vv, uint32_t ct) {
769   return R_CAST(VecW, _mm256_srli_epi64(R_CAST(__m256i, vv), ct));
770 }
771 
vecw_slli(VecW vv,uint32_t ct)772 HEADER_INLINE VecW vecw_slli(VecW vv, uint32_t ct) {
773   return R_CAST(VecW, _mm256_slli_epi64(R_CAST(__m256i, vv), ct));
774 }
775 
vecu32_srli(VecU32 vv,uint32_t ct)776 HEADER_INLINE VecU32 vecu32_srli(VecU32 vv, uint32_t ct) {
777   return R_CAST(VecU32, _mm256_srli_epi32(R_CAST(__m256i, vv), ct));
778 }
779 
vecu32_slli(VecU32 vv,uint32_t ct)780 HEADER_INLINE VecU32 vecu32_slli(VecU32 vv, uint32_t ct) {
781   return R_CAST(VecU32, _mm256_slli_epi32(R_CAST(__m256i, vv), ct));
782 }
783 
vecu16_srli(VecU16 vv,uint32_t ct)784 HEADER_INLINE VecU16 vecu16_srli(VecU16 vv, uint32_t ct) {
785   return R_CAST(VecU16, _mm256_srli_epi16(R_CAST(__m256i, vv), ct));
786 }
787 
vecu16_slli(VecU16 vv,uint32_t ct)788 HEADER_INLINE VecU16 vecu16_slli(VecU16 vv, uint32_t ct) {
789   return R_CAST(VecU16, _mm256_slli_epi16(R_CAST(__m256i, vv), ct));
790 }
791 
792 // Compiler still doesn't seem to be smart enough to use andnot properly.
vecw_and_notfirst(VecW excl,VecW main)793 HEADER_INLINE VecW vecw_and_notfirst(VecW excl, VecW main) {
794   return R_CAST(VecW, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
795 }
796 
vecu32_and_notfirst(VecU32 excl,VecU32 main)797 HEADER_INLINE VecU32 vecu32_and_notfirst(VecU32 excl, VecU32 main) {
798   return R_CAST(VecU32, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
799 }
800 
veci32_and_notfirst(VecI32 excl,VecI32 main)801 HEADER_INLINE VecI32 veci32_and_notfirst(VecI32 excl, VecI32 main) {
802   return R_CAST(VecI32, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
803 }
804 
vecu16_and_notfirst(VecU16 excl,VecU16 main)805 HEADER_INLINE VecU16 vecu16_and_notfirst(VecU16 excl, VecU16 main) {
806   return R_CAST(VecU16, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
807 }
808 
veci16_and_notfirst(VecI16 excl,VecI16 main)809 HEADER_INLINE VecI16 veci16_and_notfirst(VecI16 excl, VecI16 main) {
810   return R_CAST(VecI16, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
811 }
812 
veci8_and_notfirst(VecI8 excl,VecI8 main)813 HEADER_INLINE VecI8 veci8_and_notfirst(VecI8 excl, VecI8 main) {
814   return R_CAST(VecI8, _mm256_andnot_si256(R_CAST(__m256i, excl), R_CAST(__m256i, main)));
815 }
816 
vecw_set1(uintptr_t ulii)817 HEADER_INLINE VecW vecw_set1(uintptr_t ulii) {
818   return R_CAST(VecW, _mm256_set1_epi64x(ulii));
819 }
820 
vecu32_set1(uint32_t uii)821 HEADER_INLINE VecU32 vecu32_set1(uint32_t uii) {
822   return R_CAST(VecU32, _mm256_set1_epi32(uii));
823 }
824 
veci32_set1(int32_t ii)825 HEADER_INLINE VecI32 veci32_set1(int32_t ii) {
826   return R_CAST(VecI32, _mm256_set1_epi32(ii));
827 }
828 
vecu16_set1(unsigned short usi)829 HEADER_INLINE VecU16 vecu16_set1(unsigned short usi) {
830   return R_CAST(VecU16, _mm256_set1_epi16(usi));
831 }
832 
veci16_set1(short si)833 HEADER_INLINE VecI16 veci16_set1(short si) {
834   return R_CAST(VecI16, _mm256_set1_epi16(si));
835 }
836 
vecuc_set1(unsigned char ucc)837 HEADER_INLINE VecUc vecuc_set1(unsigned char ucc) {
838   return R_CAST(VecUc, _mm256_set1_epi8(ucc));
839 }
840 
veci8_set1(char cc)841 HEADER_INLINE VecI8 veci8_set1(char cc) {
842   return R_CAST(VecI8, _mm256_set1_epi8(cc));
843 }
844 
vecw_movemask(VecW vv)845 HEADER_INLINE uint32_t vecw_movemask(VecW vv) {
846   return _mm256_movemask_epi8(R_CAST(__m256i, vv));
847 }
848 
vecu32_movemask(VecU32 vv)849 HEADER_INLINE uint32_t vecu32_movemask(VecU32 vv) {
850   return _mm256_movemask_epi8(R_CAST(__m256i, vv));
851 }
852 
veci32_movemask(VecI32 vv)853 HEADER_INLINE uint32_t veci32_movemask(VecI32 vv) {
854   return _mm256_movemask_epi8(R_CAST(__m256i, vv));
855 }
856 
vecu16_movemask(VecU16 vv)857 HEADER_INLINE uint32_t vecu16_movemask(VecU16 vv) {
858   return _mm256_movemask_epi8(R_CAST(__m256i, vv));
859 }
860 
veci16_movemask(VecI16 vv)861 HEADER_INLINE uint32_t veci16_movemask(VecI16 vv) {
862   return _mm256_movemask_epi8(R_CAST(__m256i, vv));
863 }
864 
veci8_movemask(VecI8 vv)865 HEADER_INLINE uint32_t veci8_movemask(VecI8 vv) {
866   return _mm256_movemask_epi8(R_CAST(__m256i, vv));
867 }
868 
vecuc_movemask(VecUc vv)869 HEADER_INLINE uint32_t vecuc_movemask(VecUc vv) {
870   return _mm256_movemask_epi8(R_CAST(__m256i, vv));
871 }
872 
873 // Repeats elements in second lane in AVX2 case.
vecw_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)874 HEADER_INLINE VecW vecw_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
875   return R_CAST(VecW, _mm256_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
876 }
877 
vecu16_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)878 HEADER_INLINE VecU16 vecu16_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
879   return R_CAST(VecU16, _mm256_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
880 }
881 
vecuc_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)882 HEADER_INLINE VecUc vecuc_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
883   return R_CAST(VecUc, _mm256_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
884 }
885 
886 // Discards last 16 arguments in SSE2/SSE4.2 case.
vecw_setr8x(char e31,char e30,char e29,char e28,char e27,char e26,char e25,char e24,char e23,char e22,char e21,char e20,char e19,char e18,char e17,char e16,char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)887 HEADER_INLINE VecW vecw_setr8x(char e31, char e30, char e29, char e28, char e27, char e26, char e25, char e24, char e23, char e22, char e21, char e20, char e19, char e18, char e17, char e16, char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
888   return R_CAST(VecW, _mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
889 }
890 
vecw_unpacklo8(VecW evens,VecW odds)891 HEADER_INLINE VecW vecw_unpacklo8(VecW evens, VecW odds) {
892   return R_CAST(VecW, _mm256_unpacklo_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
893 }
894 
vecw_unpackhi8(VecW evens,VecW odds)895 HEADER_INLINE VecW vecw_unpackhi8(VecW evens, VecW odds) {
896   return R_CAST(VecW, _mm256_unpackhi_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
897 }
898 
veci8_unpacklo8(VecI8 evens,VecI8 odds)899 HEADER_INLINE VecI8 veci8_unpacklo8(VecI8 evens, VecI8 odds) {
900   return R_CAST(VecI8, _mm256_unpacklo_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
901 }
902 
veci8_unpackhi8(VecI8 evens,VecI8 odds)903 HEADER_INLINE VecI8 veci8_unpackhi8(VecI8 evens, VecI8 odds) {
904   return R_CAST(VecI8, _mm256_unpackhi_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
905 }
906 
vecuc_unpacklo8(VecUc evens,VecUc odds)907 HEADER_INLINE VecUc vecuc_unpacklo8(VecUc evens, VecUc odds) {
908   return R_CAST(VecUc, _mm256_unpacklo_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
909 }
910 
vecuc_unpackhi8(VecUc evens,VecUc odds)911 HEADER_INLINE VecUc vecuc_unpackhi8(VecUc evens, VecUc odds) {
912   return R_CAST(VecUc, _mm256_unpackhi_epi8(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
913 }
914 
vecw_unpacklo16(VecW evens,VecW odds)915 HEADER_INLINE VecW vecw_unpacklo16(VecW evens, VecW odds) {
916   return R_CAST(VecW, _mm256_unpacklo_epi16(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
917 }
918 
vecw_unpackhi16(VecW evens,VecW odds)919 HEADER_INLINE VecW vecw_unpackhi16(VecW evens, VecW odds) {
920   return R_CAST(VecW, _mm256_unpackhi_epi16(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
921 }
922 
vecw_unpacklo32(VecW evens,VecW odds)923 HEADER_INLINE VecW vecw_unpacklo32(VecW evens, VecW odds) {
924   return R_CAST(VecW, _mm256_unpacklo_epi32(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
925 }
926 
vecw_unpackhi32(VecW evens,VecW odds)927 HEADER_INLINE VecW vecw_unpackhi32(VecW evens, VecW odds) {
928   return R_CAST(VecW, _mm256_unpackhi_epi32(R_CAST(__m256i, evens), R_CAST(__m256i, odds)));
929 }
930 
vecw_permute0xd8_if_avx2(VecW vv)931 HEADER_INLINE VecW vecw_permute0xd8_if_avx2(VecW vv) {
932   return R_CAST(VecW, _mm256_permute4x64_epi64(R_CAST(__m256i, vv), 0xd8));
933 }
934 
veci8_permute0xd8_if_avx2(VecI8 vv)935 HEADER_INLINE VecI8 veci8_permute0xd8_if_avx2(VecI8 vv) {
936   return R_CAST(VecI8, _mm256_permute4x64_epi64(R_CAST(__m256i, vv), 0xd8));
937 }
938 
vecuc_permute0xd8_if_avx2(VecUc vv)939 HEADER_INLINE VecUc vecuc_permute0xd8_if_avx2(VecUc vv) {
940   return R_CAST(VecUc, _mm256_permute4x64_epi64(R_CAST(__m256i, vv), 0xd8));
941 }
942 
vecw_shuffle8(VecW table,VecW indexes)943 HEADER_INLINE VecW vecw_shuffle8(VecW table, VecW indexes) {
944   return R_CAST(VecW, _mm256_shuffle_epi8(R_CAST(__m256i, table), R_CAST(__m256i, indexes)));
945 }
946 
vecu16_shuffle8(VecU16 table,VecU16 indexes)947 HEADER_INLINE VecU16 vecu16_shuffle8(VecU16 table, VecU16 indexes) {
948   return R_CAST(VecU16, _mm256_shuffle_epi8(R_CAST(__m256i, table), R_CAST(__m256i, indexes)));
949 }
950 
vecuc_shuffle8(VecUc table,VecUc indexes)951 HEADER_INLINE VecUc vecuc_shuffle8(VecUc table, VecUc indexes) {
952   return R_CAST(VecUc, _mm256_shuffle_epi8(R_CAST(__m256i, table), R_CAST(__m256i, indexes)));
953 }
954 
vecw_extract64_0(VecW vv)955 HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
956   return _mm256_extract_epi64(R_CAST(__m256i, vv), 0);
957 }
958 
vecw_extract64_1(VecW vv)959 HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) {
960   return _mm256_extract_epi64(R_CAST(__m256i, vv), 1);
961 }
962 
963 // *** AVX2-only section ***
vecw_extract64_2(VecW vv)964 HEADER_INLINE uintptr_t vecw_extract64_2(VecW vv) {
965   return _mm256_extract_epi64(R_CAST(__m256i, vv), 2);
966 }
967 
vecw_extract64_3(VecW vv)968 HEADER_INLINE uintptr_t vecw_extract64_3(VecW vv) {
969   return _mm256_extract_epi64(R_CAST(__m256i, vv), 3);
970 }
971 
972 // todo: permute
973 
974 // *** end AVX2-only section ***
975 
976 #    define kVec8thUintMax UINT32_MAX
977 
978 typedef uint16_t Vec16thUint;
979 typedef uint32_t Vec8thUint;
980 typedef uint64_t Vec4thUint;
981 
vecw_load(const void * mem_addr)982 HEADER_INLINE VecW vecw_load(const void* mem_addr) {
983   return R_CAST(VecW, _mm256_load_si256(S_CAST(const __m256i*, mem_addr)));
984 }
985 
986 // There may be some value in adding a 4-consecutive-vector load function when
987 // addresses are expected to be unaligned: see
988 //   https://www.agner.org/optimize/blog/read.php?i=627&v=t
989 
vecw_loadu(const void * mem_addr)990 HEADER_INLINE VecW vecw_loadu(const void* mem_addr) {
991   return R_CAST(VecW, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
992 }
993 
vecu32_loadu(const void * mem_addr)994 HEADER_INLINE VecU32 vecu32_loadu(const void* mem_addr) {
995   return R_CAST(VecU32, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
996 }
997 
veci32_loadu(const void * mem_addr)998 HEADER_INLINE VecI32 veci32_loadu(const void* mem_addr) {
999   return R_CAST(VecI32, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1000 }
1001 
vecu16_loadu(const void * mem_addr)1002 HEADER_INLINE VecU16 vecu16_loadu(const void* mem_addr) {
1003   return R_CAST(VecU16, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1004 }
1005 
veci16_loadu(const void * mem_addr)1006 HEADER_INLINE VecI16 veci16_loadu(const void* mem_addr) {
1007   return R_CAST(VecI16, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1008 }
1009 
vecuc_loadu(const void * mem_addr)1010 HEADER_INLINE VecUc vecuc_loadu(const void* mem_addr) {
1011   return R_CAST(VecUc, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1012 }
1013 
veci8_loadu(const void * mem_addr)1014 HEADER_INLINE VecI8 veci8_loadu(const void* mem_addr) {
1015   return R_CAST(VecI8, _mm256_loadu_si256(S_CAST(const __m256i*, mem_addr)));
1016 }
1017 
vecw_storeu(void * mem_addr,VecW vv)1018 HEADER_INLINE void vecw_storeu(void* mem_addr, VecW vv) {
1019   _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1020 }
1021 
vecu32_storeu(void * mem_addr,VecU32 vv)1022 HEADER_INLINE void vecu32_storeu(void* mem_addr, VecU32 vv) {
1023   _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1024 }
1025 
veci32_storeu(void * mem_addr,VecI32 vv)1026 HEADER_INLINE void veci32_storeu(void* mem_addr, VecI32 vv) {
1027   _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1028 }
1029 
vecu16_storeu(void * mem_addr,VecU16 vv)1030 HEADER_INLINE void vecu16_storeu(void* mem_addr, VecU16 vv) {
1031   _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1032 }
1033 
veci16_storeu(void * mem_addr,VecI16 vv)1034 HEADER_INLINE void veci16_storeu(void* mem_addr, VecI16 vv) {
1035   _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1036 }
1037 
vecuc_storeu(void * mem_addr,VecUc vv)1038 HEADER_INLINE void vecuc_storeu(void* mem_addr, VecUc vv) {
1039   _mm256_storeu_si256(S_CAST(__m256i*, mem_addr), R_CAST(__m256i, vv));
1040 }
1041 
veci32_max(VecI32 v1,VecI32 v2)1042 HEADER_INLINE VecI32 veci32_max(VecI32 v1, VecI32 v2) {
1043   return R_CAST(VecI32, _mm256_max_epi32(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1044 }
1045 
veci16_max(VecI16 v1,VecI16 v2)1046 HEADER_INLINE VecI16 veci16_max(VecI16 v1, VecI16 v2) {
1047   return R_CAST(VecI16, _mm256_max_epi16(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1048 }
1049 
vecw_sad(VecW v1,VecW v2)1050 HEADER_INLINE VecW vecw_sad(VecW v1, VecW v2) {
1051   return R_CAST(VecW, _mm256_sad_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1052 }
1053 
vecuc_adds(VecUc v1,VecUc v2)1054 HEADER_INLINE VecUc vecuc_adds(VecUc v1, VecUc v2) {
1055   return R_CAST(VecUc, _mm256_adds_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1056 }
1057 
vecu16_min8(VecU16 v1,VecU16 v2)1058 HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) {
1059   return R_CAST(VecU16, _mm256_min_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1060 }
1061 
vecuc_min(VecUc v1,VecUc v2)1062 HEADER_INLINE VecUc vecuc_min(VecUc v1, VecUc v2) {
1063   return R_CAST(VecUc, _mm256_min_epu8(R_CAST(__m256i, v1), R_CAST(__m256i, v2)));
1064 }
1065 
vecw_blendv(VecW aa,VecW bb,VecW mask)1066 HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
1067   return R_CAST(VecW, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
1068 }
1069 
vecu32_blendv(VecU32 aa,VecU32 bb,VecU32 mask)1070 HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
1071   return R_CAST(VecU32, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
1072 }
1073 
vecu16_blendv(VecU16 aa,VecU16 bb,VecU16 mask)1074 HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
1075   return R_CAST(VecU16, _mm256_blendv_epi8(R_CAST(__m256i, aa), R_CAST(__m256i, bb), R_CAST(__m256i, mask)));
1076 }
1077 
1078 #  else  // !USE_AVX2
1079 
1080 #    define VCONST_W(xx) {xx, xx}
1081 #    define VCONST_S(xx) {xx, xx, xx, xx, xx, xx, xx, xx}
1082 #    define VCONST_C(xx) {xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx}
1083 #    define VCONST_UC VCONST_C
1084 
vecw_setzero()1085 HEADER_INLINE VecW vecw_setzero() {
1086   return R_CAST(VecW, _mm_setzero_si128());
1087 }
1088 
vecu32_setzero()1089 HEADER_INLINE VecU32 vecu32_setzero() {
1090   return R_CAST(VecU32, _mm_setzero_si128());
1091 }
1092 
vecu16_setzero()1093 HEADER_INLINE VecU16 vecu16_setzero() {
1094   return R_CAST(VecU16, _mm_setzero_si128());
1095 }
1096 
veci16_setzero()1097 HEADER_INLINE VecI16 veci16_setzero() {
1098   return R_CAST(VecI16, _mm_setzero_si128());
1099 }
1100 
vecuc_setzero()1101 HEADER_INLINE VecUc vecuc_setzero() {
1102   return R_CAST(VecUc, _mm_setzero_si128());
1103 }
1104 
veci8_setzero()1105 HEADER_INLINE VecI8 veci8_setzero() {
1106   return R_CAST(VecI8, _mm_setzero_si128());
1107 }
1108 
vecw_srli(VecW vv,uint32_t ct)1109 HEADER_INLINE VecW vecw_srli(VecW vv, uint32_t ct) {
1110   return R_CAST(VecW, _mm_srli_epi64(R_CAST(__m128i, vv), ct));
1111 }
1112 
vecw_slli(VecW vv,uint32_t ct)1113 HEADER_INLINE VecW vecw_slli(VecW vv, uint32_t ct) {
1114   return R_CAST(VecW, _mm_slli_epi64(R_CAST(__m128i, vv), ct));
1115 }
1116 
vecu32_srli(VecU32 vv,uint32_t ct)1117 HEADER_INLINE VecU32 vecu32_srli(VecU32 vv, uint32_t ct) {
1118   return R_CAST(VecU32, _mm_srli_epi32(R_CAST(__m128i, vv), ct));
1119 }
1120 
vecu32_slli(VecU32 vv,uint32_t ct)1121 HEADER_INLINE VecU32 vecu32_slli(VecU32 vv, uint32_t ct) {
1122   return R_CAST(VecU32, _mm_slli_epi32(R_CAST(__m128i, vv), ct));
1123 }
1124 
vecu16_srli(VecU16 vv,uint32_t ct)1125 HEADER_INLINE VecU16 vecu16_srli(VecU16 vv, uint32_t ct) {
1126   return R_CAST(VecU16, _mm_srli_epi16(R_CAST(__m128i, vv), ct));
1127 }
1128 
vecu16_slli(VecU16 vv,uint32_t ct)1129 HEADER_INLINE VecU16 vecu16_slli(VecU16 vv, uint32_t ct) {
1130   return R_CAST(VecU16, _mm_slli_epi16(R_CAST(__m128i, vv), ct));
1131 }
1132 
vecw_and_notfirst(VecW excl,VecW main)1133 HEADER_INLINE VecW vecw_and_notfirst(VecW excl, VecW main) {
1134   return R_CAST(VecW, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1135 }
1136 
vecu32_and_notfirst(VecU32 excl,VecU32 main)1137 HEADER_INLINE VecU32 vecu32_and_notfirst(VecU32 excl, VecU32 main) {
1138   return R_CAST(VecU32, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1139 }
1140 
veci32_and_notfirst(VecI32 excl,VecI32 main)1141 HEADER_INLINE VecI32 veci32_and_notfirst(VecI32 excl, VecI32 main) {
1142   return R_CAST(VecI32, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1143 }
1144 
vecu16_and_notfirst(VecU16 excl,VecU16 main)1145 HEADER_INLINE VecU16 vecu16_and_notfirst(VecU16 excl, VecU16 main) {
1146   return R_CAST(VecU16, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1147 }
1148 
veci16_and_notfirst(VecI16 excl,VecI16 main)1149 HEADER_INLINE VecI16 veci16_and_notfirst(VecI16 excl, VecI16 main) {
1150   return R_CAST(VecI16, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1151 }
1152 
veci8_and_notfirst(VecI8 excl,VecI8 main)1153 HEADER_INLINE VecI8 veci8_and_notfirst(VecI8 excl, VecI8 main) {
1154   return R_CAST(VecI8, _mm_andnot_si128(R_CAST(__m128i, excl), R_CAST(__m128i, main)));
1155 }
1156 
vecw_set1(uintptr_t ulii)1157 HEADER_INLINE VecW vecw_set1(uintptr_t ulii) {
1158   return R_CAST(VecW, _mm_set1_epi64x(ulii));
1159 }
1160 
vecu32_set1(uint32_t uii)1161 HEADER_INLINE VecU32 vecu32_set1(uint32_t uii) {
1162   return R_CAST(VecU32, _mm_set1_epi32(uii));
1163 }
1164 
veci32_set1(int32_t ii)1165 HEADER_INLINE VecI32 veci32_set1(int32_t ii) {
1166   return R_CAST(VecI32, _mm_set1_epi32(ii));
1167 }
1168 
vecu16_set1(unsigned short usi)1169 HEADER_INLINE VecU16 vecu16_set1(unsigned short usi) {
1170   return R_CAST(VecU16, _mm_set1_epi16(usi));
1171 }
1172 
veci16_set1(short si)1173 HEADER_INLINE VecI16 veci16_set1(short si) {
1174   return R_CAST(VecI16, _mm_set1_epi16(si));
1175 }
1176 
vecuc_set1(unsigned char ucc)1177 HEADER_INLINE VecUc vecuc_set1(unsigned char ucc) {
1178   return R_CAST(VecUc, _mm_set1_epi8(ucc));
1179 }
1180 
veci8_set1(char cc)1181 HEADER_INLINE VecI8 veci8_set1(char cc) {
1182   return R_CAST(VecI8, _mm_set1_epi8(cc));
1183 }
1184 
vecw_movemask(VecW vv)1185 HEADER_INLINE uint32_t vecw_movemask(VecW vv) {
1186   return _mm_movemask_epi8(R_CAST(__m128i, vv));
1187 }
1188 
vecu32_movemask(VecU32 vv)1189 HEADER_INLINE uint32_t vecu32_movemask(VecU32 vv) {
1190   return _mm_movemask_epi8(R_CAST(__m128i, vv));
1191 }
1192 
veci32_movemask(VecI32 vv)1193 HEADER_INLINE uint32_t veci32_movemask(VecI32 vv) {
1194   return _mm_movemask_epi8(R_CAST(__m128i, vv));
1195 }
1196 
vecu16_movemask(VecU16 vv)1197 HEADER_INLINE uint32_t vecu16_movemask(VecU16 vv) {
1198   return _mm_movemask_epi8(R_CAST(__m128i, vv));
1199 }
1200 
veci16_movemask(VecI16 vv)1201 HEADER_INLINE uint32_t veci16_movemask(VecI16 vv) {
1202   return _mm_movemask_epi8(R_CAST(__m128i, vv));
1203 }
1204 
veci8_movemask(VecI8 vv)1205 HEADER_INLINE uint32_t veci8_movemask(VecI8 vv) {
1206   return _mm_movemask_epi8(R_CAST(__m128i, vv));
1207 }
1208 
vecuc_movemask(VecUc vv)1209 HEADER_INLINE uint32_t vecuc_movemask(VecUc vv) {
1210   return _mm_movemask_epi8(R_CAST(__m128i, vv));
1211 }
1212 
1213 CONSTI32(kVec8thUintMax, 65535);
1214 
1215 // #    define kVec8thUintMax 65535
1216 
1217 typedef unsigned char Vec16thUint;
1218 typedef uint16_t Vec8thUint;
1219 typedef uint32_t Vec4thUint;
1220 
vecw_load(const void * mem_addr)1221 HEADER_INLINE VecW vecw_load(const void* mem_addr) {
1222   return R_CAST(VecW, _mm_load_si128(S_CAST(const __m128i*, mem_addr)));
1223 }
1224 
vecw_loadu(const void * mem_addr)1225 HEADER_INLINE VecW vecw_loadu(const void* mem_addr) {
1226   return R_CAST(VecW, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1227 }
1228 
vecu32_loadu(const void * mem_addr)1229 HEADER_INLINE VecU32 vecu32_loadu(const void* mem_addr) {
1230   return R_CAST(VecU32, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1231 }
1232 
veci32_loadu(const void * mem_addr)1233 HEADER_INLINE VecI32 veci32_loadu(const void* mem_addr) {
1234   return R_CAST(VecI32, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1235 }
1236 
vecu16_loadu(const void * mem_addr)1237 HEADER_INLINE VecU16 vecu16_loadu(const void* mem_addr) {
1238   return R_CAST(VecU16, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1239 }
1240 
veci16_loadu(const void * mem_addr)1241 HEADER_INLINE VecI16 veci16_loadu(const void* mem_addr) {
1242   return R_CAST(VecI16, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1243 }
1244 
vecuc_loadu(const void * mem_addr)1245 HEADER_INLINE VecUc vecuc_loadu(const void* mem_addr) {
1246   return R_CAST(VecUc, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1247 }
1248 
veci8_loadu(const void * mem_addr)1249 HEADER_INLINE VecI8 veci8_loadu(const void* mem_addr) {
1250   return R_CAST(VecI8, _mm_loadu_si128(S_CAST(const __m128i*, mem_addr)));
1251 }
1252 
vecw_storeu(void * mem_addr,VecW vv)1253 HEADER_INLINE void vecw_storeu(void* mem_addr, VecW vv) {
1254   _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1255 }
1256 
vecu32_storeu(void * mem_addr,VecU32 vv)1257 HEADER_INLINE void vecu32_storeu(void* mem_addr, VecU32 vv) {
1258   _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1259 }
1260 
veci32_storeu(void * mem_addr,VecI32 vv)1261 HEADER_INLINE void veci32_storeu(void* mem_addr, VecI32 vv) {
1262   _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1263 }
1264 
vecu16_storeu(void * mem_addr,VecU16 vv)1265 HEADER_INLINE void vecu16_storeu(void* mem_addr, VecU16 vv) {
1266   _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1267 }
1268 
veci16_storeu(void * mem_addr,VecI16 vv)1269 HEADER_INLINE void veci16_storeu(void* mem_addr, VecI16 vv) {
1270   _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1271 }
1272 
vecuc_storeu(void * mem_addr,VecUc vv)1273 HEADER_INLINE void vecuc_storeu(void* mem_addr, VecUc vv) {
1274   _mm_storeu_si128(S_CAST(__m128i*, mem_addr), R_CAST(__m128i, vv));
1275 }
1276 
1277 // Repeats arguments in AVX2 case.
vecw_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)1278 HEADER_INLINE VecW vecw_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
1279   return R_CAST(VecW, _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
1280 }
1281 
vecu16_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)1282 HEADER_INLINE VecU16 vecu16_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
1283   return R_CAST(VecU16, _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
1284 }
1285 
vecuc_setr8(char e15,char e14,char e13,char e12,char e11,char e10,char e9,char e8,char e7,char e6,char e5,char e4,char e3,char e2,char e1,char e0)1286 HEADER_INLINE VecUc vecuc_setr8(char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0) {
1287   return R_CAST(VecUc, _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0));
1288 }
1289 
1290 // Discards last 16 arguments in SSE2/SSE4.2 case.
vecw_setr8x(char e31,char e30,char e29,char e28,char e27,char e26,char e25,char e24,char e23,char e22,char e21,char e20,char e19,char e18,char e17,char e16,__maybe_unused char e15,__maybe_unused char e14,__maybe_unused char e13,__maybe_unused char e12,__maybe_unused char e11,__maybe_unused char e10,__maybe_unused char e9,__maybe_unused char e8,__maybe_unused char e7,__maybe_unused char e6,__maybe_unused char e5,__maybe_unused char e4,__maybe_unused char e3,__maybe_unused char e2,__maybe_unused char e1,__maybe_unused char e0)1291 HEADER_INLINE VecW vecw_setr8x(
1292     char e31, char e30, char e29, char e28,
1293     char e27, char e26, char e25, char e24,
1294     char e23, char e22, char e21, char e20,
1295     char e19, char e18, char e17, char e16,
1296     __maybe_unused char e15, __maybe_unused char e14,
1297     __maybe_unused char e13, __maybe_unused char e12,
1298     __maybe_unused char e11, __maybe_unused char e10,
1299     __maybe_unused char e9, __maybe_unused char e8,
1300     __maybe_unused char e7, __maybe_unused char e6,
1301     __maybe_unused char e5, __maybe_unused char e4,
1302     __maybe_unused char e3, __maybe_unused char e2,
1303     __maybe_unused char e1, __maybe_unused char e0) {
1304   return R_CAST(VecW, _mm_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16));
1305 }
1306 
vecw_unpacklo8(VecW evens,VecW odds)1307 HEADER_INLINE VecW vecw_unpacklo8(VecW evens, VecW odds) {
1308   return R_CAST(VecW, _mm_unpacklo_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1309 }
1310 
vecw_unpackhi8(VecW evens,VecW odds)1311 HEADER_INLINE VecW vecw_unpackhi8(VecW evens, VecW odds) {
1312   return R_CAST(VecW, _mm_unpackhi_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1313 }
1314 
veci8_unpacklo8(VecI8 evens,VecI8 odds)1315 HEADER_INLINE VecI8 veci8_unpacklo8(VecI8 evens, VecI8 odds) {
1316   return R_CAST(VecI8, _mm_unpacklo_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1317 }
1318 
veci8_unpackhi8(VecI8 evens,VecI8 odds)1319 HEADER_INLINE VecI8 veci8_unpackhi8(VecI8 evens, VecI8 odds) {
1320   return R_CAST(VecI8, _mm_unpackhi_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1321 }
1322 
vecuc_unpacklo8(VecUc evens,VecUc odds)1323 HEADER_INLINE VecUc vecuc_unpacklo8(VecUc evens, VecUc odds) {
1324   return R_CAST(VecUc, _mm_unpacklo_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1325 }
1326 
vecuc_unpackhi8(VecUc evens,VecUc odds)1327 HEADER_INLINE VecUc vecuc_unpackhi8(VecUc evens, VecUc odds) {
1328   return R_CAST(VecUc, _mm_unpackhi_epi8(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1329 }
1330 
vecw_unpacklo16(VecW evens,VecW odds)1331 HEADER_INLINE VecW vecw_unpacklo16(VecW evens, VecW odds) {
1332   return R_CAST(VecW, _mm_unpacklo_epi16(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1333 }
1334 
vecw_unpackhi16(VecW evens,VecW odds)1335 HEADER_INLINE VecW vecw_unpackhi16(VecW evens, VecW odds) {
1336   return R_CAST(VecW, _mm_unpackhi_epi16(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1337 }
1338 
vecw_unpacklo32(VecW evens,VecW odds)1339 HEADER_INLINE VecW vecw_unpacklo32(VecW evens, VecW odds) {
1340   return R_CAST(VecW, _mm_unpacklo_epi32(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1341 }
1342 
vecw_unpackhi32(VecW evens,VecW odds)1343 HEADER_INLINE VecW vecw_unpackhi32(VecW evens, VecW odds) {
1344   return R_CAST(VecW, _mm_unpackhi_epi32(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1345 }
1346 
vecw_unpacklo64(VecW evens,VecW odds)1347 HEADER_INLINE VecW vecw_unpacklo64(VecW evens, VecW odds) {
1348   return R_CAST(VecW, _mm_unpacklo_epi64(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1349 }
1350 
vecw_unpackhi64(VecW evens,VecW odds)1351 HEADER_INLINE VecW vecw_unpackhi64(VecW evens, VecW odds) {
1352   return R_CAST(VecW, _mm_unpackhi_epi64(R_CAST(__m128i, evens), R_CAST(__m128i, odds)));
1353 }
1354 
vecw_permute0xd8_if_avx2(VecW vv)1355 HEADER_INLINE VecW vecw_permute0xd8_if_avx2(VecW vv) {
1356   return vv;
1357 }
1358 
veci8_permute0xd8_if_avx2(VecI8 vv)1359 HEADER_INLINE VecI8 veci8_permute0xd8_if_avx2(VecI8 vv) {
1360   return vv;
1361 }
1362 
vecuc_permute0xd8_if_avx2(VecUc vv)1363 HEADER_INLINE VecUc vecuc_permute0xd8_if_avx2(VecUc vv) {
1364   return vv;
1365 }
1366 
1367 #    ifdef USE_SSE42
veci32_max(VecI32 v1,VecI32 v2)1368 HEADER_INLINE VecI32 veci32_max(VecI32 v1, VecI32 v2) {
1369   return R_CAST(VecI32, _mm_max_epi32(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1370 }
1371 
vecw_shuffle8(VecW table,VecW indexes)1372 HEADER_INLINE VecW vecw_shuffle8(VecW table, VecW indexes) {
1373   return R_CAST(VecW, _mm_shuffle_epi8(R_CAST(__m128i, table), R_CAST(__m128i, indexes)));
1374 }
1375 
vecu16_shuffle8(VecU16 table,VecU16 indexes)1376 HEADER_INLINE VecU16 vecu16_shuffle8(VecU16 table, VecU16 indexes) {
1377   return R_CAST(VecU16, _mm_shuffle_epi8(R_CAST(__m128i, table), R_CAST(__m128i, indexes)));
1378 }
1379 
vecuc_shuffle8(VecUc table,VecUc indexes)1380 HEADER_INLINE VecUc vecuc_shuffle8(VecUc table, VecUc indexes) {
1381   return R_CAST(VecUc, _mm_shuffle_epi8(R_CAST(__m128i, table), R_CAST(__m128i, indexes)));
1382 }
1383 
vecw_extract64_0(VecW vv)1384 HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
1385   return _mm_extract_epi64(R_CAST(__m128i, vv), 0);
1386 }
1387 
vecw_extract64_1(VecW vv)1388 HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) {
1389   return _mm_extract_epi64(R_CAST(__m128i, vv), 1);
1390 }
1391 
vecw_blendv(VecW aa,VecW bb,VecW mask)1392 HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
1393   return R_CAST(VecW, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
1394 }
1395 
vecu32_blendv(VecU32 aa,VecU32 bb,VecU32 mask)1396 HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
1397   return R_CAST(VecU32, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
1398 }
1399 
vecu16_blendv(VecU16 aa,VecU16 bb,VecU16 mask)1400 HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
1401   return R_CAST(VecU16, _mm_blendv_epi8(R_CAST(__m128i, aa), R_CAST(__m128i, bb), R_CAST(__m128i, mask)));
1402 }
1403 #    else
vecw_extract64_0(VecW vv)1404 HEADER_INLINE uintptr_t vecw_extract64_0(VecW vv) {
1405   return R_CAST(uintptr_t, _mm_movepi64_pi64(R_CAST(__m128i, vv)));
1406 }
1407 
vecw_extract64_1(VecW vv)1408 HEADER_INLINE uintptr_t vecw_extract64_1(VecW vv) {
1409   const __m128i v0 = _mm_srli_si128(R_CAST(__m128i, vv), 8);
1410   return R_CAST(uintptr_t, _mm_movepi64_pi64(v0));
1411 }
1412 
1413 // N.B. we do *not* enforce the low bits of each mask byte matching the high
1414 // bit.
vecw_blendv(VecW aa,VecW bb,VecW mask)1415 HEADER_INLINE VecW vecw_blendv(VecW aa, VecW bb, VecW mask) {
1416   return vecw_and_notfirst(mask, aa) | (mask & bb);
1417 }
1418 
vecu32_blendv(VecU32 aa,VecU32 bb,VecU32 mask)1419 HEADER_INLINE VecU32 vecu32_blendv(VecU32 aa, VecU32 bb, VecU32 mask) {
1420   return vecu32_and_notfirst(mask, aa) | (mask & bb);
1421 }
1422 
vecu16_blendv(VecU16 aa,VecU16 bb,VecU16 mask)1423 HEADER_INLINE VecU16 vecu16_blendv(VecU16 aa, VecU16 bb, VecU16 mask) {
1424   return vecu16_and_notfirst(mask, aa) | (mask & bb);
1425 }
1426 #    endif
1427 
veci16_max(VecI16 v1,VecI16 v2)1428 HEADER_INLINE VecI16 veci16_max(VecI16 v1, VecI16 v2) {
1429   return R_CAST(VecI16, _mm_max_epi16(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1430 }
1431 
vecw_sad(VecW v1,VecW v2)1432 HEADER_INLINE VecW vecw_sad(VecW v1, VecW v2) {
1433   return R_CAST(VecW, _mm_sad_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1434 }
1435 
vecuc_adds(VecUc v1,VecUc v2)1436 HEADER_INLINE VecUc vecuc_adds(VecUc v1, VecUc v2) {
1437   return R_CAST(VecUc, _mm_adds_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1438 }
1439 
vecu16_min8(VecU16 v1,VecU16 v2)1440 HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) {
1441   return R_CAST(VecU16, _mm_min_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1442 }
1443 
vecuc_min(VecUc v1,VecUc v2)1444 HEADER_INLINE VecUc vecuc_min(VecUc v1, VecUc v2) {
1445   return R_CAST(VecUc, _mm_min_epu8(R_CAST(__m128i, v1), R_CAST(__m128i, v2)));
1446 }
1447 #  endif  // !USE_AVX2
1448 
vecw_bytesum(VecW src,VecW m0)1449 HEADER_INLINE VecW vecw_bytesum(VecW src, VecW m0) {
1450   return vecw_sad(src, m0);
1451 }
1452 
1453 CONSTI32(kVec8thUintPerWord, sizeof(intptr_t) / sizeof(Vec8thUint));
1454 
1455 #  ifdef FVEC_32
1456 
1457 #    ifndef __FMA__
1458 #      error "32-byte-float-vector builds require FMA3 as well."
1459 #    endif
1460 
1461 CONSTI32(kBytesPerFVec, 32);
1462 typedef float VecF __attribute__ ((vector_size (32)));
1463 
1464 #    define VCONST_F(xx) {xx, xx, xx, xx, xx, xx, xx, xx}
1465 
vecf_setzero()1466 HEADER_INLINE VecF vecf_setzero() {
1467   return R_CAST(VecF, _mm256_setzero_ps());
1468 }
1469 
1470 #  else  // !FVEC_32
1471 
1472 CONSTI32(kBytesPerFVec, 16);
1473 typedef float VecF __attribute__ ((vector_size (16)));
1474 
1475 #    define VCONST_F(xx) {xx, xx, xx, xx}
1476 
vecf_setzero()1477 HEADER_INLINE VecF vecf_setzero() {
1478   return R_CAST(VecF, _mm_setzero_ps());
1479 }
1480 
1481 #  endif  // !FVEC_32
1482 
1483 #else  // not __LP64__
1484 CONSTI32(kBytesPerVec, 4);
1485 CONSTI32(kBytesPerFVec, 4);
1486 CONSTI32(kBitsPerWord, 32);
1487 CONSTI32(kBitsPerWordLog2, 5);
1488 
1489 typedef uint16_t Halfword;
1490 typedef uint8_t Quarterword;
1491 
1492 typedef uintptr_t VecW;
1493 typedef uintptr_t VecU32;
1494 typedef float VecF;
1495 // VecI16 and VecI8 aren't worth the trouble of scaling down to 32-bit
1496 
1497 #  define VCONST_W(xx) (xx)
1498 
vecw_setzero()1499 HEADER_INLINE VecW vecw_setzero() {
1500   return k0LU;
1501 }
1502 
vecw_srli(VecW vv,uint32_t ct)1503 HEADER_INLINE VecW vecw_srli(VecW vv, uint32_t ct) {
1504   return vv >> ct;
1505 }
1506 
vecw_slli(VecW vv,uint32_t ct)1507 HEADER_INLINE VecW vecw_slli(VecW vv, uint32_t ct) {
1508   return vv << ct;
1509 }
1510 
vecw_set1(uintptr_t ulii)1511 HEADER_INLINE VecW vecw_set1(uintptr_t ulii) {
1512   return ulii;
1513 }
1514 
vecw_loadu(const void * mem_addr)1515 HEADER_INLINE VecW vecw_loadu(const void* mem_addr) {
1516   return *S_CAST(const VecW*, mem_addr);
1517 }
1518 
vecw_bytesum(VecW src,__maybe_unused VecW m0)1519 HEADER_INLINE VecW vecw_bytesum(VecW src, __maybe_unused VecW m0) {
1520   src = (src & 0x00ff00ff) + ((src >> 8) & 0x00ff00ff);
1521   return (src + (src >> 16)) & 0xffff;
1522 }
1523 
vecw_and_notfirst(VecW excl,VecW main)1524 HEADER_INLINE VecW vecw_and_notfirst(VecW excl, VecW main) {
1525   return (~excl) & main;
1526 }
1527 
vecu32_and_notfirst(VecU32 excl,VecU32 main)1528 HEADER_INLINE VecU32 vecu32_and_notfirst(VecU32 excl, VecU32 main) {
1529   return (~excl) & main;
1530 }
1531 #endif  // !__LP64__
1532 
1533 // debug
PrintVec(const void * vv)1534 HEADER_INLINE void PrintVec(const void* vv) {
1535   const unsigned char* vv_alias = S_CAST(const unsigned char*, vv);
1536   for (uint32_t uii = 0; uii != kBytesPerVec; ++uii) {
1537     printf("%u ", vv_alias[uii]);
1538   }
1539   printf("\n");
1540 }
1541 
1542 // Unfortunately, we need to spell out S_CAST(uintptr_t, 0) instead of just
1543 // typing k0LU in C99.
1544 static const uintptr_t kMask5555 = (~S_CAST(uintptr_t, 0)) / 3;
1545 static const uintptr_t kMaskAAAA = ((~S_CAST(uintptr_t, 0)) / 3) * 2;
1546 static const uintptr_t kMask3333 = (~S_CAST(uintptr_t, 0)) / 5;
1547 static const uintptr_t kMask1111 = (~S_CAST(uintptr_t, 0)) / 15;
1548 static const uintptr_t kMask0F0F = (~S_CAST(uintptr_t, 0)) / 17;
1549 static const uintptr_t kMask0101 = (~S_CAST(uintptr_t, 0)) / 255;
1550 static const uintptr_t kMask00FF = (~S_CAST(uintptr_t, 0)) / 257;
1551 static const uintptr_t kMask0001 = (~S_CAST(uintptr_t, 0)) / 65535;
1552 static const uintptr_t kMask0000FFFF = (~S_CAST(uintptr_t, 0)) / 65537;
1553 static const uintptr_t kMask00000001 = (~S_CAST(uintptr_t, 0)) / 4294967295U;
1554 
1555 static const uintptr_t kMask000000FF = (~S_CAST(uintptr_t, 0)) / 16843009;
1556 static const uintptr_t kMask000F = (~S_CAST(uintptr_t, 0)) / 4369;
1557 static const uintptr_t kMask0303 = (~S_CAST(uintptr_t, 0)) / 85;
1558 
1559 CONSTI32(kBitsPerVec, kBytesPerVec * CHAR_BIT);
1560 
1561 // We now use Knuth's Nyp/Nybble vocabulary for 2-bit and 4-bit elements,
1562 // respectively.
1563 CONSTI32(kNypsPerVec, kBytesPerVec * 4);
1564 
1565 CONSTI32(kBitsPerWordD2, kBitsPerWord / 2);
1566 CONSTI32(kBitsPerWordD4, kBitsPerWord / 4);
1567 
1568 // number of bytes in a word
1569 CONSTI32(kBytesPerWord, kBitsPerWord / CHAR_BIT);
1570 
1571 CONSTI32(kInt16PerWord, kBytesPerWord / 2);
1572 
1573 static_assert(CHAR_BIT == 8, "plink2_base requires CHAR_BIT == 8.");
1574 static_assert(sizeof(int8_t) == 1, "plink2_base requires sizeof(int8_t) == 1.");
1575 static_assert(sizeof(int16_t) == 2, "plink2_base requires sizeof(int16_t) == 2.");
1576 static_assert(sizeof(int32_t) == 4, "plink2_base requires sizeof(int32_t) == 4.");
1577 static_assert(sizeof(int) >= 4, "plink2_base requires sizeof(int) >= 4.");
1578 static_assert(sizeof(intptr_t) == kBytesPerWord, "plink2_base requires sizeof(intptr_t) == kBytesPerWord.");
1579 static_assert(sizeof(int64_t) == 8, "plink2_base requires sizeof(int64_t) == 8.");
1580 
1581 CONSTI32(kWordsPerVec, kBytesPerVec / kBytesPerWord);
1582 CONSTI32(kInt32PerVec, kBytesPerVec / 4);
1583 CONSTI32(kInt16PerVec, kBytesPerVec / 2);
1584 
1585 CONSTI32(kFloatPerFVec, kBytesPerFVec / 4);
1586 
1587 CONSTI32(kCacheline, 64);
1588 
1589 CONSTI32(kBitsPerCacheline, kCacheline * CHAR_BIT);
1590 CONSTI32(kNypsPerCacheline, kCacheline * 4);
1591 CONSTI32(kInt16PerCacheline, kCacheline / sizeof(int16_t));
1592 CONSTI32(kInt32PerCacheline, kCacheline / sizeof(int32_t));
1593 CONSTI32(kInt64PerCacheline, kCacheline / sizeof(int64_t));
1594 CONSTI32(kWordsPerCacheline, kCacheline / kBytesPerWord);
1595 CONSTI32(kDoublesPerCacheline, kCacheline / sizeof(double));
1596 CONSTI32(kVecsPerCacheline, kCacheline / kBytesPerVec);
1597 
1598 // could use ioctl, etc. to dynamically determine this later, and pass it as a
1599 // parameter to e.g. PgfiMultiread()
1600 CONSTI32(kDiskBlockSize, 4096);
1601 
1602 CONSTI32(kPglFwriteBlockSize, 131072);
1603 
1604 // unsafe to fread or fwrite more bytes than this on e.g. OS X
1605 CONSTI32(kMaxBytesPerIO, 0x7ffff000);
1606 
1607 // Maximum size of "dynamically" allocated line load buffer.  (This is the
1608 // limit that applies to .vcf and similar files.)  Inconvenient to go higher
1609 // since fgets() takes a int32_t size argument.
1610 CONSTI32(kMaxLongLine, 0x7fffffc0);
1611 static_assert(!(kMaxLongLine % kCacheline), "kMaxLongLine must be a multiple of kCacheline.");
1612 
1613 #ifdef __APPLE__
1614 // OS X is limited to 256?
1615 CONSTI32(kMaxOpenFiles, 252);
1616 #else
1617 // Can't assume more than 512 are allowed on Windows, with current compilation
1618 // settings.
1619 CONSTI32(kMaxOpenFiles, 504);
1620 #endif
1621 
1622 // note that this is NOT foolproof: see e.g.
1623 // http://insanecoding.blogspot.com/2007/11/pathmax-simply-isnt.html .  (This
1624 // is why I haven't bothered with OS-based #ifdefs here.)  But it should be
1625 // good enough in practice.  And PATH_MAX itself is still relevant due to use
1626 // of realpath().
1627 CONSTI32(kPglFnamesize, 4096);
1628 #if defined(PATH_MAX) && !defined(_WIN32)
1629 static_assert(kPglFnamesize >= PATH_MAX, "plink2_base assumes PATH_MAX <= 4096.  (Safe to increase kPglFnamesize to address this, up to 131072.)");
1630 #endif
1631 
1632 
1633 #if __cplusplus >= 201103L
1634 // Main application of std::array in this codebase is enforcing length when
1635 // passing references between functions.  Conversely, if the array type has
1636 // different lengths in different functions (e.g. col_skips[]/col_types[]), we
1637 // actively want to avoid &arr[0] clutter.
1638 // When neither applies, it doesn't really matter whether we use this or not;
1639 // I normally won't use it unless it plausibly makes sense to pass
1640 // fixed-length-array-references in the future.
1641 #  define STD_ARRAY_DECL(tt, nn, vv) std::array<tt, nn> vv
1642 #  define STD_ARRAY_REF(tt, nn) std::array<tt, nn>&
1643 
1644 // necessary if tt is a pointer type, otherwise optional
1645 #  define STD_ARRAY_KREF(tt, nn) const std::array<tt, nn>&
1646 
1647 #  define STD_ARRAY_COPY(src, nn, dst) static_assert(sizeof(dst) == sizeof((dst)[0]) * nn, "Invalid STD_ARRAY_COPY() invocation."); (dst) = (src)
1648 
1649 #  define STD_ARRAY_PTR_TYPE(tt, nn) std::array<tt, nn>*
1650 #  define STD_ARRAY_PTR_DECL(tt, nn, vv) std::array<tt, nn>* vv
1651 
1652 // argh, need double-braces for C++11 std::array and single-braces for C
1653 #  define STD_ARRAY_INIT_START() {
1654 #  define STD_ARRAY_INIT_END() }
1655 
STD_ARRAY_FILL0(std::array<T,N> & arr)1656 template <class T, std::size_t N> void STD_ARRAY_FILL0(std::array<T, N>& arr) {
1657   arr.fill(0);
1658 }
1659 
1660 // plain STD_ARRAY_FILL0() can't be used on array-references due to fallback
1661 // code.
1662 // this macro ensures that we *only* use it with uint32_t array-references
1663 #  define STD_ARRAY_REF_FILL0(ct, aref) static_assert(ct * sizeof(aref[0]) == sizeof(aref), "invalid STD_ARRAY_REF_FILL0() invocation"); aref.fill(0)
1664 
1665 #  define NONCOPYABLE(struct_name) \
1666   struct_name() = default; \
1667   struct_name(const struct_name&) = delete; \
1668   struct_name& operator=(const struct_name&) = delete
1669 
1670 #else
1671 #  define STD_ARRAY_DECL(tt, nn, vv) tt vv[nn]
1672 #  define STD_ARRAY_REF(tt, nn) tt* const
1673 #  define STD_ARRAY_KREF(tt, nn) tt const* const
1674 #  define STD_ARRAY_COPY(src, nn, dst) memcpy(dst, src, nn * sizeof(dst[0]));
1675 #  define STD_ARRAY_PTR_TYPE(tt, nn) tt(*)[nn]
1676 #  define STD_ARRAY_PTR_DECL(tt, nn, vv) tt(*vv)[nn]
1677 #  define STD_ARRAY_INIT_START()
1678 #  define STD_ARRAY_INIT_END()
1679 #  define STD_ARRAY_FILL0(arr) memset(arr, 0, sizeof(arr))
1680 #  define STD_ARRAY_REF_FILL0(ct, aref) memset(aref, 0, ct * sizeof(*aref))
1681 
1682 #  define NONCOPYABLE(struct_name)
1683 #endif
1684 
1685 typedef union {
1686   VecW vw;
1687 
1688   STD_ARRAY_DECL(uintptr_t, kWordsPerVec, w);
1689 
1690   STD_ARRAY_DECL(uint32_t, kInt32PerVec, u32);
1691 } UniVec;
1692 
1693 typedef union {
1694   VecF vf;
1695   STD_ARRAY_DECL(float, kFloatPerFVec, f4);
1696 } UniVecF;
1697 
1698 // sum must fit in 16 bits
UniVecHsum16(UniVec uv)1699 HEADER_INLINE uintptr_t UniVecHsum16(UniVec uv) {
1700 #ifdef __LP64__
1701 #  ifdef USE_AVX2
1702   return ((uv.w[0] + uv.w[1] + uv.w[2] + uv.w[3]) * kMask0001) >> 48;
1703 #  else
1704   return ((uv.w[0] + uv.w[1]) * kMask0001) >> 48;
1705 #  endif
1706 #else
1707   return (uv.w[0] * kMask0001) >> 16;
1708 #endif
1709 }
1710 
1711 // sum must fit in 32 bits
UniVecHsum32(UniVec uv)1712 HEADER_INLINE uintptr_t UniVecHsum32(UniVec uv) {
1713 #ifdef __LP64__
1714 #  ifdef USE_AVX2
1715   return ((uv.w[0] + uv.w[1] + uv.w[2] + uv.w[3]) * kMask00000001) >> 32;
1716 #  else
1717   return ((uv.w[0] + uv.w[1]) * kMask00000001) >> 32;
1718 #  endif
1719 #else
1720   return uv.w[0];
1721 #endif
1722 }
1723 
VecFHsum(VecF vecf)1724 HEADER_INLINE float VecFHsum(VecF vecf) {
1725   UniVecF uvf;
1726   uvf.vf = vecf;
1727 #ifdef __LP64__
1728 #  ifdef FVEC_32
1729   // tested various uses of _mm256_hadd_ps, couldn't get them to be faster
1730   return uvf.f4[0] + uvf.f4[1] + uvf.f4[2] + uvf.f4[3] + uvf.f4[4] + uvf.f4[5] + uvf.f4[6] + uvf.f4[7];
1731 #  else
1732   return uvf.f4[0] + uvf.f4[1] + uvf.f4[2] + uvf.f4[3];
1733 #  endif
1734 #else
1735   return uvf.f4[0];
1736 #endif
1737 }
1738 
1739 #ifdef USE_AVX2
UnpackHalfwordToWord(uintptr_t hw)1740 HEADER_INLINE uintptr_t UnpackHalfwordToWord(uintptr_t hw) {
1741   return _pdep_u64(hw, kMask5555);
1742 }
1743 
UnpackHalfwordToWordShift1(uintptr_t hw)1744 HEADER_INLINE uintptr_t UnpackHalfwordToWordShift1(uintptr_t hw) {
1745   return _pdep_u64(hw, kMaskAAAA);
1746 }
1747 
UnpackVec8thUintTo4th(Vec8thUint hw)1748 HEADER_INLINE Vec4thUint UnpackVec8thUintTo4th(Vec8thUint hw) {
1749   return _pdep_u64(hw, kMask5555);
1750 }
1751 
PackWordToHalfword(uintptr_t ww)1752 HEADER_INLINE Halfword PackWordToHalfword(uintptr_t ww) {
1753   return _pext_u64(ww, kMask5555);
1754 }
1755 
PackWordToHalfwordMask5555(uintptr_t ww)1756 HEADER_INLINE Halfword PackWordToHalfwordMask5555(uintptr_t ww) {
1757   return _pext_u64(ww, kMask5555);
1758 }
1759 
PackWordToHalfwordMaskAAAA(uintptr_t ww)1760 HEADER_INLINE Halfword PackWordToHalfwordMaskAAAA(uintptr_t ww) {
1761   return _pext_u64(ww, kMaskAAAA);
1762 }
1763 
PackVec4thUintTo8th(Vec4thUint ww)1764 HEADER_INLINE Vec8thUint PackVec4thUintTo8th(Vec4thUint ww) {
1765   return _pext_u64(ww, kMask5555);
1766 }
1767 
PackVec8thUintTo16th(Vec8thUint ww)1768 HEADER_INLINE Vec16thUint PackVec8thUintTo16th(Vec8thUint ww) {
1769   return _pext_u64(ww, kMask5555);
1770 }
1771 
Unpack0F0F(uintptr_t hw)1772 HEADER_INLINE uintptr_t Unpack0F0F(uintptr_t hw) {
1773   return _pdep_u64(hw, kMask0F0F);
1774 }
1775 
Pack0F0F(uintptr_t ww)1776 HEADER_INLINE Halfword Pack0F0F(uintptr_t ww) {
1777   return _pext_u64(ww, kMask0F0F);
1778 }
1779 
Pack0F0FMask(uintptr_t ww)1780 HEADER_INLINE Halfword Pack0F0FMask(uintptr_t ww) {
1781   return _pext_u64(ww, kMask0F0F);
1782 }
1783 
Unpack0303(uintptr_t qw)1784 HEADER_INLINE uintptr_t Unpack0303(uintptr_t qw) {
1785   return _pdep_u64(qw, kMask0303);
1786 }
1787 
Pack0303(uintptr_t ww)1788 HEADER_INLINE Quarterword Pack0303(uintptr_t ww) {
1789   return _pext_u64(ww, kMask0303);
1790 }
1791 
Pack0303Mask(uintptr_t ww)1792 HEADER_INLINE Quarterword Pack0303Mask(uintptr_t ww) {
1793   return _pext_u64(ww, kMask0303);
1794 }
1795 
1796 // See https://stackoverflow.com/questions/21622212/how-to-perform-the-inverse-of-mm256-movemask-epi8-vpmovmskb .
InverseMovemaskFF(Vec8thUint mask)1797 HEADER_INLINE VecUc InverseMovemaskFF(Vec8thUint mask) {
1798   __m256i vmask = _mm256_set1_epi32(mask);
1799   const __m256i byte_gather = _mm256_setr_epi64x(0, kMask0101, 2 * kMask0101, 3 * kMask0101);
1800   vmask = _mm256_shuffle_epi8(vmask, byte_gather);
1801   const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfeLL);
1802   vmask = _mm256_or_si256(vmask, bit_mask);
1803   return R_CAST(VecUc, _mm256_cmpeq_epi8(vmask, _mm256_set1_epi64x(-1)));
1804 }
1805 
1806 // If we're only interested in the even bits of mask.  No need to mask out odd
1807 // bits before calling.
InverseMovespreadmaskFF(Vec4thUint mask)1808 HEADER_INLINE VecUc InverseMovespreadmaskFF(Vec4thUint mask) {
1809   __m256i vmask = _mm256_set1_epi64x(mask);
1810   const __m256i byte_gather = _mm256_setr_epi32(0, 0x01010101, 0x02020202, 0x03030303, 0x04040404, 0x05050505, 0x06060606, 0x07070707);
1811   vmask = _mm256_shuffle_epi8(vmask, byte_gather);
1812   const __m256i bit_mask = _mm256_set1_epi32(0xbfeffbfeU);
1813   vmask = _mm256_or_si256(vmask, bit_mask);
1814   return R_CAST(VecUc, _mm256_cmpeq_epi8(vmask, _mm256_set1_epi64x(-1)));
1815 }
1816 
1817 #else  // !USE_AVX2
UnpackHalfwordToWord(uintptr_t hw)1818 HEADER_INLINE uintptr_t UnpackHalfwordToWord(uintptr_t hw) {
1819 #  ifdef __LP64__
1820   hw = (hw | (hw << 16)) & kMask0000FFFF;
1821 #  endif
1822   hw = (hw | (hw << 8)) & kMask00FF;
1823   hw = (hw | (hw << 4)) & kMask0F0F;
1824   hw = (hw | (hw << 2)) & kMask3333;
1825   return ((hw | (hw << 1)) & kMask5555);
1826 }
1827 
UnpackHalfwordToWordShift1(uintptr_t hw)1828 HEADER_INLINE uintptr_t UnpackHalfwordToWordShift1(uintptr_t hw) {
1829   return UnpackHalfwordToWord(hw) << 1;
1830 }
1831 
PackWordToHalfword(uintptr_t ww)1832 HEADER_INLINE Halfword PackWordToHalfword(uintptr_t ww) {
1833   // assumes only even bits of ww can be set
1834   ww = (ww | (ww >> 1)) & kMask3333;
1835   ww = (ww | (ww >> 2)) & kMask0F0F;
1836   ww = (ww | (ww >> 4)) & kMask00FF;
1837 #  ifdef __LP64__
1838   ww = (ww | (ww >> 8)) & kMask0000FFFF;
1839 #  endif
1840   return S_CAST(Halfword, ww | (ww >> kBitsPerWordD4));
1841 }
1842 
PackWordToHalfwordMask5555(uintptr_t ww)1843 HEADER_INLINE Halfword PackWordToHalfwordMask5555(uintptr_t ww) {
1844   return PackWordToHalfword(ww & kMask5555);
1845 }
1846 
PackWordToHalfwordMaskAAAA(uintptr_t ww)1847 HEADER_INLINE Halfword PackWordToHalfwordMaskAAAA(uintptr_t ww) {
1848   return PackWordToHalfword((ww >> 1) & kMask5555);
1849 }
1850 
Unpack0F0F(uintptr_t hw)1851 HEADER_INLINE uintptr_t Unpack0F0F(uintptr_t hw) {
1852 #ifdef __LP64__
1853   hw = (hw | (hw << 16)) & kMask0000FFFF;
1854 #endif
1855   hw = (hw | (hw << 8)) & kMask00FF;
1856   return ((hw | (hw << 4)) & kMask0F0F);
1857 }
1858 
Pack0F0F(uintptr_t ww)1859 HEADER_INLINE Halfword Pack0F0F(uintptr_t ww) {
1860   ww = (ww | (ww >> 4)) & kMask00FF;
1861 #  ifdef __LP64__
1862   ww = (ww | (ww >> 8)) & kMask0000FFFF;
1863 #  endif
1864   return S_CAST(Halfword, ww | (ww >> kBitsPerWordD4));
1865 }
1866 
Pack0F0FMask(uintptr_t ww)1867 HEADER_INLINE Halfword Pack0F0FMask(uintptr_t ww) {
1868   return Pack0F0F(ww & kMask0F0F);
1869 }
1870 
Unpack0303(uintptr_t qw)1871 HEADER_INLINE uintptr_t Unpack0303(uintptr_t qw) {
1872   // ................................................fedcba9876543210
1873 #ifdef __LP64__
1874   qw = (qw | (qw << 24)) & kMask000000FF;
1875   // ........................fedcba98........................76543210
1876 #endif
1877   qw = qw | (qw << 12);
1878   // ............fedcba98....fedcba98............76543210....76543210
1879 
1880   qw = qw | (qw << 6);
1881   // ......fedcbaXXdcbaXXdcbaXXdcba98......765432XX5432XX5432XX543210
1882 
1883   return (qw & kMask0303);
1884   // ......fe......dc......ba......98......76......54......32......10
1885 }
1886 
Pack0303(uintptr_t ww)1887 HEADER_INLINE Quarterword Pack0303(uintptr_t ww) {
1888   // ......fe......dc......ba......98......76......54......32......10
1889 
1890   ww = ww | (ww >> 6);
1891   // ......fe....fedc....dcba....ba98....9876....7654....5432....3210
1892 
1893   ww = ww | (ww >> 12);
1894   // ......fe....fedc..fedcbafedcba98dcba9876ba9876549876543276543210
1895 
1896 #ifdef __LP64__
1897   ww = ww & kMask000000FF;
1898   // ........................fedcba98........................76543210
1899 
1900   return S_CAST(Quarterword, ww | (ww >> 24));
1901 #else
1902   return S_CAST(Quarterword, ww);
1903 #endif
1904 }
1905 
Pack0303Mask(uintptr_t ww)1906 HEADER_INLINE uintptr_t Pack0303Mask(uintptr_t ww) {
1907   return Pack0303(ww & kMask0303);
1908 }
1909 
1910 #  ifdef __LP64__
UnpackVec8thUintTo4th(Vec8thUint hw)1911 HEADER_INLINE Vec4thUint UnpackVec8thUintTo4th(Vec8thUint hw) {
1912   hw = (hw | (hw << 8)) & 0x00ff00ffU;
1913   hw = (hw | (hw << 4)) & 0x0f0f0f0fU;
1914   hw = (hw | (hw << 2)) & 0x33333333U;
1915   return (hw | (hw << 1)) & 0x55555555U;
1916 }
1917 
PackVec4thUintTo8th(Vec4thUint ww)1918 HEADER_INLINE Vec8thUint PackVec4thUintTo8th(Vec4thUint ww) {
1919   ww = (ww | (ww >> 1)) & kMask3333;
1920   ww = (ww | (ww >> 2)) & kMask0F0F;
1921   ww = (ww | (ww >> 4)) & kMask00FF;
1922   return S_CAST(Vec8thUint, ww | (ww >> 8));
1923 }
1924 
PackVec8thUintTo16th(Vec8thUint ww)1925 HEADER_INLINE Vec16thUint PackVec8thUintTo16th(Vec8thUint ww) {
1926   ww = (ww | (ww >> 1)) & 0x3333;
1927   ww = (ww | (ww >> 2)) & 0x0f0f;
1928   return S_CAST(Vec16thUint, ww | (ww >> 4));
1929 }
1930 
1931 #    ifdef USE_SSE42
InverseMovemaskFF(Vec8thUint mask)1932 HEADER_INLINE VecUc InverseMovemaskFF(Vec8thUint mask) {
1933   __m128i vmask = _mm_set1_epi16(mask);
1934   const __m128i byte_gather = _mm_setr_epi32(0, 0, 0x01010101, 0x01010101);
1935   vmask = _mm_shuffle_epi8(vmask, byte_gather);
1936   const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfeLL);
1937   vmask = _mm_or_si128(vmask, bit_mask);
1938   return R_CAST(VecUc, _mm_cmpeq_epi8(vmask, _mm_set1_epi64x(-1)));
1939 }
1940 
InverseMovespreadmaskFF(Vec4thUint mask)1941 HEADER_INLINE VecUc InverseMovespreadmaskFF(Vec4thUint mask) {
1942   __m128i vmask = _mm_set1_epi32(mask);
1943   const __m128i byte_gather = _mm_setr_epi32(0, 0x01010101, 0x02020202, 0x03030303);
1944   vmask = _mm_shuffle_epi8(vmask, byte_gather);
1945   const __m128i bit_mask = _mm_set1_epi32(0xbfeffbfeU);
1946   vmask = _mm_or_si128(vmask, bit_mask);
1947   return R_CAST(VecUc, _mm_cmpeq_epi8(vmask, _mm_set1_epi64x(-1)));
1948 }
1949 #    endif
1950 
1951 #  endif
1952 #endif  // !USE_AVX2
1953 
1954 // alignment must be a power of 2
1955 // tried splitting out RoundDownPow2U32() and RoundUpPow2U32() functions, no
1956 // practical difference
RoundDownPow2(uintptr_t val,uintptr_t alignment)1957 HEADER_CINLINE uintptr_t RoundDownPow2(uintptr_t val, uintptr_t alignment) {
1958   return val & (~(alignment - 1));
1959 }
1960 
RoundDownPow2U64(uint64_t val,uint64_t alignment)1961 HEADER_CINLINE uint64_t RoundDownPow2U64(uint64_t val, uint64_t alignment) {
1962   return val & (~(alignment - 1));
1963 }
1964 
RoundUpPow2(uintptr_t val,uintptr_t alignment)1965 HEADER_CINLINE uintptr_t RoundUpPow2(uintptr_t val, uintptr_t alignment) {
1966   return (val + alignment - 1) & (~(alignment - 1));
1967 }
1968 
1969 
1970 // This is best when the divisor is constant (so (divisor - 1) can be
1971 // collapsed), and handles val == 0 properly.  If the divisor isn't constant
1972 // and val is guaranteed to be nonzero, go with explicit
1973 // "1 + (val - 1) / divisor".
1974 //
1975 // Note that this fails if (val + divisor - 1) overflows the widest integer
1976 // type on the left.
1977 //
1978 // Since forced-uint32_t RoundDownPow2 was pointless, it stands to reason that
1979 // the same applies to DivUp.  With that said, we may as well make divisor a
1980 // uint32_t just in case this ever gets used on a not-known-at-compile-time
1981 // divisor, since 64/64 can be slower than 64/32.
DivUp(uintptr_t val,uint32_t divisor)1982 HEADER_CINLINE uintptr_t DivUp(uintptr_t val, uint32_t divisor) {
1983   return (val + divisor - 1) / divisor;
1984 }
1985 
DivUpU64(uint64_t val,uint32_t divisor)1986 HEADER_CINLINE uint64_t DivUpU64(uint64_t val, uint32_t divisor) {
1987   return (val + divisor - 1) / divisor;
1988 }
1989 
1990 // "Nz" means nonzero in two ways:
1991 // * result is in [1, modulus], not [0, modulus - 1]
1992 // * val should not be zero (though this expression still works if val is zero
1993 //   and modulus is a hardcoded power of 2)
ModNz(uintptr_t val,uint32_t modulus)1994 HEADER_INLINE uint32_t ModNz(uintptr_t val, uint32_t modulus) {
1995   return (1 + ((val - 1) % modulus));
1996 }
1997 
1998 // No need for ModNzU64 in practice, since high bits don't affect result when
1999 // modulus is a power of 2.
2000 
2001 // Equivalent to (static_cast<int32_t>(uii) < 0).  Most frequently used on
2002 // possibly-error chromosome indexes.
IsI32Neg(uint32_t uii)2003 HEADER_INLINE uint32_t IsI32Neg(uint32_t uii) {
2004   return uii >> 31;
2005 }
2006 
abs_i32(int32_t ii)2007 HEADER_INLINE uint32_t abs_i32(int32_t ii) {
2008   // Arithmetic right shift.  0xffffffffU when ii is negative, 0 otherwise.
2009   const uint32_t neg_sign_bit = S_CAST(uint32_t, ii >> 31);
2010 
2011   return (S_CAST(uint32_t, ii) ^ neg_sign_bit) - neg_sign_bit;
2012 }
2013 
2014 extern uintptr_t g_failed_alloc_attempt_size;
2015 // with NDEBUG undefined, may want to define a bunch of macros so that line
2016 // number is printed as well; see e.g.
2017 //   https://stackoverflow.com/questions/15884793/how-to-get-the-name-or-file-and-line-of-caller-method
2018 
2019 #if (__GNUC__ == 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
2020 // putting this in the header file caused a bunch of gcc 4.4 strict-aliasing
2021 // warnings, while not doing so seems to inhibit some malloc-related compiler
2022 // optimizations, bleah
2023 // compromise: header-inline iff gcc version >= 4.7 (might not be the right
2024 // cutoff?)
2025 BoolErr pgl_malloc(uintptr_t size, void* pp);
2026 #else
2027 // Unfortunately, defining the second parameter to be of type void** doesn't do
2028 // the right thing.
pgl_malloc(uintptr_t size,void * pp)2029 HEADER_INLINE BoolErr pgl_malloc(uintptr_t size, void* pp) {
2030   *S_CAST(unsigned char**, pp) = S_CAST(unsigned char*, malloc(size));
2031   if (likely(*S_CAST(unsigned char**, pp))) {
2032     return 0;
2033   }
2034   g_failed_alloc_attempt_size = size;
2035   return 1;
2036 }
2037 #endif
2038 
2039 // This must be used for all fwrite() calls where len could be >= 2^31, since
2040 // OS X raw fwrite() doesn't work in that case.
2041 static_assert(sizeof(size_t) == sizeof(intptr_t), "plink2_base assumes size_t and intptr_t are synonymous.");
2042 BoolErr fwrite_checked(const void* buf, uintptr_t len, FILE* outfile);
2043 
2044 // Only use this if loading < len bytes is not an error.
2045 // IntErr fread_checked2(void* buf, uintptr_t len, FILE* infile, uintptr_t* bytes_read_ptr);
2046 
2047 BoolErr fread_checked(void* buf, uintptr_t len, FILE* infile);
2048 
fclose_null(FILE ** fptr_ptr)2049 HEADER_INLINE BoolErr fclose_null(FILE** fptr_ptr) {
2050   int32_t ii = ferror_unlocked(*fptr_ptr);
2051   int32_t jj = fclose(*fptr_ptr);
2052   *fptr_ptr = nullptr;
2053   return ii || jj;
2054 }
2055 
2056 
2057 #ifdef __LP64__
2058 // Reads an integer in [1, cap].
2059 // * Errors out unless first character is a digit, or is '+' followed by a
2060 //   digit.  Initial whitespace is not permitted.
2061 // * Like atoi(), this considereds the number to be terminated by *any*
2062 //   nondigit character.  E.g. "1000genomes" is treated as a valid instance of
2063 //   1000 rather than a nonnumeric token, and "98.6" is treated as 98.  (See
2064 //   ScanmovPosintCapped(), ScanmovUintCapped(), etc. in plink2_string if
2065 //   you want strtol-like semantics, where the pointer is moved.)
2066 // * Errors out on overflow.  This may be the biggest advantage over atoi().
2067 BoolErr ScanPosintCapped(const char* str_iter, uint64_t cap, uint32_t* valp);
2068 
2069 // [0, cap]
2070 BoolErr ScanUintCapped(const char* str_iter, uint64_t cap, uint32_t* valp);
2071 
2072 // [-bound, bound]
2073 BoolErr ScanIntAbsBounded(const char* str_iter, uint64_t bound, int32_t* valp);
2074 #else  // not __LP64__
2075 // Need to be more careful in 32-bit case due to overflow.
2076 // A funny-looking div_10/mod_10 interface is used since the cap will usually
2077 // be a constant, and we want the integer division/modulus to occur at compile
2078 // time.
2079 BoolErr ScanPosintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp);
2080 
2081 BoolErr ScanUintCapped32(const char* str_iter, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp);
2082 
2083 BoolErr ScanIntAbsBounded32(const char* str_iter, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp);
2084 
ScanPosintCapped(const char * str,uint32_t cap,uint32_t * valp)2085 HEADER_INLINE BoolErr ScanPosintCapped(const char* str, uint32_t cap, uint32_t* valp) {
2086   return ScanPosintCapped32(str, cap / 10, cap % 10, valp);
2087 }
2088 
ScanUintCapped(const char * str,uint32_t cap,uint32_t * valp)2089 HEADER_INLINE BoolErr ScanUintCapped(const char* str, uint32_t cap, uint32_t* valp) {
2090   return ScanUintCapped32(str, cap / 10, cap % 10, valp);
2091 }
2092 
ScanIntAbsBounded(const char * str,uint32_t bound,int32_t * valp)2093 HEADER_INLINE BoolErr ScanIntAbsBounded(const char* str, uint32_t bound, int32_t* valp) {
2094   return ScanIntAbsBounded32(str, bound / 10, bound % 10, valp);
2095 }
2096 #endif
2097 
2098 
2099 // intentionally rejects -2^31 for now
2100 // (that's a reason why this doesn't have the shorter name 'ScanI32')
ScanInt32(const char * str,int32_t * valp)2101 HEADER_INLINE BoolErr ScanInt32(const char* str, int32_t* valp) {
2102   return ScanIntAbsBounded(str, 0x7fffffff, valp);
2103 }
2104 
2105 // default cap = 0x7ffffffe
ScanPosintDefcap(const char * str,uint32_t * valp)2106 HEADER_INLINE BoolErr ScanPosintDefcap(const char* str, uint32_t* valp) {
2107   return ScanPosintCapped(str, 0x7ffffffe, valp);
2108 }
2109 
ScanUintDefcap(const char * str,uint32_t * valp)2110 HEADER_INLINE BoolErr ScanUintDefcap(const char* str, uint32_t* valp) {
2111   return ScanUintCapped(str, 0x7ffffffe, valp);
2112 }
2113 
ScanIntAbsDefcap(const char * str,int32_t * valp)2114 HEADER_INLINE BoolErr ScanIntAbsDefcap(const char* str, int32_t* valp) {
2115   return ScanIntAbsBounded(str, 0x7ffffffe, valp);
2116 }
2117 
ScanUintIcap(const char * str,uint32_t * valp)2118 HEADER_INLINE BoolErr ScanUintIcap(const char* str, uint32_t* valp) {
2119   return ScanUintCapped(str, 0x7fffffff, valp);
2120 }
2121 
2122 
2123 // memcpya() tends to be used to copy known-length text strings, while
2124 // memseta() has more mixed usage but char* type is also at least as common as
2125 // unsigned char*; append comes up less when working with raw byte arrays.  So
2126 // give the shortest-name forms char* return types.
memseta(void * target,unsigned char val,uintptr_t ct)2127 HEADER_INLINE char* memseta(void* target, unsigned char val, uintptr_t ct) {
2128   memset(target, val, ct);
2129   return &(S_CAST(char*, target)[ct]);
2130 }
2131 
memsetua(void * target,unsigned char val,uintptr_t ct)2132 HEADER_INLINE unsigned char* memsetua(void* target, unsigned char val, uintptr_t ct) {
2133   memset(target, val, ct);
2134   return &(S_CAST(unsigned char*, target)[ct]);
2135 }
2136 
BitCtToVecCt(uintptr_t val)2137 HEADER_CINLINE uintptr_t BitCtToVecCt(uintptr_t val) {
2138   return DivUp(val, kBitsPerVec);
2139 }
2140 
BitCtToWordCt(uintptr_t val)2141 HEADER_CINLINE uintptr_t BitCtToWordCt(uintptr_t val) {
2142   return DivUp(val, kBitsPerWord);
2143 }
2144 
BitCtToAlignedWordCt(uintptr_t val)2145 HEADER_CINLINE uintptr_t BitCtToAlignedWordCt(uintptr_t val) {
2146   return kWordsPerVec * BitCtToVecCt(val);
2147 }
2148 
BitCtToCachelineCt(uintptr_t val)2149 HEADER_CINLINE uintptr_t BitCtToCachelineCt(uintptr_t val) {
2150   return DivUp(val, kBitsPerCacheline);
2151 }
2152 
Int32CtToVecCt(uintptr_t val)2153 HEADER_CINLINE uintptr_t Int32CtToVecCt(uintptr_t val) {
2154   return DivUp(val, kInt32PerVec);
2155 }
2156 
Int32CtToCachelineCt(uintptr_t val)2157 HEADER_CINLINE uintptr_t Int32CtToCachelineCt(uintptr_t val) {
2158   return DivUp(val, kInt32PerCacheline);
2159 }
2160 
WordCtToVecCt(uintptr_t val)2161 HEADER_CINLINE uintptr_t WordCtToVecCt(uintptr_t val) {
2162   return DivUp(val, kWordsPerVec);
2163 }
2164 
WordCtToCachelineCtU64(uint64_t val)2165 HEADER_CINLINE uint64_t WordCtToCachelineCtU64(uint64_t val) {
2166   return DivUpU64(val, kWordsPerCacheline);
2167 }
2168 
2169 #ifdef __LP64__
Int64CtToVecCt(uintptr_t val)2170 HEADER_CINLINE uintptr_t Int64CtToVecCt(uintptr_t val) {
2171   return DivUp(val, kBytesPerVec / 8);
2172 }
2173 #else
Int64CtToVecCt(uintptr_t val)2174 HEADER_CINLINE uintptr_t Int64CtToVecCt(uintptr_t val) {
2175   return val * 2;
2176 }
2177 #endif
2178 
Int64CtToCachelineCt(uintptr_t val)2179 HEADER_CINLINE uintptr_t Int64CtToCachelineCt(uintptr_t val) {
2180   return DivUp(val, kInt64PerCacheline);
2181 }
2182 
DblCtToVecCt(uintptr_t val)2183 HEADER_CINLINE uintptr_t DblCtToVecCt(uintptr_t val) {
2184   return Int64CtToVecCt(val);
2185 }
2186 
VecCtToCachelineCt(uintptr_t val)2187 HEADER_CINLINE uintptr_t VecCtToCachelineCt(uintptr_t val) {
2188   return DivUp(val, kVecsPerCacheline);
2189 }
2190 
VecCtToCachelineCtU64(uint64_t val)2191 HEADER_CINLINE uint64_t VecCtToCachelineCtU64(uint64_t val) {
2192   return DivUpU64(val, kVecsPerCacheline);
2193 }
2194 
2195 // C++11 standard guarantees std::min and std::max return leftmost minimum in
2196 // case of equality; best to adhere to that
2197 // We don't actually use std::min/max since casting one argument when comparing
2198 // e.g. a uint32_t with a uintptr_t is pointlessly verbose.  Compiler will
2199 // still warn against comparison of signed with unsigned.
2200 #define MAXV(aa, bb) (((bb) > (aa))? (bb) : (aa))
2201 #define MINV(aa, bb) (((bb) < (aa))? (bb) : (aa))
2202 
2203 
2204 // don't use PglErr here since there's only one failure mode, it's
2205 // obvious what it is, and stacking multiple aligned_mallocs in a single
2206 // if-statement is useful.
2207 BoolErr aligned_malloc(uintptr_t size, uintptr_t alignment, void* aligned_pp);
2208 
2209 #ifdef USE_SSE42
NypsumWord(uintptr_t val)2210 HEADER_CINLINE uint32_t NypsumWord(uintptr_t val) {
2211   return __builtin_popcountll(val) + __builtin_popcountll(val & kMaskAAAA);
2212 }
2213 #else
NypsumWord(uintptr_t val)2214 HEADER_CINLINE2 uint32_t NypsumWord(uintptr_t val) {
2215   val = (val & kMask3333) + ((val >> 2) & kMask3333);
2216   return (((val + (val >> 4)) & kMask0F0F) * kMask0101) >> (kBitsPerWord - 8);
2217 }
2218 #endif
2219 
2220 // the simple version, good enough for all non-time-critical stuff
2221 // (without SSE4.2, PopcountWords() tends to be >3x as fast on arrays.  with
2222 // SSE4.2 but no AVX2, there's no noticeable difference.  with AVX2,
2223 // PopcountWords() gains another factor of 1.5-2x.)
2224 #ifdef USE_SSE42
PopcountWord(uintptr_t val)2225 HEADER_CINLINE uint32_t PopcountWord(uintptr_t val) {
2226   return __builtin_popcountll(val);
2227 }
2228 #else
PopcountWord(uintptr_t val)2229 HEADER_CINLINE2 uint32_t PopcountWord(uintptr_t val) {
2230   // Sadly, this was still faster than the LLVM implementation of the intrinsic
2231   // as of 2016.
2232   return NypsumWord(val - ((val >> 1) & kMask5555));
2233 }
2234 #endif
2235 
2236 #ifdef USE_SSE42
Popcount2Words(uintptr_t val0,uintptr_t val1)2237 HEADER_INLINE uint32_t Popcount2Words(uintptr_t val0, uintptr_t val1) {
2238   return __builtin_popcountll(val0) + __builtin_popcountll(val1);
2239 }
2240 #else
Popcount2Words(uintptr_t val0,uintptr_t val1)2241 HEADER_INLINE uint32_t Popcount2Words(uintptr_t val0, uintptr_t val1) {
2242   val0 -= (val0 >> 1) & kMask5555;
2243   val1 -= (val1 >> 1) & kMask5555;
2244   const uintptr_t four_bit = (val0 & kMask3333) + ((val0 >> 2) & kMask3333) + (val1 & kMask3333) + ((val1 >> 2) & kMask3333);
2245   // up to 16 values in 0..12; sum fits in 8 bits
2246   return (((four_bit & kMask0F0F) + ((four_bit >> 4) & kMask0F0F)) * kMask0101) >> (kBitsPerWord - 8);
2247 }
2248 #endif
2249 
2250 #ifndef __LP64__
Popcount4Words(uintptr_t val0,uintptr_t val1,uintptr_t val2,uintptr_t val3)2251 HEADER_INLINE uint32_t Popcount4Words(uintptr_t val0, uintptr_t val1, uintptr_t val2, uintptr_t val3) {
2252   val0 -= (val0 >> 1) & kMask5555;
2253   val1 -= (val1 >> 1) & kMask5555;
2254   val2 -= (val2 >> 1) & kMask5555;
2255   val3 -= (val3 >> 1) & kMask5555;
2256   const uintptr_t four_bit_0 = (val0 & kMask3333) + ((val0 >> 2) & kMask3333) + (val1 & kMask3333) + ((val1 >> 2) & kMask3333);
2257   const uintptr_t four_bit_1 = (val2 & kMask3333) + ((val2 >> 2) & kMask3333) + (val3 & kMask3333) + ((val3 >> 2) & kMask3333);
2258   return (((four_bit_0 & kMask0F0F) + ((four_bit_0 >> 4) & kMask0F0F) + (four_bit_1 & kMask0F0F) + ((four_bit_1 >> 4) & kMask0F0F)) * kMask0101) >> (kBitsPerWord - 8);
2259 }
2260 #endif
2261 
2262 #ifdef __LP64__
2263 #  ifdef USE_SSE42
PopcountVec8thUint(uint32_t val)2264 HEADER_INLINE uint32_t PopcountVec8thUint(uint32_t val) {
2265   return __builtin_popcount(val);
2266 }
2267 #  else
PopcountVec8thUint(uint32_t val)2268 HEADER_INLINE uint32_t PopcountVec8thUint(uint32_t val) {
2269   // May as well exploit the fact that only the low 15 bits may be set.
2270   val = val - ((val >> 1) & 0x5555);
2271   val = (val & 0x3333) + ((val >> 2) & 0x3333);
2272   val = (val + (val >> 4)) & 0x0f0f;
2273   return (val + (val >> 8)) & 0xff;
2274 }
2275 #  endif
2276 #endif
2277 
VecIsAligned(const void * ptr)2278 HEADER_INLINE uint32_t VecIsAligned(const void* ptr) {
2279   return !(R_CAST(uintptr_t, ptr) % kBytesPerVec);
2280 }
2281 
VecAlignUp(void * pp)2282 HEADER_INLINE void VecAlignUp(void* pp) {
2283   uintptr_t addr = *S_CAST(uintptr_t*, pp);
2284 #if (__GNUC__ == 4) && (__GNUC_MINOR__ < 7) && !defined(__APPLE__)
2285   // bleah, need to write this way to avoid gcc 4.4 strict-aliasing warning
2286   addr = RoundUpPow2(addr, kBytesPerVec);
2287   memcpy(pp, &addr, sizeof(intptr_t));
2288 #else
2289   *S_CAST(uintptr_t*, pp) = RoundUpPow2(addr, kBytesPerVec);
2290 #endif
2291 }
2292 
2293 #ifdef __LP64__
VecAlignUp64(void * pp)2294 HEADER_INLINE void VecAlignUp64(void* pp) {
2295   VecAlignUp(pp);
2296 }
2297 #else
VecAlignUp64(__maybe_unused void * pp)2298 HEADER_INLINE void VecAlignUp64(__maybe_unused void* pp) {
2299 }
2300 #endif
2301 
2302 // Turns out memcpy(&cur_word, bytearr, ct) can't be trusted to be fast when ct
2303 // isn't known at compile time.
2304 //
2305 // ct must be less than sizeof(intptr_t).  ct == 0 handled correctly, albeit
2306 // inefficiently.
ProperSubwordLoad(const void * bytearr,uint32_t ct)2307 HEADER_INLINE uintptr_t ProperSubwordLoad(const void* bytearr, uint32_t ct) {
2308   const unsigned char* bytearr_uc = S_CAST(const unsigned char*, bytearr);
2309 #ifdef __LP64__
2310   if (ct >= 4) {
2311     const uint32_t remainder = ct - 4;
2312     bytearr_uc = &(bytearr_uc[remainder]);
2313     uintptr_t cur_word = *R_CAST(const uint32_t*, bytearr_uc);
2314     if (remainder) {
2315       cur_word <<= remainder * CHAR_BIT;
2316       cur_word |= *S_CAST(const uint32_t*, bytearr);
2317     }
2318     return cur_word;
2319   }
2320 #endif
2321   if (ct >= 2) {
2322     const uint32_t remainder = ct & 1;
2323     uintptr_t cur_word = *R_CAST(const uint16_t*, &(bytearr_uc[remainder]));
2324     if (remainder) {
2325       cur_word <<= 8;
2326       cur_word |= bytearr_uc[0];
2327     }
2328     return cur_word;
2329   }
2330   return ct? bytearr_uc[0] : 0;
2331 }
2332 
SubwordLoad(const void * bytearr,uint32_t ct)2333 HEADER_INLINE uintptr_t SubwordLoad(const void* bytearr, uint32_t ct) {
2334   if (ct == S_CAST(uint32_t, kBytesPerWord)) {
2335     return *S_CAST(const uintptr_t*, bytearr);
2336   }
2337   return ProperSubwordLoad(bytearr, ct);
2338 }
2339 
2340 // ct must be in 1..4.
SubU32Load(const void * bytearr,uint32_t ct)2341 HEADER_INLINE uint32_t SubU32Load(const void* bytearr, uint32_t ct) {
2342   if (ct & 1) {
2343     const unsigned char* bytearr_iter = S_CAST(const unsigned char*, bytearr);
2344     uint32_t cur_uint = *bytearr_iter;
2345     if (ct == 3) {
2346       ++bytearr_iter;
2347       cur_uint |= S_CAST(uint32_t, *R_CAST(const uint16_t*, bytearr_iter)) << 8;
2348     }
2349     return cur_uint;
2350   }
2351   if (ct == 2) {
2352     return *S_CAST(const uint16_t*, bytearr);
2353   }
2354   return *S_CAST(const uint32_t*, bytearr);
2355 }
2356 
2357 // tried making this non-inline, loop took more than 50% longer
ProperSubwordStore(uintptr_t cur_word,uint32_t byte_ct,void * target)2358 HEADER_INLINE void ProperSubwordStore(uintptr_t cur_word, uint32_t byte_ct, void* target) {
2359   unsigned char* target_iter = S_CAST(unsigned char*, target);
2360 #ifdef __LP64__
2361   if (byte_ct >= 4) {
2362     *R_CAST(uint32_t*, target_iter) = cur_word;
2363     if (byte_ct == 4) {
2364       return;
2365     }
2366     const uint32_t remainder = byte_ct - 4;
2367     target_iter = &(target_iter[remainder]);
2368     cur_word >>= remainder * CHAR_BIT;
2369     *R_CAST(uint32_t*, target_iter) = cur_word;
2370     return;
2371   }
2372 #endif
2373   if (byte_ct & 1) {
2374     *target_iter++ = cur_word;
2375     cur_word >>= 8;
2376   }
2377   if (byte_ct & 2) {
2378     *R_CAST(uint16_t*, target_iter) = cur_word;
2379   }
2380 }
2381 
ProperSubwordStoreMov(uintptr_t cur_word,uint32_t byte_ct,unsigned char ** targetp)2382 HEADER_INLINE void ProperSubwordStoreMov(uintptr_t cur_word, uint32_t byte_ct, unsigned char** targetp) {
2383   ProperSubwordStore(cur_word, byte_ct, *targetp);
2384   *targetp += byte_ct;
2385 }
2386 
SubwordStore(uintptr_t cur_word,uint32_t byte_ct,void * target)2387 HEADER_INLINE void SubwordStore(uintptr_t cur_word, uint32_t byte_ct, void* target) {
2388   if (byte_ct == kBytesPerWord) {
2389     *S_CAST(uintptr_t*, target) = cur_word;
2390     return;
2391   }
2392   ProperSubwordStore(cur_word, byte_ct, target);
2393 }
2394 
SubwordStoreMov(uintptr_t cur_word,uint32_t byte_ct,unsigned char ** targetp)2395 HEADER_INLINE void SubwordStoreMov(uintptr_t cur_word, uint32_t byte_ct, unsigned char** targetp) {
2396   SubwordStore(cur_word, byte_ct, *targetp);
2397   *targetp += byte_ct;
2398 }
2399 
2400 // byte_ct must be in 1..4.
SubU32Store(uint32_t cur_uint,uint32_t byte_ct,void * target)2401 HEADER_INLINE void SubU32Store(uint32_t cur_uint, uint32_t byte_ct, void* target) {
2402   if (byte_ct & 1) {
2403     unsigned char* target_iter = S_CAST(unsigned char*, target);
2404     *target_iter = cur_uint;
2405     if (byte_ct == 3) {
2406       ++target_iter;
2407       *R_CAST(uint16_t*, target_iter) = cur_uint >> 8;
2408     }
2409     return;
2410   }
2411   if (byte_ct == 2) {
2412     *S_CAST(uint16_t*, target) = cur_uint;
2413     return;
2414   }
2415   *S_CAST(uint32_t*, target) = cur_uint;
2416   return;
2417 }
2418 
SubU32StoreMov(uint32_t cur_uint,uint32_t byte_ct,unsigned char ** targetp)2419 HEADER_INLINE void SubU32StoreMov(uint32_t cur_uint, uint32_t byte_ct, unsigned char** targetp) {
2420   SubU32Store(cur_uint, byte_ct, *targetp);
2421   *targetp += byte_ct;
2422 }
2423 
2424 #ifdef __LP64__
SubU64StoreMov(uint64_t cur_u64,uint32_t byte_ct,unsigned char ** targetp)2425 HEADER_INLINE void SubU64StoreMov(uint64_t cur_u64, uint32_t byte_ct, unsigned char** targetp) {
2426   return SubwordStoreMov(cur_u64, byte_ct, targetp);
2427 }
2428 #else
SubU64StoreMov(uint64_t cur_u64,uint32_t byte_ct,unsigned char ** targetp)2429 HEADER_INLINE void SubU64StoreMov(uint64_t cur_u64, uint32_t byte_ct, unsigned char** targetp) {
2430   if (byte_ct > 4) {
2431     *R_CAST(uint32_t*, *targetp) = cur_u64;
2432     *targetp += 4;
2433     byte_ct -= 4;
2434     cur_u64 >>= 32;
2435   }
2436   return SubU32StoreMov(cur_u64, byte_ct, targetp);
2437 }
2438 #endif
2439 
2440 
vecaligned_malloc(uintptr_t size,void * aligned_pp)2441 HEADER_INLINE BoolErr vecaligned_malloc(uintptr_t size, void* aligned_pp) {
2442 #ifdef USE_AVX2
2443   return aligned_malloc(size, kBytesPerVec, aligned_pp);
2444 #else
2445 #  if defined(__APPLE__) || !defined(__LP64__)
2446   const BoolErr ret_boolerr = pgl_malloc(size, aligned_pp);
2447   assert(VecIsAligned(*S_CAST(uintptr_t**, aligned_pp)));
2448   return ret_boolerr;
2449 #  else
2450   return aligned_malloc(size, kBytesPerVec, aligned_pp);
2451 #  endif
2452 #endif
2453 }
2454 
cachealigned_malloc(uintptr_t size,void * aligned_pp)2455 HEADER_INLINE BoolErr cachealigned_malloc(uintptr_t size, void* aligned_pp) {
2456   return aligned_malloc(size, kCacheline, aligned_pp);
2457 }
2458 
aligned_free(void * aligned_ptr)2459 HEADER_INLINE void aligned_free(void* aligned_ptr) {
2460   free(R_CAST(void*, S_CAST(uintptr_t*, aligned_ptr)[-1]));
2461 }
2462 
aligned_free_cond(void * aligned_ptr)2463 HEADER_INLINE void aligned_free_cond(void* aligned_ptr) {
2464   if (aligned_ptr) {
2465     free(R_CAST(void*, S_CAST(uintptr_t*, aligned_ptr)[-1]));
2466   }
2467 }
2468 
2469 // C spec is slightly broken here
free_const(const void * memptr)2470 HEADER_INLINE void free_const(const void* memptr) {
2471   free(K_CAST(void*, memptr));
2472 }
2473 
free_cond(const void * memptr)2474 HEADER_INLINE void free_cond(const void* memptr) {
2475   if (memptr) {
2476     free_const(memptr);
2477   }
2478 }
2479 
2480 #ifdef USE_AVX2
vecaligned_free(void * aligned_ptr)2481 HEADER_INLINE void vecaligned_free(void* aligned_ptr) {
2482   aligned_free(aligned_ptr);
2483 }
2484 
vecaligned_free_cond(void * aligned_ptr)2485 HEADER_INLINE void vecaligned_free_cond(void* aligned_ptr) {
2486   aligned_free_cond(aligned_ptr);
2487 }
2488 #else
2489 #  if defined(__APPLE__) || !defined(__LP64__)
vecaligned_free(void * aligned_ptr)2490 HEADER_INLINE void vecaligned_free(void* aligned_ptr) {
2491   free(aligned_ptr);
2492 }
2493 
vecaligned_free_cond(void * aligned_ptr)2494 HEADER_INLINE void vecaligned_free_cond(void* aligned_ptr) {
2495   free_cond(aligned_ptr);
2496 }
2497 #  else
vecaligned_free(void * aligned_ptr)2498 HEADER_INLINE void vecaligned_free(void* aligned_ptr) {
2499   aligned_free(aligned_ptr);
2500 }
2501 
vecaligned_free_cond(void * aligned_ptr)2502 HEADER_INLINE void vecaligned_free_cond(void* aligned_ptr) {
2503   aligned_free_cond(aligned_ptr);
2504 }
2505 #  endif
2506 #endif
2507 
2508 
2509 #ifdef __LP64__
2510 int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct);
2511 #else
memequal(const void * m1,const void * m2,uintptr_t byte_ct)2512 HEADER_INLINE int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) {
2513   return !memcmp(m1, m2, byte_ct);
2514 }
2515 #endif
2516 
memcpya(void * __restrict target,const void * __restrict source,uintptr_t ct)2517 HEADER_INLINE char* memcpya(void* __restrict target, const void* __restrict source, uintptr_t ct) {
2518   memcpy(target, source, ct);
2519   return &(S_CAST(char*, target)[ct]);
2520 }
2521 
memcpyua(void * __restrict target,const void * __restrict source,uintptr_t ct)2522 HEADER_INLINE unsigned char* memcpyua(void* __restrict target, const void* __restrict source, uintptr_t ct) {
2523   memcpy(target, source, ct);
2524   return &(S_CAST(unsigned char*, target)[ct]);
2525 }
2526 
AppendU16(uint32_t usii,unsigned char ** targetp)2527 HEADER_INLINE void AppendU16(uint32_t usii, unsigned char** targetp) {
2528   memcpy(*targetp, &usii, sizeof(int16_t));
2529   *targetp += sizeof(int16_t);
2530 }
2531 
AppendU32(uint32_t uii,unsigned char ** targetp)2532 HEADER_INLINE void AppendU32(uint32_t uii, unsigned char** targetp) {
2533   memcpy(*targetp, &uii, sizeof(int32_t));
2534   *targetp += sizeof(int32_t);
2535 }
2536 
2537 // Tried beating memcpy for usually-small strings not known to have length <=
2538 // 8, gave up.
2539 
2540 #if defined(__LP64__) && defined(__cplusplus)
2541 // See https://stackoverflow.com/questions/9510514/integer-range-based-template-specialisation .
2542 
2543 template <bool> struct TRange;
2544 
2545 // This makes MemequalKImpl<byte_ct> expand to
2546 // MemequalKImpl<byte_ct, TRange<true> >.
2547 // If a later single-parameter template defines the same thing, that takes
2548 // precedence.
2549 template <uint32_t N, typename = TRange<true> > struct MemequalKImpl {
MemequalKMemequalKImpl2550   static int32_t MemequalK(const void* m1, const void* m2) {
2551     return memequal(m1, m2, N);
2552   }
2553 };
2554 
2555 template <> struct MemequalKImpl<1> {
2556   static int32_t MemequalK(const void* m1, const void* m2) {
2557     const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2558     const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2559     return (m1_uc[0] == m2_uc[0]);
2560   }
2561 };
2562 
2563 template <> struct MemequalKImpl<2> {
2564   static int32_t MemequalK(const void* m1, const void* m2) {
2565     return ((*R_CAST(const uint16_t*, m1)) == (*R_CAST(const uint16_t*, m2)));
2566   }
2567 };
2568 
2569 template <> struct MemequalKImpl<3> {
2570   static int32_t MemequalK(const void* m1, const void* m2) {
2571     const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2572     const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2573     return
2574       ((*R_CAST(const uint16_t*, m1)) == (*R_CAST(const uint16_t*, m2))) &&
2575       (m1_uc[2] == m2_uc[2]);
2576   }
2577 };
2578 
2579 template <> struct MemequalKImpl<4> {
2580   static int32_t MemequalK(const void* m1, const void* m2) {
2581     return ((*R_CAST(const uint32_t*, m1)) == (*R_CAST(const uint32_t*, m2)));
2582   }
2583 };
2584 
2585 template <uint32_t N> struct MemequalKImpl<N, TRange<(5 <= N) && (N <= 7)> > {
2586   static int32_t MemequalK(const void* m1, const void* m2) {
2587     const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2588     const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2589     return
2590       ((*R_CAST(const uint32_t*, m1)) == (*R_CAST(const uint32_t*, m2))) &&
2591       ((*R_CAST(const uint32_t*, &(m1_uc[N - 4]))) == (*R_CAST(const uint32_t*, &(m2_uc[N - 4]))));
2592   }
2593 };
2594 
2595 template <> struct MemequalKImpl<8> {
2596   static int32_t MemequalK(const void* m1, const void* m2) {
2597     return ((*R_CAST(const uint64_t*, m1)) == (*R_CAST(const uint64_t*, m2)));
2598   }
2599 };
2600 
2601 template <uint32_t N> struct MemequalKImpl<N, TRange<(9 <= N) && (N <= 15)> > {
2602   static int32_t MemequalK(const void* m1, const void* m2) {
2603     const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2604     const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2605     return
2606       ((*R_CAST(const uint64_t*, m1)) == (*R_CAST(const uint64_t*, m2))) &&
2607       ((*R_CAST(const uint64_t*, &(m1_uc[N - 8]))) == (*R_CAST(const uint64_t*, &(m2_uc[N - 8]))));
2608   }
2609 };
2610 
2611 template <> struct MemequalKImpl<16> {
2612   static int32_t MemequalK(const void* m1, const void* m2) {
2613     const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
2614     const __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
2615     return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535);
2616   }
2617 };
2618 
2619 template <uint32_t N> struct MemequalKImpl<N, TRange<(17 <= N) && (N <= 24)> > {
2620   static int32_t MemequalK(const void* m1, const void* m2) {
2621     const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2622     const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2623     const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
2624     const __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
2625     return
2626       (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535) &&
2627       ((*R_CAST(const uint64_t*, &(m1_uc[N - 8]))) == (*R_CAST(const uint64_t*, &(m2_uc[N - 8]))));
2628   }
2629 };
2630 
2631 template <uint32_t N> struct MemequalKImpl<N, TRange<(25 <= N) && (N <= 31)> > {
2632   static int32_t MemequalK(const void* m1, const void* m2) {
2633     __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
2634     __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
2635     if (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) != 65535) {
2636       return 0;
2637     }
2638     const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
2639     const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
2640     v1 = _mm_loadu_si128(R_CAST(const __m128i*, &(m1_uc[N - 16])));
2641     v2 = _mm_loadu_si128(R_CAST(const __m128i*, &(m2_uc[N - 16])));
2642     return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535);
2643   }
2644 };
2645 
2646 #  define memequal_k(m1, m2, byte_ct) plink2::MemequalKImpl<byte_ct>::MemequalK(m1, m2)
2647 
2648 template <uint32_t N, typename = TRange<true> > struct MemcpyKImpl {
2649   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2650     memcpy(dst, src, N);
2651   }
2652 };
2653 
2654 // Patch a bunch of cases where some commonly-used gcc and clang versions
2655 // generate suboptimal code.  (Since this code is shamelessly x86-specific, we
2656 // don't worry about the formal undefinedness of unaligned pointer dereferences
2657 // here.)
2658 // (todo: check if/when this has been fixed, and remove this bloat once all
2659 // production build machines have sufficiently new compilers.)
2660 template <> struct MemcpyKImpl<2> {
2661   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2662     *S_CAST(uint16_t*, dst) = *S_CAST(const uint16_t*, src);
2663   }
2664 };
2665 
2666 template <> struct MemcpyKImpl<3> {
2667   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2668     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2669     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2670     *S_CAST(uint16_t*, dst) = *S_CAST(const uint16_t*, src);
2671     dst_uc[2] = src_uc[2];
2672   }
2673 };
2674 
2675 template <> struct MemcpyKImpl<5> {
2676   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2677     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2678     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2679     *S_CAST(uint32_t*, dst) = *S_CAST(const uint32_t*, src);
2680     dst_uc[4] = src_uc[4];
2681   }
2682 };
2683 
2684 template <> struct MemcpyKImpl<6> {
2685   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2686     uint16_t* dst_u16 = S_CAST(uint16_t*, dst);
2687     const uint16_t* src_u16 = S_CAST(const uint16_t*, src);
2688     *S_CAST(uint32_t*, dst) = *S_CAST(const uint32_t*, src);
2689     dst_u16[2] = src_u16[2];
2690   }
2691 };
2692 
2693 template <> struct MemcpyKImpl<7> {
2694   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2695     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2696     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2697     *S_CAST(uint32_t*, dst) = *S_CAST(const uint32_t*, src);
2698     *R_CAST(uint32_t*, &(dst_uc[3])) = *R_CAST(const uint32_t*, &(src_uc[3]));
2699   }
2700 };
2701 
2702 template <> struct MemcpyKImpl<9> {
2703   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2704     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2705     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2706     *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2707     dst_uc[8] = src_uc[8];
2708   }
2709 };
2710 
2711 template <> struct MemcpyKImpl<10> {
2712   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2713     uint16_t* dst_u16 = S_CAST(uint16_t*, dst);
2714     const uint16_t* src_u16 = S_CAST(const uint16_t*, src);
2715     *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2716     dst_u16[4] = src_u16[4];
2717   }
2718 };
2719 
2720 template <uint32_t N> struct MemcpyKImpl<N, TRange<(11 <= N) && (N <= 12)> > {
2721   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2722     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2723     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2724     *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2725     *R_CAST(uint32_t*, &(dst_uc[N - 4])) = *R_CAST(const uint32_t*, &(src_uc[N - 4]));
2726   }
2727 };
2728 
2729 template <uint32_t N> struct MemcpyKImpl<N, TRange<(13 <= N) && (N <= 15)> > {
2730   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2731     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2732     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2733     *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2734     *R_CAST(uint64_t*, &(dst_uc[N - 8])) = *R_CAST(const uint64_t*, &(src_uc[N - 8]));
2735   }
2736 };
2737 
2738 template <> struct MemcpyKImpl<17> {
2739   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2740     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2741     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2742     const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2743     _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2744     dst_uc[16] = src_uc[16];
2745   }
2746 };
2747 
2748 template <> struct MemcpyKImpl<18> {
2749   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2750     uint16_t* dst_u16 = S_CAST(uint16_t*, dst);
2751     const uint16_t* src_u16 = S_CAST(const uint16_t*, src);
2752     const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2753     _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2754     dst_u16[8] = src_u16[8];
2755   }
2756 };
2757 
2758 template <uint32_t N> struct MemcpyKImpl<N, TRange<(19 <= N) && (N <= 20)> > {
2759   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2760     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2761     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2762     const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2763     _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2764     *R_CAST(uint32_t*, &(dst_uc[N - 4])) = *R_CAST(const uint32_t*, &(src_uc[N - 4]));
2765   }
2766 };
2767 
2768 template <uint32_t N> struct MemcpyKImpl<N, TRange<(21 <= N) && (N <= 24)> > {
2769   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2770     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2771     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2772     const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2773     _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2774     *R_CAST(uint64_t*, &(dst_uc[N - 8])) = *R_CAST(const uint64_t*, &(src_uc[N - 8]));
2775   }
2776 };
2777 
2778 template <uint32_t N> struct MemcpyKImpl<N, TRange<(25 <= N) && (N <= 31)> > {
2779   static void MemcpyK(void* __restrict dst, const void* __restrict src) {
2780     unsigned char* dst_uc = S_CAST(unsigned char*, dst);
2781     const unsigned char* src_uc = S_CAST(const unsigned char*, src);
2782     const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, src));
2783     const __m128i v2 = _mm_loadu_si128(R_CAST(const __m128i*, &(src_uc[N - 16])));
2784     _mm_storeu_si128(S_CAST(__m128i*, dst), v1);
2785     _mm_storeu_si128(R_CAST(__m128i*, &(dst_uc[N - 16])), v2);
2786   }
2787 };
2788 
2789 // Note that there's no difference between memcpy() and memcpy_k() for common
2790 // 'well-behaved' sizes like 1, 4, 8, and 16.  It's the funny numbers in
2791 // between, which often arise with constant strings, which this template is
2792 // targeting.
2793 #  define memcpy_k(dst, src, ct) plink2::MemcpyKImpl<ct>::MemcpyK(dst, src)
2794 
2795 template <uint32_t N> char* MemcpyaK(void* __restrict dst, const void* __restrict src) {
2796   MemcpyKImpl<N>::MemcpyK(dst, src);
2797   char* dst_c = S_CAST(char*, dst);
2798   return &(dst_c[N]);
2799 }
2800 
2801 #  define memcpya_k(dst, src, ct) plink2::MemcpyaK<ct>(dst, src)
2802 #  define memcpyua_k(dst, src, ct) R_CAST(unsigned char*, plink2::MemcpyaK<ct>(dst, src))
2803 
2804 template <uint32_t N> struct MemcpyoKImpl {
2805   static void MemcpyoK(void* __restrict dst, const void* __restrict src) {
2806     MemcpyKImpl<N>::MemcpyK(dst, src);
2807   }
2808 };
2809 
2810 template <> struct MemcpyoKImpl<3> {
2811   static void MemcpyoK(void* __restrict dst, const void* __restrict src) {
2812     *S_CAST(uint32_t*, dst) = *S_CAST(const uint32_t*, src);
2813   }
2814 };
2815 
2816 template <> struct MemcpyoKImpl<7> {
2817   static void MemcpyoK(void* __restrict dst, const void* __restrict src) {
2818     *S_CAST(uint64_t*, dst) = *S_CAST(const uint64_t*, src);
2819   }
2820 };
2821 
2822 template <> struct MemcpyoKImpl<15> {
2823   static void MemcpyoK(void* __restrict dst, const void* __restrict src) {
2824     const __m128i vv = _mm_loadu_si128(S_CAST(const __m128i*, src));
2825     _mm_storeu_si128(S_CAST(__m128i*, dst), vv);
2826   }
2827 };
2828 
2829 // interestingly, __m256i copy does not seem to be better in 31 byte case
2830 
2831 #  define memcpyo_k(dst, src, ct) plink2::MemcpyoKImpl<ct>::MemcpyoK(dst, src)
2832 
2833 template <uint32_t N> char* MemcpyaoK(void* __restrict dst, const void* __restrict src) {
2834   MemcpyoKImpl<N>::MemcpyoK(dst, src);
2835   char* dst_c = S_CAST(char*, dst);
2836   return &(dst_c[N]);
2837 }
2838 
2839 #  define memcpyao_k(dst, src, ct) plink2::MemcpyaoK<ct>(dst, src)
2840 #  define memcpyuao_k(dst, src, ct) R_CAST(unsigned char*, plink2::MemcpyaoK<ct>(dst, src))
2841 
2842 #  else  // !(defined(__LP64__) && defined(__cplusplus))
2843 
2844 HEADER_INLINE int32_t memequal_k(const void* m1, const void* m2, uintptr_t ct) {
2845   return !memcmp(m1, m2, ct);
2846 }
2847 
2848 HEADER_INLINE void memcpy_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2849   memcpy(dst, src, ct);
2850 }
2851 
2852 HEADER_INLINE char* memcpya_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2853   return memcpya(dst, src, ct);
2854 }
2855 
2856 HEADER_INLINE unsigned char* memcpyua_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2857   return memcpyua(dst, src, ct);
2858 }
2859 
2860 HEADER_INLINE void memcpyo_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2861   memcpy(dst, src, ct);
2862 }
2863 
2864 HEADER_INLINE char* memcpyao_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2865   return memcpya(dst, src, ct);
2866 }
2867 
2868 HEADER_INLINE unsigned char* memcpyuao_k(void* __restrict dst, const void* __restrict src, uintptr_t ct) {
2869   return memcpyua(dst, src, ct);
2870 }
2871 
2872 #endif
2873 
2874 #ifdef __LP64__
2875 // This is also better than the June 2018 OS X/LLVM stock implementation,
2876 // especially for small values of ct.
2877 // (gcc 7.1 and clang 6.0.0 should have better stock implementations;
2878 // re-benchmark this once Linux build machine is upgraded to Ubuntu 18.04.)
2879 int32_t Memcmp(const void* m1, const void* m2, uintptr_t ct);
2880 #else
2881 HEADER_INLINE int32_t Memcmp(const void* m1, const void* m2, uintptr_t ct) {
2882   return memcmp(m1, m2, ct);
2883 }
2884 #endif
2885 
2886 
2887 // now compiling with gcc >= 4.4 (or clang equivalent) on all platforms, so
2888 // safe to use memset everywhere
2889 HEADER_INLINE void ZeroU32Arr(uintptr_t entry_ct, uint32_t* u32arr) {
2890   memset(u32arr, 0, entry_ct * sizeof(int32_t));
2891 }
2892 
2893 HEADER_INLINE void ZeroWArr(uintptr_t entry_ct, uintptr_t* warr) {
2894   memset(warr, 0, entry_ct * sizeof(intptr_t));
2895 }
2896 
2897 HEADER_INLINE void ZeroU64Arr(uintptr_t entry_ct, uint64_t* u64arr) {
2898   memset(u64arr, 0, entry_ct * sizeof(int64_t));
2899 }
2900 
2901 HEADER_INLINE void ZeroPtrArr(uintptr_t entry_ct, void* pp) {
2902   memset(pp, 0, entry_ct * sizeof(intptr_t));
2903 }
2904 
2905 HEADER_INLINE void ZeroHwArr(uintptr_t entry_ct, Halfword* hwarr) {
2906   memset(hwarr, 0, entry_ct * sizeof(Halfword));
2907 }
2908 
2909 HEADER_INLINE void SetAllWArr(uintptr_t entry_ct, uintptr_t* warr) {
2910   // todo: test this against vecset()
2911   for (uintptr_t idx = 0; idx != entry_ct; ++idx) {
2912     warr[idx] = ~k0LU;
2913   }
2914 }
2915 
2916 
2917 // tried _bzhi_u64() in AVX2 case, it was actually worse on my Mac (more
2918 // opaque to compiler?)
2919 // todo: check gcc behavior since it may be different: see
2920 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82298 .
2921 //
2922 // This is undefined if idx == kBitsPerWord.
2923 HEADER_INLINE uintptr_t bzhi(uintptr_t ww, uint32_t idx) {
2924   return ww & ((k1LU << idx) - k1LU);
2925 }
2926 
2927 // This is undefined if idx == 0.
2928 HEADER_INLINE uintptr_t bzhi_max(uintptr_t ww, uint32_t idx) {
2929   return ww & ((~k0LU) >> (kBitsPerWord - idx));
2930 }
2931 
2932 // Don't bother defining blsr(), compiler should automatically use the
2933 // instruction under -mbmi and regular code is more readable?  (again, should
2934 // verify this is true for gcc)
2935 
2936 HEADER_INLINE uint32_t BytesToRepresentNzU32(uint32_t uii) {
2937   return 1 + (bsru32(uii) / CHAR_BIT);
2938 }
2939 
2940 // analogous to memset()
2941 // this can be slightly slower if e.g. system supports AVX2 but non-AVX2 plink2
2942 // build is in use; fine to pay that price given the small-array advantage for
2943 // now.  Should revisit this after next build-machine Ubuntu upgrade, though.
2944 HEADER_INLINE void vecset(void* target_vec, uintptr_t ww, uintptr_t vec_ct) {
2945   VecW* target_vec_iter = S_CAST(VecW*, target_vec);
2946 #ifdef __LP64__
2947   const VecW payload = VCONST_W(ww);
2948   for (uintptr_t vec_idx = 0; vec_idx != vec_ct; ++vec_idx) {
2949     *target_vec_iter++ = payload;
2950   }
2951 #else
2952   for (uintptr_t vec_idx = 0; vec_idx != vec_ct; ++vec_idx) {
2953     *target_vec_iter++ = ww;
2954   }
2955 #endif
2956 }
2957 
2958 // todo: make sure these are efficient for small ct
2959 HEADER_INLINE void u16set(void* dst, uint16_t usii, uintptr_t ct) {
2960   uint16_t* dst_u16 = S_CAST(uint16_t*, dst);
2961   for (uintptr_t ulii = 0; ulii != ct; ++ulii) {
2962     dst_u16[ulii] = usii;
2963   }
2964 }
2965 
2966 HEADER_INLINE char* u16setsa(char* dst, uint16_t usii, uintptr_t ct) {
2967   u16set(dst, usii, ct);
2968   return &(dst[ct * 2]);
2969 }
2970 
2971 HEADER_INLINE uintptr_t ClearBottomSetBits(uint32_t ct, uintptr_t ulii) {
2972 #ifdef USE_AVX2
2973   return _pdep_u64((~k0LU) << ct, ulii);
2974 #else
2975   for (uint32_t uii = 0; uii != ct; ++uii) {
2976     ulii &= ulii - 1;
2977   }
2978   return ulii;
2979 #endif
2980 }
2981 
2982 HEADER_INLINE uint32_t WordBitIdxToUidx(uintptr_t ulii, uint32_t bit_idx) {
2983   return ctzw(ClearBottomSetBits(bit_idx, ulii));
2984 }
2985 
2986 CONSTI32(kNybblesPerWord, 2 * kBytesPerWord);
2987 CONSTI32(kNybblesPerCacheline, 2 * kCacheline);
2988 
2989 HEADER_CINLINE uintptr_t NybbleCtToByteCt(uintptr_t val) {
2990   return DivUp(val, 2);
2991 }
2992 
2993 HEADER_CINLINE uintptr_t NybbleCtToWordCt(uintptr_t val) {
2994   return DivUp(val, kNybblesPerWord);
2995 }
2996 
2997 HEADER_INLINE uintptr_t GetNybbleArrEntry(const uintptr_t* nybblearr, uint32_t idx) {
2998   return (nybblearr[idx / kBitsPerWordD4] >> (4 * (idx % kBitsPerWordD4))) & 15;
2999 }
3000 
3001 // Returns zero when ww has no zero bytes, and a word where the lowest set bit
3002 // is at position 8x + 7 when the first zero byte is [8x .. 8x+7].
3003 HEADER_INLINE uintptr_t DetectFirstZeroByte(uintptr_t ww) {
3004   return (ww - kMask0101) & (~ww) & (kMask0101 * 0x80);
3005 }
3006 
3007 // From TAOCP 4a, 7.1.3, (91).
3008 // Position 8x + 7 is always set iff byte x is zero.  All other bits are always
3009 // zero.
3010 HEADER_INLINE uintptr_t DetectAllZeroBytes(uintptr_t ww) {
3011   return (kMask0101 * 0x80) & (~(ww | ((ww | (kMask0101 * 0x80)) - kMask0101)));
3012 }
3013 
3014 HEADER_INLINE uintptr_t DetectAllZeroNybbles(uintptr_t ww) {
3015   return (kMask1111 * 8) & (~(ww | ((ww | (kMask1111 * 8)) - kMask1111)));
3016 }
3017 
3018 // Flagset conventions:
3019 // * Each 32-bit and 64-bit flagset has its own type, which is guaranteed to be
3020 //   the appropriate width.  (Todo: verify that bit 31 works properly in 32-bit
3021 //   case.)
3022 // * Constant flag names start with "kf[CamelCase description]", followed by a
3023 //   description that shouldn't suck too badly.  The zero flagset is always
3024 //   named kf[CamelCase description]0.
3025 // * The type name is always of the form [snake_case description]_flags_t.
3026 // * To gain the desired level of type-checking under C++11 without pointless
3027 //   verbosity, &, |, ^, ~, &=, |=, and ^= operations are defined; [my_flags_t
3028 //   variable] |= [another my_flags_t variable] & [a my_flags_t constant] works
3029 //   without an explicit cast.  (Defining "struct my_flags_t" separately from
3030 //   the enum global-scope-constants container is necessary to make |= work
3031 //   without a cast.  inline is needed due to duplicate operator definitions
3032 //   across multiple files.)
3033 // * To slightly reduce the chance of breakage under C99/C++03, the enum is
3034 //   nameless; the flagset type is just a uint32_t/uint64_t alias.  This is
3035 //   because the C99 and C++03 specs do not provide enough control over the
3036 //   enum base type to make it safe for the enum to serve as the flagset type.
3037 // * Implicit conversion to int is not prevented for now, since I'm trying to
3038 //   keep PglErr-style code duplication to a minimum.
3039 #if __cplusplus >= 201103L
3040 
3041   // could avoid the typedef here, but that leads to a bit more verbosity.
3042 #  define FLAGSET_DEF_START() typedef enum : uint32_t {
3043 #  define FLAGSET_DEF_END(tname) } tname ## _PLINK2_BASE_DO_NOT_USE__ ; \
3044   \
3045 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator|(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3046   return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint32_t>(aa) | static_cast<uint32_t>(bb)); \
3047 } \
3048   \
3049 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator&(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3050   return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint32_t>(aa) & static_cast<uint32_t>(bb)); \
3051 } \
3052   \
3053 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator^(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3054   return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint32_t>(aa) ^ static_cast<uint32_t>(bb)); \
3055 } \
3056   \
3057 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator~(tname ## _PLINK2_BASE_DO_NOT_USE__ aa) { \
3058   return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(~static_cast<uint32_t>(aa)); \
3059 } \
3060   \
3061 struct tname { \
3062   tname() {} \
3063   \
3064   tname(const tname& source) : value_(source.value_) {} \
3065   \
3066   tname(const tname ## _PLINK2_BASE_DO_NOT_USE__ source) : value_(static_cast<uint32_t>(source)) {} \
3067   \
3068   explicit tname(uint32_t source) : value_(source) {} \
3069   \
3070   operator tname ## _PLINK2_BASE_DO_NOT_USE__() const { \
3071     return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(value_); \
3072   } \
3073   \
3074   tname& operator|=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3075     value_ |= rhs; \
3076     return *this; \
3077   } \
3078   \
3079   tname& operator&=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3080     value_ &= rhs; \
3081     return *this; \
3082   } \
3083   \
3084   tname& operator^=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3085     value_ ^= rhs; \
3086     return *this; \
3087   } \
3088   \
3089   tname& operator=(const tname& rhs) = default; \
3090   \
3091 private: \
3092   uint32_t value_; \
3093 }
3094 
3095 #  define FLAGSET64_DEF_START() typedef enum : uint64_t {
3096 #  define FLAGSET64_DEF_END(tname) } tname ## _PLINK2_BASE_DO_NOT_USE__ ; \
3097   \
3098 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator|(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3099   return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint64_t>(aa) | static_cast<uint64_t>(bb)); \
3100 } \
3101   \
3102 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator&(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3103   return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint64_t>(aa) & static_cast<uint64_t>(bb)); \
3104 } \
3105   \
3106 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator^(tname ## _PLINK2_BASE_DO_NOT_USE__ aa, tname ## _PLINK2_BASE_DO_NOT_USE__ bb) { \
3107   return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(static_cast<uint64_t>(aa) ^ static_cast<uint64_t>(bb)); \
3108 } \
3109   \
3110 inline tname ## _PLINK2_BASE_DO_NOT_USE__ operator~(tname ## _PLINK2_BASE_DO_NOT_USE__ aa) { \
3111   return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(~static_cast<uint64_t>(aa)); \
3112 } \
3113   \
3114 struct tname { \
3115   tname() {} \
3116   \
3117   tname(const tname& source) : value_(source.value_) {} \
3118   \
3119   tname(const tname ## _PLINK2_BASE_DO_NOT_USE__ source) : value_(static_cast<uint64_t>(source)) {} \
3120   \
3121   explicit tname(uint64_t source) : value_(source) {} \
3122   \
3123   operator tname ## _PLINK2_BASE_DO_NOT_USE__() const { \
3124     return static_cast<tname ## _PLINK2_BASE_DO_NOT_USE__>(value_); \
3125   } \
3126   \
3127   tname& operator|=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3128     value_ |= rhs; \
3129     return *this; \
3130   } \
3131   \
3132   tname& operator&=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3133     value_ &= rhs; \
3134     return *this; \
3135   } \
3136   \
3137   tname& operator^=(const tname ## _PLINK2_BASE_DO_NOT_USE__ rhs) { \
3138     value_ ^= rhs; \
3139     return *this; \
3140   } \
3141   \
3142   tname& operator=(const tname& rhs) = default; \
3143   \
3144 private: \
3145   uint64_t value_; \
3146 }
3147 
3148 #  define ENUM_U31_DEF_START() typedef enum : uint32_t {
3149 #  define ENUM_U31_DEF_END(tname) } tname
3150 
3151 #else  // !__cplusplus >= 201103L
3152 
3153 #  define FLAGSET_DEF_START() enum {
3154 #  define FLAGSET_DEF_END(tname) } ; \
3155 typedef uint32_t tname
3156 
3157   // don't use a nameless enum here, since we want to be able to static_assert
3158   // the enum size.
3159   // best to artificially add an element to the end for now to force width to
3160   // 64-bit, otherwise gcc actually shrinks it even when the constants are
3161   // defined with LLU.
3162 #  define FLAGSET64_DEF_START() typedef enum {
3163 #  define FLAGSET64_DEF_END(tname) , \
3164   tname ## PLINK2_BASE_DO_NOT_USE__ALL_64_SET__ = ~(0LLU) } tname ## _PLINK2_BASE_DO_NOT_USE__ ; \
3165 static_assert(sizeof(tname ## _PLINK2_BASE_DO_NOT_USE__) == 8, "64-bit flagset constants are not actually uint64_ts."); \
3166 typedef uint64_t tname
3167 
3168 #  define ENUM_U31_DEF_START() typedef enum {
3169 #  define ENUM_U31_DEF_END(tname) } tname ## _PLINK2_BASE_DO_NOT_USE__ ; \
3170 typedef uint32_t tname
3171 
3172 #endif
3173 
3174 // This supports private struct members in code that still compiles as C.
3175 //
3176 // Internal code should access these members with GET_PRIVATE(), and define a
3177 // pair of public C++-only GET_PRIVATE_...() member functions (one const and
3178 // one non-const) that each return a reference to the member; see plink2_thread
3179 // for examples.  In addition, .cc API code should define a small number of
3180 // standard GET_PRIVATE() accessors at the top of the file, and practically all
3181 // private-member access should occur through those file-scope accessors; this
3182 // keeps the surface area under control.
3183 //
3184 // (Tried to define a DECLARE_PRIVATE(typ, member) macro as well, but didn't
3185 // get that to work.  This is already pretty painless if it's restricted to key
3186 // APIs, though.)
3187 //
3188 // probable todo: see if the intended effect can be achieved in a simpler
3189 // manner with well-chosen explicit and implicit type conversions.
3190 #ifdef __cplusplus
3191 #  define GET_PRIVATE(par, member) (par).GET_PRIVATE_ ## member()
3192 #else
3193 #  define GET_PRIVATE(par, member) (par).member
3194 #endif
3195 
3196 #ifdef __cplusplus
3197 }  // namespace plink2
3198 #endif
3199 
3200 #endif  // __PLINK2_BASE_H__
3201