1 #ifndef __PLINK2_CMDLINE_H__
2 #define __PLINK2_CMDLINE_H__
3 
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This program is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation, either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 
20 
21 // Cross-platform logging, command-line parsing, workspace
22 // initialization/allocation, basic multithreading code, and a few numeric
23 // constants.
24 
25 #include "include/plink2_bits.h"
26 #include "include/plink2_string.h"
27 #include "include/plink2_thread.h"
28 
29 #include <errno.h>
30 #include <stdarg.h>
31 
32 #ifndef _WIN32
33 #  include <sys/stat.h>
34 #endif
35 
36 #ifdef DYNAMIC_MKL
37 #  define USE_MKL
38 #endif
39 
40 #ifdef USE_MKL
41 #  ifdef __APPLE__
42 #    error "plink2 cannot currently use MKL on OS X."
43 #  endif
44 #  ifdef LAPACK_ILP64
45 #    define MKL_ILP64
46 #  endif
47 #  ifdef DYNAMIC_MKL
48 #    include <mkl_service.h>
49 #  else
50 // If this isn't initially found, use the compiler's -I option to specify the
51 // appropriate include-file directory.  A common location is
52 // /opt/intel/mkl/include .
53 // If this isn't installed at all on your system but you want/need to change
54 // that, see the instructions at
55 //   https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo
56 #    include "mkl_service.h"
57 #  endif
58 #  define USE_MTBLAS
59 // this technically doesn't have to be a macro, but it's surrounded by other
60 // things which do have to be macros, so changing this to a namespaced function
61 // arguably *decreases* overall readability...
62 #  define BLAS_SET_NUM_THREADS mkl_set_num_threads
63 #else
64 #  ifdef USE_OPENBLAS
65 #    ifdef __cplusplus
66 extern "C" {
67 #    endif
68       void openblas_set_num_threads(int num_threads);
69 #    ifdef __cplusplus
70 }  // extern "C"
71 #    endif
72 #    define USE_MTBLAS
73 #    define BLAS_SET_NUM_THREADS openblas_set_num_threads
74 #  else
75 #    define BLAS_SET_NUM_THREADS(num)
76 #  endif
77 #endif
78 
79 #ifdef _WIN32
80 #  define NULL_STREAM_NAME "nul"
81 #else
82 #  define NULL_STREAM_NAME "/dev/null"
83 #endif
84 
85 #ifdef __cplusplus
86 namespace plink2 {
87 #endif
88 
89 CONSTI32(kLogbufSize, 2 * kMaxMediumLine);
90 
91 // must be at least 2 * kMaxMediumLine + 2 to support generic token loader.
92 CONSTI32(kTextbufSize, 2 * kMaxMediumLine + 256);
93 
94 // when g_textbuf is used as a generic I/O buffer, this is a convenient
95 // power-of-2 size (must be <= kTextbufSize).
96 CONSTI32(kTextbufMainSize, 2 * kMaxMediumLine);
97 
98 // "slen" is now used to indicate string lengths excluding terminating nulls,
99 // while "blen" includes the terminator.
100 
101 // Maximum length of chromosome, variant, FID, IID, cluster, and set IDs (not
102 // including terminating null).  This value supports up to 8 IDs per line
103 // (maximum so far is 5, for e.g. --hom).
104 // Assumed by plink2_pvar to be a multiple of 16.
105 CONSTI32(kMaxIdSlen, 16000);
106 CONSTI32(kMaxIdBlen, kMaxIdSlen + 1);
107 // Don't see a better option than #define for this.
108 #define MAX_ID_SLEN_STR "16000"
109 
110 // allow extensions like .model.trend.fisher.set.score.adjusted
111 CONSTI32(kMaxOutfnameExtBlen, 39);
112 
113 #ifdef __LP64__
RoundUpPow2U64(uint64_t val,uint64_t alignment)114 HEADER_CINLINE uint64_t RoundUpPow2U64(uint64_t val, uint64_t alignment) {
115   return RoundUpPow2(val, alignment);
116 }
117 #else
RoundUpPow2U64(uint64_t val,uint64_t alignment)118 HEADER_CINLINE uint64_t RoundUpPow2U64(uint64_t val, uint64_t alignment) {
119   return (val + alignment - 1) & (~(alignment - 1));
120 }
121 #endif
122 
123 extern const char kErrprintfFopen[];
124 extern const char kErrprintfFread[];
125 extern const char kErrprintfRewind[];
126 // extern const char g_cmdline_format_str[];
127 
128 // All global variables not initialized at compile time start with g_ (even if
129 // they're initialized very early and never changed afterwards, like
130 // g_one_char_strs).
131 extern char g_textbuf[];
132 
133 extern const char* g_one_char_strs;
134 
135 // '.' missing genotype value is now taken for granted; this is in *addition*
136 // to it (default '0').
137 // (Yes, this might not belong in plink2_cmdline.)
138 extern const char* g_input_missing_geno_ptr;
139 
140 extern const char* g_output_missing_geno_ptr;  // now defaults to '.'
141 
142 extern FILE* g_logfile;
143 
144 // Mostly-safe log buffer (length kLogbufSize, currently 256k).  Good practice
145 // to use snprintf when writing an entire line to it in a single statement.
146 // Warning: Do NOT put allele codes or arbitrary-length lists in here.
147 extern char g_logbuf[];
148 
149 extern uint32_t g_debug_on;
150 extern uint32_t g_log_failed;
151 
152 // for --warning-errcode
153 extern uint32_t g_stderr_written_to;
154 
155 
156 // Warning: Do NOT include allele codes (unless they're guaranteed to be SNPs)
157 // in log strings; they can overflow the buffer.
158 void logputs_silent(const char* str);
159 
160 void logputs(const char* str);
161 
162 void logerrputs(const char* str);
163 
164 void logputsb();
165 
166 void logerrputsb();
167 
logprintf(const char * fmt,...)168 HEADER_INLINE void logprintf(const char* fmt, ...) {
169   va_list args;
170   va_start(args, fmt);
171   vsnprintf(g_logbuf, kLogbufSize, fmt, args);
172   logputsb();
173 }
174 
logerrprintf(const char * fmt,...)175 HEADER_INLINE void logerrprintf(const char* fmt, ...) {
176   va_list args;
177   va_start(args, fmt);
178   vsnprintf(g_logbuf, kLogbufSize, fmt, args);
179   logerrputsb();
180 }
181 
182 // input for WordWrapB/logprintfww should have no intermediate '\n's.  If
183 // suffix_len is 0, there should be a terminating \n.
WordWrapB(uint32_t suffix_len)184 HEADER_INLINE void WordWrapB(uint32_t suffix_len) {
185   WordWrap(suffix_len, g_logbuf);
186 }
187 
logpreprintfww(const char * fmt,...)188 HEADER_INLINE void logpreprintfww(const char* fmt, ...) {
189   va_list args;
190   va_start(args, fmt);
191   vsnprintf(g_logbuf, kLogbufSize, fmt, args);
192   WordWrapB(0);
193 }
194 
logprintfww(const char * fmt,...)195 HEADER_INLINE void logprintfww(const char* fmt, ...) {
196   va_list args;
197   va_start(args, fmt);
198   vsnprintf(g_logbuf, kLogbufSize, fmt, args);
199   WordWrapB(0);
200   logputsb();
201 }
202 
logerrprintfww(const char * fmt,...)203 HEADER_INLINE void logerrprintfww(const char* fmt, ...) {
204   va_list args;
205   va_start(args, fmt);
206   vsnprintf(g_logbuf, kLogbufSize, fmt, args);
207   WordWrapB(0);
208   logerrputsb();
209 }
210 
211 // 5 = length of "done." suffix, which is commonly used
logprintfww5(const char * fmt,...)212 HEADER_INLINE void logprintfww5(const char* fmt, ...) {
213   va_list args;
214   va_start(args, fmt);
215   vsnprintf(g_logbuf, kLogbufSize, fmt, args);
216   WordWrapB(5);
217   logputsb();
218 }
219 
DebugPrintf(const char * fmt,...)220 HEADER_INLINE void DebugPrintf(const char* fmt, ...) {
221   if (g_debug_on) {
222     va_list args;
223     va_start(args, fmt);
224     vsnprintf(g_logbuf, kLogbufSize, fmt, args);
225     logputsb();
226   }
227 }
228 
DPrintf(const char * fmt,...)229 HEADER_INLINE void DPrintf(const char* fmt, ...) {
230   if (g_debug_on) {
231     va_list args;
232     va_start(args, fmt);
233     vsnprintf(g_logbuf, kLogbufSize, fmt, args);
234     logputsb();
235   }
236 }
237 
238 // Returns kPglRetOpenFail if file doesn't exist, or kPglRetRewindFail if file
239 // is process-substitution/named-pipe.  Does not print an error message.
240 PglErr ForceNonFifo(const char* fname);
241 
242 BoolErr fopen_checked(const char* fname, const char* mode, FILE** target_ptr);
243 
putc_checked(int32_t ii,FILE * outfile)244 HEADER_INLINE IntErr putc_checked(int32_t ii, FILE* outfile) {
245   putc_unlocked(ii, outfile);
246   return ferror_unlocked(outfile);
247 }
248 
fputs_checked(const char * str,FILE * outfile)249 HEADER_INLINE IntErr fputs_checked(const char* str, FILE* outfile) {
250   fputs(str, outfile);
251   return ferror_unlocked(outfile);
252 }
253 
254 BoolErr fwrite_flush2(char* buf_flush, FILE* outfile, char** write_iter_ptr);
255 
fwrite_uflush2(unsigned char * buf_flush,FILE * outfile,unsigned char ** write_iter_ptr)256 HEADER_INLINE BoolErr fwrite_uflush2(unsigned char* buf_flush, FILE* outfile, unsigned char** write_iter_ptr) {
257   return fwrite_flush2(R_CAST(char*, buf_flush), outfile, R_CAST(char**, write_iter_ptr));
258 }
259 
fwrite_ck(char * buf_flush,FILE * outfile,char ** write_iter_ptr)260 HEADER_INLINE BoolErr fwrite_ck(char* buf_flush, FILE* outfile, char** write_iter_ptr) {
261   if ((*write_iter_ptr) < buf_flush) {
262     return 0;
263   }
264   return fwrite_flush2(buf_flush, outfile, write_iter_ptr);
265 }
266 
267 // fclose_null defined in plink2_base.h
268 
269 BoolErr fclose_flush_null(char* buf_flush, char* write_iter, FILE** outfile_ptr);
270 
fclose_uflush_null(unsigned char * buf_flush,unsigned char * write_iter,FILE ** outfile_ptr)271 HEADER_INLINE BoolErr fclose_uflush_null(unsigned char* buf_flush, unsigned char* write_iter, FILE** outfile_ptr) {
272   return fclose_flush_null(R_CAST(char*, buf_flush), R_CAST(char*, write_iter), outfile_ptr);
273 }
274 
275 // This should only be used when the file can only be open on error-early-exit;
276 // otherwise we care about fclose's error code.
fclose_cond(FILE * fptr)277 HEADER_INLINE void fclose_cond(FILE* fptr) {
278   if (fptr) {
279     fclose(fptr);
280   }
281 }
282 
ClipU32(uint32_t val,uint32_t lbound,uint32_t ubound)283 HEADER_INLINE uint32_t ClipU32(uint32_t val, uint32_t lbound, uint32_t ubound) {
284   if (val >= ubound) {
285     return ubound;
286   }
287   return MAXV(val, lbound);
288 }
289 
290 int32_t double_cmp(const void* aa, const void* bb);
291 
292 int32_t double_cmp_decr(const void* aa, const void* bb);
293 
294 int32_t u64cmp(const void* aa, const void* bb);
295 
296 #ifndef __cplusplus
297 int32_t u64cmp_decr(const void* aa, const void* bb);
298 #endif
299 
U32ArrMax(const uint32_t * unsorted_arr,uintptr_t len)300 HEADER_INLINE uint32_t U32ArrMax(const uint32_t* unsorted_arr, uintptr_t len) {
301   const uint32_t* unsorted_arr_end = &(unsorted_arr[len]);
302 #ifndef __cplusplus
303   const uint32_t* unsorted_arr_iter = unsorted_arr;
304   uint32_t uimax = *unsorted_arr_iter++;
305   while (unsorted_arr_iter < unsorted_arr_end) {
306     const uint32_t cur_val = *unsorted_arr_iter++;
307     if (cur_val > uimax) {
308       uimax = cur_val;
309     }
310   }
311   return uimax;
312 #else
313   return *std::max_element(unsorted_arr, unsorted_arr_end);
314 #endif
315 }
316 
FArrMax(const float * unsorted_arr,uintptr_t len)317 HEADER_INLINE float FArrMax(const float* unsorted_arr, uintptr_t len) {
318   const float* unsorted_arr_end = &(unsorted_arr[len]);
319 #if defined(__APPLE__) || !defined(__cplusplus)
320   // std::max_element doesn't seem to be performant for floats/doubles on OS X
321   const float* unsorted_arr_iter = unsorted_arr;
322   float fmax = *unsorted_arr_iter++;
323   while (unsorted_arr_iter < unsorted_arr_end) {
324     const float cur_val = *unsorted_arr_iter++;
325     if (cur_val > fmax) {
326       fmax = cur_val;
327     }
328   }
329   return fmax;
330 #else
331   return *std::max_element(unsorted_arr, unsorted_arr_end);
332 #endif
333 }
334 
DArrMax(const double * unsorted_arr,uintptr_t len)335 HEADER_INLINE double DArrMax(const double* unsorted_arr, uintptr_t len) {
336   const double* unsorted_arr_end = &(unsorted_arr[len]);
337 #if defined(__APPLE__) || !defined(__cplusplus)
338   const double* unsorted_arr_iter = unsorted_arr;
339   double dmax = *unsorted_arr_iter++;
340   while (unsorted_arr_iter < unsorted_arr_end) {
341     const double cur_val = *unsorted_arr_iter++;
342     if (cur_val > dmax) {
343       dmax = cur_val;
344     }
345   }
346   return dmax;
347 #else
348   return *std::max_element(unsorted_arr, unsorted_arr_end);
349 #endif
350 }
351 
352 float DestructiveMedianF(uintptr_t len, float* unsorted_arr);
353 
354 double DestructiveMedianD(uintptr_t len, double* unsorted_arr);
355 
356 
357 // This makes a temporary g_bigstack allocation.
358 // Must be safe to read up to (kBytesPerWord - 1) bytes past end of strbox.
359 // Results may technically vary between runs when duplicate elements are
360 // present; it's assumed that this doesn't matter because all duplicates will
361 // be handled in the same manner.
362 BoolErr SortStrboxIndexed(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map);
363 
364 
365 // This makes a temporary g_bigstack allocation.
366 // BoolErr strptr_arr_indexed_sort(const char* const* unsorted_strptrs, uint32_t str_ct, uint32_t use_nsort, uint32_t* id_map);
367 
368 // Offset of std::lower_bound.
369 // Requires 0 < arr_length < 2^31.
370 uint32_t CountSortedSmallerU32(const uint32_t* sorted_u32_arr, uint32_t arr_length, uint32_t needle);
371 
372 // Same as above, but optimized for case where result is probably close to the
373 // front, and tolerates end_idx (== arr_length) == 0.
374 uint32_t ExpsearchU32(const uint32_t* sorted_u32_arr, uint32_t end_idx, uint32_t needle);
375 
376 uintptr_t CountSortedSmallerU64(const uint64_t* sorted_u64_arr, uintptr_t arr_length, uint64_t needle);
377 
378 uintptr_t CountSortedSmallerD(const double* sorted_dbl_arr, uintptr_t arr_length, double needle);
379 
380 
381 uintptr_t CountSortedLeqU64(const uint64_t* sorted_u64_arr, uintptr_t arr_length, uint64_t needle);
382 
383 
IsCmdlineFlag(const char * param)384 HEADER_INLINE uint32_t IsCmdlineFlag(const char* param) {
385   unsigned char ucc = param[1];
386   return ((*param == '-') && ((ucc > '9') || ((ucc < '0') && (ucc != '.') && (ucc != '\0'))));
387 }
388 
IsCmdlineFlagStart(const char * param)389 HEADER_INLINE CXXCONST_CP IsCmdlineFlagStart(const char* param) {
390   unsigned char ucc = param[1];
391   if ((*param == '-') && ((ucc > '9') || ((ucc < '0') && (ucc != '.') && (ucc != '\0')))) {
392     return S_CAST(CXXCONST_CP, &(param[1 + (ucc == '-')]));
393   }
394   return nullptr;
395 }
396 
397 #ifdef __cplusplus
IsCmdlineFlagStart(char * param)398 HEADER_INLINE char* IsCmdlineFlagStart(char* param) {
399   return const_cast<char*>(IsCmdlineFlagStart(const_cast<const char*>(param)));
400 }
401 #endif
402 
403 uint32_t GetParamCt(const char* const* argvk, uint32_t argc, uint32_t flag_idx);
404 
405 BoolErr EnforceParamCtRange(const char* flag_name, uint32_t param_ct, uint32_t min_ct, uint32_t max_ct);
406 
407 // PglErr SortCmdlineFlags(uint32_t max_flag_blen, uint32_t flag_ct, char* flag_buf, uint32_t* flag_map);
408 
409 BoolErr CleanupLogfile(uint32_t print_end_time);
410 
411 CONSTI32(kNonBigstackMin, 67108864);
412 
413 CONSTI32(kBigstackMinMib, 640);
414 CONSTI32(kBigstackDefaultMib, 2048);
415 
416 static const double kSmallishEpsilon = 0.00000000002910383045673370361328125;
417 static const double kRecip2m32 = 0.00000000023283064365386962890625;
418 static const double k2m64 = 18446744073709551616.0;
419 
420 static const double kLnPvalError = 9.0;
421 
422 static const double kDblNormalMin = 2.2250738585072013e-308;
423 
424 // probably time to flip arena_alloc and bigstack_alloc definitions...
425 
426 // manually managed, very large double-ended stack
427 extern unsigned char* g_bigstack_base;
428 extern unsigned char* g_bigstack_end;
429 
430 uintptr_t DetectMib();
431 
432 uintptr_t GetDefaultAllocMib();
433 
434 // caller is responsible for freeing bigstack_ua
435 PglErr InitBigstack(uintptr_t malloc_size_mib, uintptr_t* malloc_mib_final_ptr, unsigned char** bigstack_ua_ptr);
436 
437 
bigstack_left()438 HEADER_INLINE uintptr_t bigstack_left() {
439   return g_bigstack_end - g_bigstack_base;
440 }
441 
bigstack_alloc_raw(uintptr_t size)442 HEADER_INLINE void* bigstack_alloc_raw(uintptr_t size) {
443   // Assumes caller has already forced size to a multiple of
444   // kCacheline, and verified that enough space is available.
445   assert(!(size % kCacheline));
446   unsigned char* alloc_ptr = g_bigstack_base;
447   g_bigstack_base += size;
448   return alloc_ptr;
449 }
450 
bigstack_alloc_raw_rd(uintptr_t size)451 HEADER_INLINE void* bigstack_alloc_raw_rd(uintptr_t size) {
452   // Same as bigstack_alloc_raw(), except for rounding up size.
453   unsigned char* alloc_ptr = g_bigstack_base;
454   g_bigstack_base += RoundUpPow2(size, kCacheline);
455   return alloc_ptr;
456 }
457 
458 // Basic 64-byte-aligned allocation at bottom of stack.
459 // Note that --make-pgen switches gracefully to a less memory-hungry algorithm
460 // when it encounters an allocation failure with its default algorithm.  Since
461 // it can only happen once, unlikely() is still justified, but keep an eye on
462 // this.
bigstack_alloc(uintptr_t size)463 HEADER_INLINE void* bigstack_alloc(uintptr_t size) {
464   size = RoundUpPow2(size, kCacheline);
465   if (unlikely(bigstack_left() < size)) {
466     g_failed_alloc_attempt_size = size;
467     return nullptr;
468   }
469   return bigstack_alloc_raw(size);
470 }
471 
472 
473 // Typesafe, return-0-iff-success interfaces.  (See also bigstack_calloc_...
474 // further below.)
475 // This interface deliberately not provided for bigstack_alloc_raw() and
476 // bigstack_alloc_raw_rd() (and I wouldn't blame a future maintainer for
477 // entirely eliminating those functions).
bigstack_alloc_c(uintptr_t ct,char ** c_arr_ptr)478 HEADER_INLINE BoolErr bigstack_alloc_c(uintptr_t ct, char** c_arr_ptr) {
479   *c_arr_ptr = S_CAST(char*, bigstack_alloc(ct));
480   return !(*c_arr_ptr);
481 }
482 
bigstack_alloc_d(uintptr_t ct,double ** d_arr_ptr)483 HEADER_INLINE BoolErr bigstack_alloc_d(uintptr_t ct, double** d_arr_ptr) {
484   *d_arr_ptr = S_CAST(double*, bigstack_alloc(ct * sizeof(double)));
485   return !(*d_arr_ptr);
486 }
487 
bigstack_alloc_f(uintptr_t ct,float ** f_arr_ptr)488 HEADER_INLINE BoolErr bigstack_alloc_f(uintptr_t ct, float** f_arr_ptr) {
489   *f_arr_ptr = S_CAST(float*, bigstack_alloc(ct * sizeof(float)));
490   return !(*f_arr_ptr);
491 }
492 
bigstack_alloc_i16(uintptr_t ct,int16_t ** i16_arr_ptr)493 HEADER_INLINE BoolErr bigstack_alloc_i16(uintptr_t ct, int16_t** i16_arr_ptr) {
494   *i16_arr_ptr = S_CAST(int16_t*, bigstack_alloc(ct * sizeof(int16_t)));
495   return !(*i16_arr_ptr);
496 }
497 
bigstack_alloc_i32(uintptr_t ct,int32_t ** i32_arr_ptr)498 HEADER_INLINE BoolErr bigstack_alloc_i32(uintptr_t ct, int32_t** i32_arr_ptr) {
499   *i32_arr_ptr = S_CAST(int32_t*, bigstack_alloc(ct * sizeof(int32_t)));
500   return !(*i32_arr_ptr);
501 }
502 
bigstack_alloc_uc(uintptr_t ct,unsigned char ** uc_arr_ptr)503 HEADER_INLINE BoolErr bigstack_alloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
504   *uc_arr_ptr = S_CAST(unsigned char*, bigstack_alloc(ct));
505   return !(*uc_arr_ptr);
506 }
507 
bigstack_alloc_u16(uintptr_t ct,uint16_t ** u16_arr_ptr)508 HEADER_INLINE BoolErr bigstack_alloc_u16(uintptr_t ct, uint16_t** u16_arr_ptr) {
509   *u16_arr_ptr = S_CAST(uint16_t*, bigstack_alloc(ct * sizeof(int16_t)));
510   return !(*u16_arr_ptr);
511 }
512 
bigstack_alloc_u32(uintptr_t ct,uint32_t ** u32_arr_ptr)513 HEADER_INLINE BoolErr bigstack_alloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr) {
514   *u32_arr_ptr = S_CAST(uint32_t*, bigstack_alloc(ct * sizeof(int32_t)));
515   return !(*u32_arr_ptr);
516 }
517 
bigstack_alloc_w(uintptr_t ct,uintptr_t ** w_arr_ptr)518 HEADER_INLINE BoolErr bigstack_alloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) {
519   *w_arr_ptr = S_CAST(uintptr_t*, bigstack_alloc(ct * sizeof(intptr_t)));
520   return !(*w_arr_ptr);
521 }
522 
bigstack_alloc_i64(uintptr_t ct,int64_t ** i64_arr_ptr)523 HEADER_INLINE BoolErr bigstack_alloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) {
524   *i64_arr_ptr = S_CAST(int64_t*, bigstack_alloc(ct * sizeof(int64_t)));
525   return !(*i64_arr_ptr);
526 }
527 
bigstack_alloc_u64(uintptr_t ct,uint64_t ** u64_arr_ptr)528 HEADER_INLINE BoolErr bigstack_alloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr) {
529   *u64_arr_ptr = S_CAST(uint64_t*, bigstack_alloc(ct * sizeof(int64_t)));
530   return !(*u64_arr_ptr);
531 }
532 
bigstack_alloc_v(uintptr_t ct,VecW ** v_arr_ptr)533 HEADER_INLINE BoolErr bigstack_alloc_v(uintptr_t ct, VecW** v_arr_ptr) {
534   *v_arr_ptr = S_CAST(VecW*, bigstack_alloc(ct * sizeof(VecW)));
535   return !(*v_arr_ptr);
536 }
537 
538 // some versions of gcc give aliasing warnings if we use bigstack_alloc_w()
539 // for everything
540 // if sizeof(intptr_t) != sizeof(uintptr_t*), we're doomed anyway, so I won't
541 // bother with that static assert...
bigstack_alloc_wp(uintptr_t ct,uintptr_t *** ulp_arr_ptr)542 HEADER_INLINE BoolErr bigstack_alloc_wp(uintptr_t ct, uintptr_t*** ulp_arr_ptr) {
543   *ulp_arr_ptr = S_CAST(uintptr_t**, bigstack_alloc(ct * sizeof(intptr_t)));
544   return !(*ulp_arr_ptr);
545 }
546 
bigstack_alloc_cp(uintptr_t ct,char *** cp_arr_ptr)547 HEADER_INLINE BoolErr bigstack_alloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
548   *cp_arr_ptr = S_CAST(char**, bigstack_alloc(ct * sizeof(intptr_t)));
549   return !(*cp_arr_ptr);
550 }
551 
bigstack_alloc_kcp(uintptr_t ct,const char *** kcp_arr_ptr)552 HEADER_INLINE BoolErr bigstack_alloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
553   *kcp_arr_ptr = S_CAST(const char**, bigstack_alloc(ct * sizeof(intptr_t)));
554   return !(*kcp_arr_ptr);
555 }
556 
bigstack_alloc_i16p(uintptr_t ct,int16_t *** i16p_arr_ptr)557 HEADER_INLINE BoolErr bigstack_alloc_i16p(uintptr_t ct, int16_t*** i16p_arr_ptr) {
558   *i16p_arr_ptr = S_CAST(int16_t**, bigstack_alloc(ct * sizeof(intptr_t)));
559   return !(*i16p_arr_ptr);
560 }
561 
bigstack_alloc_i32p(uintptr_t ct,int32_t *** i32p_arr_ptr)562 HEADER_INLINE BoolErr bigstack_alloc_i32p(uintptr_t ct, int32_t*** i32p_arr_ptr) {
563   *i32p_arr_ptr = S_CAST(int32_t**, bigstack_alloc(ct * sizeof(intptr_t)));
564   return !(*i32p_arr_ptr);
565 }
566 
bigstack_alloc_ucp(uintptr_t ct,unsigned char *** ucp_arr_ptr)567 HEADER_INLINE BoolErr bigstack_alloc_ucp(uintptr_t ct, unsigned char*** ucp_arr_ptr) {
568   *ucp_arr_ptr = S_CAST(unsigned char**, bigstack_alloc(ct * sizeof(intptr_t)));
569   return !(*ucp_arr_ptr);
570 }
571 
bigstack_alloc_u16p(uintptr_t ct,uint16_t *** u16p_arr_ptr)572 HEADER_INLINE BoolErr bigstack_alloc_u16p(uintptr_t ct, uint16_t*** u16p_arr_ptr) {
573   *u16p_arr_ptr = S_CAST(uint16_t**, bigstack_alloc(ct * sizeof(intptr_t)));
574   return !(*u16p_arr_ptr);
575 }
576 
bigstack_alloc_u32p(uintptr_t ct,uint32_t *** u32p_arr_ptr)577 HEADER_INLINE BoolErr bigstack_alloc_u32p(uintptr_t ct, uint32_t*** u32p_arr_ptr) {
578   *u32p_arr_ptr = S_CAST(uint32_t**, bigstack_alloc(ct * sizeof(intptr_t)));
579   return !(*u32p_arr_ptr);
580 }
581 
bigstack_alloc_u64p(uintptr_t ct,uint64_t *** u64p_arr_ptr)582 HEADER_INLINE BoolErr bigstack_alloc_u64p(uintptr_t ct, uint64_t*** u64p_arr_ptr) {
583   *u64p_arr_ptr = S_CAST(uint64_t**, bigstack_alloc(ct * sizeof(intptr_t)));
584   return !(*u64p_arr_ptr);
585 }
586 
bigstack_alloc_dp(uintptr_t ct,double *** dp_arr_ptr)587 HEADER_INLINE BoolErr bigstack_alloc_dp(uintptr_t ct, double*** dp_arr_ptr) {
588   *dp_arr_ptr = S_CAST(double**, bigstack_alloc(ct * sizeof(intptr_t)));
589   return !(*dp_arr_ptr);
590 }
591 
bigstack_alloc_vp(uintptr_t ct,VecW *** vp_arr_ptr)592 HEADER_INLINE BoolErr bigstack_alloc_vp(uintptr_t ct, VecW*** vp_arr_ptr) {
593   *vp_arr_ptr = S_CAST(VecW**, bigstack_alloc(ct * sizeof(intptr_t)));
594   return !(*vp_arr_ptr);
595 }
596 
bigstack_alloc_wpp(uintptr_t ct,uintptr_t **** wpp_arr_ptr)597 HEADER_INLINE BoolErr bigstack_alloc_wpp(uintptr_t ct, uintptr_t**** wpp_arr_ptr) {
598   *wpp_arr_ptr = S_CAST(uintptr_t***, bigstack_alloc(ct * sizeof(intptr_t)));
599   return !(*wpp_arr_ptr);
600 }
601 
bigstack_alloc_u32pp(uintptr_t ct,uint32_t **** u32pp_arr_ptr)602 HEADER_INLINE BoolErr bigstack_alloc_u32pp(uintptr_t ct, uint32_t**** u32pp_arr_ptr) {
603   *u32pp_arr_ptr = S_CAST(uint32_t***, bigstack_alloc(ct * sizeof(intptr_t)));
604   return !(*u32pp_arr_ptr);
605 }
606 
bigstack_alloc_vpp(uintptr_t ct,VecW **** vpp_arr_ptr)607 HEADER_INLINE BoolErr bigstack_alloc_vpp(uintptr_t ct, VecW**** vpp_arr_ptr) {
608   *vpp_arr_ptr = S_CAST(VecW***, bigstack_alloc(ct * sizeof(VecW)));
609   return !(*vpp_arr_ptr);
610 }
611 
612 BoolErr bigstack_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr);
613 
614 BoolErr bigstack_calloc_d(uintptr_t ct, double** d_arr_ptr);
615 
616 BoolErr bigstack_calloc_f(uintptr_t ct, float** f_arr_ptr);
617 
618 BoolErr bigstack_calloc_u16(uintptr_t ct, uint16_t** u16_arr_ptr);
619 
620 BoolErr bigstack_calloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr);
621 
622 BoolErr bigstack_calloc_w(uintptr_t ct, uintptr_t** w_arr_ptr);
623 
624 BoolErr bigstack_calloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr);
625 
626 BoolErr bigstack_calloc_cp(uintptr_t ct, char*** cp_arr_ptr);
627 
628 BoolErr bigstack_calloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr);
629 
bigstack_calloc_c(uintptr_t ct,char ** c_arr_ptr)630 HEADER_INLINE BoolErr bigstack_calloc_c(uintptr_t ct, char** c_arr_ptr) {
631   return bigstack_calloc_uc(ct, R_CAST(unsigned char**, c_arr_ptr));
632 }
633 
bigstack_calloc_i16(uintptr_t ct,int16_t ** i16_arr_ptr)634 HEADER_INLINE BoolErr bigstack_calloc_i16(uintptr_t ct, int16_t** i16_arr_ptr) {
635   return bigstack_calloc_u16(ct, R_CAST(uint16_t**, i16_arr_ptr));
636 }
637 
bigstack_calloc_i32(uintptr_t ct,int32_t ** i32_arr_ptr)638 HEADER_INLINE BoolErr bigstack_calloc_i32(uintptr_t ct, int32_t** i32_arr_ptr) {
639   return bigstack_calloc_u32(ct, R_CAST(uint32_t**, i32_arr_ptr));
640 }
641 
bigstack_calloc_i64(uintptr_t ct,int64_t ** i64_arr_ptr)642 HEADER_INLINE BoolErr bigstack_calloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) {
643   return bigstack_calloc_u64(ct, R_CAST(uint64_t**, i64_arr_ptr));
644 }
645 
646 #if __cplusplus >= 201103L
647 
BigstackAllocX(uintptr_t ct,T ** x_arr_ptr)648 template <class T> BoolErr BigstackAllocX(uintptr_t ct, T** x_arr_ptr) {
649   *x_arr_ptr = S_CAST(T*, bigstack_alloc(ct * sizeof(T)));
650   return !(*x_arr_ptr);
651 }
652 
653 // todo: define all BigstackAlloc functions in terms of ArenaAlloc; then these
654 // can be namespaced, and we only need ARENA_ALLOC_X and ARENA_ALLOC_STD_ARRAY
655 // macros
656 #  define BIGSTACK_ALLOC_X(tt, ct, pp) plink2::BigstackAllocX<tt>((ct), (pp))
657 
658 #  define BIGSTACK_ALLOC_STD_ARRAY(tt, arr_size, len, pp) plink2::BigstackAllocX<std::array<tt, arr_size>>((len), (pp))
659 
660 #else
661 
662 #  define BIGSTACK_ALLOC_X(tt, ct, pp) (!((*(pp)) = S_CAST(tt*, bigstack_alloc((ct) * sizeof(tt)))))
663 
664 #  define BIGSTACK_ALLOC_STD_ARRAY(tt, arr_size, len, pp) (!((*(pp)) = S_CAST(STD_ARRAY_PTR_TYPE(tt, arr_size), bigstack_alloc((len) * (arr_size * sizeof(tt))))))
665 
666 #endif
667 
BigstackReset(void * new_base)668 HEADER_INLINE void BigstackReset(void* new_base) {
669   g_bigstack_base = S_CAST(unsigned char*, new_base);
670 }
671 
BigstackEndReset(void * new_end)672 HEADER_INLINE void BigstackEndReset(void* new_end) {
673   g_bigstack_end = S_CAST(unsigned char*, new_end);
674 }
675 
BigstackDoubleReset(void * new_base,void * new_end)676 HEADER_INLINE void BigstackDoubleReset(void* new_base, void* new_end) {
677   BigstackReset(new_base);
678   BigstackEndReset(new_end);
679 }
680 
681 // assumes we've already been writing to wptr and have previously performed
682 // bounds-checking.
BigstackFinalizeW(__maybe_unused const uintptr_t * wptr,uintptr_t ct)683 HEADER_INLINE void BigstackFinalizeW(__maybe_unused const uintptr_t* wptr, uintptr_t ct) {
684   assert(wptr == R_CAST(const uintptr_t*, g_bigstack_base));
685   g_bigstack_base += RoundUpPow2(ct * sizeof(intptr_t), kCacheline);
686   assert(g_bigstack_base <= g_bigstack_end);
687 }
688 
BigstackFinalizeU32(__maybe_unused const uint32_t * u32ptr,uintptr_t ct)689 HEADER_INLINE void BigstackFinalizeU32(__maybe_unused const uint32_t* u32ptr, uintptr_t ct) {
690   assert(u32ptr == R_CAST(const uint32_t*, g_bigstack_base));
691   g_bigstack_base += RoundUpPow2(ct * sizeof(int32_t), kCacheline);
692   assert(g_bigstack_base <= g_bigstack_end);
693 }
694 
BigstackFinalizeU64(__maybe_unused const uint64_t * u64ptr,uintptr_t ct)695 HEADER_INLINE void BigstackFinalizeU64(__maybe_unused const uint64_t* u64ptr, uintptr_t ct) {
696   assert(u64ptr == R_CAST(const uint64_t*, g_bigstack_base));
697   g_bigstack_base += RoundUpPow2(ct * sizeof(int64_t), kCacheline);
698   assert(g_bigstack_base <= g_bigstack_end);
699 }
700 
BigstackFinalizeC(__maybe_unused const char * cptr,uintptr_t ct)701 HEADER_INLINE void BigstackFinalizeC(__maybe_unused const char* cptr, uintptr_t ct) {
702   assert(cptr == R_CAST(const char*, g_bigstack_base));
703   g_bigstack_base += RoundUpPow2(ct, kCacheline);
704   assert(g_bigstack_base <= g_bigstack_end);
705 }
706 
BigstackFinalizeCp(__maybe_unused const char * const * cpptr,uintptr_t ct)707 HEADER_INLINE void BigstackFinalizeCp(__maybe_unused const char* const* cpptr, uintptr_t ct) {
708   assert(cpptr == R_CAST(const char* const*, g_bigstack_base));
709   g_bigstack_base += RoundUpPow2(ct * sizeof(intptr_t), kCacheline);
710   assert(g_bigstack_base <= g_bigstack_end);
711 }
712 
713 
BigstackBaseSet(const void * unaligned_base)714 HEADER_INLINE void BigstackBaseSet(const void* unaligned_base) {
715   g_bigstack_base = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, unaligned_base), kCacheline));
716 }
717 
BigstackShrinkTop(const void * rebase,uintptr_t new_size)718 HEADER_INLINE void BigstackShrinkTop(const void* rebase, uintptr_t new_size) {
719   // could assert that this doesn't go in the wrong direction?
720   g_bigstack_base = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, rebase) + new_size, kCacheline));
721 }
722 
723 // ensure vector-alignment
724 CONSTI32(kEndAllocAlign, MAXV(kBytesPerVec, 16));
725 
BigstackEndSet(const void * unaligned_end)726 HEADER_INLINE void BigstackEndSet(const void* unaligned_end) {
727   g_bigstack_end = R_CAST(unsigned char*, RoundDownPow2(R_CAST(uintptr_t, unaligned_end), kEndAllocAlign));
728 }
729 
730 // assumes size is divisible by kEndAllocAlign
731 // assumes enough space is available
bigstack_end_alloc_raw(uintptr_t size)732 HEADER_INLINE void* bigstack_end_alloc_raw(uintptr_t size) {
733   assert(!(size % kEndAllocAlign));
734   g_bigstack_end -= size;
735   return g_bigstack_end;
736 }
737 
bigstack_end_alloc_raw_rd(uintptr_t size)738 HEADER_INLINE void* bigstack_end_alloc_raw_rd(uintptr_t size) {
739   g_bigstack_end -= RoundUpPow2(size, kEndAllocAlign);
740   return g_bigstack_end;
741 }
742 
bigstack_end_alloc_presized(uintptr_t size)743 HEADER_INLINE void* bigstack_end_alloc_presized(uintptr_t size) {
744   uintptr_t cur_bigstack_left = bigstack_left();
745   if (size > cur_bigstack_left) {
746     g_failed_alloc_attempt_size = size;
747     return nullptr;
748   }
749   return bigstack_end_alloc_raw(size);
750 }
751 
bigstack_end_alloc(uintptr_t size)752 HEADER_INLINE void* bigstack_end_alloc(uintptr_t size) {
753   size = RoundUpPow2(size, kEndAllocAlign);
754   return bigstack_end_alloc_presized(size);
755 }
756 
bigstack_end_aligned_alloc(uintptr_t size)757 HEADER_INLINE void* bigstack_end_aligned_alloc(uintptr_t size) {
758   return bigstack_end_alloc(size);
759 }
760 
bigstack_end_alloc_c(uintptr_t ct,char ** c_arr_ptr)761 HEADER_INLINE BoolErr bigstack_end_alloc_c(uintptr_t ct, char** c_arr_ptr) {
762   *c_arr_ptr = S_CAST(char*, bigstack_end_alloc(ct));
763   return !(*c_arr_ptr);
764 }
765 
bigstack_end_alloc_d(uintptr_t ct,double ** d_arr_ptr)766 HEADER_INLINE BoolErr bigstack_end_alloc_d(uintptr_t ct, double** d_arr_ptr) {
767   *d_arr_ptr = S_CAST(double*, bigstack_end_alloc(ct * sizeof(double)));
768   return !(*d_arr_ptr);
769 }
770 
bigstack_end_alloc_f(uintptr_t ct,float ** f_arr_ptr)771 HEADER_INLINE BoolErr bigstack_end_alloc_f(uintptr_t ct, float** f_arr_ptr) {
772   *f_arr_ptr = S_CAST(float*, bigstack_end_alloc(ct * sizeof(float)));
773   return !(*f_arr_ptr);
774 }
775 
bigstack_end_alloc_i32(uintptr_t ct,int32_t ** i32_arr_ptr)776 HEADER_INLINE BoolErr bigstack_end_alloc_i32(uintptr_t ct, int32_t** i32_arr_ptr) {
777   *i32_arr_ptr = S_CAST(int32_t*, bigstack_end_alloc(ct * sizeof(int32_t)));
778   return !(*i32_arr_ptr);
779 }
780 
bigstack_end_alloc_uc(uintptr_t ct,unsigned char ** uc_arr_ptr)781 HEADER_INLINE BoolErr bigstack_end_alloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
782   *uc_arr_ptr = S_CAST(unsigned char*, bigstack_end_alloc(ct));
783   return !(*uc_arr_ptr);
784 }
785 
bigstack_end_alloc_u32(uintptr_t ct,uint32_t ** u32_arr_ptr)786 HEADER_INLINE BoolErr bigstack_end_alloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr) {
787   *u32_arr_ptr = S_CAST(uint32_t*, bigstack_end_alloc(ct * sizeof(int32_t)));
788   return !(*u32_arr_ptr);
789 }
790 
bigstack_end_alloc_w(uintptr_t ct,uintptr_t ** w_arr_ptr)791 HEADER_INLINE BoolErr bigstack_end_alloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) {
792   *w_arr_ptr = S_CAST(uintptr_t*, bigstack_end_alloc(ct * sizeof(intptr_t)));
793   return !(*w_arr_ptr);
794 }
795 
bigstack_end_alloc_i64(uintptr_t ct,int64_t ** i64_arr_ptr)796 HEADER_INLINE BoolErr bigstack_end_alloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) {
797   *i64_arr_ptr = S_CAST(int64_t*, bigstack_end_alloc(ct * sizeof(int64_t)));
798   return !(*i64_arr_ptr);
799 }
800 
bigstack_end_alloc_u64(uintptr_t ct,uint64_t ** u64_arr_ptr)801 HEADER_INLINE BoolErr bigstack_end_alloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr) {
802   *u64_arr_ptr = S_CAST(uint64_t*, bigstack_end_alloc(ct * sizeof(int64_t)));
803   return !(*u64_arr_ptr);
804 }
805 
806 typedef struct LlStrStruct {
807   NONCOPYABLE(LlStrStruct);
808   struct LlStrStruct* next;
809   char str[];
810 } LlStr;
811 
bigstack_end_alloc_llstr(uintptr_t str_bytes,LlStr ** llstr_arr_ptr)812 HEADER_INLINE BoolErr bigstack_end_alloc_llstr(uintptr_t str_bytes, LlStr** llstr_arr_ptr) {
813   *llstr_arr_ptr = S_CAST(LlStr*, bigstack_end_alloc(str_bytes + sizeof(LlStr)));
814   return !(*llstr_arr_ptr);
815 }
816 
bigstack_end_alloc_cp(uintptr_t ct,char *** cp_arr_ptr)817 HEADER_INLINE BoolErr bigstack_end_alloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
818   *cp_arr_ptr = S_CAST(char**, bigstack_end_alloc(ct * sizeof(intptr_t)));
819   return !(*cp_arr_ptr);
820 }
821 
bigstack_end_alloc_kcp(uintptr_t ct,const char *** kcp_arr_ptr)822 HEADER_INLINE BoolErr bigstack_end_alloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
823   *kcp_arr_ptr = S_CAST(const char**, bigstack_end_alloc(ct * sizeof(intptr_t)));
824   return !(*kcp_arr_ptr);
825 }
826 
bigstack_end_alloc_ucp(uintptr_t ct,unsigned char *** ucp_arr_ptr)827 HEADER_INLINE BoolErr bigstack_end_alloc_ucp(uintptr_t ct, unsigned char*** ucp_arr_ptr) {
828   *ucp_arr_ptr = S_CAST(unsigned char**, bigstack_end_alloc(ct * sizeof(intptr_t)));
829   return !(*ucp_arr_ptr);
830 }
831 
832 BoolErr bigstack_end_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr);
833 
834 BoolErr bigstack_end_calloc_d(uintptr_t ct, double** d_arr_ptr);
835 
836 BoolErr bigstack_end_calloc_f(uintptr_t ct, float** f_arr_ptr);
837 
838 BoolErr bigstack_end_calloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr);
839 
840 BoolErr bigstack_end_calloc_w(uintptr_t ct, uintptr_t** w_arr_ptr);
841 
842 BoolErr bigstack_end_calloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr);
843 
844 BoolErr bigstack_end_calloc_cp(uintptr_t ct, char*** cp_arr_ptr);
845 
bigstack_end_calloc_c(uintptr_t ct,char ** c_arr_ptr)846 HEADER_INLINE BoolErr bigstack_end_calloc_c(uintptr_t ct, char** c_arr_ptr) {
847   return bigstack_end_calloc_uc(ct, R_CAST(unsigned char**, c_arr_ptr));
848 }
849 
bigstack_end_calloc_i32(uintptr_t ct,int32_t ** i32_arr_ptr)850 HEADER_INLINE BoolErr bigstack_end_calloc_i32(uintptr_t ct, int32_t** i32_arr_ptr) {
851   return bigstack_end_calloc_u32(ct, R_CAST(uint32_t**, i32_arr_ptr));
852 }
853 
bigstack_end_calloc_i64(uintptr_t ct,int64_t ** i64_arr_ptr)854 HEADER_INLINE BoolErr bigstack_end_calloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) {
855   return bigstack_end_calloc_u64(ct, R_CAST(uint64_t**, i64_arr_ptr));
856 }
857 
bigstack_end_calloc_kcp(uintptr_t ct,const char *** kcp_arr_ptr)858 HEADER_INLINE BoolErr bigstack_end_calloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
859   return bigstack_end_calloc_cp(ct, K_CAST(char***, kcp_arr_ptr));
860 }
861 
862 // and here's the interface for a non-global arena (necessary for some
863 // multithreaded code).
arena_alloc_raw(uintptr_t size,unsigned char ** arena_bottom_ptr)864 HEADER_INLINE void* arena_alloc_raw(uintptr_t size, unsigned char** arena_bottom_ptr) {
865   assert(!(size % kCacheline));
866   unsigned char* alloc_ptr = *arena_bottom_ptr;
867   *arena_bottom_ptr = &(alloc_ptr[size]);
868   return alloc_ptr;
869 }
870 
arena_alloc_raw_rd(uintptr_t size,unsigned char ** arena_bottom_ptr)871 HEADER_INLINE void* arena_alloc_raw_rd(uintptr_t size, unsigned char** arena_bottom_ptr) {
872   unsigned char* alloc_ptr = *arena_bottom_ptr;
873   *arena_bottom_ptr = &(alloc_ptr[RoundUpPow2(size, kCacheline)]);
874   return alloc_ptr;
875 }
876 
arena_alloc(unsigned char * arena_top,uintptr_t size,unsigned char ** arena_bottom_ptr)877 HEADER_INLINE void* arena_alloc(unsigned char* arena_top, uintptr_t size, unsigned char** arena_bottom_ptr) {
878   size = RoundUpPow2(size, kCacheline);
879   if (S_CAST(uintptr_t, arena_top - (*arena_bottom_ptr)) < size) {
880     g_failed_alloc_attempt_size = size;
881     return nullptr;
882   }
883   return arena_alloc_raw(size, arena_bottom_ptr);
884 }
885 
arena_alloc_c(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,char ** c_arr_ptr)886 HEADER_INLINE BoolErr arena_alloc_c(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, char** c_arr_ptr) {
887   *c_arr_ptr = S_CAST(char*, arena_alloc(arena_top, ct, arena_bottom_ptr));
888   return !(*c_arr_ptr);
889 }
890 
arena_alloc_d(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,double ** d_arr_ptr)891 HEADER_INLINE BoolErr arena_alloc_d(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, double** d_arr_ptr) {
892   *d_arr_ptr = S_CAST(double*, arena_alloc(arena_top, ct * sizeof(double), arena_bottom_ptr));
893   return !(*d_arr_ptr);
894 }
895 
arena_alloc_f(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,float ** f_arr_ptr)896 HEADER_INLINE BoolErr arena_alloc_f(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, float** f_arr_ptr) {
897   *f_arr_ptr = S_CAST(float*, arena_alloc(arena_top, ct * sizeof(float), arena_bottom_ptr));
898   return !(*f_arr_ptr);
899 }
900 
arena_alloc_i32(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,int32_t ** i32_arr_ptr)901 HEADER_INLINE BoolErr arena_alloc_i32(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, int32_t** i32_arr_ptr) {
902   *i32_arr_ptr = S_CAST(int32_t*, arena_alloc(arena_top, ct * sizeof(int32_t), arena_bottom_ptr));
903   return !(*i32_arr_ptr);
904 }
905 
arena_alloc_uc(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,unsigned char ** uc_arr_ptr)906 HEADER_INLINE BoolErr arena_alloc_uc(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, unsigned char** uc_arr_ptr) {
907   *uc_arr_ptr = S_CAST(unsigned char*, arena_alloc(arena_top, ct, arena_bottom_ptr));
908   return !(*uc_arr_ptr);
909 }
910 
arena_alloc_u32(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,uint32_t ** u32_arr_ptr)911 HEADER_INLINE BoolErr arena_alloc_u32(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uint32_t** u32_arr_ptr) {
912   *u32_arr_ptr = S_CAST(uint32_t*, arena_alloc(arena_top, ct * sizeof(int32_t), arena_bottom_ptr));
913   return !(*u32_arr_ptr);
914 }
915 
arena_alloc_w(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,uintptr_t ** w_arr_ptr)916 HEADER_INLINE BoolErr arena_alloc_w(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uintptr_t** w_arr_ptr) {
917   *w_arr_ptr = S_CAST(uintptr_t*, arena_alloc(arena_top, ct * sizeof(intptr_t), arena_bottom_ptr));
918   return !(*w_arr_ptr);
919 }
920 
arena_alloc_i64(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,int64_t ** i64_arr_ptr)921 HEADER_INLINE BoolErr arena_alloc_i64(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, int64_t** i64_arr_ptr) {
922   *i64_arr_ptr = S_CAST(int64_t*, arena_alloc(arena_top, ct * sizeof(int64_t), arena_bottom_ptr));
923   return !(*i64_arr_ptr);
924 }
925 
arena_alloc_u64(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,uint64_t ** u64_arr_ptr)926 HEADER_INLINE BoolErr arena_alloc_u64(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uint64_t** u64_arr_ptr) {
927   *u64_arr_ptr = S_CAST(uint64_t*, arena_alloc(arena_top, ct * sizeof(int64_t), arena_bottom_ptr));
928   return !(*u64_arr_ptr);
929 }
930 
arena_alloc_cp(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,char *** cp_arr_ptr)931 HEADER_INLINE BoolErr arena_alloc_cp(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, char*** cp_arr_ptr) {
932   *cp_arr_ptr = S_CAST(char**, arena_alloc(arena_top, ct * sizeof(intptr_t), arena_bottom_ptr));
933   return !(*cp_arr_ptr);
934 }
935 
ArenaBaseSet(const void * unaligned_base,unsigned char ** arena_bottom_ptr)936 HEADER_INLINE void ArenaBaseSet(const void* unaligned_base, unsigned char** arena_bottom_ptr) {
937   *arena_bottom_ptr = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, unaligned_base), kCacheline));
938 }
939 
arena_end_alloc_raw(uintptr_t size,unsigned char ** arena_top_ptr)940 HEADER_INLINE void* arena_end_alloc_raw(uintptr_t size, unsigned char** arena_top_ptr) {
941   assert(!(size % kEndAllocAlign));
942   unsigned char* alloc_ptr = *arena_top_ptr;
943   alloc_ptr -= size;
944   *arena_top_ptr = alloc_ptr;
945   return alloc_ptr;
946 }
947 
arena_end_alloc(unsigned char * arena_bottom,uintptr_t size,unsigned char ** arena_top_ptr)948 HEADER_INLINE void* arena_end_alloc(unsigned char* arena_bottom, uintptr_t size, unsigned char** arena_top_ptr) {
949   size = RoundUpPow2(size, kEndAllocAlign);
950   if (unlikely(S_CAST(uintptr_t, (*arena_top_ptr) - arena_bottom) < size)) {
951     g_failed_alloc_attempt_size = size;
952     return nullptr;
953   }
954   return arena_end_alloc_raw(size, arena_top_ptr);
955 }
956 
arena_end_alloc_c(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,char ** c_arr_ptr)957 HEADER_INLINE BoolErr arena_end_alloc_c(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, char** c_arr_ptr) {
958   *c_arr_ptr = S_CAST(char*, arena_end_alloc(arena_bottom, ct, arena_top_ptr));
959   return !(*c_arr_ptr);
960 }
961 
arena_end_alloc_d(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,double ** d_arr_ptr)962 HEADER_INLINE BoolErr arena_end_alloc_d(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, double** d_arr_ptr) {
963   *d_arr_ptr = S_CAST(double*, arena_end_alloc(arena_bottom, ct * sizeof(double), arena_top_ptr));
964   return !(*d_arr_ptr);
965 }
966 
arena_end_alloc_f(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,float ** f_arr_ptr)967 HEADER_INLINE BoolErr arena_end_alloc_f(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, float** f_arr_ptr) {
968   *f_arr_ptr = S_CAST(float*, arena_end_alloc(arena_bottom, ct * sizeof(float), arena_top_ptr));
969   return !(*f_arr_ptr);
970 }
971 
arena_end_alloc_i32(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,int32_t ** i32_arr_ptr)972 HEADER_INLINE BoolErr arena_end_alloc_i32(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, int32_t** i32_arr_ptr) {
973   *i32_arr_ptr = S_CAST(int32_t*, arena_end_alloc(arena_bottom, ct * sizeof(int32_t), arena_top_ptr));
974   return !(*i32_arr_ptr);
975 }
976 
arena_end_alloc_uc(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,unsigned char ** uc_arr_ptr)977 HEADER_INLINE BoolErr arena_end_alloc_uc(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, unsigned char** uc_arr_ptr) {
978   *uc_arr_ptr = S_CAST(unsigned char*, arena_end_alloc(arena_bottom, ct, arena_top_ptr));
979   return !(*uc_arr_ptr);
980 }
981 
arena_end_alloc_u32(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,uint32_t ** u32_arr_ptr)982 HEADER_INLINE BoolErr arena_end_alloc_u32(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uint32_t** u32_arr_ptr) {
983   *u32_arr_ptr = S_CAST(uint32_t*, arena_end_alloc(arena_bottom, ct * sizeof(int32_t), arena_top_ptr));
984   return !(*u32_arr_ptr);
985 }
986 
arena_end_alloc_w(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,uintptr_t ** w_arr_ptr)987 HEADER_INLINE BoolErr arena_end_alloc_w(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uintptr_t** w_arr_ptr) {
988   *w_arr_ptr = S_CAST(uintptr_t*, arena_end_alloc(arena_bottom, ct * sizeof(intptr_t), arena_top_ptr));
989   return !(*w_arr_ptr);
990 }
991 
arena_end_alloc_i64(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,int64_t ** i64_arr_ptr)992 HEADER_INLINE BoolErr arena_end_alloc_i64(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, int64_t** i64_arr_ptr) {
993   *i64_arr_ptr = S_CAST(int64_t*, arena_end_alloc(arena_bottom, ct * sizeof(int64_t), arena_top_ptr));
994   return !(*i64_arr_ptr);
995 }
996 
arena_end_alloc_u64(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,uint64_t ** u64_arr_ptr)997 HEADER_INLINE BoolErr arena_end_alloc_u64(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uint64_t** u64_arr_ptr) {
998   *u64_arr_ptr = S_CAST(uint64_t*, arena_end_alloc(arena_bottom, ct * sizeof(int64_t), arena_top_ptr));
999   return !(*u64_arr_ptr);
1000 }
1001 
arena_end_alloc_cp(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,char *** cp_arr_ptr)1002 HEADER_INLINE BoolErr arena_end_alloc_cp(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, char*** cp_arr_ptr) {
1003   *cp_arr_ptr = S_CAST(char**, arena_end_alloc(arena_bottom, ct * sizeof(intptr_t), arena_top_ptr));
1004   return !(*cp_arr_ptr);
1005 }
1006 
1007 
1008 BoolErr PushLlStr(const char* str, LlStr** ll_stack_ptr);
1009 
1010 // Does not require null-termination.
1011 // BoolErr push_llstr_counted(const char* str, uint32_t slen, LlStr** ll_stack_ptr);
1012 
1013 typedef struct L32StrStruct {
1014   NONCOPYABLE(L32StrStruct);
1015   uint32_t len;
1016   char str[];
1017 } L32Str;
1018 
1019 
1020 // assumes multistr is nonempty
1021 BoolErr CountAndMeasureMultistrReverseAlloc(const char* multistr, uintptr_t max_str_ct, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr, const char*** strptr_arrp);
1022 
1023 BoolErr MultistrToStrboxDedupArenaAlloc(unsigned char* arena_top, const char* multistr, unsigned char** arena_bottom_ptr, char** sorted_strbox_ptr, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr);
1024 
MultistrToStrboxDedupAlloc(const char * multistr,char ** sorted_strbox_ptr,uint32_t * str_ct_ptr,uintptr_t * max_blen_ptr)1025 HEADER_INLINE BoolErr MultistrToStrboxDedupAlloc(const char* multistr, char** sorted_strbox_ptr, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr) {
1026   return MultistrToStrboxDedupArenaAlloc(g_bigstack_end, multistr, &g_bigstack_base, sorted_strbox_ptr, str_ct_ptr, max_blen_ptr);
1027 }
1028 
1029 
1030 void DivisionMagicNums(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp);
1031 
1032 // ZeroU32Arr, ZeroWArr, ZeroU64Arr, SetAllWArr currently defined in
1033 // plink2_base.h
1034 
ZeroVecArr(uintptr_t entry_ct,VecW * vvec)1035 HEADER_INLINE void ZeroVecArr(uintptr_t entry_ct, VecW* vvec) {
1036   memset(vvec, 0, entry_ct * kBytesPerVec);
1037 }
1038 
SetAllU64Arr(uintptr_t entry_ct,uint64_t * u64arr)1039 HEADER_INLINE void SetAllU64Arr(uintptr_t entry_ct, uint64_t* u64arr) {
1040   // bugfix (1 Feb 2018): forgot to multiply by 2 in 32-bit case
1041   SetAllWArr(entry_ct * (sizeof(int64_t) / kBytesPerWord), R_CAST(uintptr_t*, u64arr));
1042 }
1043 
ZeroI32Arr(uintptr_t entry_ct,int32_t * i32arr)1044 HEADER_INLINE void ZeroI32Arr(uintptr_t entry_ct, int32_t* i32arr) {
1045   memset(i32arr, 0, entry_ct * sizeof(int32_t));
1046 }
1047 
SetAllI32Arr(uintptr_t entry_ct,int32_t * i32arr)1048 HEADER_INLINE void SetAllI32Arr(uintptr_t entry_ct, int32_t* i32arr) {
1049   for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1050     *i32arr++ = -1;
1051   }
1052 }
1053 
SetAllU32Arr(uintptr_t entry_ct,uint32_t * u32arr)1054 HEADER_INLINE void SetAllU32Arr(uintptr_t entry_ct, uint32_t* u32arr) {
1055   for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1056     *u32arr++ = ~0U;
1057   }
1058 }
1059 
ZeroFArr(uintptr_t entry_ct,float * farr)1060 HEADER_INLINE void ZeroFArr(uintptr_t entry_ct, float* farr) {
1061   for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1062     *farr++ = 0.0;
1063   }
1064 }
1065 
ZeroDArr(uintptr_t entry_ct,double * darr)1066 HEADER_INLINE void ZeroDArr(uintptr_t entry_ct, double* darr) {
1067   for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1068     *darr++ = 0.0;
1069   }
1070 }
1071 
1072 
ZeromovFArr(uintptr_t entry_ct,float ** farr_ptr)1073 HEADER_INLINE void ZeromovFArr(uintptr_t entry_ct, float** farr_ptr) {
1074   float* farr = *farr_ptr;
1075   for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1076     *farr++ = 0.0;
1077   }
1078   *farr_ptr = farr;
1079 }
1080 
1081 
1082 // SetAllBits, IsSet, SetBit, ClearBit, AdvTo1Bit, AdvTo0Bit, AdvBoundedTo1Bit,
1083 // FindLast1BitBefore, AllWordsAreZero defined in plink2_base.h
1084 
1085 // Useful when we don't want to think about the signedness of a 32-bit int.
SetBitI(int32_t loc,uintptr_t * bitarr)1086 HEADER_INLINE void SetBitI(int32_t loc, uintptr_t* bitarr) {
1087   SetBit(S_CAST(uint32_t, loc), bitarr);
1088 }
1089 
FlipBit(uintptr_t loc,uintptr_t * bitarr)1090 HEADER_INLINE void FlipBit(uintptr_t loc, uintptr_t* bitarr) {
1091   bitarr[loc / kBitsPerWord] ^= k1LU << (loc % kBitsPerWord);
1092 }
1093 
1094 // "Nz" added to names to make it obvious these require positive len
1095 void FillBitsNz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr);
1096 void ClearBitsNz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr);
1097 
1098 // floor permitted to be -1, though not smaller than that.
1099 int32_t FindLast1BitBeforeBounded(const uintptr_t* bitarr, uint32_t loc, int32_t floor);
1100 
1101 // This can be made a tiny bit faster than memequal() in isolation, but the
1102 // difference is almost certainly too small to justify additional i-cache
1103 // pressure.
wordsequal(const uintptr_t * word_arr1,const uintptr_t * word_arr2,uintptr_t word_ct)1104 HEADER_INLINE uint32_t wordsequal(const uintptr_t* word_arr1, const uintptr_t* word_arr2, uintptr_t word_ct) {
1105   return memequal(word_arr1, word_arr2, word_ct * kBytesPerWord);
1106 }
1107 
1108 
1109 // BitvecAnd(), BitvecInvmask(), BitvecOr(), BitvecInvert() in plink2_base.h
1110 
1111 void BitvecAndCopy(const uintptr_t* __restrict source1_bitvec, const uintptr_t* __restrict source2_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec);
1112 
1113 void BitvecInvmaskCopy(const uintptr_t* __restrict source_bitvec, const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec);
1114 
1115 void BitvecInvertCopy(const uintptr_t* __restrict source_bitvec, uintptr_t word_ct, uintptr_t* __restrict target_bitvec);
1116 
1117 // These ensure the trailing bits are zeroed out.
1118 // 'AlignedBitarr' instead of Bitvec since this takes bit_ct instead of word_ct
1119 // as the size argument, and zeroes trailing bits.
AlignedBitarrInvert(uintptr_t bit_ct,uintptr_t * main_bitvec)1120 HEADER_INLINE void AlignedBitarrInvert(uintptr_t bit_ct, uintptr_t* main_bitvec) {
1121   const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
1122   BitvecInvert(fullword_ct, main_bitvec);
1123   const uint32_t trail_ct = bit_ct % kBitsPerWord;
1124   if (trail_ct) {
1125     main_bitvec[fullword_ct] = bzhi(~main_bitvec[fullword_ct], trail_ct);
1126   }
1127 }
1128 
AlignedBitarrInvertCopy(const uintptr_t * __restrict source_bitvec,uintptr_t bit_ct,uintptr_t * __restrict target_bitvec)1129 HEADER_INLINE void AlignedBitarrInvertCopy(const uintptr_t* __restrict source_bitvec, uintptr_t bit_ct, uintptr_t* __restrict target_bitvec) {
1130   const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
1131   BitvecInvertCopy(source_bitvec, fullword_ct, target_bitvec);
1132   const uint32_t trail_ct = bit_ct % kBitsPerWord;
1133   if (trail_ct) {
1134     target_bitvec[fullword_ct] = bzhi(~source_bitvec[fullword_ct], trail_ct);
1135   }
1136 }
1137 
1138 // void BitvecXor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
1139 
1140 void BitvecInvertAndMask(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
1141 
1142 // void BitvecOrNot(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
1143 
1144 // Address C-only incompatible-pointer-types-discards-qualifiers warning.
1145 #ifdef __cplusplus
1146 #  define TO_CONSTU32PCONSTP(u32_pp) (u32_pp)
1147 #else
1148 #  define TO_CONSTU32PCONSTP(u32_pp) ((const uint32_t* const*)(u32_pp))
1149 #endif
1150 
U32VecAdd(const VecU32 * arg_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1151 HEADER_INLINE void U32VecAdd(const VecU32* arg_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1152   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1153     main_u32vec[vidx] += arg_u32vec[vidx];
1154   }
1155 }
1156 
U32CastVecAdd(const uint32_t * arg_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1157 HEADER_INLINE void U32CastVecAdd(const uint32_t* arg_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1158   U32VecAdd(R_CAST(const VecU32*, arg_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1159 }
1160 
I32CastVecAdd(const int32_t * arg_i32arr,uintptr_t vec_ct,int32_t * main_i32arr)1161 HEADER_INLINE void I32CastVecAdd(const int32_t* arg_i32arr, uintptr_t vec_ct, int32_t* main_i32arr) {
1162   U32VecAdd(R_CAST(const VecU32*, arg_i32arr), vec_ct, R_CAST(VecU32*, main_i32arr));
1163 }
1164 
U32VecSub(const VecU32 * arg_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1165 HEADER_INLINE void U32VecSub(const VecU32* arg_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1166   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1167     main_u32vec[vidx] -= arg_u32vec[vidx];
1168   }
1169 }
1170 
U32CastVecSub(const uint32_t * arg_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1171 HEADER_INLINE void U32CastVecSub(const uint32_t* arg_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1172   U32VecSub(R_CAST(const VecU32*, arg_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1173 }
1174 
U32VecMaskedAdd(const VecU32 * mask_u32vec,const VecU32 * arg_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1175 HEADER_INLINE void U32VecMaskedAdd(const VecU32* mask_u32vec, const VecU32* arg_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1176   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1177     main_u32vec[vidx] += mask_u32vec[vidx] & arg_u32vec[vidx];
1178   }
1179 }
1180 
U32CastVecMaskedAdd(const uint32_t * mask_u32arr,const uint32_t * arg_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1181 HEADER_INLINE void U32CastVecMaskedAdd(const uint32_t* mask_u32arr, const uint32_t* arg_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1182   U32VecMaskedAdd(R_CAST(const VecU32*, mask_u32arr), R_CAST(const VecU32*, arg_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1183 }
1184 
U32VecInvmaskedAdd(const VecU32 * invmask_u32vec,const VecU32 * arg_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1185 HEADER_INLINE void U32VecInvmaskedAdd(const VecU32* invmask_u32vec, const VecU32* arg_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1186   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1187     main_u32vec[vidx] += vecu32_and_notfirst(invmask_u32vec[vidx], arg_u32vec[vidx]);
1188   }
1189 }
1190 
U32CastVecInvmaskedAdd(const uint32_t * invmask_u32arr,const uint32_t * arg_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1191 HEADER_INLINE void U32CastVecInvmaskedAdd(const uint32_t* invmask_u32arr, const uint32_t* arg_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1192   U32VecInvmaskedAdd(R_CAST(const VecU32*, invmask_u32arr), R_CAST(const VecU32*, arg_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1193 }
1194 
1195 
U32VecAdd2(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1196 HEADER_INLINE void U32VecAdd2(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1197   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1198     main_u32vec[vidx] += arg1_u32vec[vidx] + arg2_u32vec[vidx];
1199   }
1200 }
1201 
U32CastVecAdd2(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1202 HEADER_INLINE void U32CastVecAdd2(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1203   U32VecAdd2(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1204 }
1205 
U32VecAssignAdd2(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1206 HEADER_INLINE void U32VecAssignAdd2(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1207   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1208     main_u32vec[vidx] = arg1_u32vec[vidx] + arg2_u32vec[vidx];
1209   }
1210 }
1211 
U32CastVecAssignAdd2(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1212 HEADER_INLINE void U32CastVecAssignAdd2(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1213   U32VecAssignAdd2(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1214 }
1215 
U32VecMaskedAdd2(const VecU32 * mask_u32vec,const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1216 HEADER_INLINE void U32VecMaskedAdd2(const VecU32* mask_u32vec, const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1217   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1218     main_u32vec[vidx] += mask_u32vec[vidx] & (arg1_u32vec[vidx] + arg2_u32vec[vidx]);
1219   }
1220 }
1221 
U32CastVecMaskedAdd2(const uint32_t * mask_u32arr,const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1222 HEADER_INLINE void U32CastVecMaskedAdd2(const uint32_t* mask_u32arr, const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1223   U32VecMaskedAdd2(R_CAST(const VecU32*, mask_u32arr), R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1224 }
1225 
U32VecAddSub(const VecU32 * add_u32vec,const VecU32 * sub_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1226 HEADER_INLINE void U32VecAddSub(const VecU32* add_u32vec, const VecU32* sub_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1227   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1228     main_u32vec[vidx] += add_u32vec[vidx] - sub_u32vec[vidx];
1229   }
1230 }
1231 
U32CastVecAddSub(const uint32_t * add_u32arr,const uint32_t * sub_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1232 HEADER_INLINE void U32CastVecAddSub(const uint32_t* add_u32arr, const uint32_t* sub_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1233   U32VecAddSub(R_CAST(const VecU32*, add_u32arr), R_CAST(const VecU32*, sub_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1234 }
1235 
1236 
U32VecAdd3(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,const VecU32 * arg3_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1237 HEADER_INLINE void U32VecAdd3(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, const VecU32* arg3_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1238   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1239     main_u32vec[vidx] += arg1_u32vec[vidx] + arg2_u32vec[vidx] + arg3_u32vec[vidx];
1240   }
1241 }
1242 
U32CastVecAdd3(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,const uint32_t * arg3_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1243 HEADER_INLINE void U32CastVecAdd3(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, const uint32_t* arg3_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1244   U32VecAdd3(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), R_CAST(const VecU32*, arg3_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1245 }
1246 
U32VecAssignAdd3(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,const VecU32 * arg3_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1247 HEADER_INLINE void U32VecAssignAdd3(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, const VecU32* arg3_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1248   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1249     main_u32vec[vidx] = arg1_u32vec[vidx] + arg2_u32vec[vidx] + arg3_u32vec[vidx];
1250   }
1251 }
1252 
U32CastVecAssignAdd3(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,const uint32_t * arg3_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1253 HEADER_INLINE void U32CastVecAssignAdd3(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, const uint32_t* arg3_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1254   U32VecAssignAdd3(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), R_CAST(const VecU32*, arg3_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1255 }
1256 
U32VecSub3(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,const VecU32 * arg3_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1257 HEADER_INLINE void U32VecSub3(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, const VecU32* arg3_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1258   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1259     main_u32vec[vidx] -= arg1_u32vec[vidx] + arg2_u32vec[vidx] + arg3_u32vec[vidx];
1260   }
1261 }
1262 
U32CastVecSub3(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,const uint32_t * arg3_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1263 HEADER_INLINE void U32CastVecSub3(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, const uint32_t* arg3_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1264   U32VecSub3(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), R_CAST(const VecU32*, arg3_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1265 }
1266 
U32VecAssignAdd5(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,const VecU32 * arg3_u32vec,const VecU32 * arg4_u32vec,const VecU32 * arg5_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1267 HEADER_INLINE void U32VecAssignAdd5(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, const VecU32* arg3_u32vec, const VecU32* arg4_u32vec, const VecU32* arg5_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1268   for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1269     main_u32vec[vidx] = arg1_u32vec[vidx] + arg2_u32vec[vidx] + arg3_u32vec[vidx] + arg4_u32vec[vidx] + arg5_u32vec[vidx];
1270   }
1271 }
1272 
U32CastVecAssignAdd5(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,const uint32_t * arg3_u32arr,const uint32_t * arg4_u32arr,const uint32_t * arg5_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1273 HEADER_INLINE void U32CastVecAssignAdd5(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, const uint32_t* arg3_u32arr, const uint32_t* arg4_u32arr, const uint32_t* arg5_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1274   U32VecAssignAdd5(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), R_CAST(const VecU32*, arg3_u32arr), R_CAST(const VecU32*, arg4_u32arr), R_CAST(const VecU32*, arg5_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1275 }
1276 
1277 
1278 // basic linear scan
1279 // returns -1 on failure to find, -2 if duplicate
1280 int32_t GetVariantUidxWithoutHtable(const char* idstr, const char* const* variant_ids, const uintptr_t* variant_include, uint32_t variant_ct);
1281 
1282 // copy_subset() doesn't exist since a loop of the form
1283 //   uintptr_t uidx_base = 0;
1284 //   uintptr_t cur_bits = subset_mask[0];
1285 //   for (uint32_t idx = 0; idx != subset_size; ++idx) {
1286 //     const uintptr_t uidx = BitIter1(subset_mask, &uidx_base, &cur_bits);
1287 //     *target_iter++ = source_arr[uidx];
1288 //   }
1289 // seems to compile better?
1290 //
1291 // similarly, copy_when_nonmissing() retired in favor of genoarr_to_nonmissing
1292 // followed by a for loop
1293 
1294 // Benchmark results justify replacing this with XXH3 when the latter is ready.
1295 // Not considering it ready yet since the development XXH_INLINE_ALL setup
1296 // isn't very friendly.
1297 //
1298 // (Can also try using _mm_aesenc_si128() in a similar manner to the Golang
1299 // runtime when SSE4.2+ is available, and otherwise keeping Murmurhash.  See
1300 // aeshashbody() in https://golang.org/src/runtime/asm_amd64.s .)
1301 //
1302 // Eventually want this to be a constexpr?  Seems painful to make that work,
1303 // though.
1304 uint32_t Hash32(const void* key, uint32_t len);
1305 
1306 // see http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
1307 // Note that this is a bit more vulnerable to adversarial input: modulo
1308 // reduction requires lots of hash collisions (or near-collisions) or known
1309 // hash table size, while this can be attacked without knowledge of the table
1310 // size.  But it doesn't really matter; either way, you'd need to add something
1311 // like a randomly generated salt if you care about defending.
Hashceil(const char * idstr,uint32_t idlen,uint32_t htable_size)1312 HEADER_INLINE uint32_t Hashceil(const char* idstr, uint32_t idlen, uint32_t htable_size) {
1313   return (S_CAST(uint64_t, Hash32(idstr, idlen)) * htable_size) >> 32;
1314 }
1315 
1316 // uintptr_t geqprime(uintptr_t floor);
1317 
1318 // assumes ceil is odd and greater than 4.  Returns the first prime <= ceil.
1319 // uintptr_t leqprime(uintptr_t ceil);
1320 
GetHtableMinSize(uintptr_t item_ct)1321 HEADER_INLINE uint32_t GetHtableMinSize(uintptr_t item_ct) {
1322   // Don't need primes any more.
1323   return item_ct * 2;
1324 }
1325 
1326 // load factor ~22% seems to yield the best speed/space tradeoff on my test
1327 // machines
GetHtableFastSize(uint32_t item_ct)1328 HEADER_INLINE uint32_t GetHtableFastSize(uint32_t item_ct) {
1329   if (item_ct < 954437176) {
1330     return (item_ct * 9LLU) / 2;
1331   }
1332   return 4294967290U;
1333 }
1334 
1335 BoolErr HtableGoodSizeAlloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t** htable_ptr, uint32_t* htable_size_ptr);
1336 
1337 // useful for duplicate detection: returns 0 on no duplicates, a positive index
1338 // of a duplicate pair if they're present
1339 uint32_t PopulateStrboxHtable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
1340 
1341 // returned index in duplicate-pair case is unfiltered
1342 // uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
1343 
1344 // cur_id DOES need to be null-terminated
1345 uint32_t IdHtableFind(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size);
1346 
1347 // item_ids overread must be ok.  In exchange, cur_id doesn't need to be
1348 // null-terminated any more.
1349 uint32_t IdHtableFindNnt(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size);
1350 
1351 // assumes cur_id_slen < max_str_blen.
1352 // requires cur_id to be null-terminated.
1353 uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size);
1354 
1355 // last variant_ids entry must be at least kMaxIdBlen bytes before end of
1356 // bigstack
1357 uint32_t VariantIdDupflagHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen);
1358 
1359 uint32_t VariantIdDupHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, const uint32_t* htable_dup_base, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen, uint32_t* llidx_ptr);
1360 
1361 
1362 // This still perform a temporary bigstack allocation; 'noalloc' here just
1363 // means that sorted_strbox and id_map must be allocated in advance.  (Overread
1364 // must be safe.)
1365 PglErr CopySortStrboxSubsetNoalloc(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char* __restrict sorted_strbox, uint32_t* __restrict id_map);
1366 
1367 PglErr CopySortStrboxSubset(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char** sorted_strbox_ptr, uint32_t** id_map_ptr);
1368 
1369 
1370 typedef struct RangeListStruct {
1371   NONCOPYABLE(RangeListStruct);
1372   char* names;
1373   unsigned char* starts_range;
1374   uint32_t name_ct;
1375   uint32_t name_max_blen;
1376 } RangeList;
1377 
1378 void InitRangeList(RangeList* range_list_ptr);
1379 
1380 void CleanupRangeList(RangeList* range_list_ptr);
1381 
1382 // bitarr assumed to be initialized (but not necessarily zero-initialized)
1383 BoolErr NumericRangeListToBitarr(const RangeList* range_list_ptr, uint32_t bitarr_size, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr);
1384 
1385 PglErr StringRangeListToBitarr(const char* header_line, const RangeList* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t* bitarr, int32_t* __restrict seen_idxs);
1386 
1387 PglErr StringRangeListToBitarrAlloc(const char* header_line, const RangeList* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t** bitarr_ptr);
1388 
1389 
1390 #ifdef __LP64__
PopcountWordsNzbase(const uintptr_t * bitvec,uintptr_t start_idx,uintptr_t end_idx)1391 HEADER_INLINE uintptr_t PopcountWordsNzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
1392   uintptr_t prefix_ct = 0;
1393 #  ifdef USE_AVX2
1394   while (start_idx & 3) {
1395     if (end_idx == start_idx) {
1396       return prefix_ct;
1397     }
1398     prefix_ct += PopcountWord(bitvec[start_idx++]);
1399   }
1400 #  else
1401   if (start_idx & 1) {
1402     if (end_idx == start_idx) {
1403       return 0;
1404     }
1405     prefix_ct = PopcountWord(bitvec[start_idx++]);
1406   }
1407 #  endif  // USE_AVX2
1408   return prefix_ct + PopcountWords(&(bitvec[start_idx]), end_idx - start_idx);
1409 }
1410 #else
PopcountWordsNzbase(const uintptr_t * bitvec,uintptr_t start_idx,uintptr_t end_idx)1411 HEADER_INLINE uintptr_t PopcountWordsNzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
1412   return PopcountWords(&(bitvec[start_idx]), end_idx - start_idx);
1413 }
1414 #endif
1415 
1416 // start_idx == end_idx ok
1417 uintptr_t PopcountBitRange(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx);
1418 
IntersectionIsEmpty(const uintptr_t * bitvec1,const uintptr_t * bitvec2,uintptr_t word_ct)1419 HEADER_INLINE uint32_t IntersectionIsEmpty(const uintptr_t* bitvec1, const uintptr_t* bitvec2, uintptr_t word_ct) {
1420 #ifdef USE_SSE42
1421   const uintptr_t fullvec_ct = word_ct / kWordsPerVec;
1422 #  ifdef USE_AVX2
1423   const __m256i* bitvvec1 = R_CAST(const __m256i*, bitvec1);
1424   const __m256i* bitvvec2 = R_CAST(const __m256i*, bitvec2);
1425   for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
1426     if (!_mm256_testz_si256(bitvvec1[vidx], bitvvec2[vidx])) {
1427       return 0;
1428     }
1429   }
1430 #  else
1431   const __m128i* bitvvec1 = R_CAST(const __m128i*, bitvec1);
1432   const __m128i* bitvvec2 = R_CAST(const __m128i*, bitvec2);
1433   for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
1434     if (!_mm_testz_si128(bitvvec1[vidx], bitvvec2[vidx])) {
1435       return 0;
1436     }
1437   }
1438 #  endif
1439   word_ct = word_ct & (kWordsPerVec - 1);
1440   bitvec1 = R_CAST(const uintptr_t*, &(bitvvec1[fullvec_ct]));
1441   bitvec2 = R_CAST(const uintptr_t*, &(bitvvec2[fullvec_ct]));
1442 #endif
1443   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1444     if (bitvec1[widx] & bitvec2[widx]) {
1445       return 0;
1446     }
1447   }
1448   return 1;
1449 }
1450 
1451 // could use testz here, but it's more awkward
UnionIsFull(const uintptr_t * bitarr1,const uintptr_t * bitarr2,uintptr_t bit_ct)1452 HEADER_INLINE uint32_t UnionIsFull(const uintptr_t* bitarr1, const uintptr_t* bitarr2, uintptr_t bit_ct) {
1453   const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
1454   for (uintptr_t widx = 0; widx != fullword_ct; ++widx) {
1455     if ((bitarr1[widx] | bitarr2[widx]) != ~k0LU) {
1456       return 0;
1457     }
1458   }
1459   const uint32_t trailing_bit_ct = bit_ct % kBitsPerWord;
1460   if (trailing_bit_ct) {
1461     if ((bitarr1[fullword_ct] | bitarr2[fullword_ct]) != ((k1LU << trailing_bit_ct) - k1LU)) {
1462       return 0;
1463     }
1464   }
1465   return 1;
1466 }
1467 
1468 // PopcountWordsIntersect moved to plink2_base
1469 
1470 void PopcountWordsIntersect3val(const uintptr_t* __restrict bitvec1, const uintptr_t* __restrict bitvec2, uint32_t word_ct, uint32_t* __restrict popcount1_ptr, uint32_t* __restrict popcount2_ptr, uint32_t* __restrict popcount_intersect_ptr);
1471 
1472 // uintptr_t count_11_longs(const uintptr_t* genovec, uintptr_t word_ct);
1473 // (see CountNyp in pgenlib_misc for that functionality)
1474 
1475 // Must be safe to read from bitarr[start_idx / kBitsPerWord].
1476 uint32_t AllBitsAreZero(const uintptr_t* bitarr, uintptr_t start_idx, uintptr_t end_idx);
1477 
1478 // does not assume relevant bits of target_bitarr are zero
CopyBits(uintptr_t src_word,uint64_t target_start_bitidx,uint32_t len,uintptr_t * target_bitarr)1479 HEADER_INLINE void CopyBits(uintptr_t src_word, uint64_t target_start_bitidx, uint32_t len, uintptr_t* target_bitarr) {
1480   uintptr_t* update_ptr = &(target_bitarr[target_start_bitidx / kBitsPerWord]);
1481   uintptr_t target_word = *update_ptr;
1482   const uint32_t bit_idx_start = target_start_bitidx % kBitsPerWord;
1483   const uintptr_t shifted_new_bits = src_word << bit_idx_start;
1484   const uint32_t bit_idx_end = bit_idx_start + len;
1485   if (bit_idx_end <= kBitsPerWord) {
1486     const uintptr_t invmask = bzhi_max((~k0LU) << bit_idx_start, bit_idx_end);
1487     *update_ptr = (target_word & (~invmask)) | shifted_new_bits;
1488     return;
1489   }
1490   *update_ptr++ = bzhi(target_word, bit_idx_start) | shifted_new_bits;
1491   target_word = *update_ptr;
1492   const uint32_t remainder = bit_idx_end - kBitsPerWord;
1493   *update_ptr = (target_word & ((~k0LU) << remainder)) | (src_word >> (len - remainder));
1494 }
1495 
1496 // assumes len is positive, and relevant bits of target_bitarr are zero
1497 void CopyBitarrRange(const uintptr_t* __restrict src_bitarr, uintptr_t src_start_bitidx, uintptr_t target_start_bitidx, uintptr_t len, uintptr_t* __restrict target_bitarr);
1498 
1499 
1500 // vertical popcount support
1501 // VcountScramble1() and friends are here since they apply to generic
1502 // arrays; scramble_2_4_8_32() is more plink-specific
1503 
1504 #ifdef __LP64__
1505 #  ifdef USE_AVX2
VcountScramble1(uint32_t orig_idx)1506 HEADER_INLINE uint32_t VcountScramble1(uint32_t orig_idx) {
1507   // 1->4: 0 4 8 12 16 20 24 28 32 ... 252 1 5 9 ...
1508   // 4->8: 0 8 16 24 32 ... 248 4 12 20 ... 1 9 17 ...
1509   // 8->32: 0 32 ... 224 8 40 ... 232 ... 248 4 36 ... 252 1 33 ...
1510   return (orig_idx & (~255)) + ((orig_idx & 3) * 64) + ((orig_idx & 4) * 8) + (orig_idx & 24) + ((orig_idx & 224) / 32);
1511 }
1512 #  else
VcountScramble1(uint32_t orig_idx)1513 HEADER_INLINE uint32_t VcountScramble1(uint32_t orig_idx) {
1514   // 1->4: 0 4 8 12 16 20 24 28 32 ... 124 1 5 9 ...
1515   // 4->8: 0 8 16 24 32 ... 120 4 12 20 ... 1 9 17 ...
1516   // 8->32: 0 32 64 96 8 40 72 104 16 48 80 112 24 56 88 120 4 36 68 ... 1 33 ...
1517   return (orig_idx & (~127)) + ((orig_idx & 3) * 32) + ((orig_idx & 4) * 4) + ((orig_idx & 24) / 2) + ((orig_idx & 96) / 32);
1518 }
1519 #  endif
1520 #else
1521 // 1->4: 0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 ...
1522 // 4->8: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18 ...
1523 // 8->32: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18 ...
VcountScramble1(uint32_t orig_idx)1524 HEADER_INLINE uint32_t VcountScramble1(uint32_t orig_idx) {
1525   return (orig_idx & (~31)) + ((orig_idx & 3) * 8) + (orig_idx & 4) + ((orig_idx & 24) / 8);
1526 }
1527 #endif
1528 
1529 // change acc1 type to const void* if a VecW* use case arises
VcountIncr1To4(const uintptr_t * acc1,uint32_t acc1_vec_ct,VecW * acc4_iter)1530 HEADER_INLINE void VcountIncr1To4(const uintptr_t* acc1, uint32_t acc1_vec_ct, VecW* acc4_iter) {
1531   const VecW m1x4 = VCONST_W(kMask1111);
1532   const VecW* acc1_iter = R_CAST(const VecW*, acc1);
1533   for (uint32_t vidx = 0; vidx != acc1_vec_ct; ++vidx) {
1534     VecW loader = *acc1_iter++;
1535     *acc4_iter = (*acc4_iter) + (loader & m1x4);
1536     ++acc4_iter;
1537     loader = vecw_srli(loader, 1);
1538     *acc4_iter = (*acc4_iter) + (loader & m1x4);
1539     ++acc4_iter;
1540     loader = vecw_srli(loader, 1);
1541     *acc4_iter = (*acc4_iter) + (loader & m1x4);
1542     ++acc4_iter;
1543     loader = vecw_srli(loader, 1);
1544     *acc4_iter = (*acc4_iter) + (loader & m1x4);
1545     ++acc4_iter;
1546   }
1547 }
1548 
Vcount0Incr1To4(uint32_t acc1_vec_ct,uintptr_t * acc1,VecW * acc4_iter)1549 HEADER_INLINE void Vcount0Incr1To4(uint32_t acc1_vec_ct, uintptr_t* acc1, VecW* acc4_iter) {
1550   const VecW m1x4 = VCONST_W(kMask1111);
1551   VecW* acc1_iter = R_CAST(VecW*, acc1);
1552   for (uint32_t vidx = 0; vidx != acc1_vec_ct; ++vidx) {
1553     VecW loader = *acc1_iter;
1554     *acc1_iter++ = vecw_setzero();
1555     *acc4_iter = (*acc4_iter) + (loader & m1x4);
1556     ++acc4_iter;
1557     loader = vecw_srli(loader, 1);
1558     *acc4_iter = (*acc4_iter) + (loader & m1x4);
1559     ++acc4_iter;
1560     loader = vecw_srli(loader, 1);
1561     *acc4_iter = (*acc4_iter) + (loader & m1x4);
1562     ++acc4_iter;
1563     loader = vecw_srli(loader, 1);
1564     *acc4_iter = (*acc4_iter) + (loader & m1x4);
1565     ++acc4_iter;
1566   }
1567 }
1568 
1569 // er, should this just be the same function as unroll_incr_2_4 with extra
1570 // parameters?...
VcountIncr4To8(const VecW * acc4_iter,uint32_t acc4_vec_ct,VecW * acc8_iter)1571 HEADER_INLINE void VcountIncr4To8(const VecW* acc4_iter, uint32_t acc4_vec_ct, VecW* acc8_iter) {
1572   const VecW m4 = VCONST_W(kMask0F0F);
1573   for (uint32_t vidx = 0; vidx != acc4_vec_ct; ++vidx) {
1574     VecW loader = *acc4_iter++;
1575     *acc8_iter = (*acc8_iter) + (loader & m4);
1576     ++acc8_iter;
1577     loader = vecw_srli(loader, 4);
1578     *acc8_iter = (*acc8_iter) + (loader & m4);
1579     ++acc8_iter;
1580   }
1581 }
1582 
Vcount0Incr4To8(uint32_t acc4_vec_ct,VecW * acc4_iter,VecW * acc8_iter)1583 HEADER_INLINE void Vcount0Incr4To8(uint32_t acc4_vec_ct, VecW* acc4_iter, VecW* acc8_iter) {
1584   const VecW m4 = VCONST_W(kMask0F0F);
1585   for (uint32_t vidx = 0; vidx != acc4_vec_ct; ++vidx) {
1586     VecW loader = *acc4_iter;
1587     *acc4_iter++ = vecw_setzero();
1588     *acc8_iter = (*acc8_iter) + (loader & m4);
1589     ++acc8_iter;
1590     loader = vecw_srli(loader, 4);
1591     *acc8_iter = (*acc8_iter) + (loader & m4);
1592     ++acc8_iter;
1593   }
1594 }
1595 
VcountIncr8To32(const VecW * acc8_iter,uint32_t acc8_vec_ct,VecW * acc32_iter)1596 HEADER_INLINE void VcountIncr8To32(const VecW* acc8_iter, uint32_t acc8_vec_ct, VecW* acc32_iter) {
1597   const VecW m8x32 = VCONST_W(kMask000000FF);
1598   for (uint32_t vidx = 0; vidx != acc8_vec_ct; ++vidx) {
1599     VecW loader = *acc8_iter++;
1600     *acc32_iter = (*acc32_iter) + (loader & m8x32);
1601     ++acc32_iter;
1602     loader = vecw_srli(loader, 8);
1603     *acc32_iter = (*acc32_iter) + (loader & m8x32);
1604     ++acc32_iter;
1605     loader = vecw_srli(loader, 8);
1606     *acc32_iter = (*acc32_iter) + (loader & m8x32);
1607     ++acc32_iter;
1608     loader = vecw_srli(loader, 8);
1609     *acc32_iter = (*acc32_iter) + (loader & m8x32);
1610     ++acc32_iter;
1611   }
1612 }
1613 
Vcount0Incr8To32(uint32_t acc8_vec_ct,VecW * acc8_iter,VecW * acc32_iter)1614 HEADER_INLINE void Vcount0Incr8To32(uint32_t acc8_vec_ct, VecW* acc8_iter, VecW* acc32_iter) {
1615   const VecW m8x32 = VCONST_W(kMask000000FF);
1616   for (uint32_t vidx = 0; vidx != acc8_vec_ct; ++vidx) {
1617     VecW loader = *acc8_iter;
1618     *acc8_iter++ = vecw_setzero();
1619     *acc32_iter = (*acc32_iter) + (loader & m8x32);
1620     ++acc32_iter;
1621     loader = vecw_srli(loader, 8);
1622     *acc32_iter = (*acc32_iter) + (loader & m8x32);
1623     ++acc32_iter;
1624     loader = vecw_srli(loader, 8);
1625     *acc32_iter = (*acc32_iter) + (loader & m8x32);
1626     ++acc32_iter;
1627     loader = vecw_srli(loader, 8);
1628     *acc32_iter = (*acc32_iter) + (loader & m8x32);
1629     ++acc32_iter;
1630   }
1631 }
1632 
1633 
1634 // forward_ct must be positive.  Stays put if forward_ct == 1 and current bit
1635 // is set.
1636 // In usual 64-bit case, also assumes bitvec is vector aligned.
1637 uintptr_t FindNth1BitFrom(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct);
1638 
IdxToUidxBasic(const uintptr_t * bitvec,uint32_t idx)1639 HEADER_INLINE uint32_t IdxToUidxBasic(const uintptr_t* bitvec, uint32_t idx) {
1640   return FindNth1BitFrom(bitvec, 0, idx + 1);
1641 }
1642 
1643 // variant_ct must be positive, but can be smaller than thread_ct
1644 void ComputeUidxStartPartition(const uintptr_t* variant_include, uint64_t variant_ct, uint32_t thread_ct, uint32_t first_uidx, uint32_t* variant_uidx_starts);
1645 
1646 // Set multiplier to 0 to only count extra alleles, 1 to also count alt1 for
1647 // those variants (useful for HWE), 2 to count both ref and alt1 for
1648 // multiallelic variants.
1649 // allele_idx_offsets == nullptr ok.
1650 uintptr_t CountExtraAlleles(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t multiallelic_variant_ct_multiplier);
1651 
1652 uint32_t MaxAlleleCtSubset(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct);
1653 
1654 void ComputePartitionAligned(const uintptr_t* variant_include, uint32_t orig_thread_ct, uint32_t first_variant_uidx, uint32_t cur_variant_idx, uint32_t cur_variant_ct, uint32_t alignment, uint32_t* variant_uidx_starts, uint32_t* vidx_starts);
1655 
1656 BoolErr ParseNextRange(const char* const* argvk, uint32_t param_ct, char range_delim, uint32_t* cur_param_idx_ptr, const char** cur_arg_pptr, const char** range_start_ptr, uint32_t* rs_len_ptr, const char** range_end_ptr, uint32_t* re_len_ptr);
1657 
1658 PglErr ParseNameRanges(const char* const* argvk, const char* errstr_append, uint32_t param_ct, uint32_t require_posint, char range_delim, RangeList* range_list_ptr);
1659 
1660 
1661 // Analytically finds all real roots of x^3 + ax^2 + bx + c, saving them in
1662 // solutions[] (sorted from smallest to largest), and returning the count.
1663 // Multiple roots are only returned/counted once.
1664 uint32_t CubicRealRoots(double coef_a, double coef_b, double coef_c, STD_ARRAY_REF(double, 3) solutions);
1665 
1666 
1667 // store_all_dups currently must be true for dup_ct to be set, but this is easy
1668 // to change
1669 PglErr PopulateIdHtableMt(unsigned char* arena_top, const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t store_all_dups, uint32_t id_htable_size, uint32_t thread_ct, unsigned char** arena_bottom_ptr, uint32_t* id_htable, uint32_t* dup_ct_ptr);
1670 
1671 PglErr CheckIdUniqueness(unsigned char* arena_bottom, unsigned char* arena_top, const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t max_thread_ct, uint32_t* dup_found_ptr);
1672 
1673 // Pass in htable_dup_base_ptr == nullptr if just flagging duplicate IDs rather
1674 // than tracking all their positions in item_ids.
1675 // Otherwise, htable_dup_base entries are guaranteed to be filled in increasing
1676 // order (briefly made that nondeterministic on 11 Oct 2019 and broke --rm-dup,
1677 // not doing that again).
1678 PglErr AllocAndPopulateIdHtableMt(const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uintptr_t fast_size_min_extra_bytes, uint32_t max_thread_ct, uint32_t** id_htable_ptr, uint32_t** htable_dup_base_ptr, uint32_t* id_htable_size_ptr, uint32_t* dup_ct_ptr);
1679 
1680 
1681 typedef struct HelpCtrlStruct {
1682   NONCOPYABLE(HelpCtrlStruct);
1683   uint32_t iters_left;
1684   uint32_t param_ct;
1685   const char* const* argv;
1686   uintptr_t unmatched_ct;
1687   uintptr_t* all_match_arr;
1688   uintptr_t* prefix_match_arr;
1689   uintptr_t* perfect_match_arr;
1690   uint32_t* param_slens;
1691   uint32_t preprint_newline;
1692 } HelpCtrl;
1693 
1694 void HelpPrint(const char* cur_params, HelpCtrl* help_ctrl_ptr, uint32_t postprint_newline, const char* payload);
1695 
1696 
1697 extern const char kErrstrNomem[];
1698 extern const char kErrstrWrite[];
1699 extern const char kErrstrThreadCreate[];
1700 
1701 // assumes logfile is open
1702 void DispExitMsg(PglErr reterr);
1703 
1704 BoolErr CheckExtraParam(const char* const* argvk, const char* permitted_modif, uint32_t* other_idx_ptr);
1705 
1706 char ExtractCharParam(const char* ss);
1707 
1708 PglErr CmdlineAllocString(const char* source, const char* flag_name, uint32_t max_slen, char** sbuf_ptr);
1709 
1710 PglErr AllocFname(const char* source, const char* flagname_p, uint32_t extra_size, char** fnbuf_ptr);
1711 
1712 PglErr AllocAndFlatten(const char* const* sources, uint32_t param_ct, uint32_t max_blen, char** flattened_buf_ptr);
1713 
1714 
1715 typedef struct Plink2CmdlineMetaStruct {
1716   NONCOPYABLE(Plink2CmdlineMetaStruct);
1717   // need to be able to assign this to argv, so don't make it const char**
1718   char** subst_argv;
1719 
1720   char* script_buf;
1721   char* rerun_buf;
1722   char* flag_buf;
1723   uint32_t* flag_map;
1724 } Plink2CmdlineMeta;
1725 
1726 void PreinitPlink2CmdlineMeta(Plink2CmdlineMeta* pcmp);
1727 
1728 // Handles --script, --rerun, --help, --version, and --silent.
1729 // subst_argv, script_buf, and rerun_buf must be initialized to nullptr.
1730 PglErr CmdlineParsePhase1(const char* ver_str, const char* ver_str2, const char* prog_name_str, const char* notestr_null_calc2, const char* cmdline_format_str, const char* errstr_append, uint32_t max_flag_blen, PglErr(* disp_help_fn)(const char* const*, uint32_t), int* argc_ptr, char*** argv_ptr, Plink2CmdlineMeta* pcmp, uint32_t* first_arg_idx_ptr, uint32_t* flag_ct_ptr);
1731 
1732 // Assumes CmdlineParsePhase1() has completed, flag names have been copied to
1733 // flag_buf/flag_map, aliases handled, and PROG_NAME_STR has been copied to
1734 // outname (no null-terminator needed).  outname_end must be initialized to
1735 // nullptr.
1736 // This sorts the flag names so they're processed in a predictable order,
1737 // handles --d and --out if present, initializes the log, and determines the
1738 // number of processors the OS wants us to think the machine has.
1739 PglErr CmdlineParsePhase2(const char* ver_str, const char* errstr_append, const char* const* argvk, uint32_t prog_name_str_slen, uint32_t max_flag_blen, int32_t argc, uint32_t flag_ct, Plink2CmdlineMeta* pcmp, char* outname, char** outname_end_ptr, char* range_delim_ptr, int32_t* known_procs_ptr, uint32_t* max_thread_ct_ptr);
1740 
InvalidArg(const char * cur_arg)1741 HEADER_INLINE void InvalidArg(const char* cur_arg) {
1742   logpreprintfww("Error: Unrecognized flag ('%s').\n", cur_arg);
1743 }
1744 
1745 PglErr CmdlineParsePhase3(uintptr_t max_default_mib, uintptr_t malloc_size_mib, uint32_t memory_require, Plink2CmdlineMeta* pcmp, unsigned char** bigstack_ua_ptr);
1746 
1747 void CleanupPlink2CmdlineMeta(Plink2CmdlineMeta* pcmp);
1748 
1749 // order is such that (kCmpOperatorEq - x) is the inverse of x
1750 ENUM_U31_DEF_START()
1751   kCmpOperatorNoteq,
1752   kCmpOperatorLe,
1753   kCmpOperatorLeq,
1754 
1755   kCmpOperatorGe,
1756   kCmpOperatorGeq,
1757   kCmpOperatorEq
1758 ENUM_U31_DEF_END(CmpBinaryOp);
1759 
1760 typedef struct CmpExprStruct {
1761   // arguably too small for noncopyable to be a great idea, but the next
1762   // iteration of this struct probably won't have that issue.
1763   NONCOPYABLE(CmpExprStruct);
1764   // Restrict to <key> <operator> <value> for now; key=INFO/pheno/covar.
1765   //
1766   // Almost certainly want to support conjunctions/disjunctions later; but
1767   // there are complications regarding the command-line interface:
1768   // * Phenotype/covariate names and VCFv4.2 INFO keys can contain parenthesis
1769   //   and mathematical-operator characters.  And we actually need to permit at
1770   //   least the latter, since autogenerated categorical-covariate names
1771   //   contain '=' for a good reason.  Some sort of escaping is probably in
1772   //   in order, but...
1773   // * Escaping is not a big deal in a shell script, but most
1774   //   conjunctions/disjunctions can already be emulated easily enough in a
1775   //   shell script by calling plink2 multiple times.  While there's some
1776   //   execution-time improvement, the main value-add from directly supporting
1777   //   logical operations in --keep-if/--extract-if-info is reduced
1778   //   interactive-use cognitive load.
1779   // * We can't treat multiple command-line arguments differently from the
1780   //   corresponding space-containing single command-line argument; otherwise
1781   //   --rerun breaks.  So "multiple-argument-form supports keys with special
1782   //   characters but not conjunctions/disjunctions; single-argument-form is
1783   //   the other way around" is not an option.
1784   // The least-bad solution I've thought of is to add --keep-if-file/etc. flags
1785   // which read the expression from a short text file.  In that case, we'd at
1786   // least be able to define normal quoting and escaping rules without worrying
1787   // about confusing interactions with bash.  Deferring implementation for now
1788   // in hopes of coming up with a better idea, but this should go in before
1789   // beta testing begins.
1790 
1791   // Currently stores null-terminated key, followed by null-terminated value
1792   // string.  Storage format needs to be synced with ValidateAndAllocCmpExpr().
1793   char* pheno_name;
1794   CmpBinaryOp binary_op;
1795 } CmpExpr;
1796 
1797 void InitCmpExpr(CmpExpr* cmp_expr_ptr);
1798 
1799 void CleanupCmpExpr(CmpExpr* cmp_expr_ptr);
1800 
1801 PglErr ValidateAndAllocCmpExpr(const char* const* sources, const char* flag_name, uint32_t param_ct, CmpExpr* cmp_expr_ptr);
1802 
1803 PglErr SearchHeaderLine(const char* header_line_iter, const char* const* search_multistrs, const char* flag_nodash, uint32_t search_col_ct, uint32_t* found_col_ct_ptr, uint32_t* found_type_bitset_ptr, uint32_t* col_skips, uint32_t* col_types);
1804 
1805 // col_descriptor is usually a pointer to argv[...][5] (first five characters
1806 // are "cols=").  supported_ids is a multistr.
1807 PglErr ParseColDescriptor(const char* col_descriptor_iter, const char* supported_ids, const char* cur_flag_name, uint32_t first_col_shifted, uint32_t default_cols_mask, uint32_t prohibit_empty, void* result_ptr);
1808 
1809 
1810 // this is technically application-dependent, but let's keep this simple for
1811 // now
1812 // todo: recalibrate these numbers before each beta release
1813 #ifndef __LP64__
1814   // 2047 seems to consistently fail on both OS X and Windows
1815 #  ifdef _WIN32
1816 CONSTI32(kMalloc32bitMibMax, 1728);
1817 #  else
1818 #    ifdef __APPLE__
1819 CONSTI32(kMalloc32bitMibMax, 1888);
1820 #    else
1821 CONSTI32(kMalloc32bitMibMax, 2047);
1822 #    endif
1823 #  endif
1824 #endif
1825 
1826 
1827 
1828 #ifdef __cplusplus
1829 }  // namespace plink2
1830 #endif
1831 
1832 #endif  // __PLINK2_CMDLINE_H__
1833