1 #ifndef __PLINK2_CMDLINE_H__
2 #define __PLINK2_CMDLINE_H__
3
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This program is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation, either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20
21 // Cross-platform logging, command-line parsing, workspace
22 // initialization/allocation, basic multithreading code, and a few numeric
23 // constants.
24
25 #include "include/plink2_bits.h"
26 #include "include/plink2_string.h"
27 #include "include/plink2_thread.h"
28
29 #include <errno.h>
30 #include <stdarg.h>
31
32 #ifndef _WIN32
33 # include <sys/stat.h>
34 #endif
35
36 #ifdef DYNAMIC_MKL
37 # define USE_MKL
38 #endif
39
40 #ifdef USE_MKL
41 # ifdef __APPLE__
42 # error "plink2 cannot currently use MKL on OS X."
43 # endif
44 # ifdef LAPACK_ILP64
45 # define MKL_ILP64
46 # endif
47 # ifdef DYNAMIC_MKL
48 # include <mkl_service.h>
49 # else
50 // If this isn't initially found, use the compiler's -I option to specify the
51 // appropriate include-file directory. A common location is
52 // /opt/intel/mkl/include .
53 // If this isn't installed at all on your system but you want/need to change
54 // that, see the instructions at
55 // https://software.intel.com/en-us/articles/installing-intel-free-libs-and-python-apt-repo
56 # include "mkl_service.h"
57 # endif
58 # define USE_MTBLAS
59 // this technically doesn't have to be a macro, but it's surrounded by other
60 // things which do have to be macros, so changing this to a namespaced function
61 // arguably *decreases* overall readability...
62 # define BLAS_SET_NUM_THREADS mkl_set_num_threads
63 #else
64 # ifdef USE_OPENBLAS
65 # ifdef __cplusplus
66 extern "C" {
67 # endif
68 void openblas_set_num_threads(int num_threads);
69 # ifdef __cplusplus
70 } // extern "C"
71 # endif
72 # define USE_MTBLAS
73 # define BLAS_SET_NUM_THREADS openblas_set_num_threads
74 # else
75 # define BLAS_SET_NUM_THREADS(num)
76 # endif
77 #endif
78
79 #ifdef _WIN32
80 # define NULL_STREAM_NAME "nul"
81 #else
82 # define NULL_STREAM_NAME "/dev/null"
83 #endif
84
85 #ifdef __cplusplus
86 namespace plink2 {
87 #endif
88
89 CONSTI32(kLogbufSize, 2 * kMaxMediumLine);
90
91 // must be at least 2 * kMaxMediumLine + 2 to support generic token loader.
92 CONSTI32(kTextbufSize, 2 * kMaxMediumLine + 256);
93
94 // when g_textbuf is used as a generic I/O buffer, this is a convenient
95 // power-of-2 size (must be <= kTextbufSize).
96 CONSTI32(kTextbufMainSize, 2 * kMaxMediumLine);
97
98 // "slen" is now used to indicate string lengths excluding terminating nulls,
99 // while "blen" includes the terminator.
100
101 // Maximum length of chromosome, variant, FID, IID, cluster, and set IDs (not
102 // including terminating null). This value supports up to 8 IDs per line
103 // (maximum so far is 5, for e.g. --hom).
104 // Assumed by plink2_pvar to be a multiple of 16.
105 CONSTI32(kMaxIdSlen, 16000);
106 CONSTI32(kMaxIdBlen, kMaxIdSlen + 1);
107 // Don't see a better option than #define for this.
108 #define MAX_ID_SLEN_STR "16000"
109
110 // allow extensions like .model.trend.fisher.set.score.adjusted
111 CONSTI32(kMaxOutfnameExtBlen, 39);
112
113 #ifdef __LP64__
RoundUpPow2U64(uint64_t val,uint64_t alignment)114 HEADER_CINLINE uint64_t RoundUpPow2U64(uint64_t val, uint64_t alignment) {
115 return RoundUpPow2(val, alignment);
116 }
117 #else
RoundUpPow2U64(uint64_t val,uint64_t alignment)118 HEADER_CINLINE uint64_t RoundUpPow2U64(uint64_t val, uint64_t alignment) {
119 return (val + alignment - 1) & (~(alignment - 1));
120 }
121 #endif
122
123 extern const char kErrprintfFopen[];
124 extern const char kErrprintfFread[];
125 extern const char kErrprintfRewind[];
126 // extern const char g_cmdline_format_str[];
127
128 // All global variables not initialized at compile time start with g_ (even if
129 // they're initialized very early and never changed afterwards, like
130 // g_one_char_strs).
131 extern char g_textbuf[];
132
133 extern const char* g_one_char_strs;
134
135 // '.' missing genotype value is now taken for granted; this is in *addition*
136 // to it (default '0').
137 // (Yes, this might not belong in plink2_cmdline.)
138 extern const char* g_input_missing_geno_ptr;
139
140 extern const char* g_output_missing_geno_ptr; // now defaults to '.'
141
142 extern FILE* g_logfile;
143
144 // Mostly-safe log buffer (length kLogbufSize, currently 256k). Good practice
145 // to use snprintf when writing an entire line to it in a single statement.
146 // Warning: Do NOT put allele codes or arbitrary-length lists in here.
147 extern char g_logbuf[];
148
149 extern uint32_t g_debug_on;
150 extern uint32_t g_log_failed;
151
152 // for --warning-errcode
153 extern uint32_t g_stderr_written_to;
154
155
156 // Warning: Do NOT include allele codes (unless they're guaranteed to be SNPs)
157 // in log strings; they can overflow the buffer.
158 void logputs_silent(const char* str);
159
160 void logputs(const char* str);
161
162 void logerrputs(const char* str);
163
164 void logputsb();
165
166 void logerrputsb();
167
logprintf(const char * fmt,...)168 HEADER_INLINE void logprintf(const char* fmt, ...) {
169 va_list args;
170 va_start(args, fmt);
171 vsnprintf(g_logbuf, kLogbufSize, fmt, args);
172 logputsb();
173 }
174
logerrprintf(const char * fmt,...)175 HEADER_INLINE void logerrprintf(const char* fmt, ...) {
176 va_list args;
177 va_start(args, fmt);
178 vsnprintf(g_logbuf, kLogbufSize, fmt, args);
179 logerrputsb();
180 }
181
182 // input for WordWrapB/logprintfww should have no intermediate '\n's. If
183 // suffix_len is 0, there should be a terminating \n.
WordWrapB(uint32_t suffix_len)184 HEADER_INLINE void WordWrapB(uint32_t suffix_len) {
185 WordWrap(suffix_len, g_logbuf);
186 }
187
logpreprintfww(const char * fmt,...)188 HEADER_INLINE void logpreprintfww(const char* fmt, ...) {
189 va_list args;
190 va_start(args, fmt);
191 vsnprintf(g_logbuf, kLogbufSize, fmt, args);
192 WordWrapB(0);
193 }
194
logprintfww(const char * fmt,...)195 HEADER_INLINE void logprintfww(const char* fmt, ...) {
196 va_list args;
197 va_start(args, fmt);
198 vsnprintf(g_logbuf, kLogbufSize, fmt, args);
199 WordWrapB(0);
200 logputsb();
201 }
202
logerrprintfww(const char * fmt,...)203 HEADER_INLINE void logerrprintfww(const char* fmt, ...) {
204 va_list args;
205 va_start(args, fmt);
206 vsnprintf(g_logbuf, kLogbufSize, fmt, args);
207 WordWrapB(0);
208 logerrputsb();
209 }
210
211 // 5 = length of "done." suffix, which is commonly used
logprintfww5(const char * fmt,...)212 HEADER_INLINE void logprintfww5(const char* fmt, ...) {
213 va_list args;
214 va_start(args, fmt);
215 vsnprintf(g_logbuf, kLogbufSize, fmt, args);
216 WordWrapB(5);
217 logputsb();
218 }
219
DebugPrintf(const char * fmt,...)220 HEADER_INLINE void DebugPrintf(const char* fmt, ...) {
221 if (g_debug_on) {
222 va_list args;
223 va_start(args, fmt);
224 vsnprintf(g_logbuf, kLogbufSize, fmt, args);
225 logputsb();
226 }
227 }
228
DPrintf(const char * fmt,...)229 HEADER_INLINE void DPrintf(const char* fmt, ...) {
230 if (g_debug_on) {
231 va_list args;
232 va_start(args, fmt);
233 vsnprintf(g_logbuf, kLogbufSize, fmt, args);
234 logputsb();
235 }
236 }
237
238 // Returns kPglRetOpenFail if file doesn't exist, or kPglRetRewindFail if file
239 // is process-substitution/named-pipe. Does not print an error message.
240 PglErr ForceNonFifo(const char* fname);
241
242 BoolErr fopen_checked(const char* fname, const char* mode, FILE** target_ptr);
243
putc_checked(int32_t ii,FILE * outfile)244 HEADER_INLINE IntErr putc_checked(int32_t ii, FILE* outfile) {
245 putc_unlocked(ii, outfile);
246 return ferror_unlocked(outfile);
247 }
248
fputs_checked(const char * str,FILE * outfile)249 HEADER_INLINE IntErr fputs_checked(const char* str, FILE* outfile) {
250 fputs(str, outfile);
251 return ferror_unlocked(outfile);
252 }
253
254 BoolErr fwrite_flush2(char* buf_flush, FILE* outfile, char** write_iter_ptr);
255
fwrite_uflush2(unsigned char * buf_flush,FILE * outfile,unsigned char ** write_iter_ptr)256 HEADER_INLINE BoolErr fwrite_uflush2(unsigned char* buf_flush, FILE* outfile, unsigned char** write_iter_ptr) {
257 return fwrite_flush2(R_CAST(char*, buf_flush), outfile, R_CAST(char**, write_iter_ptr));
258 }
259
fwrite_ck(char * buf_flush,FILE * outfile,char ** write_iter_ptr)260 HEADER_INLINE BoolErr fwrite_ck(char* buf_flush, FILE* outfile, char** write_iter_ptr) {
261 if ((*write_iter_ptr) < buf_flush) {
262 return 0;
263 }
264 return fwrite_flush2(buf_flush, outfile, write_iter_ptr);
265 }
266
267 // fclose_null defined in plink2_base.h
268
269 BoolErr fclose_flush_null(char* buf_flush, char* write_iter, FILE** outfile_ptr);
270
fclose_uflush_null(unsigned char * buf_flush,unsigned char * write_iter,FILE ** outfile_ptr)271 HEADER_INLINE BoolErr fclose_uflush_null(unsigned char* buf_flush, unsigned char* write_iter, FILE** outfile_ptr) {
272 return fclose_flush_null(R_CAST(char*, buf_flush), R_CAST(char*, write_iter), outfile_ptr);
273 }
274
275 // This should only be used when the file can only be open on error-early-exit;
276 // otherwise we care about fclose's error code.
fclose_cond(FILE * fptr)277 HEADER_INLINE void fclose_cond(FILE* fptr) {
278 if (fptr) {
279 fclose(fptr);
280 }
281 }
282
ClipU32(uint32_t val,uint32_t lbound,uint32_t ubound)283 HEADER_INLINE uint32_t ClipU32(uint32_t val, uint32_t lbound, uint32_t ubound) {
284 if (val >= ubound) {
285 return ubound;
286 }
287 return MAXV(val, lbound);
288 }
289
290 int32_t double_cmp(const void* aa, const void* bb);
291
292 int32_t double_cmp_decr(const void* aa, const void* bb);
293
294 int32_t u64cmp(const void* aa, const void* bb);
295
296 #ifndef __cplusplus
297 int32_t u64cmp_decr(const void* aa, const void* bb);
298 #endif
299
U32ArrMax(const uint32_t * unsorted_arr,uintptr_t len)300 HEADER_INLINE uint32_t U32ArrMax(const uint32_t* unsorted_arr, uintptr_t len) {
301 const uint32_t* unsorted_arr_end = &(unsorted_arr[len]);
302 #ifndef __cplusplus
303 const uint32_t* unsorted_arr_iter = unsorted_arr;
304 uint32_t uimax = *unsorted_arr_iter++;
305 while (unsorted_arr_iter < unsorted_arr_end) {
306 const uint32_t cur_val = *unsorted_arr_iter++;
307 if (cur_val > uimax) {
308 uimax = cur_val;
309 }
310 }
311 return uimax;
312 #else
313 return *std::max_element(unsorted_arr, unsorted_arr_end);
314 #endif
315 }
316
FArrMax(const float * unsorted_arr,uintptr_t len)317 HEADER_INLINE float FArrMax(const float* unsorted_arr, uintptr_t len) {
318 const float* unsorted_arr_end = &(unsorted_arr[len]);
319 #if defined(__APPLE__) || !defined(__cplusplus)
320 // std::max_element doesn't seem to be performant for floats/doubles on OS X
321 const float* unsorted_arr_iter = unsorted_arr;
322 float fmax = *unsorted_arr_iter++;
323 while (unsorted_arr_iter < unsorted_arr_end) {
324 const float cur_val = *unsorted_arr_iter++;
325 if (cur_val > fmax) {
326 fmax = cur_val;
327 }
328 }
329 return fmax;
330 #else
331 return *std::max_element(unsorted_arr, unsorted_arr_end);
332 #endif
333 }
334
DArrMax(const double * unsorted_arr,uintptr_t len)335 HEADER_INLINE double DArrMax(const double* unsorted_arr, uintptr_t len) {
336 const double* unsorted_arr_end = &(unsorted_arr[len]);
337 #if defined(__APPLE__) || !defined(__cplusplus)
338 const double* unsorted_arr_iter = unsorted_arr;
339 double dmax = *unsorted_arr_iter++;
340 while (unsorted_arr_iter < unsorted_arr_end) {
341 const double cur_val = *unsorted_arr_iter++;
342 if (cur_val > dmax) {
343 dmax = cur_val;
344 }
345 }
346 return dmax;
347 #else
348 return *std::max_element(unsorted_arr, unsorted_arr_end);
349 #endif
350 }
351
352 float DestructiveMedianF(uintptr_t len, float* unsorted_arr);
353
354 double DestructiveMedianD(uintptr_t len, double* unsorted_arr);
355
356
357 // This makes a temporary g_bigstack allocation.
358 // Must be safe to read up to (kBytesPerWord - 1) bytes past end of strbox.
359 // Results may technically vary between runs when duplicate elements are
360 // present; it's assumed that this doesn't matter because all duplicates will
361 // be handled in the same manner.
362 BoolErr SortStrboxIndexed(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map);
363
364
365 // This makes a temporary g_bigstack allocation.
366 // BoolErr strptr_arr_indexed_sort(const char* const* unsorted_strptrs, uint32_t str_ct, uint32_t use_nsort, uint32_t* id_map);
367
368 // Offset of std::lower_bound.
369 // Requires 0 < arr_length < 2^31.
370 uint32_t CountSortedSmallerU32(const uint32_t* sorted_u32_arr, uint32_t arr_length, uint32_t needle);
371
372 // Same as above, but optimized for case where result is probably close to the
373 // front, and tolerates end_idx (== arr_length) == 0.
374 uint32_t ExpsearchU32(const uint32_t* sorted_u32_arr, uint32_t end_idx, uint32_t needle);
375
376 uintptr_t CountSortedSmallerU64(const uint64_t* sorted_u64_arr, uintptr_t arr_length, uint64_t needle);
377
378 uintptr_t CountSortedSmallerD(const double* sorted_dbl_arr, uintptr_t arr_length, double needle);
379
380
381 uintptr_t CountSortedLeqU64(const uint64_t* sorted_u64_arr, uintptr_t arr_length, uint64_t needle);
382
383
IsCmdlineFlag(const char * param)384 HEADER_INLINE uint32_t IsCmdlineFlag(const char* param) {
385 unsigned char ucc = param[1];
386 return ((*param == '-') && ((ucc > '9') || ((ucc < '0') && (ucc != '.') && (ucc != '\0'))));
387 }
388
IsCmdlineFlagStart(const char * param)389 HEADER_INLINE CXXCONST_CP IsCmdlineFlagStart(const char* param) {
390 unsigned char ucc = param[1];
391 if ((*param == '-') && ((ucc > '9') || ((ucc < '0') && (ucc != '.') && (ucc != '\0')))) {
392 return S_CAST(CXXCONST_CP, &(param[1 + (ucc == '-')]));
393 }
394 return nullptr;
395 }
396
397 #ifdef __cplusplus
IsCmdlineFlagStart(char * param)398 HEADER_INLINE char* IsCmdlineFlagStart(char* param) {
399 return const_cast<char*>(IsCmdlineFlagStart(const_cast<const char*>(param)));
400 }
401 #endif
402
403 uint32_t GetParamCt(const char* const* argvk, uint32_t argc, uint32_t flag_idx);
404
405 BoolErr EnforceParamCtRange(const char* flag_name, uint32_t param_ct, uint32_t min_ct, uint32_t max_ct);
406
407 // PglErr SortCmdlineFlags(uint32_t max_flag_blen, uint32_t flag_ct, char* flag_buf, uint32_t* flag_map);
408
409 BoolErr CleanupLogfile(uint32_t print_end_time);
410
411 CONSTI32(kNonBigstackMin, 67108864);
412
413 CONSTI32(kBigstackMinMib, 640);
414 CONSTI32(kBigstackDefaultMib, 2048);
415
416 static const double kSmallishEpsilon = 0.00000000002910383045673370361328125;
417 static const double kRecip2m32 = 0.00000000023283064365386962890625;
418 static const double k2m64 = 18446744073709551616.0;
419
420 static const double kLnPvalError = 9.0;
421
422 static const double kDblNormalMin = 2.2250738585072013e-308;
423
424 // probably time to flip arena_alloc and bigstack_alloc definitions...
425
426 // manually managed, very large double-ended stack
427 extern unsigned char* g_bigstack_base;
428 extern unsigned char* g_bigstack_end;
429
430 uintptr_t DetectMib();
431
432 uintptr_t GetDefaultAllocMib();
433
434 // caller is responsible for freeing bigstack_ua
435 PglErr InitBigstack(uintptr_t malloc_size_mib, uintptr_t* malloc_mib_final_ptr, unsigned char** bigstack_ua_ptr);
436
437
bigstack_left()438 HEADER_INLINE uintptr_t bigstack_left() {
439 return g_bigstack_end - g_bigstack_base;
440 }
441
bigstack_alloc_raw(uintptr_t size)442 HEADER_INLINE void* bigstack_alloc_raw(uintptr_t size) {
443 // Assumes caller has already forced size to a multiple of
444 // kCacheline, and verified that enough space is available.
445 assert(!(size % kCacheline));
446 unsigned char* alloc_ptr = g_bigstack_base;
447 g_bigstack_base += size;
448 return alloc_ptr;
449 }
450
bigstack_alloc_raw_rd(uintptr_t size)451 HEADER_INLINE void* bigstack_alloc_raw_rd(uintptr_t size) {
452 // Same as bigstack_alloc_raw(), except for rounding up size.
453 unsigned char* alloc_ptr = g_bigstack_base;
454 g_bigstack_base += RoundUpPow2(size, kCacheline);
455 return alloc_ptr;
456 }
457
458 // Basic 64-byte-aligned allocation at bottom of stack.
459 // Note that --make-pgen switches gracefully to a less memory-hungry algorithm
460 // when it encounters an allocation failure with its default algorithm. Since
461 // it can only happen once, unlikely() is still justified, but keep an eye on
462 // this.
bigstack_alloc(uintptr_t size)463 HEADER_INLINE void* bigstack_alloc(uintptr_t size) {
464 size = RoundUpPow2(size, kCacheline);
465 if (unlikely(bigstack_left() < size)) {
466 g_failed_alloc_attempt_size = size;
467 return nullptr;
468 }
469 return bigstack_alloc_raw(size);
470 }
471
472
473 // Typesafe, return-0-iff-success interfaces. (See also bigstack_calloc_...
474 // further below.)
475 // This interface deliberately not provided for bigstack_alloc_raw() and
476 // bigstack_alloc_raw_rd() (and I wouldn't blame a future maintainer for
477 // entirely eliminating those functions).
bigstack_alloc_c(uintptr_t ct,char ** c_arr_ptr)478 HEADER_INLINE BoolErr bigstack_alloc_c(uintptr_t ct, char** c_arr_ptr) {
479 *c_arr_ptr = S_CAST(char*, bigstack_alloc(ct));
480 return !(*c_arr_ptr);
481 }
482
bigstack_alloc_d(uintptr_t ct,double ** d_arr_ptr)483 HEADER_INLINE BoolErr bigstack_alloc_d(uintptr_t ct, double** d_arr_ptr) {
484 *d_arr_ptr = S_CAST(double*, bigstack_alloc(ct * sizeof(double)));
485 return !(*d_arr_ptr);
486 }
487
bigstack_alloc_f(uintptr_t ct,float ** f_arr_ptr)488 HEADER_INLINE BoolErr bigstack_alloc_f(uintptr_t ct, float** f_arr_ptr) {
489 *f_arr_ptr = S_CAST(float*, bigstack_alloc(ct * sizeof(float)));
490 return !(*f_arr_ptr);
491 }
492
bigstack_alloc_i16(uintptr_t ct,int16_t ** i16_arr_ptr)493 HEADER_INLINE BoolErr bigstack_alloc_i16(uintptr_t ct, int16_t** i16_arr_ptr) {
494 *i16_arr_ptr = S_CAST(int16_t*, bigstack_alloc(ct * sizeof(int16_t)));
495 return !(*i16_arr_ptr);
496 }
497
bigstack_alloc_i32(uintptr_t ct,int32_t ** i32_arr_ptr)498 HEADER_INLINE BoolErr bigstack_alloc_i32(uintptr_t ct, int32_t** i32_arr_ptr) {
499 *i32_arr_ptr = S_CAST(int32_t*, bigstack_alloc(ct * sizeof(int32_t)));
500 return !(*i32_arr_ptr);
501 }
502
bigstack_alloc_uc(uintptr_t ct,unsigned char ** uc_arr_ptr)503 HEADER_INLINE BoolErr bigstack_alloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
504 *uc_arr_ptr = S_CAST(unsigned char*, bigstack_alloc(ct));
505 return !(*uc_arr_ptr);
506 }
507
bigstack_alloc_u16(uintptr_t ct,uint16_t ** u16_arr_ptr)508 HEADER_INLINE BoolErr bigstack_alloc_u16(uintptr_t ct, uint16_t** u16_arr_ptr) {
509 *u16_arr_ptr = S_CAST(uint16_t*, bigstack_alloc(ct * sizeof(int16_t)));
510 return !(*u16_arr_ptr);
511 }
512
bigstack_alloc_u32(uintptr_t ct,uint32_t ** u32_arr_ptr)513 HEADER_INLINE BoolErr bigstack_alloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr) {
514 *u32_arr_ptr = S_CAST(uint32_t*, bigstack_alloc(ct * sizeof(int32_t)));
515 return !(*u32_arr_ptr);
516 }
517
bigstack_alloc_w(uintptr_t ct,uintptr_t ** w_arr_ptr)518 HEADER_INLINE BoolErr bigstack_alloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) {
519 *w_arr_ptr = S_CAST(uintptr_t*, bigstack_alloc(ct * sizeof(intptr_t)));
520 return !(*w_arr_ptr);
521 }
522
bigstack_alloc_i64(uintptr_t ct,int64_t ** i64_arr_ptr)523 HEADER_INLINE BoolErr bigstack_alloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) {
524 *i64_arr_ptr = S_CAST(int64_t*, bigstack_alloc(ct * sizeof(int64_t)));
525 return !(*i64_arr_ptr);
526 }
527
bigstack_alloc_u64(uintptr_t ct,uint64_t ** u64_arr_ptr)528 HEADER_INLINE BoolErr bigstack_alloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr) {
529 *u64_arr_ptr = S_CAST(uint64_t*, bigstack_alloc(ct * sizeof(int64_t)));
530 return !(*u64_arr_ptr);
531 }
532
bigstack_alloc_v(uintptr_t ct,VecW ** v_arr_ptr)533 HEADER_INLINE BoolErr bigstack_alloc_v(uintptr_t ct, VecW** v_arr_ptr) {
534 *v_arr_ptr = S_CAST(VecW*, bigstack_alloc(ct * sizeof(VecW)));
535 return !(*v_arr_ptr);
536 }
537
538 // some versions of gcc give aliasing warnings if we use bigstack_alloc_w()
539 // for everything
540 // if sizeof(intptr_t) != sizeof(uintptr_t*), we're doomed anyway, so I won't
541 // bother with that static assert...
bigstack_alloc_wp(uintptr_t ct,uintptr_t *** ulp_arr_ptr)542 HEADER_INLINE BoolErr bigstack_alloc_wp(uintptr_t ct, uintptr_t*** ulp_arr_ptr) {
543 *ulp_arr_ptr = S_CAST(uintptr_t**, bigstack_alloc(ct * sizeof(intptr_t)));
544 return !(*ulp_arr_ptr);
545 }
546
bigstack_alloc_cp(uintptr_t ct,char *** cp_arr_ptr)547 HEADER_INLINE BoolErr bigstack_alloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
548 *cp_arr_ptr = S_CAST(char**, bigstack_alloc(ct * sizeof(intptr_t)));
549 return !(*cp_arr_ptr);
550 }
551
bigstack_alloc_kcp(uintptr_t ct,const char *** kcp_arr_ptr)552 HEADER_INLINE BoolErr bigstack_alloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
553 *kcp_arr_ptr = S_CAST(const char**, bigstack_alloc(ct * sizeof(intptr_t)));
554 return !(*kcp_arr_ptr);
555 }
556
bigstack_alloc_i16p(uintptr_t ct,int16_t *** i16p_arr_ptr)557 HEADER_INLINE BoolErr bigstack_alloc_i16p(uintptr_t ct, int16_t*** i16p_arr_ptr) {
558 *i16p_arr_ptr = S_CAST(int16_t**, bigstack_alloc(ct * sizeof(intptr_t)));
559 return !(*i16p_arr_ptr);
560 }
561
bigstack_alloc_i32p(uintptr_t ct,int32_t *** i32p_arr_ptr)562 HEADER_INLINE BoolErr bigstack_alloc_i32p(uintptr_t ct, int32_t*** i32p_arr_ptr) {
563 *i32p_arr_ptr = S_CAST(int32_t**, bigstack_alloc(ct * sizeof(intptr_t)));
564 return !(*i32p_arr_ptr);
565 }
566
bigstack_alloc_ucp(uintptr_t ct,unsigned char *** ucp_arr_ptr)567 HEADER_INLINE BoolErr bigstack_alloc_ucp(uintptr_t ct, unsigned char*** ucp_arr_ptr) {
568 *ucp_arr_ptr = S_CAST(unsigned char**, bigstack_alloc(ct * sizeof(intptr_t)));
569 return !(*ucp_arr_ptr);
570 }
571
bigstack_alloc_u16p(uintptr_t ct,uint16_t *** u16p_arr_ptr)572 HEADER_INLINE BoolErr bigstack_alloc_u16p(uintptr_t ct, uint16_t*** u16p_arr_ptr) {
573 *u16p_arr_ptr = S_CAST(uint16_t**, bigstack_alloc(ct * sizeof(intptr_t)));
574 return !(*u16p_arr_ptr);
575 }
576
bigstack_alloc_u32p(uintptr_t ct,uint32_t *** u32p_arr_ptr)577 HEADER_INLINE BoolErr bigstack_alloc_u32p(uintptr_t ct, uint32_t*** u32p_arr_ptr) {
578 *u32p_arr_ptr = S_CAST(uint32_t**, bigstack_alloc(ct * sizeof(intptr_t)));
579 return !(*u32p_arr_ptr);
580 }
581
bigstack_alloc_u64p(uintptr_t ct,uint64_t *** u64p_arr_ptr)582 HEADER_INLINE BoolErr bigstack_alloc_u64p(uintptr_t ct, uint64_t*** u64p_arr_ptr) {
583 *u64p_arr_ptr = S_CAST(uint64_t**, bigstack_alloc(ct * sizeof(intptr_t)));
584 return !(*u64p_arr_ptr);
585 }
586
bigstack_alloc_dp(uintptr_t ct,double *** dp_arr_ptr)587 HEADER_INLINE BoolErr bigstack_alloc_dp(uintptr_t ct, double*** dp_arr_ptr) {
588 *dp_arr_ptr = S_CAST(double**, bigstack_alloc(ct * sizeof(intptr_t)));
589 return !(*dp_arr_ptr);
590 }
591
bigstack_alloc_vp(uintptr_t ct,VecW *** vp_arr_ptr)592 HEADER_INLINE BoolErr bigstack_alloc_vp(uintptr_t ct, VecW*** vp_arr_ptr) {
593 *vp_arr_ptr = S_CAST(VecW**, bigstack_alloc(ct * sizeof(intptr_t)));
594 return !(*vp_arr_ptr);
595 }
596
bigstack_alloc_wpp(uintptr_t ct,uintptr_t **** wpp_arr_ptr)597 HEADER_INLINE BoolErr bigstack_alloc_wpp(uintptr_t ct, uintptr_t**** wpp_arr_ptr) {
598 *wpp_arr_ptr = S_CAST(uintptr_t***, bigstack_alloc(ct * sizeof(intptr_t)));
599 return !(*wpp_arr_ptr);
600 }
601
bigstack_alloc_u32pp(uintptr_t ct,uint32_t **** u32pp_arr_ptr)602 HEADER_INLINE BoolErr bigstack_alloc_u32pp(uintptr_t ct, uint32_t**** u32pp_arr_ptr) {
603 *u32pp_arr_ptr = S_CAST(uint32_t***, bigstack_alloc(ct * sizeof(intptr_t)));
604 return !(*u32pp_arr_ptr);
605 }
606
bigstack_alloc_vpp(uintptr_t ct,VecW **** vpp_arr_ptr)607 HEADER_INLINE BoolErr bigstack_alloc_vpp(uintptr_t ct, VecW**** vpp_arr_ptr) {
608 *vpp_arr_ptr = S_CAST(VecW***, bigstack_alloc(ct * sizeof(VecW)));
609 return !(*vpp_arr_ptr);
610 }
611
612 BoolErr bigstack_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr);
613
614 BoolErr bigstack_calloc_d(uintptr_t ct, double** d_arr_ptr);
615
616 BoolErr bigstack_calloc_f(uintptr_t ct, float** f_arr_ptr);
617
618 BoolErr bigstack_calloc_u16(uintptr_t ct, uint16_t** u16_arr_ptr);
619
620 BoolErr bigstack_calloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr);
621
622 BoolErr bigstack_calloc_w(uintptr_t ct, uintptr_t** w_arr_ptr);
623
624 BoolErr bigstack_calloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr);
625
626 BoolErr bigstack_calloc_cp(uintptr_t ct, char*** cp_arr_ptr);
627
628 BoolErr bigstack_calloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr);
629
bigstack_calloc_c(uintptr_t ct,char ** c_arr_ptr)630 HEADER_INLINE BoolErr bigstack_calloc_c(uintptr_t ct, char** c_arr_ptr) {
631 return bigstack_calloc_uc(ct, R_CAST(unsigned char**, c_arr_ptr));
632 }
633
bigstack_calloc_i16(uintptr_t ct,int16_t ** i16_arr_ptr)634 HEADER_INLINE BoolErr bigstack_calloc_i16(uintptr_t ct, int16_t** i16_arr_ptr) {
635 return bigstack_calloc_u16(ct, R_CAST(uint16_t**, i16_arr_ptr));
636 }
637
bigstack_calloc_i32(uintptr_t ct,int32_t ** i32_arr_ptr)638 HEADER_INLINE BoolErr bigstack_calloc_i32(uintptr_t ct, int32_t** i32_arr_ptr) {
639 return bigstack_calloc_u32(ct, R_CAST(uint32_t**, i32_arr_ptr));
640 }
641
bigstack_calloc_i64(uintptr_t ct,int64_t ** i64_arr_ptr)642 HEADER_INLINE BoolErr bigstack_calloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) {
643 return bigstack_calloc_u64(ct, R_CAST(uint64_t**, i64_arr_ptr));
644 }
645
646 #if __cplusplus >= 201103L
647
BigstackAllocX(uintptr_t ct,T ** x_arr_ptr)648 template <class T> BoolErr BigstackAllocX(uintptr_t ct, T** x_arr_ptr) {
649 *x_arr_ptr = S_CAST(T*, bigstack_alloc(ct * sizeof(T)));
650 return !(*x_arr_ptr);
651 }
652
653 // todo: define all BigstackAlloc functions in terms of ArenaAlloc; then these
654 // can be namespaced, and we only need ARENA_ALLOC_X and ARENA_ALLOC_STD_ARRAY
655 // macros
656 # define BIGSTACK_ALLOC_X(tt, ct, pp) plink2::BigstackAllocX<tt>((ct), (pp))
657
658 # define BIGSTACK_ALLOC_STD_ARRAY(tt, arr_size, len, pp) plink2::BigstackAllocX<std::array<tt, arr_size>>((len), (pp))
659
660 #else
661
662 # define BIGSTACK_ALLOC_X(tt, ct, pp) (!((*(pp)) = S_CAST(tt*, bigstack_alloc((ct) * sizeof(tt)))))
663
664 # define BIGSTACK_ALLOC_STD_ARRAY(tt, arr_size, len, pp) (!((*(pp)) = S_CAST(STD_ARRAY_PTR_TYPE(tt, arr_size), bigstack_alloc((len) * (arr_size * sizeof(tt))))))
665
666 #endif
667
BigstackReset(void * new_base)668 HEADER_INLINE void BigstackReset(void* new_base) {
669 g_bigstack_base = S_CAST(unsigned char*, new_base);
670 }
671
BigstackEndReset(void * new_end)672 HEADER_INLINE void BigstackEndReset(void* new_end) {
673 g_bigstack_end = S_CAST(unsigned char*, new_end);
674 }
675
BigstackDoubleReset(void * new_base,void * new_end)676 HEADER_INLINE void BigstackDoubleReset(void* new_base, void* new_end) {
677 BigstackReset(new_base);
678 BigstackEndReset(new_end);
679 }
680
681 // assumes we've already been writing to wptr and have previously performed
682 // bounds-checking.
BigstackFinalizeW(__maybe_unused const uintptr_t * wptr,uintptr_t ct)683 HEADER_INLINE void BigstackFinalizeW(__maybe_unused const uintptr_t* wptr, uintptr_t ct) {
684 assert(wptr == R_CAST(const uintptr_t*, g_bigstack_base));
685 g_bigstack_base += RoundUpPow2(ct * sizeof(intptr_t), kCacheline);
686 assert(g_bigstack_base <= g_bigstack_end);
687 }
688
BigstackFinalizeU32(__maybe_unused const uint32_t * u32ptr,uintptr_t ct)689 HEADER_INLINE void BigstackFinalizeU32(__maybe_unused const uint32_t* u32ptr, uintptr_t ct) {
690 assert(u32ptr == R_CAST(const uint32_t*, g_bigstack_base));
691 g_bigstack_base += RoundUpPow2(ct * sizeof(int32_t), kCacheline);
692 assert(g_bigstack_base <= g_bigstack_end);
693 }
694
BigstackFinalizeU64(__maybe_unused const uint64_t * u64ptr,uintptr_t ct)695 HEADER_INLINE void BigstackFinalizeU64(__maybe_unused const uint64_t* u64ptr, uintptr_t ct) {
696 assert(u64ptr == R_CAST(const uint64_t*, g_bigstack_base));
697 g_bigstack_base += RoundUpPow2(ct * sizeof(int64_t), kCacheline);
698 assert(g_bigstack_base <= g_bigstack_end);
699 }
700
BigstackFinalizeC(__maybe_unused const char * cptr,uintptr_t ct)701 HEADER_INLINE void BigstackFinalizeC(__maybe_unused const char* cptr, uintptr_t ct) {
702 assert(cptr == R_CAST(const char*, g_bigstack_base));
703 g_bigstack_base += RoundUpPow2(ct, kCacheline);
704 assert(g_bigstack_base <= g_bigstack_end);
705 }
706
BigstackFinalizeCp(__maybe_unused const char * const * cpptr,uintptr_t ct)707 HEADER_INLINE void BigstackFinalizeCp(__maybe_unused const char* const* cpptr, uintptr_t ct) {
708 assert(cpptr == R_CAST(const char* const*, g_bigstack_base));
709 g_bigstack_base += RoundUpPow2(ct * sizeof(intptr_t), kCacheline);
710 assert(g_bigstack_base <= g_bigstack_end);
711 }
712
713
BigstackBaseSet(const void * unaligned_base)714 HEADER_INLINE void BigstackBaseSet(const void* unaligned_base) {
715 g_bigstack_base = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, unaligned_base), kCacheline));
716 }
717
BigstackShrinkTop(const void * rebase,uintptr_t new_size)718 HEADER_INLINE void BigstackShrinkTop(const void* rebase, uintptr_t new_size) {
719 // could assert that this doesn't go in the wrong direction?
720 g_bigstack_base = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, rebase) + new_size, kCacheline));
721 }
722
723 // ensure vector-alignment
724 CONSTI32(kEndAllocAlign, MAXV(kBytesPerVec, 16));
725
BigstackEndSet(const void * unaligned_end)726 HEADER_INLINE void BigstackEndSet(const void* unaligned_end) {
727 g_bigstack_end = R_CAST(unsigned char*, RoundDownPow2(R_CAST(uintptr_t, unaligned_end), kEndAllocAlign));
728 }
729
730 // assumes size is divisible by kEndAllocAlign
731 // assumes enough space is available
bigstack_end_alloc_raw(uintptr_t size)732 HEADER_INLINE void* bigstack_end_alloc_raw(uintptr_t size) {
733 assert(!(size % kEndAllocAlign));
734 g_bigstack_end -= size;
735 return g_bigstack_end;
736 }
737
bigstack_end_alloc_raw_rd(uintptr_t size)738 HEADER_INLINE void* bigstack_end_alloc_raw_rd(uintptr_t size) {
739 g_bigstack_end -= RoundUpPow2(size, kEndAllocAlign);
740 return g_bigstack_end;
741 }
742
bigstack_end_alloc_presized(uintptr_t size)743 HEADER_INLINE void* bigstack_end_alloc_presized(uintptr_t size) {
744 uintptr_t cur_bigstack_left = bigstack_left();
745 if (size > cur_bigstack_left) {
746 g_failed_alloc_attempt_size = size;
747 return nullptr;
748 }
749 return bigstack_end_alloc_raw(size);
750 }
751
bigstack_end_alloc(uintptr_t size)752 HEADER_INLINE void* bigstack_end_alloc(uintptr_t size) {
753 size = RoundUpPow2(size, kEndAllocAlign);
754 return bigstack_end_alloc_presized(size);
755 }
756
bigstack_end_aligned_alloc(uintptr_t size)757 HEADER_INLINE void* bigstack_end_aligned_alloc(uintptr_t size) {
758 return bigstack_end_alloc(size);
759 }
760
bigstack_end_alloc_c(uintptr_t ct,char ** c_arr_ptr)761 HEADER_INLINE BoolErr bigstack_end_alloc_c(uintptr_t ct, char** c_arr_ptr) {
762 *c_arr_ptr = S_CAST(char*, bigstack_end_alloc(ct));
763 return !(*c_arr_ptr);
764 }
765
bigstack_end_alloc_d(uintptr_t ct,double ** d_arr_ptr)766 HEADER_INLINE BoolErr bigstack_end_alloc_d(uintptr_t ct, double** d_arr_ptr) {
767 *d_arr_ptr = S_CAST(double*, bigstack_end_alloc(ct * sizeof(double)));
768 return !(*d_arr_ptr);
769 }
770
bigstack_end_alloc_f(uintptr_t ct,float ** f_arr_ptr)771 HEADER_INLINE BoolErr bigstack_end_alloc_f(uintptr_t ct, float** f_arr_ptr) {
772 *f_arr_ptr = S_CAST(float*, bigstack_end_alloc(ct * sizeof(float)));
773 return !(*f_arr_ptr);
774 }
775
bigstack_end_alloc_i32(uintptr_t ct,int32_t ** i32_arr_ptr)776 HEADER_INLINE BoolErr bigstack_end_alloc_i32(uintptr_t ct, int32_t** i32_arr_ptr) {
777 *i32_arr_ptr = S_CAST(int32_t*, bigstack_end_alloc(ct * sizeof(int32_t)));
778 return !(*i32_arr_ptr);
779 }
780
bigstack_end_alloc_uc(uintptr_t ct,unsigned char ** uc_arr_ptr)781 HEADER_INLINE BoolErr bigstack_end_alloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
782 *uc_arr_ptr = S_CAST(unsigned char*, bigstack_end_alloc(ct));
783 return !(*uc_arr_ptr);
784 }
785
bigstack_end_alloc_u32(uintptr_t ct,uint32_t ** u32_arr_ptr)786 HEADER_INLINE BoolErr bigstack_end_alloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr) {
787 *u32_arr_ptr = S_CAST(uint32_t*, bigstack_end_alloc(ct * sizeof(int32_t)));
788 return !(*u32_arr_ptr);
789 }
790
bigstack_end_alloc_w(uintptr_t ct,uintptr_t ** w_arr_ptr)791 HEADER_INLINE BoolErr bigstack_end_alloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) {
792 *w_arr_ptr = S_CAST(uintptr_t*, bigstack_end_alloc(ct * sizeof(intptr_t)));
793 return !(*w_arr_ptr);
794 }
795
bigstack_end_alloc_i64(uintptr_t ct,int64_t ** i64_arr_ptr)796 HEADER_INLINE BoolErr bigstack_end_alloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) {
797 *i64_arr_ptr = S_CAST(int64_t*, bigstack_end_alloc(ct * sizeof(int64_t)));
798 return !(*i64_arr_ptr);
799 }
800
bigstack_end_alloc_u64(uintptr_t ct,uint64_t ** u64_arr_ptr)801 HEADER_INLINE BoolErr bigstack_end_alloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr) {
802 *u64_arr_ptr = S_CAST(uint64_t*, bigstack_end_alloc(ct * sizeof(int64_t)));
803 return !(*u64_arr_ptr);
804 }
805
806 typedef struct LlStrStruct {
807 NONCOPYABLE(LlStrStruct);
808 struct LlStrStruct* next;
809 char str[];
810 } LlStr;
811
bigstack_end_alloc_llstr(uintptr_t str_bytes,LlStr ** llstr_arr_ptr)812 HEADER_INLINE BoolErr bigstack_end_alloc_llstr(uintptr_t str_bytes, LlStr** llstr_arr_ptr) {
813 *llstr_arr_ptr = S_CAST(LlStr*, bigstack_end_alloc(str_bytes + sizeof(LlStr)));
814 return !(*llstr_arr_ptr);
815 }
816
bigstack_end_alloc_cp(uintptr_t ct,char *** cp_arr_ptr)817 HEADER_INLINE BoolErr bigstack_end_alloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
818 *cp_arr_ptr = S_CAST(char**, bigstack_end_alloc(ct * sizeof(intptr_t)));
819 return !(*cp_arr_ptr);
820 }
821
bigstack_end_alloc_kcp(uintptr_t ct,const char *** kcp_arr_ptr)822 HEADER_INLINE BoolErr bigstack_end_alloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
823 *kcp_arr_ptr = S_CAST(const char**, bigstack_end_alloc(ct * sizeof(intptr_t)));
824 return !(*kcp_arr_ptr);
825 }
826
bigstack_end_alloc_ucp(uintptr_t ct,unsigned char *** ucp_arr_ptr)827 HEADER_INLINE BoolErr bigstack_end_alloc_ucp(uintptr_t ct, unsigned char*** ucp_arr_ptr) {
828 *ucp_arr_ptr = S_CAST(unsigned char**, bigstack_end_alloc(ct * sizeof(intptr_t)));
829 return !(*ucp_arr_ptr);
830 }
831
832 BoolErr bigstack_end_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr);
833
834 BoolErr bigstack_end_calloc_d(uintptr_t ct, double** d_arr_ptr);
835
836 BoolErr bigstack_end_calloc_f(uintptr_t ct, float** f_arr_ptr);
837
838 BoolErr bigstack_end_calloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr);
839
840 BoolErr bigstack_end_calloc_w(uintptr_t ct, uintptr_t** w_arr_ptr);
841
842 BoolErr bigstack_end_calloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr);
843
844 BoolErr bigstack_end_calloc_cp(uintptr_t ct, char*** cp_arr_ptr);
845
bigstack_end_calloc_c(uintptr_t ct,char ** c_arr_ptr)846 HEADER_INLINE BoolErr bigstack_end_calloc_c(uintptr_t ct, char** c_arr_ptr) {
847 return bigstack_end_calloc_uc(ct, R_CAST(unsigned char**, c_arr_ptr));
848 }
849
bigstack_end_calloc_i32(uintptr_t ct,int32_t ** i32_arr_ptr)850 HEADER_INLINE BoolErr bigstack_end_calloc_i32(uintptr_t ct, int32_t** i32_arr_ptr) {
851 return bigstack_end_calloc_u32(ct, R_CAST(uint32_t**, i32_arr_ptr));
852 }
853
bigstack_end_calloc_i64(uintptr_t ct,int64_t ** i64_arr_ptr)854 HEADER_INLINE BoolErr bigstack_end_calloc_i64(uintptr_t ct, int64_t** i64_arr_ptr) {
855 return bigstack_end_calloc_u64(ct, R_CAST(uint64_t**, i64_arr_ptr));
856 }
857
bigstack_end_calloc_kcp(uintptr_t ct,const char *** kcp_arr_ptr)858 HEADER_INLINE BoolErr bigstack_end_calloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
859 return bigstack_end_calloc_cp(ct, K_CAST(char***, kcp_arr_ptr));
860 }
861
862 // and here's the interface for a non-global arena (necessary for some
863 // multithreaded code).
arena_alloc_raw(uintptr_t size,unsigned char ** arena_bottom_ptr)864 HEADER_INLINE void* arena_alloc_raw(uintptr_t size, unsigned char** arena_bottom_ptr) {
865 assert(!(size % kCacheline));
866 unsigned char* alloc_ptr = *arena_bottom_ptr;
867 *arena_bottom_ptr = &(alloc_ptr[size]);
868 return alloc_ptr;
869 }
870
arena_alloc_raw_rd(uintptr_t size,unsigned char ** arena_bottom_ptr)871 HEADER_INLINE void* arena_alloc_raw_rd(uintptr_t size, unsigned char** arena_bottom_ptr) {
872 unsigned char* alloc_ptr = *arena_bottom_ptr;
873 *arena_bottom_ptr = &(alloc_ptr[RoundUpPow2(size, kCacheline)]);
874 return alloc_ptr;
875 }
876
arena_alloc(unsigned char * arena_top,uintptr_t size,unsigned char ** arena_bottom_ptr)877 HEADER_INLINE void* arena_alloc(unsigned char* arena_top, uintptr_t size, unsigned char** arena_bottom_ptr) {
878 size = RoundUpPow2(size, kCacheline);
879 if (S_CAST(uintptr_t, arena_top - (*arena_bottom_ptr)) < size) {
880 g_failed_alloc_attempt_size = size;
881 return nullptr;
882 }
883 return arena_alloc_raw(size, arena_bottom_ptr);
884 }
885
arena_alloc_c(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,char ** c_arr_ptr)886 HEADER_INLINE BoolErr arena_alloc_c(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, char** c_arr_ptr) {
887 *c_arr_ptr = S_CAST(char*, arena_alloc(arena_top, ct, arena_bottom_ptr));
888 return !(*c_arr_ptr);
889 }
890
arena_alloc_d(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,double ** d_arr_ptr)891 HEADER_INLINE BoolErr arena_alloc_d(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, double** d_arr_ptr) {
892 *d_arr_ptr = S_CAST(double*, arena_alloc(arena_top, ct * sizeof(double), arena_bottom_ptr));
893 return !(*d_arr_ptr);
894 }
895
arena_alloc_f(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,float ** f_arr_ptr)896 HEADER_INLINE BoolErr arena_alloc_f(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, float** f_arr_ptr) {
897 *f_arr_ptr = S_CAST(float*, arena_alloc(arena_top, ct * sizeof(float), arena_bottom_ptr));
898 return !(*f_arr_ptr);
899 }
900
arena_alloc_i32(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,int32_t ** i32_arr_ptr)901 HEADER_INLINE BoolErr arena_alloc_i32(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, int32_t** i32_arr_ptr) {
902 *i32_arr_ptr = S_CAST(int32_t*, arena_alloc(arena_top, ct * sizeof(int32_t), arena_bottom_ptr));
903 return !(*i32_arr_ptr);
904 }
905
arena_alloc_uc(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,unsigned char ** uc_arr_ptr)906 HEADER_INLINE BoolErr arena_alloc_uc(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, unsigned char** uc_arr_ptr) {
907 *uc_arr_ptr = S_CAST(unsigned char*, arena_alloc(arena_top, ct, arena_bottom_ptr));
908 return !(*uc_arr_ptr);
909 }
910
arena_alloc_u32(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,uint32_t ** u32_arr_ptr)911 HEADER_INLINE BoolErr arena_alloc_u32(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uint32_t** u32_arr_ptr) {
912 *u32_arr_ptr = S_CAST(uint32_t*, arena_alloc(arena_top, ct * sizeof(int32_t), arena_bottom_ptr));
913 return !(*u32_arr_ptr);
914 }
915
arena_alloc_w(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,uintptr_t ** w_arr_ptr)916 HEADER_INLINE BoolErr arena_alloc_w(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uintptr_t** w_arr_ptr) {
917 *w_arr_ptr = S_CAST(uintptr_t*, arena_alloc(arena_top, ct * sizeof(intptr_t), arena_bottom_ptr));
918 return !(*w_arr_ptr);
919 }
920
arena_alloc_i64(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,int64_t ** i64_arr_ptr)921 HEADER_INLINE BoolErr arena_alloc_i64(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, int64_t** i64_arr_ptr) {
922 *i64_arr_ptr = S_CAST(int64_t*, arena_alloc(arena_top, ct * sizeof(int64_t), arena_bottom_ptr));
923 return !(*i64_arr_ptr);
924 }
925
arena_alloc_u64(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,uint64_t ** u64_arr_ptr)926 HEADER_INLINE BoolErr arena_alloc_u64(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, uint64_t** u64_arr_ptr) {
927 *u64_arr_ptr = S_CAST(uint64_t*, arena_alloc(arena_top, ct * sizeof(int64_t), arena_bottom_ptr));
928 return !(*u64_arr_ptr);
929 }
930
arena_alloc_cp(unsigned char * arena_top,uintptr_t ct,unsigned char ** arena_bottom_ptr,char *** cp_arr_ptr)931 HEADER_INLINE BoolErr arena_alloc_cp(unsigned char* arena_top, uintptr_t ct, unsigned char** arena_bottom_ptr, char*** cp_arr_ptr) {
932 *cp_arr_ptr = S_CAST(char**, arena_alloc(arena_top, ct * sizeof(intptr_t), arena_bottom_ptr));
933 return !(*cp_arr_ptr);
934 }
935
ArenaBaseSet(const void * unaligned_base,unsigned char ** arena_bottom_ptr)936 HEADER_INLINE void ArenaBaseSet(const void* unaligned_base, unsigned char** arena_bottom_ptr) {
937 *arena_bottom_ptr = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, unaligned_base), kCacheline));
938 }
939
arena_end_alloc_raw(uintptr_t size,unsigned char ** arena_top_ptr)940 HEADER_INLINE void* arena_end_alloc_raw(uintptr_t size, unsigned char** arena_top_ptr) {
941 assert(!(size % kEndAllocAlign));
942 unsigned char* alloc_ptr = *arena_top_ptr;
943 alloc_ptr -= size;
944 *arena_top_ptr = alloc_ptr;
945 return alloc_ptr;
946 }
947
arena_end_alloc(unsigned char * arena_bottom,uintptr_t size,unsigned char ** arena_top_ptr)948 HEADER_INLINE void* arena_end_alloc(unsigned char* arena_bottom, uintptr_t size, unsigned char** arena_top_ptr) {
949 size = RoundUpPow2(size, kEndAllocAlign);
950 if (unlikely(S_CAST(uintptr_t, (*arena_top_ptr) - arena_bottom) < size)) {
951 g_failed_alloc_attempt_size = size;
952 return nullptr;
953 }
954 return arena_end_alloc_raw(size, arena_top_ptr);
955 }
956
arena_end_alloc_c(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,char ** c_arr_ptr)957 HEADER_INLINE BoolErr arena_end_alloc_c(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, char** c_arr_ptr) {
958 *c_arr_ptr = S_CAST(char*, arena_end_alloc(arena_bottom, ct, arena_top_ptr));
959 return !(*c_arr_ptr);
960 }
961
arena_end_alloc_d(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,double ** d_arr_ptr)962 HEADER_INLINE BoolErr arena_end_alloc_d(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, double** d_arr_ptr) {
963 *d_arr_ptr = S_CAST(double*, arena_end_alloc(arena_bottom, ct * sizeof(double), arena_top_ptr));
964 return !(*d_arr_ptr);
965 }
966
arena_end_alloc_f(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,float ** f_arr_ptr)967 HEADER_INLINE BoolErr arena_end_alloc_f(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, float** f_arr_ptr) {
968 *f_arr_ptr = S_CAST(float*, arena_end_alloc(arena_bottom, ct * sizeof(float), arena_top_ptr));
969 return !(*f_arr_ptr);
970 }
971
arena_end_alloc_i32(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,int32_t ** i32_arr_ptr)972 HEADER_INLINE BoolErr arena_end_alloc_i32(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, int32_t** i32_arr_ptr) {
973 *i32_arr_ptr = S_CAST(int32_t*, arena_end_alloc(arena_bottom, ct * sizeof(int32_t), arena_top_ptr));
974 return !(*i32_arr_ptr);
975 }
976
arena_end_alloc_uc(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,unsigned char ** uc_arr_ptr)977 HEADER_INLINE BoolErr arena_end_alloc_uc(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, unsigned char** uc_arr_ptr) {
978 *uc_arr_ptr = S_CAST(unsigned char*, arena_end_alloc(arena_bottom, ct, arena_top_ptr));
979 return !(*uc_arr_ptr);
980 }
981
arena_end_alloc_u32(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,uint32_t ** u32_arr_ptr)982 HEADER_INLINE BoolErr arena_end_alloc_u32(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uint32_t** u32_arr_ptr) {
983 *u32_arr_ptr = S_CAST(uint32_t*, arena_end_alloc(arena_bottom, ct * sizeof(int32_t), arena_top_ptr));
984 return !(*u32_arr_ptr);
985 }
986
arena_end_alloc_w(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,uintptr_t ** w_arr_ptr)987 HEADER_INLINE BoolErr arena_end_alloc_w(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uintptr_t** w_arr_ptr) {
988 *w_arr_ptr = S_CAST(uintptr_t*, arena_end_alloc(arena_bottom, ct * sizeof(intptr_t), arena_top_ptr));
989 return !(*w_arr_ptr);
990 }
991
arena_end_alloc_i64(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,int64_t ** i64_arr_ptr)992 HEADER_INLINE BoolErr arena_end_alloc_i64(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, int64_t** i64_arr_ptr) {
993 *i64_arr_ptr = S_CAST(int64_t*, arena_end_alloc(arena_bottom, ct * sizeof(int64_t), arena_top_ptr));
994 return !(*i64_arr_ptr);
995 }
996
arena_end_alloc_u64(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,uint64_t ** u64_arr_ptr)997 HEADER_INLINE BoolErr arena_end_alloc_u64(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, uint64_t** u64_arr_ptr) {
998 *u64_arr_ptr = S_CAST(uint64_t*, arena_end_alloc(arena_bottom, ct * sizeof(int64_t), arena_top_ptr));
999 return !(*u64_arr_ptr);
1000 }
1001
arena_end_alloc_cp(unsigned char * arena_bottom,uintptr_t ct,unsigned char ** arena_top_ptr,char *** cp_arr_ptr)1002 HEADER_INLINE BoolErr arena_end_alloc_cp(unsigned char* arena_bottom, uintptr_t ct, unsigned char** arena_top_ptr, char*** cp_arr_ptr) {
1003 *cp_arr_ptr = S_CAST(char**, arena_end_alloc(arena_bottom, ct * sizeof(intptr_t), arena_top_ptr));
1004 return !(*cp_arr_ptr);
1005 }
1006
1007
1008 BoolErr PushLlStr(const char* str, LlStr** ll_stack_ptr);
1009
1010 // Does not require null-termination.
1011 // BoolErr push_llstr_counted(const char* str, uint32_t slen, LlStr** ll_stack_ptr);
1012
1013 typedef struct L32StrStruct {
1014 NONCOPYABLE(L32StrStruct);
1015 uint32_t len;
1016 char str[];
1017 } L32Str;
1018
1019
1020 // assumes multistr is nonempty
1021 BoolErr CountAndMeasureMultistrReverseAlloc(const char* multistr, uintptr_t max_str_ct, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr, const char*** strptr_arrp);
1022
1023 BoolErr MultistrToStrboxDedupArenaAlloc(unsigned char* arena_top, const char* multistr, unsigned char** arena_bottom_ptr, char** sorted_strbox_ptr, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr);
1024
MultistrToStrboxDedupAlloc(const char * multistr,char ** sorted_strbox_ptr,uint32_t * str_ct_ptr,uintptr_t * max_blen_ptr)1025 HEADER_INLINE BoolErr MultistrToStrboxDedupAlloc(const char* multistr, char** sorted_strbox_ptr, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr) {
1026 return MultistrToStrboxDedupArenaAlloc(g_bigstack_end, multistr, &g_bigstack_base, sorted_strbox_ptr, str_ct_ptr, max_blen_ptr);
1027 }
1028
1029
1030 void DivisionMagicNums(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp);
1031
1032 // ZeroU32Arr, ZeroWArr, ZeroU64Arr, SetAllWArr currently defined in
1033 // plink2_base.h
1034
ZeroVecArr(uintptr_t entry_ct,VecW * vvec)1035 HEADER_INLINE void ZeroVecArr(uintptr_t entry_ct, VecW* vvec) {
1036 memset(vvec, 0, entry_ct * kBytesPerVec);
1037 }
1038
SetAllU64Arr(uintptr_t entry_ct,uint64_t * u64arr)1039 HEADER_INLINE void SetAllU64Arr(uintptr_t entry_ct, uint64_t* u64arr) {
1040 // bugfix (1 Feb 2018): forgot to multiply by 2 in 32-bit case
1041 SetAllWArr(entry_ct * (sizeof(int64_t) / kBytesPerWord), R_CAST(uintptr_t*, u64arr));
1042 }
1043
ZeroI32Arr(uintptr_t entry_ct,int32_t * i32arr)1044 HEADER_INLINE void ZeroI32Arr(uintptr_t entry_ct, int32_t* i32arr) {
1045 memset(i32arr, 0, entry_ct * sizeof(int32_t));
1046 }
1047
SetAllI32Arr(uintptr_t entry_ct,int32_t * i32arr)1048 HEADER_INLINE void SetAllI32Arr(uintptr_t entry_ct, int32_t* i32arr) {
1049 for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1050 *i32arr++ = -1;
1051 }
1052 }
1053
SetAllU32Arr(uintptr_t entry_ct,uint32_t * u32arr)1054 HEADER_INLINE void SetAllU32Arr(uintptr_t entry_ct, uint32_t* u32arr) {
1055 for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1056 *u32arr++ = ~0U;
1057 }
1058 }
1059
ZeroFArr(uintptr_t entry_ct,float * farr)1060 HEADER_INLINE void ZeroFArr(uintptr_t entry_ct, float* farr) {
1061 for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1062 *farr++ = 0.0;
1063 }
1064 }
1065
ZeroDArr(uintptr_t entry_ct,double * darr)1066 HEADER_INLINE void ZeroDArr(uintptr_t entry_ct, double* darr) {
1067 for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1068 *darr++ = 0.0;
1069 }
1070 }
1071
1072
ZeromovFArr(uintptr_t entry_ct,float ** farr_ptr)1073 HEADER_INLINE void ZeromovFArr(uintptr_t entry_ct, float** farr_ptr) {
1074 float* farr = *farr_ptr;
1075 for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
1076 *farr++ = 0.0;
1077 }
1078 *farr_ptr = farr;
1079 }
1080
1081
1082 // SetAllBits, IsSet, SetBit, ClearBit, AdvTo1Bit, AdvTo0Bit, AdvBoundedTo1Bit,
1083 // FindLast1BitBefore, AllWordsAreZero defined in plink2_base.h
1084
1085 // Useful when we don't want to think about the signedness of a 32-bit int.
SetBitI(int32_t loc,uintptr_t * bitarr)1086 HEADER_INLINE void SetBitI(int32_t loc, uintptr_t* bitarr) {
1087 SetBit(S_CAST(uint32_t, loc), bitarr);
1088 }
1089
FlipBit(uintptr_t loc,uintptr_t * bitarr)1090 HEADER_INLINE void FlipBit(uintptr_t loc, uintptr_t* bitarr) {
1091 bitarr[loc / kBitsPerWord] ^= k1LU << (loc % kBitsPerWord);
1092 }
1093
1094 // "Nz" added to names to make it obvious these require positive len
1095 void FillBitsNz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr);
1096 void ClearBitsNz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr);
1097
1098 // floor permitted to be -1, though not smaller than that.
1099 int32_t FindLast1BitBeforeBounded(const uintptr_t* bitarr, uint32_t loc, int32_t floor);
1100
1101 // This can be made a tiny bit faster than memequal() in isolation, but the
1102 // difference is almost certainly too small to justify additional i-cache
1103 // pressure.
wordsequal(const uintptr_t * word_arr1,const uintptr_t * word_arr2,uintptr_t word_ct)1104 HEADER_INLINE uint32_t wordsequal(const uintptr_t* word_arr1, const uintptr_t* word_arr2, uintptr_t word_ct) {
1105 return memequal(word_arr1, word_arr2, word_ct * kBytesPerWord);
1106 }
1107
1108
1109 // BitvecAnd(), BitvecInvmask(), BitvecOr(), BitvecInvert() in plink2_base.h
1110
1111 void BitvecAndCopy(const uintptr_t* __restrict source1_bitvec, const uintptr_t* __restrict source2_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec);
1112
1113 void BitvecInvmaskCopy(const uintptr_t* __restrict source_bitvec, const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec);
1114
1115 void BitvecInvertCopy(const uintptr_t* __restrict source_bitvec, uintptr_t word_ct, uintptr_t* __restrict target_bitvec);
1116
1117 // These ensure the trailing bits are zeroed out.
1118 // 'AlignedBitarr' instead of Bitvec since this takes bit_ct instead of word_ct
1119 // as the size argument, and zeroes trailing bits.
AlignedBitarrInvert(uintptr_t bit_ct,uintptr_t * main_bitvec)1120 HEADER_INLINE void AlignedBitarrInvert(uintptr_t bit_ct, uintptr_t* main_bitvec) {
1121 const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
1122 BitvecInvert(fullword_ct, main_bitvec);
1123 const uint32_t trail_ct = bit_ct % kBitsPerWord;
1124 if (trail_ct) {
1125 main_bitvec[fullword_ct] = bzhi(~main_bitvec[fullword_ct], trail_ct);
1126 }
1127 }
1128
AlignedBitarrInvertCopy(const uintptr_t * __restrict source_bitvec,uintptr_t bit_ct,uintptr_t * __restrict target_bitvec)1129 HEADER_INLINE void AlignedBitarrInvertCopy(const uintptr_t* __restrict source_bitvec, uintptr_t bit_ct, uintptr_t* __restrict target_bitvec) {
1130 const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
1131 BitvecInvertCopy(source_bitvec, fullword_ct, target_bitvec);
1132 const uint32_t trail_ct = bit_ct % kBitsPerWord;
1133 if (trail_ct) {
1134 target_bitvec[fullword_ct] = bzhi(~source_bitvec[fullword_ct], trail_ct);
1135 }
1136 }
1137
1138 // void BitvecXor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
1139
1140 void BitvecInvertAndMask(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
1141
1142 // void BitvecOrNot(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
1143
1144 // Address C-only incompatible-pointer-types-discards-qualifiers warning.
1145 #ifdef __cplusplus
1146 # define TO_CONSTU32PCONSTP(u32_pp) (u32_pp)
1147 #else
1148 # define TO_CONSTU32PCONSTP(u32_pp) ((const uint32_t* const*)(u32_pp))
1149 #endif
1150
U32VecAdd(const VecU32 * arg_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1151 HEADER_INLINE void U32VecAdd(const VecU32* arg_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1152 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1153 main_u32vec[vidx] += arg_u32vec[vidx];
1154 }
1155 }
1156
U32CastVecAdd(const uint32_t * arg_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1157 HEADER_INLINE void U32CastVecAdd(const uint32_t* arg_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1158 U32VecAdd(R_CAST(const VecU32*, arg_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1159 }
1160
I32CastVecAdd(const int32_t * arg_i32arr,uintptr_t vec_ct,int32_t * main_i32arr)1161 HEADER_INLINE void I32CastVecAdd(const int32_t* arg_i32arr, uintptr_t vec_ct, int32_t* main_i32arr) {
1162 U32VecAdd(R_CAST(const VecU32*, arg_i32arr), vec_ct, R_CAST(VecU32*, main_i32arr));
1163 }
1164
U32VecSub(const VecU32 * arg_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1165 HEADER_INLINE void U32VecSub(const VecU32* arg_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1166 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1167 main_u32vec[vidx] -= arg_u32vec[vidx];
1168 }
1169 }
1170
U32CastVecSub(const uint32_t * arg_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1171 HEADER_INLINE void U32CastVecSub(const uint32_t* arg_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1172 U32VecSub(R_CAST(const VecU32*, arg_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1173 }
1174
U32VecMaskedAdd(const VecU32 * mask_u32vec,const VecU32 * arg_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1175 HEADER_INLINE void U32VecMaskedAdd(const VecU32* mask_u32vec, const VecU32* arg_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1176 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1177 main_u32vec[vidx] += mask_u32vec[vidx] & arg_u32vec[vidx];
1178 }
1179 }
1180
U32CastVecMaskedAdd(const uint32_t * mask_u32arr,const uint32_t * arg_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1181 HEADER_INLINE void U32CastVecMaskedAdd(const uint32_t* mask_u32arr, const uint32_t* arg_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1182 U32VecMaskedAdd(R_CAST(const VecU32*, mask_u32arr), R_CAST(const VecU32*, arg_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1183 }
1184
U32VecInvmaskedAdd(const VecU32 * invmask_u32vec,const VecU32 * arg_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1185 HEADER_INLINE void U32VecInvmaskedAdd(const VecU32* invmask_u32vec, const VecU32* arg_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1186 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1187 main_u32vec[vidx] += vecu32_and_notfirst(invmask_u32vec[vidx], arg_u32vec[vidx]);
1188 }
1189 }
1190
U32CastVecInvmaskedAdd(const uint32_t * invmask_u32arr,const uint32_t * arg_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1191 HEADER_INLINE void U32CastVecInvmaskedAdd(const uint32_t* invmask_u32arr, const uint32_t* arg_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1192 U32VecInvmaskedAdd(R_CAST(const VecU32*, invmask_u32arr), R_CAST(const VecU32*, arg_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1193 }
1194
1195
U32VecAdd2(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1196 HEADER_INLINE void U32VecAdd2(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1197 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1198 main_u32vec[vidx] += arg1_u32vec[vidx] + arg2_u32vec[vidx];
1199 }
1200 }
1201
U32CastVecAdd2(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1202 HEADER_INLINE void U32CastVecAdd2(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1203 U32VecAdd2(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1204 }
1205
U32VecAssignAdd2(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1206 HEADER_INLINE void U32VecAssignAdd2(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1207 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1208 main_u32vec[vidx] = arg1_u32vec[vidx] + arg2_u32vec[vidx];
1209 }
1210 }
1211
U32CastVecAssignAdd2(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1212 HEADER_INLINE void U32CastVecAssignAdd2(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1213 U32VecAssignAdd2(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1214 }
1215
U32VecMaskedAdd2(const VecU32 * mask_u32vec,const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1216 HEADER_INLINE void U32VecMaskedAdd2(const VecU32* mask_u32vec, const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1217 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1218 main_u32vec[vidx] += mask_u32vec[vidx] & (arg1_u32vec[vidx] + arg2_u32vec[vidx]);
1219 }
1220 }
1221
U32CastVecMaskedAdd2(const uint32_t * mask_u32arr,const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1222 HEADER_INLINE void U32CastVecMaskedAdd2(const uint32_t* mask_u32arr, const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1223 U32VecMaskedAdd2(R_CAST(const VecU32*, mask_u32arr), R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1224 }
1225
U32VecAddSub(const VecU32 * add_u32vec,const VecU32 * sub_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1226 HEADER_INLINE void U32VecAddSub(const VecU32* add_u32vec, const VecU32* sub_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1227 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1228 main_u32vec[vidx] += add_u32vec[vidx] - sub_u32vec[vidx];
1229 }
1230 }
1231
U32CastVecAddSub(const uint32_t * add_u32arr,const uint32_t * sub_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1232 HEADER_INLINE void U32CastVecAddSub(const uint32_t* add_u32arr, const uint32_t* sub_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1233 U32VecAddSub(R_CAST(const VecU32*, add_u32arr), R_CAST(const VecU32*, sub_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1234 }
1235
1236
U32VecAdd3(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,const VecU32 * arg3_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1237 HEADER_INLINE void U32VecAdd3(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, const VecU32* arg3_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1238 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1239 main_u32vec[vidx] += arg1_u32vec[vidx] + arg2_u32vec[vidx] + arg3_u32vec[vidx];
1240 }
1241 }
1242
U32CastVecAdd3(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,const uint32_t * arg3_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1243 HEADER_INLINE void U32CastVecAdd3(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, const uint32_t* arg3_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1244 U32VecAdd3(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), R_CAST(const VecU32*, arg3_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1245 }
1246
U32VecAssignAdd3(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,const VecU32 * arg3_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1247 HEADER_INLINE void U32VecAssignAdd3(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, const VecU32* arg3_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1248 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1249 main_u32vec[vidx] = arg1_u32vec[vidx] + arg2_u32vec[vidx] + arg3_u32vec[vidx];
1250 }
1251 }
1252
U32CastVecAssignAdd3(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,const uint32_t * arg3_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1253 HEADER_INLINE void U32CastVecAssignAdd3(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, const uint32_t* arg3_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1254 U32VecAssignAdd3(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), R_CAST(const VecU32*, arg3_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1255 }
1256
U32VecSub3(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,const VecU32 * arg3_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1257 HEADER_INLINE void U32VecSub3(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, const VecU32* arg3_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1258 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1259 main_u32vec[vidx] -= arg1_u32vec[vidx] + arg2_u32vec[vidx] + arg3_u32vec[vidx];
1260 }
1261 }
1262
U32CastVecSub3(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,const uint32_t * arg3_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1263 HEADER_INLINE void U32CastVecSub3(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, const uint32_t* arg3_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1264 U32VecSub3(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), R_CAST(const VecU32*, arg3_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1265 }
1266
U32VecAssignAdd5(const VecU32 * arg1_u32vec,const VecU32 * arg2_u32vec,const VecU32 * arg3_u32vec,const VecU32 * arg4_u32vec,const VecU32 * arg5_u32vec,uintptr_t vec_ct,VecU32 * main_u32vec)1267 HEADER_INLINE void U32VecAssignAdd5(const VecU32* arg1_u32vec, const VecU32* arg2_u32vec, const VecU32* arg3_u32vec, const VecU32* arg4_u32vec, const VecU32* arg5_u32vec, uintptr_t vec_ct, VecU32* main_u32vec) {
1268 for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
1269 main_u32vec[vidx] = arg1_u32vec[vidx] + arg2_u32vec[vidx] + arg3_u32vec[vidx] + arg4_u32vec[vidx] + arg5_u32vec[vidx];
1270 }
1271 }
1272
U32CastVecAssignAdd5(const uint32_t * arg1_u32arr,const uint32_t * arg2_u32arr,const uint32_t * arg3_u32arr,const uint32_t * arg4_u32arr,const uint32_t * arg5_u32arr,uintptr_t vec_ct,uint32_t * main_u32arr)1273 HEADER_INLINE void U32CastVecAssignAdd5(const uint32_t* arg1_u32arr, const uint32_t* arg2_u32arr, const uint32_t* arg3_u32arr, const uint32_t* arg4_u32arr, const uint32_t* arg5_u32arr, uintptr_t vec_ct, uint32_t* main_u32arr) {
1274 U32VecAssignAdd5(R_CAST(const VecU32*, arg1_u32arr), R_CAST(const VecU32*, arg2_u32arr), R_CAST(const VecU32*, arg3_u32arr), R_CAST(const VecU32*, arg4_u32arr), R_CAST(const VecU32*, arg5_u32arr), vec_ct, R_CAST(VecU32*, main_u32arr));
1275 }
1276
1277
1278 // basic linear scan
1279 // returns -1 on failure to find, -2 if duplicate
1280 int32_t GetVariantUidxWithoutHtable(const char* idstr, const char* const* variant_ids, const uintptr_t* variant_include, uint32_t variant_ct);
1281
1282 // copy_subset() doesn't exist since a loop of the form
1283 // uintptr_t uidx_base = 0;
1284 // uintptr_t cur_bits = subset_mask[0];
1285 // for (uint32_t idx = 0; idx != subset_size; ++idx) {
1286 // const uintptr_t uidx = BitIter1(subset_mask, &uidx_base, &cur_bits);
1287 // *target_iter++ = source_arr[uidx];
1288 // }
1289 // seems to compile better?
1290 //
1291 // similarly, copy_when_nonmissing() retired in favor of genoarr_to_nonmissing
1292 // followed by a for loop
1293
1294 // Benchmark results justify replacing this with XXH3 when the latter is ready.
1295 // Not considering it ready yet since the development XXH_INLINE_ALL setup
1296 // isn't very friendly.
1297 //
1298 // (Can also try using _mm_aesenc_si128() in a similar manner to the Golang
1299 // runtime when SSE4.2+ is available, and otherwise keeping Murmurhash. See
1300 // aeshashbody() in https://golang.org/src/runtime/asm_amd64.s .)
1301 //
1302 // Eventually want this to be a constexpr? Seems painful to make that work,
1303 // though.
1304 uint32_t Hash32(const void* key, uint32_t len);
1305
1306 // see http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
1307 // Note that this is a bit more vulnerable to adversarial input: modulo
1308 // reduction requires lots of hash collisions (or near-collisions) or known
1309 // hash table size, while this can be attacked without knowledge of the table
1310 // size. But it doesn't really matter; either way, you'd need to add something
1311 // like a randomly generated salt if you care about defending.
Hashceil(const char * idstr,uint32_t idlen,uint32_t htable_size)1312 HEADER_INLINE uint32_t Hashceil(const char* idstr, uint32_t idlen, uint32_t htable_size) {
1313 return (S_CAST(uint64_t, Hash32(idstr, idlen)) * htable_size) >> 32;
1314 }
1315
1316 // uintptr_t geqprime(uintptr_t floor);
1317
1318 // assumes ceil is odd and greater than 4. Returns the first prime <= ceil.
1319 // uintptr_t leqprime(uintptr_t ceil);
1320
GetHtableMinSize(uintptr_t item_ct)1321 HEADER_INLINE uint32_t GetHtableMinSize(uintptr_t item_ct) {
1322 // Don't need primes any more.
1323 return item_ct * 2;
1324 }
1325
1326 // load factor ~22% seems to yield the best speed/space tradeoff on my test
1327 // machines
GetHtableFastSize(uint32_t item_ct)1328 HEADER_INLINE uint32_t GetHtableFastSize(uint32_t item_ct) {
1329 if (item_ct < 954437176) {
1330 return (item_ct * 9LLU) / 2;
1331 }
1332 return 4294967290U;
1333 }
1334
1335 BoolErr HtableGoodSizeAlloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t** htable_ptr, uint32_t* htable_size_ptr);
1336
1337 // useful for duplicate detection: returns 0 on no duplicates, a positive index
1338 // of a duplicate pair if they're present
1339 uint32_t PopulateStrboxHtable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
1340
1341 // returned index in duplicate-pair case is unfiltered
1342 // uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable);
1343
1344 // cur_id DOES need to be null-terminated
1345 uint32_t IdHtableFind(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size);
1346
1347 // item_ids overread must be ok. In exchange, cur_id doesn't need to be
1348 // null-terminated any more.
1349 uint32_t IdHtableFindNnt(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size);
1350
1351 // assumes cur_id_slen < max_str_blen.
1352 // requires cur_id to be null-terminated.
1353 uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size);
1354
1355 // last variant_ids entry must be at least kMaxIdBlen bytes before end of
1356 // bigstack
1357 uint32_t VariantIdDupflagHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen);
1358
1359 uint32_t VariantIdDupHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, const uint32_t* htable_dup_base, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen, uint32_t* llidx_ptr);
1360
1361
1362 // This still perform a temporary bigstack allocation; 'noalloc' here just
1363 // means that sorted_strbox and id_map must be allocated in advance. (Overread
1364 // must be safe.)
1365 PglErr CopySortStrboxSubsetNoalloc(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char* __restrict sorted_strbox, uint32_t* __restrict id_map);
1366
1367 PglErr CopySortStrboxSubset(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char** sorted_strbox_ptr, uint32_t** id_map_ptr);
1368
1369
1370 typedef struct RangeListStruct {
1371 NONCOPYABLE(RangeListStruct);
1372 char* names;
1373 unsigned char* starts_range;
1374 uint32_t name_ct;
1375 uint32_t name_max_blen;
1376 } RangeList;
1377
1378 void InitRangeList(RangeList* range_list_ptr);
1379
1380 void CleanupRangeList(RangeList* range_list_ptr);
1381
1382 // bitarr assumed to be initialized (but not necessarily zero-initialized)
1383 BoolErr NumericRangeListToBitarr(const RangeList* range_list_ptr, uint32_t bitarr_size, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr);
1384
1385 PglErr StringRangeListToBitarr(const char* header_line, const RangeList* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t* bitarr, int32_t* __restrict seen_idxs);
1386
1387 PglErr StringRangeListToBitarrAlloc(const char* header_line, const RangeList* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t** bitarr_ptr);
1388
1389
1390 #ifdef __LP64__
PopcountWordsNzbase(const uintptr_t * bitvec,uintptr_t start_idx,uintptr_t end_idx)1391 HEADER_INLINE uintptr_t PopcountWordsNzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
1392 uintptr_t prefix_ct = 0;
1393 # ifdef USE_AVX2
1394 while (start_idx & 3) {
1395 if (end_idx == start_idx) {
1396 return prefix_ct;
1397 }
1398 prefix_ct += PopcountWord(bitvec[start_idx++]);
1399 }
1400 # else
1401 if (start_idx & 1) {
1402 if (end_idx == start_idx) {
1403 return 0;
1404 }
1405 prefix_ct = PopcountWord(bitvec[start_idx++]);
1406 }
1407 # endif // USE_AVX2
1408 return prefix_ct + PopcountWords(&(bitvec[start_idx]), end_idx - start_idx);
1409 }
1410 #else
PopcountWordsNzbase(const uintptr_t * bitvec,uintptr_t start_idx,uintptr_t end_idx)1411 HEADER_INLINE uintptr_t PopcountWordsNzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
1412 return PopcountWords(&(bitvec[start_idx]), end_idx - start_idx);
1413 }
1414 #endif
1415
1416 // start_idx == end_idx ok
1417 uintptr_t PopcountBitRange(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx);
1418
IntersectionIsEmpty(const uintptr_t * bitvec1,const uintptr_t * bitvec2,uintptr_t word_ct)1419 HEADER_INLINE uint32_t IntersectionIsEmpty(const uintptr_t* bitvec1, const uintptr_t* bitvec2, uintptr_t word_ct) {
1420 #ifdef USE_SSE42
1421 const uintptr_t fullvec_ct = word_ct / kWordsPerVec;
1422 # ifdef USE_AVX2
1423 const __m256i* bitvvec1 = R_CAST(const __m256i*, bitvec1);
1424 const __m256i* bitvvec2 = R_CAST(const __m256i*, bitvec2);
1425 for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
1426 if (!_mm256_testz_si256(bitvvec1[vidx], bitvvec2[vidx])) {
1427 return 0;
1428 }
1429 }
1430 # else
1431 const __m128i* bitvvec1 = R_CAST(const __m128i*, bitvec1);
1432 const __m128i* bitvvec2 = R_CAST(const __m128i*, bitvec2);
1433 for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
1434 if (!_mm_testz_si128(bitvvec1[vidx], bitvvec2[vidx])) {
1435 return 0;
1436 }
1437 }
1438 # endif
1439 word_ct = word_ct & (kWordsPerVec - 1);
1440 bitvec1 = R_CAST(const uintptr_t*, &(bitvvec1[fullvec_ct]));
1441 bitvec2 = R_CAST(const uintptr_t*, &(bitvvec2[fullvec_ct]));
1442 #endif
1443 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1444 if (bitvec1[widx] & bitvec2[widx]) {
1445 return 0;
1446 }
1447 }
1448 return 1;
1449 }
1450
1451 // could use testz here, but it's more awkward
UnionIsFull(const uintptr_t * bitarr1,const uintptr_t * bitarr2,uintptr_t bit_ct)1452 HEADER_INLINE uint32_t UnionIsFull(const uintptr_t* bitarr1, const uintptr_t* bitarr2, uintptr_t bit_ct) {
1453 const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
1454 for (uintptr_t widx = 0; widx != fullword_ct; ++widx) {
1455 if ((bitarr1[widx] | bitarr2[widx]) != ~k0LU) {
1456 return 0;
1457 }
1458 }
1459 const uint32_t trailing_bit_ct = bit_ct % kBitsPerWord;
1460 if (trailing_bit_ct) {
1461 if ((bitarr1[fullword_ct] | bitarr2[fullword_ct]) != ((k1LU << trailing_bit_ct) - k1LU)) {
1462 return 0;
1463 }
1464 }
1465 return 1;
1466 }
1467
1468 // PopcountWordsIntersect moved to plink2_base
1469
1470 void PopcountWordsIntersect3val(const uintptr_t* __restrict bitvec1, const uintptr_t* __restrict bitvec2, uint32_t word_ct, uint32_t* __restrict popcount1_ptr, uint32_t* __restrict popcount2_ptr, uint32_t* __restrict popcount_intersect_ptr);
1471
1472 // uintptr_t count_11_longs(const uintptr_t* genovec, uintptr_t word_ct);
1473 // (see CountNyp in pgenlib_misc for that functionality)
1474
1475 // Must be safe to read from bitarr[start_idx / kBitsPerWord].
1476 uint32_t AllBitsAreZero(const uintptr_t* bitarr, uintptr_t start_idx, uintptr_t end_idx);
1477
1478 // does not assume relevant bits of target_bitarr are zero
CopyBits(uintptr_t src_word,uint64_t target_start_bitidx,uint32_t len,uintptr_t * target_bitarr)1479 HEADER_INLINE void CopyBits(uintptr_t src_word, uint64_t target_start_bitidx, uint32_t len, uintptr_t* target_bitarr) {
1480 uintptr_t* update_ptr = &(target_bitarr[target_start_bitidx / kBitsPerWord]);
1481 uintptr_t target_word = *update_ptr;
1482 const uint32_t bit_idx_start = target_start_bitidx % kBitsPerWord;
1483 const uintptr_t shifted_new_bits = src_word << bit_idx_start;
1484 const uint32_t bit_idx_end = bit_idx_start + len;
1485 if (bit_idx_end <= kBitsPerWord) {
1486 const uintptr_t invmask = bzhi_max((~k0LU) << bit_idx_start, bit_idx_end);
1487 *update_ptr = (target_word & (~invmask)) | shifted_new_bits;
1488 return;
1489 }
1490 *update_ptr++ = bzhi(target_word, bit_idx_start) | shifted_new_bits;
1491 target_word = *update_ptr;
1492 const uint32_t remainder = bit_idx_end - kBitsPerWord;
1493 *update_ptr = (target_word & ((~k0LU) << remainder)) | (src_word >> (len - remainder));
1494 }
1495
1496 // assumes len is positive, and relevant bits of target_bitarr are zero
1497 void CopyBitarrRange(const uintptr_t* __restrict src_bitarr, uintptr_t src_start_bitidx, uintptr_t target_start_bitidx, uintptr_t len, uintptr_t* __restrict target_bitarr);
1498
1499
1500 // vertical popcount support
1501 // VcountScramble1() and friends are here since they apply to generic
1502 // arrays; scramble_2_4_8_32() is more plink-specific
1503
1504 #ifdef __LP64__
1505 # ifdef USE_AVX2
VcountScramble1(uint32_t orig_idx)1506 HEADER_INLINE uint32_t VcountScramble1(uint32_t orig_idx) {
1507 // 1->4: 0 4 8 12 16 20 24 28 32 ... 252 1 5 9 ...
1508 // 4->8: 0 8 16 24 32 ... 248 4 12 20 ... 1 9 17 ...
1509 // 8->32: 0 32 ... 224 8 40 ... 232 ... 248 4 36 ... 252 1 33 ...
1510 return (orig_idx & (~255)) + ((orig_idx & 3) * 64) + ((orig_idx & 4) * 8) + (orig_idx & 24) + ((orig_idx & 224) / 32);
1511 }
1512 # else
VcountScramble1(uint32_t orig_idx)1513 HEADER_INLINE uint32_t VcountScramble1(uint32_t orig_idx) {
1514 // 1->4: 0 4 8 12 16 20 24 28 32 ... 124 1 5 9 ...
1515 // 4->8: 0 8 16 24 32 ... 120 4 12 20 ... 1 9 17 ...
1516 // 8->32: 0 32 64 96 8 40 72 104 16 48 80 112 24 56 88 120 4 36 68 ... 1 33 ...
1517 return (orig_idx & (~127)) + ((orig_idx & 3) * 32) + ((orig_idx & 4) * 4) + ((orig_idx & 24) / 2) + ((orig_idx & 96) / 32);
1518 }
1519 # endif
1520 #else
1521 // 1->4: 0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 ...
1522 // 4->8: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18 ...
1523 // 8->32: 0 8 16 24 4 12 20 28 1 9 17 25 5 13 21 29 2 10 18 ...
VcountScramble1(uint32_t orig_idx)1524 HEADER_INLINE uint32_t VcountScramble1(uint32_t orig_idx) {
1525 return (orig_idx & (~31)) + ((orig_idx & 3) * 8) + (orig_idx & 4) + ((orig_idx & 24) / 8);
1526 }
1527 #endif
1528
1529 // change acc1 type to const void* if a VecW* use case arises
VcountIncr1To4(const uintptr_t * acc1,uint32_t acc1_vec_ct,VecW * acc4_iter)1530 HEADER_INLINE void VcountIncr1To4(const uintptr_t* acc1, uint32_t acc1_vec_ct, VecW* acc4_iter) {
1531 const VecW m1x4 = VCONST_W(kMask1111);
1532 const VecW* acc1_iter = R_CAST(const VecW*, acc1);
1533 for (uint32_t vidx = 0; vidx != acc1_vec_ct; ++vidx) {
1534 VecW loader = *acc1_iter++;
1535 *acc4_iter = (*acc4_iter) + (loader & m1x4);
1536 ++acc4_iter;
1537 loader = vecw_srli(loader, 1);
1538 *acc4_iter = (*acc4_iter) + (loader & m1x4);
1539 ++acc4_iter;
1540 loader = vecw_srli(loader, 1);
1541 *acc4_iter = (*acc4_iter) + (loader & m1x4);
1542 ++acc4_iter;
1543 loader = vecw_srli(loader, 1);
1544 *acc4_iter = (*acc4_iter) + (loader & m1x4);
1545 ++acc4_iter;
1546 }
1547 }
1548
Vcount0Incr1To4(uint32_t acc1_vec_ct,uintptr_t * acc1,VecW * acc4_iter)1549 HEADER_INLINE void Vcount0Incr1To4(uint32_t acc1_vec_ct, uintptr_t* acc1, VecW* acc4_iter) {
1550 const VecW m1x4 = VCONST_W(kMask1111);
1551 VecW* acc1_iter = R_CAST(VecW*, acc1);
1552 for (uint32_t vidx = 0; vidx != acc1_vec_ct; ++vidx) {
1553 VecW loader = *acc1_iter;
1554 *acc1_iter++ = vecw_setzero();
1555 *acc4_iter = (*acc4_iter) + (loader & m1x4);
1556 ++acc4_iter;
1557 loader = vecw_srli(loader, 1);
1558 *acc4_iter = (*acc4_iter) + (loader & m1x4);
1559 ++acc4_iter;
1560 loader = vecw_srli(loader, 1);
1561 *acc4_iter = (*acc4_iter) + (loader & m1x4);
1562 ++acc4_iter;
1563 loader = vecw_srli(loader, 1);
1564 *acc4_iter = (*acc4_iter) + (loader & m1x4);
1565 ++acc4_iter;
1566 }
1567 }
1568
1569 // er, should this just be the same function as unroll_incr_2_4 with extra
1570 // parameters?...
VcountIncr4To8(const VecW * acc4_iter,uint32_t acc4_vec_ct,VecW * acc8_iter)1571 HEADER_INLINE void VcountIncr4To8(const VecW* acc4_iter, uint32_t acc4_vec_ct, VecW* acc8_iter) {
1572 const VecW m4 = VCONST_W(kMask0F0F);
1573 for (uint32_t vidx = 0; vidx != acc4_vec_ct; ++vidx) {
1574 VecW loader = *acc4_iter++;
1575 *acc8_iter = (*acc8_iter) + (loader & m4);
1576 ++acc8_iter;
1577 loader = vecw_srli(loader, 4);
1578 *acc8_iter = (*acc8_iter) + (loader & m4);
1579 ++acc8_iter;
1580 }
1581 }
1582
Vcount0Incr4To8(uint32_t acc4_vec_ct,VecW * acc4_iter,VecW * acc8_iter)1583 HEADER_INLINE void Vcount0Incr4To8(uint32_t acc4_vec_ct, VecW* acc4_iter, VecW* acc8_iter) {
1584 const VecW m4 = VCONST_W(kMask0F0F);
1585 for (uint32_t vidx = 0; vidx != acc4_vec_ct; ++vidx) {
1586 VecW loader = *acc4_iter;
1587 *acc4_iter++ = vecw_setzero();
1588 *acc8_iter = (*acc8_iter) + (loader & m4);
1589 ++acc8_iter;
1590 loader = vecw_srli(loader, 4);
1591 *acc8_iter = (*acc8_iter) + (loader & m4);
1592 ++acc8_iter;
1593 }
1594 }
1595
VcountIncr8To32(const VecW * acc8_iter,uint32_t acc8_vec_ct,VecW * acc32_iter)1596 HEADER_INLINE void VcountIncr8To32(const VecW* acc8_iter, uint32_t acc8_vec_ct, VecW* acc32_iter) {
1597 const VecW m8x32 = VCONST_W(kMask000000FF);
1598 for (uint32_t vidx = 0; vidx != acc8_vec_ct; ++vidx) {
1599 VecW loader = *acc8_iter++;
1600 *acc32_iter = (*acc32_iter) + (loader & m8x32);
1601 ++acc32_iter;
1602 loader = vecw_srli(loader, 8);
1603 *acc32_iter = (*acc32_iter) + (loader & m8x32);
1604 ++acc32_iter;
1605 loader = vecw_srli(loader, 8);
1606 *acc32_iter = (*acc32_iter) + (loader & m8x32);
1607 ++acc32_iter;
1608 loader = vecw_srli(loader, 8);
1609 *acc32_iter = (*acc32_iter) + (loader & m8x32);
1610 ++acc32_iter;
1611 }
1612 }
1613
Vcount0Incr8To32(uint32_t acc8_vec_ct,VecW * acc8_iter,VecW * acc32_iter)1614 HEADER_INLINE void Vcount0Incr8To32(uint32_t acc8_vec_ct, VecW* acc8_iter, VecW* acc32_iter) {
1615 const VecW m8x32 = VCONST_W(kMask000000FF);
1616 for (uint32_t vidx = 0; vidx != acc8_vec_ct; ++vidx) {
1617 VecW loader = *acc8_iter;
1618 *acc8_iter++ = vecw_setzero();
1619 *acc32_iter = (*acc32_iter) + (loader & m8x32);
1620 ++acc32_iter;
1621 loader = vecw_srli(loader, 8);
1622 *acc32_iter = (*acc32_iter) + (loader & m8x32);
1623 ++acc32_iter;
1624 loader = vecw_srli(loader, 8);
1625 *acc32_iter = (*acc32_iter) + (loader & m8x32);
1626 ++acc32_iter;
1627 loader = vecw_srli(loader, 8);
1628 *acc32_iter = (*acc32_iter) + (loader & m8x32);
1629 ++acc32_iter;
1630 }
1631 }
1632
1633
1634 // forward_ct must be positive. Stays put if forward_ct == 1 and current bit
1635 // is set.
1636 // In usual 64-bit case, also assumes bitvec is vector aligned.
1637 uintptr_t FindNth1BitFrom(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct);
1638
IdxToUidxBasic(const uintptr_t * bitvec,uint32_t idx)1639 HEADER_INLINE uint32_t IdxToUidxBasic(const uintptr_t* bitvec, uint32_t idx) {
1640 return FindNth1BitFrom(bitvec, 0, idx + 1);
1641 }
1642
1643 // variant_ct must be positive, but can be smaller than thread_ct
1644 void ComputeUidxStartPartition(const uintptr_t* variant_include, uint64_t variant_ct, uint32_t thread_ct, uint32_t first_uidx, uint32_t* variant_uidx_starts);
1645
1646 // Set multiplier to 0 to only count extra alleles, 1 to also count alt1 for
1647 // those variants (useful for HWE), 2 to count both ref and alt1 for
1648 // multiallelic variants.
1649 // allele_idx_offsets == nullptr ok.
1650 uintptr_t CountExtraAlleles(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t multiallelic_variant_ct_multiplier);
1651
1652 uint32_t MaxAlleleCtSubset(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct);
1653
1654 void ComputePartitionAligned(const uintptr_t* variant_include, uint32_t orig_thread_ct, uint32_t first_variant_uidx, uint32_t cur_variant_idx, uint32_t cur_variant_ct, uint32_t alignment, uint32_t* variant_uidx_starts, uint32_t* vidx_starts);
1655
1656 BoolErr ParseNextRange(const char* const* argvk, uint32_t param_ct, char range_delim, uint32_t* cur_param_idx_ptr, const char** cur_arg_pptr, const char** range_start_ptr, uint32_t* rs_len_ptr, const char** range_end_ptr, uint32_t* re_len_ptr);
1657
1658 PglErr ParseNameRanges(const char* const* argvk, const char* errstr_append, uint32_t param_ct, uint32_t require_posint, char range_delim, RangeList* range_list_ptr);
1659
1660
1661 // Analytically finds all real roots of x^3 + ax^2 + bx + c, saving them in
1662 // solutions[] (sorted from smallest to largest), and returning the count.
1663 // Multiple roots are only returned/counted once.
1664 uint32_t CubicRealRoots(double coef_a, double coef_b, double coef_c, STD_ARRAY_REF(double, 3) solutions);
1665
1666
1667 // store_all_dups currently must be true for dup_ct to be set, but this is easy
1668 // to change
1669 PglErr PopulateIdHtableMt(unsigned char* arena_top, const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t store_all_dups, uint32_t id_htable_size, uint32_t thread_ct, unsigned char** arena_bottom_ptr, uint32_t* id_htable, uint32_t* dup_ct_ptr);
1670
1671 PglErr CheckIdUniqueness(unsigned char* arena_bottom, unsigned char* arena_top, const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t max_thread_ct, uint32_t* dup_found_ptr);
1672
1673 // Pass in htable_dup_base_ptr == nullptr if just flagging duplicate IDs rather
1674 // than tracking all their positions in item_ids.
1675 // Otherwise, htable_dup_base entries are guaranteed to be filled in increasing
1676 // order (briefly made that nondeterministic on 11 Oct 2019 and broke --rm-dup,
1677 // not doing that again).
1678 PglErr AllocAndPopulateIdHtableMt(const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uintptr_t fast_size_min_extra_bytes, uint32_t max_thread_ct, uint32_t** id_htable_ptr, uint32_t** htable_dup_base_ptr, uint32_t* id_htable_size_ptr, uint32_t* dup_ct_ptr);
1679
1680
1681 typedef struct HelpCtrlStruct {
1682 NONCOPYABLE(HelpCtrlStruct);
1683 uint32_t iters_left;
1684 uint32_t param_ct;
1685 const char* const* argv;
1686 uintptr_t unmatched_ct;
1687 uintptr_t* all_match_arr;
1688 uintptr_t* prefix_match_arr;
1689 uintptr_t* perfect_match_arr;
1690 uint32_t* param_slens;
1691 uint32_t preprint_newline;
1692 } HelpCtrl;
1693
1694 void HelpPrint(const char* cur_params, HelpCtrl* help_ctrl_ptr, uint32_t postprint_newline, const char* payload);
1695
1696
1697 extern const char kErrstrNomem[];
1698 extern const char kErrstrWrite[];
1699 extern const char kErrstrThreadCreate[];
1700
1701 // assumes logfile is open
1702 void DispExitMsg(PglErr reterr);
1703
1704 BoolErr CheckExtraParam(const char* const* argvk, const char* permitted_modif, uint32_t* other_idx_ptr);
1705
1706 char ExtractCharParam(const char* ss);
1707
1708 PglErr CmdlineAllocString(const char* source, const char* flag_name, uint32_t max_slen, char** sbuf_ptr);
1709
1710 PglErr AllocFname(const char* source, const char* flagname_p, uint32_t extra_size, char** fnbuf_ptr);
1711
1712 PglErr AllocAndFlatten(const char* const* sources, uint32_t param_ct, uint32_t max_blen, char** flattened_buf_ptr);
1713
1714
1715 typedef struct Plink2CmdlineMetaStruct {
1716 NONCOPYABLE(Plink2CmdlineMetaStruct);
1717 // need to be able to assign this to argv, so don't make it const char**
1718 char** subst_argv;
1719
1720 char* script_buf;
1721 char* rerun_buf;
1722 char* flag_buf;
1723 uint32_t* flag_map;
1724 } Plink2CmdlineMeta;
1725
1726 void PreinitPlink2CmdlineMeta(Plink2CmdlineMeta* pcmp);
1727
1728 // Handles --script, --rerun, --help, --version, and --silent.
1729 // subst_argv, script_buf, and rerun_buf must be initialized to nullptr.
1730 PglErr CmdlineParsePhase1(const char* ver_str, const char* ver_str2, const char* prog_name_str, const char* notestr_null_calc2, const char* cmdline_format_str, const char* errstr_append, uint32_t max_flag_blen, PglErr(* disp_help_fn)(const char* const*, uint32_t), int* argc_ptr, char*** argv_ptr, Plink2CmdlineMeta* pcmp, uint32_t* first_arg_idx_ptr, uint32_t* flag_ct_ptr);
1731
1732 // Assumes CmdlineParsePhase1() has completed, flag names have been copied to
1733 // flag_buf/flag_map, aliases handled, and PROG_NAME_STR has been copied to
1734 // outname (no null-terminator needed). outname_end must be initialized to
1735 // nullptr.
1736 // This sorts the flag names so they're processed in a predictable order,
1737 // handles --d and --out if present, initializes the log, and determines the
1738 // number of processors the OS wants us to think the machine has.
1739 PglErr CmdlineParsePhase2(const char* ver_str, const char* errstr_append, const char* const* argvk, uint32_t prog_name_str_slen, uint32_t max_flag_blen, int32_t argc, uint32_t flag_ct, Plink2CmdlineMeta* pcmp, char* outname, char** outname_end_ptr, char* range_delim_ptr, int32_t* known_procs_ptr, uint32_t* max_thread_ct_ptr);
1740
InvalidArg(const char * cur_arg)1741 HEADER_INLINE void InvalidArg(const char* cur_arg) {
1742 logpreprintfww("Error: Unrecognized flag ('%s').\n", cur_arg);
1743 }
1744
1745 PglErr CmdlineParsePhase3(uintptr_t max_default_mib, uintptr_t malloc_size_mib, uint32_t memory_require, Plink2CmdlineMeta* pcmp, unsigned char** bigstack_ua_ptr);
1746
1747 void CleanupPlink2CmdlineMeta(Plink2CmdlineMeta* pcmp);
1748
1749 // order is such that (kCmpOperatorEq - x) is the inverse of x
1750 ENUM_U31_DEF_START()
1751 kCmpOperatorNoteq,
1752 kCmpOperatorLe,
1753 kCmpOperatorLeq,
1754
1755 kCmpOperatorGe,
1756 kCmpOperatorGeq,
1757 kCmpOperatorEq
1758 ENUM_U31_DEF_END(CmpBinaryOp);
1759
1760 typedef struct CmpExprStruct {
1761 // arguably too small for noncopyable to be a great idea, but the next
1762 // iteration of this struct probably won't have that issue.
1763 NONCOPYABLE(CmpExprStruct);
1764 // Restrict to <key> <operator> <value> for now; key=INFO/pheno/covar.
1765 //
1766 // Almost certainly want to support conjunctions/disjunctions later; but
1767 // there are complications regarding the command-line interface:
1768 // * Phenotype/covariate names and VCFv4.2 INFO keys can contain parenthesis
1769 // and mathematical-operator characters. And we actually need to permit at
1770 // least the latter, since autogenerated categorical-covariate names
1771 // contain '=' for a good reason. Some sort of escaping is probably in
1772 // in order, but...
1773 // * Escaping is not a big deal in a shell script, but most
1774 // conjunctions/disjunctions can already be emulated easily enough in a
1775 // shell script by calling plink2 multiple times. While there's some
1776 // execution-time improvement, the main value-add from directly supporting
1777 // logical operations in --keep-if/--extract-if-info is reduced
1778 // interactive-use cognitive load.
1779 // * We can't treat multiple command-line arguments differently from the
1780 // corresponding space-containing single command-line argument; otherwise
1781 // --rerun breaks. So "multiple-argument-form supports keys with special
1782 // characters but not conjunctions/disjunctions; single-argument-form is
1783 // the other way around" is not an option.
1784 // The least-bad solution I've thought of is to add --keep-if-file/etc. flags
1785 // which read the expression from a short text file. In that case, we'd at
1786 // least be able to define normal quoting and escaping rules without worrying
1787 // about confusing interactions with bash. Deferring implementation for now
1788 // in hopes of coming up with a better idea, but this should go in before
1789 // beta testing begins.
1790
1791 // Currently stores null-terminated key, followed by null-terminated value
1792 // string. Storage format needs to be synced with ValidateAndAllocCmpExpr().
1793 char* pheno_name;
1794 CmpBinaryOp binary_op;
1795 } CmpExpr;
1796
1797 void InitCmpExpr(CmpExpr* cmp_expr_ptr);
1798
1799 void CleanupCmpExpr(CmpExpr* cmp_expr_ptr);
1800
1801 PglErr ValidateAndAllocCmpExpr(const char* const* sources, const char* flag_name, uint32_t param_ct, CmpExpr* cmp_expr_ptr);
1802
1803 PglErr SearchHeaderLine(const char* header_line_iter, const char* const* search_multistrs, const char* flag_nodash, uint32_t search_col_ct, uint32_t* found_col_ct_ptr, uint32_t* found_type_bitset_ptr, uint32_t* col_skips, uint32_t* col_types);
1804
1805 // col_descriptor is usually a pointer to argv[...][5] (first five characters
1806 // are "cols="). supported_ids is a multistr.
1807 PglErr ParseColDescriptor(const char* col_descriptor_iter, const char* supported_ids, const char* cur_flag_name, uint32_t first_col_shifted, uint32_t default_cols_mask, uint32_t prohibit_empty, void* result_ptr);
1808
1809
1810 // this is technically application-dependent, but let's keep this simple for
1811 // now
1812 // todo: recalibrate these numbers before each beta release
1813 #ifndef __LP64__
1814 // 2047 seems to consistently fail on both OS X and Windows
1815 # ifdef _WIN32
1816 CONSTI32(kMalloc32bitMibMax, 1728);
1817 # else
1818 # ifdef __APPLE__
1819 CONSTI32(kMalloc32bitMibMax, 1888);
1820 # else
1821 CONSTI32(kMalloc32bitMibMax, 2047);
1822 # endif
1823 # endif
1824 #endif
1825
1826
1827
1828 #ifdef __cplusplus
1829 } // namespace plink2
1830 #endif
1831
1832 #endif // __PLINK2_CMDLINE_H__
1833