1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink2_cmdline.h"
19
20 #ifdef __APPLE__
21 // needed for sysctl() call
22 # include <sys/sysctl.h>
23 #endif
24
25 #include <sys/types.h> // open()
26 #include <sys/stat.h> // open()
27 #include <fcntl.h> // open()
28 #include <time.h> // time(), ctime()
29 #include <unistd.h> // getcwd(), gethostname(), sysconf(), fstat()
30
31 #ifdef __cplusplus
32 namespace plink2 {
33 #endif
34
35 const char kErrprintfFopen[] = "Error: Failed to open %s : %s.\n";
36 const char kErrprintfFread[] = "Error: %s read failure: %s.\n";
37 const char kErrprintfRewind[] = "Error: %s could not be scanned twice. (Process-substitution/named-pipe input is not permitted in this use case.)\n";
38
39 char g_textbuf[kTextbufSize];
40
41 // now initialized by init_bigstack
42 // can't use std::array<char, 512>& since it's initially null
43 const char* g_one_char_strs = nullptr;
44 // If one-base indels become sufficiently common, might want to predefine
45 // g_two_char_strs[], and update allele string construction/destruction
46 // accordingly. (Though that should either be programmatically initialized, or
47 // only cover a subset of the space; 192k is a lot to increase the binary image
48 // size for a single simple table.)
49
50 const char* g_input_missing_geno_ptr = nullptr; // in addition to '.'
51 const char* g_output_missing_geno_ptr = nullptr; // now '.'
52
53 FILE* g_logfile = nullptr;
54
55 char g_logbuf[kLogbufSize];
56
57 uint32_t g_debug_on = 0;
58 uint32_t g_log_failed = 0;
59 uint32_t g_stderr_written_to = 0;
60
logputs_silent(const char * str)61 void logputs_silent(const char* str) {
62 if (!g_debug_on) {
63 fputs(str, g_logfile);
64 if (unlikely(ferror_unlocked(g_logfile))) {
65 putchar('\n');
66 fflush(stdout);
67 fprintf(stderr, "Warning: Logging failure on:\n%s\nFurther logging will not be attempted in this run.\n", str);
68 g_log_failed = 1;
69 }
70 } else {
71 if (g_log_failed) {
72 fflush(stdout);
73 fputs(str, stderr);
74 } else {
75 fputs(str, g_logfile);
76 if (unlikely(ferror_unlocked(g_logfile))) {
77 putchar('\n');
78 fflush(stdout);
79 fprintf(stderr, "Error: Debug logging failure. Dumping to stderr:\n%s", str);
80 g_log_failed = 1;
81 g_stderr_written_to = 1;
82 } else {
83 fflush(g_logfile);
84 }
85 }
86 }
87 }
88
logputs(const char * str)89 void logputs(const char* str) {
90 logputs_silent(str);
91 fputs(str, stdout);
92 }
93
logerrputs(const char * str)94 void logerrputs(const char* str) {
95 logputs_silent(str);
96 fflush(stdout);
97 fputs(str, stderr);
98 g_stderr_written_to = 1;
99 }
100
logputsb()101 void logputsb() {
102 logputs_silent(g_logbuf);
103 fputs(g_logbuf, stdout);
104 }
105
logerrputsb()106 void logerrputsb() {
107 logputs_silent(g_logbuf);
108 fflush(stdout);
109 fputs(g_logbuf, stderr);
110 g_stderr_written_to = 1;
111 }
112
ForceNonFifo(const char * fname)113 PglErr ForceNonFifo(const char* fname) {
114 int32_t file_handle = open(fname, O_RDONLY);
115 if (unlikely(file_handle < 0)) {
116 return kPglRetOpenFail;
117 }
118 struct stat statbuf;
119 if (unlikely(fstat(file_handle, &statbuf) < 0)) {
120 close(file_handle);
121 return kPglRetOpenFail;
122 }
123 if (unlikely(S_ISFIFO(statbuf.st_mode))) {
124 close(file_handle);
125 return kPglRetRewindFail;
126 }
127 close(file_handle);
128 return kPglRetSuccess;
129 }
130
fopen_checked(const char * fname,const char * mode,FILE ** target_ptr)131 BoolErr fopen_checked(const char* fname, const char* mode, FILE** target_ptr) {
132 /*
133 if (!strcmp(mode, FOPEN_WB)) {
134 // The fopen call may take a long time to return when overwriting in mode
135 // "w" in some scenarios on some OSes (I've seen 15+ sec). On the other
136 // hand, sometimes it returns immediately and then writing takes less time.
137 // Since it goes both ways, I won't second-guess the OSes for now;
138 // presumably they're trying to minimize average amortized time. But it
139 // might be worth retesting in the future; perhaps some common plink
140 // workflows violate normally-reasonable OS assumptions.
141 if (access(fname, W_OK) != -1) {
142 if (unlikely(unlink(fname))) {
143 logputs("\n");
144 logerrprintfww(kErrprintfFopen, fname, strerror(errno));
145 return 1;
146 }
147 }
148 }
149 */
150 *target_ptr = fopen(fname, mode);
151 if (unlikely(!(*target_ptr))) {
152 logputs("\n");
153 logerrprintfww(kErrprintfFopen, fname, strerror(errno));
154 return 1;
155 }
156 return 0;
157 }
158
fwrite_flush2(char * buf_flush,FILE * outfile,char ** write_iter_ptr)159 BoolErr fwrite_flush2(char* buf_flush, FILE* outfile, char** write_iter_ptr) {
160 char* buf = &(buf_flush[-S_CAST(int32_t, kMaxMediumLine)]);
161 char* buf_end = *write_iter_ptr;
162 *write_iter_ptr = buf;
163 return fwrite_checked(buf, buf_end - buf, outfile);
164 }
165
fclose_flush_null(char * buf_flush,char * write_iter,FILE ** outfile_ptr)166 BoolErr fclose_flush_null(char* buf_flush, char* write_iter, FILE** outfile_ptr) {
167 char* buf = &(buf_flush[-S_CAST(int32_t, kMaxMediumLine)]);
168 if (write_iter != buf) {
169 if (unlikely(fwrite_checked(buf, write_iter - buf, *outfile_ptr))) {
170 return 1;
171 }
172 }
173 return fclose_null(outfile_ptr);
174 }
175
176
float_cmp(const void * aa,const void * bb)177 int32_t float_cmp(const void* aa, const void* bb) {
178 const float fxx = *S_CAST(const float*, aa);
179 const float fyy = *S_CAST(const float*, bb);
180 if (fxx < fyy) {
181 return -1;
182 }
183 return (fxx > fyy);
184 }
185
double_cmp(const void * aa,const void * bb)186 int32_t double_cmp(const void* aa, const void* bb) {
187 const double dxx = *S_CAST(const double*, aa);
188 const double dyy = *S_CAST(const double*, bb);
189 if (dxx < dyy) {
190 return -1;
191 }
192 return (dxx > dyy);
193 }
194
double_cmp_decr(const void * aa,const void * bb)195 int32_t double_cmp_decr(const void* aa, const void* bb) {
196 const double dxx = *S_CAST(const double*, aa);
197 const double dyy = *S_CAST(const double*, bb);
198 if (dxx > dyy) {
199 return -1;
200 }
201 return (dxx < dyy);
202 }
203
u64cmp(const void * aa,const void * bb)204 int32_t u64cmp(const void* aa, const void* bb) {
205 const uint64_t ullaa = *S_CAST(const uint64_t*, aa);
206 const uint64_t ullbb = *S_CAST(const uint64_t*, bb);
207 if (ullaa < ullbb) {
208 return -1;
209 }
210 return (ullaa > ullbb);
211 }
212
213 #ifndef __cplusplus
u64cmp_decr(const void * aa,const void * bb)214 int32_t u64cmp_decr(const void* aa, const void* bb) {
215 const uint64_t ullaa = *S_CAST(const uint64_t*, aa);
216 const uint64_t ullbb = *S_CAST(const uint64_t*, bb);
217 if (ullaa > ullbb) {
218 return -1;
219 }
220 return (ullaa < ullbb);
221 }
222 #endif
223
224 #ifdef __cplusplus
DestructiveMedianF(uintptr_t len,float * unsorted_arr)225 float DestructiveMedianF(uintptr_t len, float* unsorted_arr) {
226 if (!len) {
227 return 0.0;
228 }
229 const uintptr_t len_d2 = len / 2;
230 std::nth_element(unsorted_arr, &(unsorted_arr[len_d2]), &(unsorted_arr[len]));
231 const float median_upper = unsorted_arr[len_d2];
232 if (len % 2) {
233 return median_upper;
234 }
235 return (FArrMax(unsorted_arr, len_d2) + median_upper) * 0.5f;
236 }
237
DestructiveMedianD(uintptr_t len,double * unsorted_arr)238 double DestructiveMedianD(uintptr_t len, double* unsorted_arr) {
239 if (!len) {
240 return 0.0;
241 }
242 const uintptr_t len_d2 = len / 2;
243 std::nth_element(unsorted_arr, &(unsorted_arr[len_d2]), &(unsorted_arr[len]));
244 const double median_upper = unsorted_arr[len_d2];
245 if (len % 2) {
246 return median_upper;
247 }
248 return (DArrMax(unsorted_arr, len_d2) + median_upper) * 0.5;
249 }
250 #else
251 // these will probably be used in __cplusplus case too
MedianF(const float * sorted_arr,uintptr_t len)252 float MedianF(const float* sorted_arr, uintptr_t len) {
253 if (!len) {
254 return 0.0;
255 }
256 if (len % 2) {
257 return sorted_arr[len / 2];
258 }
259 return (sorted_arr[len / 2] + sorted_arr[(len / 2) - 1]) * 0.5f;
260 }
261
MedianD(const double * sorted_arr,uintptr_t len)262 double MedianD(const double* sorted_arr, uintptr_t len) {
263 if (!len) {
264 return 0.0;
265 }
266 if (len % 2) {
267 return sorted_arr[len / 2];
268 }
269 return (sorted_arr[len / 2] + sorted_arr[(len / 2) - 1]) * 0.5;
270 }
271
DestructiveMedianF(uintptr_t len,float * unsorted_arr)272 float DestructiveMedianF(uintptr_t len, float* unsorted_arr) {
273 // no, I'm not gonna bother reimplementing introselect just for folks who
274 // insist on compiling this as pure C instead of C++
275 qsort(unsorted_arr, len, sizeof(float), float_cmp);
276 return MedianF(unsorted_arr, len);
277 }
278
DestructiveMedianD(uintptr_t len,double * unsorted_arr)279 double DestructiveMedianD(uintptr_t len, double* unsorted_arr) {
280 qsort(unsorted_arr, len, sizeof(double), double_cmp);
281 return MedianD(unsorted_arr, len);
282 }
283 #endif
284
285
286 // Overread must be ok.
SortStrboxIndexed(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,char * strbox,uint32_t * id_map)287 BoolErr SortStrboxIndexed(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map) {
288 if (str_ct < 2) {
289 return 0;
290 }
291 unsigned char* bigstack_mark = g_bigstack_base;
292 const uintptr_t wkspace_entry_blen = GetStrboxsortWentryBlen(max_str_blen);
293 unsigned char* sort_wkspace;
294 if (unlikely(bigstack_alloc_uc(str_ct * wkspace_entry_blen, &sort_wkspace))) {
295 return 1;
296 }
297 SortStrboxIndexed2(str_ct, max_str_blen, use_nsort, strbox, id_map, sort_wkspace);
298 BigstackReset(bigstack_mark);
299 return 0;
300 }
301
302 /*
303 BoolErr StrptrArrIndexedSort(const char* const* unsorted_strptrs, uint32_t str_ct, uint32_t overread_ok, uint32_t use_nsort, uint32_t* id_map) {
304 if (str_ct < 2) {
305 if (str_ct) {
306 id_map[0] = 0;
307 }
308 return 0;
309 }
310 if (bigstack_left() < str_ct * sizeof(StrSortIndexedDeref)) {
311 return 1;
312 }
313 StrSortIndexedDeref* wkspace_alias = (StrSortIndexedDeref*)g_bigstack_base;
314 for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
315 wkspace_alias[str_idx].strptr = unsorted_strptrs[str_idx];
316 wkspace_alias[str_idx].orig_idx = str_idx;
317 }
318 StrptrArrSortMain(str_ct, overread_ok, use_nsort, wkspace_alias);
319 for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
320 id_map[str_idx] = wkspace_alias[str_idx].orig_idx;
321 }
322 BigstackReset(wkspace_alias);
323 return 0;
324 }
325 */
326
327
CountSortedSmallerU32(const uint32_t * sorted_u32_arr,uint32_t arr_length,uint32_t needle)328 uint32_t CountSortedSmallerU32(const uint32_t* sorted_u32_arr, uint32_t arr_length, uint32_t needle) {
329 // (strangely, this seems to be equal to or better than std::lower_bound with
330 // -O2 optimization, but can become much slower with -O3?)
331
332 // assumes arr_length is nonzero, and sorted_u32_arr is in nondecreasing
333 // order. (useful for searching variant_bps[].)
334 // needle guaranteed to be larger than sorted_u32_arr[min_idx - 1] if it
335 // exists, but NOT necessarily sorted_u32_arr[min_idx].
336 int32_t min_idx = 0;
337 // similarly, needle guaranteed to be no greater than
338 // sorted_u32_arr[max_idx + 1] if it exists, but not necessarily
339 // sorted_u32_arr[max_idx]. Signed integer since it could become -1, and
340 // min_idx in turn is signed so comparisons are safe.
341 int32_t max_idx = arr_length - 1;
342 while (min_idx < max_idx) {
343 const uint32_t mid_idx = (S_CAST(uint32_t, min_idx) + S_CAST(uint32_t, max_idx)) / 2;
344 if (needle > sorted_u32_arr[mid_idx]) {
345 min_idx = mid_idx + 1;
346 } else {
347 max_idx = mid_idx - 1;
348 }
349 }
350 return min_idx + (needle > sorted_u32_arr[S_CAST(uint32_t, min_idx)]);
351 }
352
ExpsearchU32(const uint32_t * sorted_u32_arr,uint32_t end_idx,uint32_t needle)353 uint32_t ExpsearchU32(const uint32_t* sorted_u32_arr, uint32_t end_idx, uint32_t needle) {
354 uint32_t next_incr = 1;
355 uint32_t start_idx = 0;
356 uint32_t cur_idx = 0;
357 while (cur_idx < end_idx) {
358 if (sorted_u32_arr[cur_idx] >= needle) {
359 end_idx = cur_idx;
360 break;
361 }
362 start_idx = cur_idx + 1;
363 cur_idx += next_incr;
364 next_incr *= 2;
365 }
366 while (start_idx < end_idx) {
367 // this breaks if arr_length > 2^31
368 const uint32_t mid_idx = (start_idx + end_idx) / 2;
369
370 if (sorted_u32_arr[mid_idx] < needle) {
371 start_idx = mid_idx + 1;
372 } else {
373 end_idx = mid_idx;
374 }
375 }
376 return start_idx;
377 }
378
CountSortedSmallerU64(const uint64_t * sorted_u64_arr,uintptr_t arr_length,uint64_t needle)379 uintptr_t CountSortedSmallerU64(const uint64_t* sorted_u64_arr, uintptr_t arr_length, uint64_t needle) {
380 intptr_t min_idx = 0;
381 intptr_t max_idx = arr_length - 1;
382 while (min_idx < max_idx) {
383 const uintptr_t mid_idx = (S_CAST(uintptr_t, min_idx) + S_CAST(uintptr_t, max_idx)) / 2;
384 if (needle > sorted_u64_arr[mid_idx]) {
385 min_idx = mid_idx + 1;
386 } else {
387 max_idx = mid_idx - 1;
388 }
389 }
390 return min_idx + (needle > sorted_u64_arr[S_CAST(uintptr_t, min_idx)]);
391 }
392
CountSortedSmallerD(const double * sorted_dbl_arr,uintptr_t arr_length,double needle)393 uintptr_t CountSortedSmallerD(const double* sorted_dbl_arr, uintptr_t arr_length, double needle) {
394 intptr_t min_idx = 0;
395 intptr_t max_idx = arr_length - 1;
396 while (min_idx < max_idx) {
397 const uintptr_t mid_idx = (S_CAST(uintptr_t, min_idx) + S_CAST(uintptr_t, max_idx)) / 2;
398 if (needle > sorted_dbl_arr[mid_idx]) {
399 min_idx = mid_idx + 1;
400 } else {
401 max_idx = mid_idx - 1;
402 }
403 }
404 return min_idx + (needle > sorted_dbl_arr[S_CAST(uintptr_t, min_idx)]);
405 }
406
CountSortedLeqU64(const uint64_t * sorted_u64_arr,uintptr_t arr_length,uint64_t needle)407 uintptr_t CountSortedLeqU64(const uint64_t* sorted_u64_arr, uintptr_t arr_length, uint64_t needle) {
408 intptr_t min_idx = 0;
409 intptr_t max_idx = arr_length - 1;
410 while (min_idx < max_idx) {
411 const uintptr_t mid_idx = (S_CAST(uintptr_t, min_idx) + S_CAST(uintptr_t, max_idx)) / 2;
412 if (needle >= sorted_u64_arr[mid_idx]) {
413 min_idx = mid_idx + 1;
414 } else {
415 max_idx = mid_idx - 1;
416 }
417 }
418 return min_idx + (needle >= sorted_u64_arr[S_CAST(uintptr_t, min_idx)]);
419 }
420
GetParamCt(const char * const * argvk,uint32_t argc,uint32_t flag_idx)421 uint32_t GetParamCt(const char* const* argvk, uint32_t argc, uint32_t flag_idx) {
422 // Counts the number of optional arguments given to the flag at position
423 // flag_idx, treating any nonnumeric argument beginning with "-" as optional.
424 ++flag_idx;
425 uint32_t cur_idx = flag_idx;
426 while ((cur_idx < argc) && (!IsCmdlineFlag(argvk[cur_idx]))) {
427 ++cur_idx;
428 }
429 return cur_idx - flag_idx;
430 }
431
EnforceParamCtRange(const char * flag_name,uint32_t param_ct,uint32_t min_ct,uint32_t max_ct)432 BoolErr EnforceParamCtRange(const char* flag_name, uint32_t param_ct, uint32_t min_ct, uint32_t max_ct) {
433 if (unlikely(param_ct > max_ct)) {
434 if (max_ct > min_ct) {
435 snprintf(g_logbuf, kLogbufSize, "Error: %s accepts at most %u argument%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
436 } else {
437 snprintf(g_logbuf, kLogbufSize, "Error: %s only accepts %u argument%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
438 }
439 return 1;
440 }
441 if (likely(param_ct >= min_ct)) {
442 return 0;
443 }
444 if (min_ct == 1) {
445 snprintf(g_logbuf, kLogbufSize, "Error: Missing %s argument.\n", flag_name);
446 } else {
447 snprintf(g_logbuf, kLogbufSize, "Error: %s requires %s%u arguments.\n", flag_name, (min_ct < max_ct)? "at least " : "", min_ct);
448 }
449 return 1;
450 }
451
SortCmdlineFlags(uint32_t max_flag_blen,uint32_t flag_ct,char * flag_buf,uint32_t * flag_map)452 PglErr SortCmdlineFlags(uint32_t max_flag_blen, uint32_t flag_ct, char* flag_buf, uint32_t* flag_map) {
453 // Assumes flag_ct is the number of flag (as opposed to value) arguments,
454 // flag_buf[] points to a rectangular char* array (width max_flag_blen) of
455 // flag names with leading dash(es) stripped, and flag_map[] maps flag_buf[]
456 // entries to argv[] entries.
457 // Lexicographically sorts flag_buf (updating flag_map in the process), and
458 // then checks for duplicates.
459 // Okay for flag_buf to contain entries with spaces (plink 1.9's alias
460 // resolution takes advantage of this).
461 assert(flag_ct); // this must be skipped if there are no flags at all
462 if (unlikely(SortStrboxIndexedMalloc(flag_ct, max_flag_blen, flag_buf, flag_map))) {
463 return kPglRetNomem;
464 }
465 uint32_t prev_flag_len = strlen_se(flag_buf);
466 char* prev_flag_ptr = flag_buf;
467 for (uint32_t cur_flag_idx = 1; cur_flag_idx != flag_ct; ++cur_flag_idx) {
468 char* cur_flag_ptr = &(prev_flag_ptr[max_flag_blen]);
469 const uint32_t cur_flag_len = strlen_se(cur_flag_ptr);
470 if (unlikely((prev_flag_len == cur_flag_len) && memequal(prev_flag_ptr, cur_flag_ptr, cur_flag_len))) {
471 cur_flag_ptr[cur_flag_len] = '\0'; // just in case of aliases
472 fflush(stdout);
473 fprintf(stderr, "Error: Duplicate --%s flag.\n", cur_flag_ptr);
474 // g_stderr_written_to = 1;
475 return kPglRetInvalidCmdline;
476 }
477 prev_flag_ptr = cur_flag_ptr;
478 prev_flag_len = cur_flag_len;
479 }
480 return kPglRetSuccess;
481 }
482
InitLogfile(uint32_t always_stderr,char * outname,char * outname_end)483 PglErr InitLogfile(uint32_t always_stderr, char* outname, char* outname_end) {
484 snprintf(outname_end, kMaxOutfnameExtBlen, ".log");
485 g_logfile = fopen(outname, "w");
486 if (unlikely(!g_logfile)) {
487 fflush(stdout);
488 fprintf(stderr, "Error: Failed to open %s for logging: %s.\n", outname, strerror(errno));
489 // g_stderr_written_to = 1;
490 return kPglRetOpenFail;
491 }
492 fprintf(always_stderr? stderr : stdout, "Logging to %s.\n", outname);
493 return kPglRetSuccess;
494 }
495
CleanupLogfile(uint32_t print_end_time)496 BoolErr CleanupLogfile(uint32_t print_end_time) {
497 char* write_iter = strcpya_k(g_logbuf, "End time: ");
498 time_t rawtime;
499 time(&rawtime);
500 write_iter = Stpcpy(write_iter, ctime(&rawtime)); // has trailing \n
501 if (print_end_time) {
502 fputs(g_logbuf, stdout);
503 }
504 BoolErr ret_boolerr = 0;
505 if (g_logfile) {
506 if (!g_log_failed) {
507 logputs_silent("\n");
508 logputs_silent(g_logbuf);
509 if (unlikely(fclose(g_logfile))) {
510 fflush(stdout);
511 fprintf(stderr, "Error: Failed to finish writing to log: %s.\n", strerror(errno));
512 ret_boolerr = 1;
513 }
514 } else {
515 fclose(g_logfile);
516 }
517 g_logfile = nullptr;
518 }
519 return ret_boolerr;
520 }
521
522 // manually managed, very large stack
523 unsigned char* g_bigstack_base = nullptr;
524 unsigned char* g_bigstack_end = nullptr;
525
DetectMib()526 uintptr_t DetectMib() {
527 int64_t llxx;
528 // return zero if detection failed
529 // see e.g. http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system .
530 #ifdef __APPLE__
531 int32_t mib[2];
532 mib[0] = CTL_HW;
533 mib[1] = HW_MEMSIZE;
534 llxx = 0;
535 size_t sztmp = sizeof(int64_t);
536 sysctl(&mib[0], 2, &llxx, &sztmp, nullptr, 0);
537 llxx /= 1048576;
538 #else
539 # ifdef _WIN32
540 MEMORYSTATUSEX memstatus;
541 memstatus.dwLength = sizeof(memstatus);
542 GlobalMemoryStatusEx(&memstatus);
543 llxx = memstatus.ullTotalPhys / 1048576;
544 # else
545 llxx = S_CAST(uint64_t, sysconf(_SC_PHYS_PAGES)) * S_CAST(size_t, sysconf(_SC_PAGESIZE)) / 1048576;
546 # endif
547 #endif
548 return llxx;
549 }
550
GetDefaultAllocMib()551 uintptr_t GetDefaultAllocMib() {
552 const uintptr_t total_mib = DetectMib();
553 if (!total_mib) {
554 return kBigstackDefaultMib;
555 }
556 if (total_mib < (kBigstackMinMib * 2)) {
557 return kBigstackMinMib;
558 }
559 return (total_mib / 2);
560 }
561
InitBigstack(uintptr_t malloc_size_mib,uintptr_t * malloc_mib_final_ptr,unsigned char ** bigstack_ua_ptr)562 PglErr InitBigstack(uintptr_t malloc_size_mib, uintptr_t* malloc_mib_final_ptr, unsigned char** bigstack_ua_ptr) {
563 // guarantee contiguous malloc space outside of main workspace
564 unsigned char* bubble;
565
566 // this is pointless with overcommit on, may want to conditionally compile
567 // this out, and/or conditionally skip this
568 if (unlikely(pgl_malloc(kNonBigstackMin, &bubble))) {
569 return kPglRetNomem;
570 }
571
572 assert(malloc_size_mib >= kBigstackMinMib);
573 #ifndef __LP64__
574 assert(malloc_size_mib <= 2047);
575 #endif
576 // don't use pgl_malloc here since we don't automatically want to set
577 // g_failed_alloc_attempt_size on failure
578 unsigned char* bigstack_ua = S_CAST(unsigned char*, malloc(malloc_size_mib * 1048576 * sizeof(char)));
579 // this is thwarted by overcommit, but still better than nothing...
580 while (!bigstack_ua) {
581 malloc_size_mib = (malloc_size_mib * 3) / 4;
582 if (malloc_size_mib < kBigstackMinMib) {
583 malloc_size_mib = kBigstackMinMib;
584 }
585 bigstack_ua = S_CAST(unsigned char*, malloc(malloc_size_mib * 1048576 * sizeof(char)));
586 if (unlikely((!bigstack_ua) && (malloc_size_mib == kBigstackMinMib))) {
587 // switch to "goto cleanup" pattern if any more exit points are needed
588 g_failed_alloc_attempt_size = kBigstackMinMib * 1048576;
589 free(bubble);
590 return kPglRetNomem;
591 }
592 }
593 // force 64-byte align to make cache line sensitivity work
594 unsigned char* bigstack_initial_base = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, bigstack_ua), kCacheline));
595 g_bigstack_base = bigstack_initial_base;
596 // last 576 bytes now reserved for g_one_char_strs + overread buffer
597 g_bigstack_end = &(bigstack_initial_base[RoundDownPow2(malloc_size_mib * 1048576 - 576 - S_CAST(uintptr_t, bigstack_initial_base - bigstack_ua), kCacheline)]);
598 free(bubble);
599 uintptr_t* one_char_iter = R_CAST(uintptr_t*, g_bigstack_end);
600 #ifdef __LP64__
601 // assumes little-endian
602 uintptr_t cur_word = 0x3000200010000LLU;
603 for (uint32_t uii = 0; uii != 64; ++uii) {
604 *one_char_iter++ = cur_word;
605 cur_word += 0x4000400040004LLU;
606 }
607 #else
608 uintptr_t cur_word = 0x10000;
609 for (uint32_t uii = 0; uii != 128; ++uii) {
610 *one_char_iter++ = cur_word;
611 cur_word += 0x20002;
612 }
613 #endif
614 g_one_char_strs = R_CAST(const char*, g_bigstack_end);
615
616 // plink2 doesn't actually need these here, but short programs using
617 // plink2_common benefit from this
618 g_input_missing_geno_ptr = &(g_one_char_strs[96]);
619 g_output_missing_geno_ptr = &(g_one_char_strs[92]);
620
621 *malloc_mib_final_ptr = malloc_size_mib;
622 *bigstack_ua_ptr = bigstack_ua;
623 return kPglRetSuccess;
624 }
625
626
bigstack_calloc_uc(uintptr_t ct,unsigned char ** uc_arr_ptr)627 BoolErr bigstack_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
628 *uc_arr_ptr = S_CAST(unsigned char*, bigstack_alloc(ct));
629 if (unlikely(!(*uc_arr_ptr))) {
630 return 1;
631 }
632 memset(*uc_arr_ptr, 0, ct);
633 return 0;
634 }
635
bigstack_calloc_d(uintptr_t ct,double ** d_arr_ptr)636 BoolErr bigstack_calloc_d(uintptr_t ct, double** d_arr_ptr) {
637 *d_arr_ptr = S_CAST(double*, bigstack_alloc(ct * sizeof(double)));
638 if (unlikely(!(*d_arr_ptr))) {
639 return 1;
640 }
641 ZeroDArr(ct, *d_arr_ptr);
642 return 0;
643 }
644
bigstack_calloc_f(uintptr_t ct,float ** f_arr_ptr)645 BoolErr bigstack_calloc_f(uintptr_t ct, float** f_arr_ptr) {
646 *f_arr_ptr = S_CAST(float*, bigstack_alloc(ct * sizeof(float)));
647 if (unlikely(!(*f_arr_ptr))) {
648 return 1;
649 }
650 ZeroFArr(ct, *f_arr_ptr);
651 return 0;
652 }
653
bigstack_calloc_u16(uintptr_t ct,uint16_t ** u16_arr_ptr)654 BoolErr bigstack_calloc_u16(uintptr_t ct, uint16_t** u16_arr_ptr) {
655 *u16_arr_ptr = S_CAST(uint16_t*, bigstack_alloc(ct * sizeof(int16_t)));
656 if (unlikely(!(*u16_arr_ptr))) {
657 return 1;
658 }
659 memset(*u16_arr_ptr, 0, ct * sizeof(int16_t));
660 return 0;
661 }
662
bigstack_calloc_u32(uintptr_t ct,uint32_t ** u32_arr_ptr)663 BoolErr bigstack_calloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr) {
664 *u32_arr_ptr = S_CAST(uint32_t*, bigstack_alloc(ct * sizeof(int32_t)));
665 if (unlikely(!(*u32_arr_ptr))) {
666 return 1;
667 }
668 ZeroU32Arr(ct, *u32_arr_ptr);
669 return 0;
670 }
671
bigstack_calloc_w(uintptr_t ct,uintptr_t ** w_arr_ptr)672 BoolErr bigstack_calloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) {
673 *w_arr_ptr = S_CAST(uintptr_t*, bigstack_alloc(ct * sizeof(intptr_t)));
674 if (unlikely(!(*w_arr_ptr))) {
675 return 1;
676 }
677 ZeroWArr(ct, *w_arr_ptr);
678 return 0;
679 }
680
bigstack_calloc_u64(uintptr_t ct,uint64_t ** u64_arr_ptr)681 BoolErr bigstack_calloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr) {
682 *u64_arr_ptr = S_CAST(uint64_t*, bigstack_alloc(ct * sizeof(int64_t)));
683 if (unlikely(!(*u64_arr_ptr))) {
684 return 1;
685 }
686 ZeroU64Arr(ct, *u64_arr_ptr);
687 return 0;
688 }
689
bigstack_calloc_cp(uintptr_t ct,char *** cp_arr_ptr)690 BoolErr bigstack_calloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
691 *cp_arr_ptr = S_CAST(char**, bigstack_alloc(ct * sizeof(intptr_t)));
692 if (unlikely(!(*cp_arr_ptr))) {
693 return 1;
694 }
695 ZeroPtrArr(ct, *cp_arr_ptr);
696 return 0;
697 }
698
bigstack_calloc_kcp(uintptr_t ct,const char *** kcp_arr_ptr)699 BoolErr bigstack_calloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
700 *kcp_arr_ptr = S_CAST(const char**, bigstack_alloc(ct * sizeof(intptr_t)));
701 if (unlikely(!(*kcp_arr_ptr))) {
702 return 1;
703 }
704 ZeroPtrArr(ct, *kcp_arr_ptr);
705 return 0;
706 }
707
bigstack_end_calloc_uc(uintptr_t ct,unsigned char ** uc_arr_ptr)708 BoolErr bigstack_end_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
709 *uc_arr_ptr = S_CAST(unsigned char*, bigstack_end_alloc(ct));
710 if (unlikely(!(*uc_arr_ptr))) {
711 return 1;
712 }
713 memset(*uc_arr_ptr, 0, ct);
714 return 0;
715 }
716
bigstack_end_calloc_d(uintptr_t ct,double ** d_arr_ptr)717 BoolErr bigstack_end_calloc_d(uintptr_t ct, double** d_arr_ptr) {
718 *d_arr_ptr = S_CAST(double*, bigstack_end_alloc(ct * sizeof(double)));
719 if (unlikely(!(*d_arr_ptr))) {
720 return 1;
721 }
722 ZeroDArr(ct, *d_arr_ptr);
723 return 0;
724 }
725
bigstack_end_calloc_f(uintptr_t ct,float ** f_arr_ptr)726 BoolErr bigstack_end_calloc_f(uintptr_t ct, float** f_arr_ptr) {
727 *f_arr_ptr = S_CAST(float*, bigstack_end_alloc(ct * sizeof(float)));
728 if (unlikely(!(*f_arr_ptr))) {
729 return 1;
730 }
731 ZeroFArr(ct, *f_arr_ptr);
732 return 0;
733 }
734
bigstack_end_calloc_u32(uintptr_t ct,uint32_t ** u32_arr_ptr)735 BoolErr bigstack_end_calloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr) {
736 *u32_arr_ptr = S_CAST(uint32_t*, bigstack_end_alloc(ct * sizeof(int32_t)));
737 if (unlikely(!(*u32_arr_ptr))) {
738 return 1;
739 }
740 ZeroU32Arr(ct, *u32_arr_ptr);
741 return 0;
742 }
743
bigstack_end_calloc_w(uintptr_t ct,uintptr_t ** w_arr_ptr)744 BoolErr bigstack_end_calloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) {
745 *w_arr_ptr = S_CAST(uintptr_t*, bigstack_end_alloc(ct * sizeof(intptr_t)));
746 if (unlikely(!(*w_arr_ptr))) {
747 return 1;
748 }
749 ZeroWArr(ct, *w_arr_ptr);
750 return 0;
751 }
752
bigstack_end_calloc_u64(uintptr_t ct,uint64_t ** u64_arr_ptr)753 BoolErr bigstack_end_calloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr) {
754 *u64_arr_ptr = S_CAST(uint64_t*, bigstack_end_alloc(ct * sizeof(int64_t)));
755 if (unlikely(!(*u64_arr_ptr))) {
756 return 1;
757 }
758 ZeroU64Arr(ct, *u64_arr_ptr);
759 return 0;
760 }
761
bigstack_end_calloc_cp(uintptr_t ct,char *** cp_arr_ptr)762 BoolErr bigstack_end_calloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
763 *cp_arr_ptr = S_CAST(char**, bigstack_end_alloc(ct * sizeof(intptr_t)));
764 if (unlikely(!(*cp_arr_ptr))) {
765 return 1;
766 }
767 ZeroPtrArr(ct, *cp_arr_ptr);
768 return 0;
769 }
770
771
PushLlStr(const char * str,LlStr ** ll_stack_ptr)772 BoolErr PushLlStr(const char* str, LlStr** ll_stack_ptr) {
773 uintptr_t blen = strlen(str) + 1;
774 LlStr* new_llstr;
775 if (unlikely(pgl_malloc(sizeof(LlStr) + blen, &new_llstr))) {
776 return 1;
777 }
778 new_llstr->next = *ll_stack_ptr;
779 memcpy(new_llstr->str, str, blen);
780 *ll_stack_ptr = new_llstr;
781 return 0;
782 }
783
784 /*
785 BoolErr push_llstr_counted(const char* str, uint32_t slen, LlStr** ll_stack_ptr) {
786 LlStr* new_llstr;
787 if (pgl_malloc(sizeof(LlStr) + slen + 1, &new_llstr)) {
788 return 1;
789 }
790 new_llstr->next = *ll_stack_ptr;
791 memcpy(new_llstr->str, str, slen);
792 new_llstr->str[slen] = '\0';
793 *ll_stack_ptr = new_llstr;
794 return 0;
795 }
796 */
797
798
CountAndMeasureMultistrReverseAlloc(const char * multistr,uintptr_t max_str_ct,uint32_t * str_ct_ptr,uintptr_t * max_blen_ptr,const char *** strptr_arrp)799 BoolErr CountAndMeasureMultistrReverseAlloc(const char* multistr, uintptr_t max_str_ct, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr, const char*** strptr_arrp) {
800 // assumes multistr is nonempty
801 assert(multistr[0]);
802 uintptr_t ct = 0;
803 uintptr_t max_blen = *max_blen_ptr;
804 const char** strptr_arr_iter = *strptr_arrp;
805 do {
806 if (unlikely(++ct > max_str_ct)) {
807 return 1;
808 }
809 const uintptr_t blen = strlen(multistr) + 1;
810 if (blen > max_blen) {
811 max_blen = blen;
812 }
813 *(--strptr_arr_iter) = multistr;
814 multistr = &(multistr[blen]);
815 } while (*multistr);
816 *str_ct_ptr = ct;
817 *max_blen_ptr = max_blen;
818 *strptr_arrp = strptr_arr_iter;
819 return 0;
820 }
821
MultistrToStrboxDedupArenaAlloc(unsigned char * arena_top,const char * multistr,unsigned char ** arena_bottom_ptr,char ** sorted_strbox_ptr,uint32_t * str_ct_ptr,uintptr_t * max_blen_ptr)822 BoolErr MultistrToStrboxDedupArenaAlloc(unsigned char* arena_top, const char* multistr, unsigned char** arena_bottom_ptr, char** sorted_strbox_ptr, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr) {
823 const char** strptr_arr = R_CAST(const char**, arena_top);
824 uintptr_t max_str_blen = 0;
825 uint32_t str_ct;
826 if (unlikely(CountAndMeasureMultistrReverseAlloc(multistr, (arena_top - (*arena_bottom_ptr)) / sizeof(intptr_t), &str_ct, &max_str_blen, &strptr_arr))) {
827 return 1;
828 }
829 const uintptr_t strbox_byte_ct = RoundUpPow2(str_ct * max_str_blen, kCacheline);
830 if (unlikely((R_CAST(uintptr_t, strptr_arr) - R_CAST(uintptr_t, *arena_bottom_ptr)) < strbox_byte_ct)) {
831 return 1;
832 }
833 // multistr can currently come from command line parser, which does not
834 // guarantee safe overread
835 StrptrArrSort(str_ct, strptr_arr);
836 *sorted_strbox_ptr = R_CAST(char*, *arena_bottom_ptr);
837 str_ct = CopyAndDedupSortedStrptrsToStrbox(strptr_arr, str_ct, max_str_blen, *sorted_strbox_ptr);
838 *arena_bottom_ptr += RoundUpPow2(str_ct * max_str_blen, kCacheline);
839 *str_ct_ptr = str_ct;
840 *max_blen_ptr = max_str_blen;
841 return 0;
842 }
843
844
DivisionMagicNums(uint32_t divisor,uint64_t * multp,uint32_t * __restrict pre_shiftp,uint32_t * __restrict post_shiftp,uint32_t * __restrict incrp)845 void DivisionMagicNums(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp) {
846 // Enables fast integer division by a constant not known until runtime. See
847 // http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html .
848 // Assumes divisor is not zero, of course.
849 // May want to populate a struct instead.
850 assert(divisor);
851 if (!(divisor & (divisor - 1))) {
852 // power of 2
853 *multp = 1;
854 *pre_shiftp = 0;
855 *post_shiftp = ctzu32(divisor);
856 *incrp = 0;
857 return;
858 }
859 uint32_t quotient = 0x80000000U / divisor;
860 uint32_t remainder = 0x80000000U - (quotient * divisor);
861 const uint32_t ceil_log_2_d = 1 + bsru32(divisor);
862 uint32_t down_multiplier = 0;
863 uint32_t down_exponent = 0;
864 uint32_t has_magic_down = 0;
865 for (uint32_t exponent = 0; ; ++exponent) {
866 if (remainder >= divisor - remainder) {
867 quotient = quotient * 2 + 1;
868 remainder = remainder * 2 - divisor;
869 } else {
870 quotient = quotient * 2;
871 remainder = remainder * 2;
872 }
873 if ((exponent >= ceil_log_2_d) || (divisor - remainder) <= (1U << exponent)) {
874 if (exponent < ceil_log_2_d) {
875 *multp = quotient + 1;
876 *pre_shiftp = 0;
877 *post_shiftp = 32 + exponent;
878 *incrp = 0;
879 return;
880 }
881 break;
882 }
883 if ((!has_magic_down) && (remainder <= (1U << exponent))) {
884 has_magic_down = 1;
885 down_multiplier = quotient;
886 down_exponent = exponent;
887 }
888 }
889 if (divisor & 1) {
890 *multp = down_multiplier;
891 *pre_shiftp = 0;
892 *post_shiftp = 32 + down_exponent;
893 *incrp = 1;
894 return;
895 }
896 *pre_shiftp = ctzu32(divisor);
897 uint32_t dummy;
898 DivisionMagicNums(divisor >> (*pre_shiftp), multp, &dummy, post_shiftp, incrp);
899 }
900
901
FillBitsNz(uintptr_t start_idx,uintptr_t end_idx,uintptr_t * bitarr)902 void FillBitsNz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr) {
903 assert(end_idx > start_idx);
904 uintptr_t maj_start = start_idx / kBitsPerWord;
905 uintptr_t maj_end = end_idx / kBitsPerWord;
906 uintptr_t minor;
907 if (maj_start == maj_end) {
908 bitarr[maj_start] |= (k1LU << (end_idx % kBitsPerWord)) - (k1LU << (start_idx % kBitsPerWord));
909 } else {
910 bitarr[maj_start] |= ~((k1LU << (start_idx % kBitsPerWord)) - k1LU);
911 SetAllWArr(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
912 minor = end_idx % kBitsPerWord;
913 if (minor) {
914 bitarr[maj_end] |= (k1LU << minor) - k1LU;
915 }
916 }
917 }
918
ClearBitsNz(uintptr_t start_idx,uintptr_t end_idx,uintptr_t * bitarr)919 void ClearBitsNz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr) {
920 assert(end_idx > start_idx);
921 uintptr_t maj_start = start_idx / kBitsPerWord;
922 uintptr_t maj_end = end_idx / kBitsPerWord;
923 uintptr_t minor;
924 if (maj_start == maj_end) {
925 bitarr[maj_start] &= ~((k1LU << (end_idx % kBitsPerWord)) - (k1LU << (start_idx % kBitsPerWord)));
926 } else {
927 bitarr[maj_start] = bzhi(bitarr[maj_start], start_idx % kBitsPerWord);
928 ZeroWArr(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
929 minor = end_idx % kBitsPerWord;
930 if (minor) {
931 bitarr[maj_end] &= ~((k1LU << minor) - k1LU);
932 }
933 }
934 }
935
936 // floor permitted to be -1, though not smaller than that.
FindLast1BitBeforeBounded(const uintptr_t * bitarr,uint32_t loc,int32_t floor)937 int32_t FindLast1BitBeforeBounded(const uintptr_t* bitarr, uint32_t loc, int32_t floor) {
938 const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
939 uint32_t remainder = loc % kBitsPerWord;
940 uintptr_t ulii;
941 if (remainder) {
942 ulii = bzhi(*bitarr_ptr, remainder);
943 if (ulii) {
944 const uint32_t set_bit_loc = loc - remainder + bsrw(ulii);
945 return MAXV(S_CAST(int32_t, set_bit_loc), floor);
946 }
947 }
948 const uintptr_t* bitarr_last = &(bitarr[S_CAST(uint32_t, floor + 1) / kBitsPerWord]);
949 do {
950 if (bitarr_ptr <= bitarr_last) {
951 return floor;
952 }
953 ulii = *(--bitarr_ptr);
954 } while (!ulii);
955 const uint32_t set_bit_loc = S_CAST(uintptr_t, bitarr_ptr - bitarr) * kBitsPerWord + bsrw(ulii);
956 return MAXV(S_CAST(int32_t, set_bit_loc), floor);
957 }
958
BitvecAndCopy(const uintptr_t * __restrict source1_bitvec,const uintptr_t * __restrict source2_bitvec,uintptr_t word_ct,uintptr_t * target_bitvec)959 void BitvecAndCopy(const uintptr_t* __restrict source1_bitvec, const uintptr_t* __restrict source2_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec) {
960 #ifdef __LP64__
961 VecW* target_bitvvec = R_CAST(VecW*, target_bitvec);
962 const VecW* source1_bitvvec = R_CAST(const VecW*, source1_bitvec);
963 const VecW* source2_bitvvec = R_CAST(const VecW*, source2_bitvec);
964 const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
965 for (uintptr_t ulii = 0; ulii != full_vec_ct; ++ulii) {
966 target_bitvvec[ulii] = source1_bitvvec[ulii] & source2_bitvvec[ulii];
967 }
968 # ifdef USE_AVX2
969 if (word_ct & 2) {
970 const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
971 target_bitvec[base_idx] = source1_bitvec[base_idx] & source2_bitvec[base_idx];
972 target_bitvec[base_idx + 1] = source1_bitvec[base_idx + 1] & source2_bitvec[base_idx + 1];
973 }
974 # endif
975 if (word_ct & 1) {
976 target_bitvec[word_ct - 1] = source1_bitvec[word_ct - 1] & source2_bitvec[word_ct - 1];
977 }
978 #else
979 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
980 target_bitvec[widx] = source1_bitvec[widx] & source2_bitvec[widx];
981 }
982 #endif
983 }
984
BitvecInvmaskCopy(const uintptr_t * __restrict source_bitvec,const uintptr_t * __restrict exclude_bitvec,uintptr_t word_ct,uintptr_t * target_bitvec)985 void BitvecInvmaskCopy(const uintptr_t* __restrict source_bitvec, const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec) {
986 // target_bitvec := source_bitvec AND (~exclude_bitvec)
987 #ifdef __LP64__
988 VecW* target_bitvvec = R_CAST(VecW*, target_bitvec);
989 const VecW* source_bitvvec = R_CAST(const VecW*, source_bitvec);
990 const VecW* exclude_bitvvec = R_CAST(const VecW*, exclude_bitvec);
991 const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
992 for (uintptr_t ulii = 0; ulii != full_vec_ct; ++ulii) {
993 target_bitvvec[ulii] = vecw_and_notfirst(exclude_bitvvec[ulii], source_bitvvec[ulii]);
994 }
995 # ifdef USE_AVX2
996 if (word_ct & 2) {
997 const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
998 target_bitvec[base_idx] = (~exclude_bitvec[base_idx]) & source_bitvec[base_idx];
999 target_bitvec[base_idx + 1] = (~exclude_bitvec[base_idx + 1]) & source_bitvec[base_idx + 1];
1000 }
1001 # endif
1002 if (word_ct & 1) {
1003 target_bitvec[word_ct - 1] = source_bitvec[word_ct - 1] & (~exclude_bitvec[word_ct - 1]);
1004 }
1005 #else
1006 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1007 target_bitvec[widx] = source_bitvec[widx] & (~exclude_bitvec[widx]);
1008 }
1009 #endif
1010 }
1011
BitvecInvertCopy(const uintptr_t * __restrict source_bitvec,uintptr_t word_ct,uintptr_t * __restrict target_bitvec)1012 void BitvecInvertCopy(const uintptr_t* __restrict source_bitvec, uintptr_t word_ct, uintptr_t* __restrict target_bitvec) {
1013 #ifdef __LP64__
1014 const VecW* source_bitvvec_iter = R_CAST(const VecW*, source_bitvec);
1015 VecW* target_bitvvec_iter = R_CAST(VecW*, target_bitvec);
1016 const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
1017 const VecW all1 = VCONST_W(~k0LU);
1018 // As of Apple clang 11, this manual unroll is no longer relevant. todo:
1019 // check Linux performance, and remove all of these unrolls if perf is good
1020 // enough without them.
1021 if (full_vec_ct & 1) {
1022 *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1023 }
1024 if (full_vec_ct & 2) {
1025 *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1026 *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1027 }
1028 for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
1029 *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1030 *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1031 *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1032 *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1033 }
1034 # ifdef USE_AVX2
1035 if (word_ct & 2) {
1036 const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
1037 target_bitvec[base_idx] = ~source_bitvec[base_idx];
1038 target_bitvec[base_idx + 1] = ~source_bitvec[base_idx + 1];
1039 }
1040 # endif
1041 if (word_ct & 1) {
1042 target_bitvec[word_ct - 1] = ~source_bitvec[word_ct - 1];
1043 }
1044 #else
1045 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1046 target_bitvec[widx] = ~source_bitvec[widx];
1047 }
1048 #endif
1049 }
1050
1051 /*
1052 void BitvecXor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
1053 // main_bitvec := main_bitvec XOR arg_bitvec
1054 #ifdef __LP64__
1055 VecW* main_bitvvec_iter = R_CAST(VecW*, main_bitvec);
1056 const VecW* arg_bitvvec_iter = R_CAST(const VecW*, arg_bitvec);
1057 const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
1058 if (full_vec_ct & 1) {
1059 *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1060 }
1061 if (full_vec_ct & 2) {
1062 *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1063 *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1064 }
1065 for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
1066 *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1067 *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1068 *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1069 *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1070 }
1071 # ifdef USE_AVX2
1072 if (word_ct & 2) {
1073 const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
1074 main_bitvec[base_idx] ^= arg_bitvec[base_idx];
1075 main_bitvec[base_idx + 1] ^= arg_bitvec[base_idx + 1];
1076 }
1077 # endif
1078 if (word_ct & 1) {
1079 main_bitvec[word_ct - 1] ^= arg_bitvec[word_ct - 1];
1080 }
1081 #else
1082 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1083 main_bitvec[widx] ^= arg_bitvec[widx];
1084 }
1085 #endif
1086 }
1087 */
1088
BitvecInvertAndMask(const uintptr_t * __restrict include_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)1089 void BitvecInvertAndMask(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
1090 // main_bitvec := (~main_bitvec) AND include_bitvec
1091 // this corresponds _mm_andnot() operand order
1092 #ifdef __LP64__
1093 VecW* main_bitvvec_iter = R_CAST(VecW*, main_bitvec);
1094 const VecW* include_bitvvec_iter = R_CAST(const VecW*, include_bitvec);
1095 const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
1096 if (full_vec_ct & 1) {
1097 *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1098 ++main_bitvvec_iter;
1099 }
1100 if (full_vec_ct & 2) {
1101 *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1102 ++main_bitvvec_iter;
1103 *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1104 ++main_bitvvec_iter;
1105 }
1106 for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
1107 *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1108 ++main_bitvvec_iter;
1109 *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1110 ++main_bitvvec_iter;
1111 *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1112 ++main_bitvvec_iter;
1113 *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1114 ++main_bitvvec_iter;
1115 }
1116 # ifdef USE_AVX2
1117 if (word_ct & 2) {
1118 const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
1119 main_bitvec[base_idx] = (~main_bitvec[base_idx]) & include_bitvec[base_idx];
1120 main_bitvec[base_idx + 1] = (~main_bitvec[base_idx + 1]) & include_bitvec[base_idx + 1];
1121 }
1122 # endif
1123 if (word_ct & 1) {
1124 main_bitvec[word_ct - 1] = (~main_bitvec[word_ct - 1]) & include_bitvec[word_ct - 1];
1125 }
1126 #else
1127 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1128 main_bitvec[widx] = (~main_bitvec[widx]) & include_bitvec[widx];
1129 }
1130 #endif
1131 }
1132
1133 /*
1134 void BitvecOrNot(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
1135 // main_bitvec := main_bitvec OR (~arg_bitvec)
1136 #ifdef __LP64__
1137 VecW* main_bitvvec_iter = (VecW*)main_bitvec;
1138 const VecW* arg_bitvvec_iter = (const VecW*)arg_bitvec;
1139 const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
1140 if (full_vec_ct & 1) {
1141 // todo: verify the compiler isn't dumb here
1142 *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1143 }
1144 if (full_vec_ct & 2) {
1145 *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1146 *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1147 }
1148 for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
1149 *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1150 *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1151 *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1152 *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1153 }
1154 # ifdef USE_AVX2
1155 if (word_ct & 2) {
1156 const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
1157 main_bitvec[base_idx] |= ~arg_bitvec[base_idx];
1158 main_bitvec[base_idx + 1] |= ~arg_bitvec[base_idx + 1]
1159 }
1160 # endif
1161 if (word_ct & 1) {
1162 main_bitvec[word_ct - 1] |= ~arg_bitvec[word_ct - 1];
1163 }
1164 #else
1165 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1166 main_bitvec[widx] |= ~arg_bitvec[widx];
1167 }
1168 #endif
1169 }
1170 */
1171
1172
GetVariantUidxWithoutHtable(const char * idstr,const char * const * variant_ids,const uintptr_t * variant_include,uint32_t variant_ct)1173 int32_t GetVariantUidxWithoutHtable(const char* idstr, const char* const* variant_ids, const uintptr_t* variant_include, uint32_t variant_ct) {
1174 const uint32_t id_blen = strlen(idstr) + 1;
1175 uintptr_t widx = ~k0LU;
1176 int32_t retval = -1;
1177 for (uint32_t variant_idx = 0; variant_idx != variant_ct; ) {
1178 uintptr_t variant_include_bits;
1179 do {
1180 variant_include_bits = variant_include[++widx];
1181 } while (!variant_include_bits);
1182 const uintptr_t variant_uidx_base = widx * kBitsPerWord;
1183 const char* const* cur_variant_ids = &(variant_ids[variant_uidx_base]);
1184 variant_idx += PopcountWord(variant_include_bits);
1185 do {
1186 const uint32_t uidx_lowbits = ctzw(variant_include_bits);
1187 if (memequal(idstr, cur_variant_ids[uidx_lowbits], id_blen)) {
1188 if (retval != -1) {
1189 // duplicate
1190 return -2;
1191 }
1192 retval = S_CAST(int32_t, uidx_lowbits + variant_uidx_base);
1193 }
1194 variant_include_bits &= variant_include_bits - 1;
1195 } while (variant_include_bits);
1196 }
1197 return retval;
1198 }
1199
1200
1201 // MurmurHash3, from
1202 // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
rotl32(uint32_t x,int8_t r)1203 CSINLINE uint32_t rotl32(uint32_t x, int8_t r) {
1204 return (x << r) | (x >> (32 - r));
1205 }
1206
getblock32(const uint32_t * p,int i)1207 static inline uint32_t getblock32(const uint32_t* p, int i) {
1208 return p[i];
1209 }
1210
1211 //-----------------------------------------------------------------------------
1212 // Finalization mix - force all bits of a hash block to avalanche
1213
fmix32(uint32_t h)1214 CSINLINE2 uint32_t fmix32(uint32_t h) {
1215 h ^= h >> 16;
1216 h *= 0x85ebca6b;
1217 h ^= h >> 13;
1218 h *= 0xc2b2ae35;
1219 h ^= h >> 16;
1220
1221 return h;
1222 }
1223
Hash32(const void * key,uint32_t len)1224 uint32_t Hash32(const void* key, uint32_t len) {
1225 const uint8_t* data = S_CAST(const uint8_t*, key);
1226 const int32_t nblocks = len / 4;
1227
1228 uint32_t h1 = 0;
1229 // uint32_t h1 = seed;
1230
1231 const uint32_t c1 = 0xcc9e2d51;
1232 const uint32_t c2 = 0x1b873593;
1233
1234 //----------
1235 // body
1236
1237 const uint32_t* blocks = R_CAST(const uint32_t*, data + nblocks*4);
1238
1239 int32_t i;
1240 uint32_t k1;
1241 for(i = -nblocks; i; i++) {
1242 k1 = getblock32(blocks,i);
1243
1244 k1 *= c1;
1245 k1 = rotl32(k1,15);
1246 k1 *= c2;
1247
1248 h1 ^= k1;
1249 h1 = rotl32(h1,13);
1250 h1 = h1*5+0xe6546b64;
1251 }
1252
1253 //----------
1254 // tail
1255
1256 const uint8_t* tail = R_CAST(const uint8_t*, data + nblocks*4);
1257
1258 k1 = 0;
1259
1260 switch(len & 3) {
1261 case 3:
1262 k1 ^= tail[2] << 16;
1263 // fall through
1264 case 2:
1265 k1 ^= tail[1] << 8;
1266 // fall through
1267 case 1:
1268 k1 ^= tail[0];
1269 k1 *= c1;
1270 k1 = rotl32(k1,15);
1271 k1 *= c2;
1272 h1 ^= k1;
1273 }
1274
1275 //----------
1276 // finalization
1277
1278 h1 ^= len;
1279
1280 return fmix32(h1);
1281 }
1282
1283
1284 /*
1285 uint32_t is_composite6(uintptr_t num) {
1286 // assumes num is congruent to 1 or 5 mod 6.
1287 // can speed this up by ~50% by hardcoding avoidance of multiples of 5/7,
1288 // but this isn't currently a bottleneck so I'll keep this simple
1289 assert((num % 6 == 1) || (num % 6 == 5));
1290 uintptr_t divisor = 5;
1291 while (divisor * divisor <= num) {
1292 if (!(num % divisor)) {
1293 return 1;
1294 }
1295 divisor += 2;
1296 if (!(num % divisor)) {
1297 return 1;
1298 }
1299 divisor += 4;
1300 }
1301 return 0;
1302 }
1303
1304 uintptr_t geqprime(uintptr_t floor) {
1305 // assumes floor is odd and greater than 1. Returns 5 if floor = 3,
1306 // otherwise returns the first prime >= floor.
1307 assert((floor % 2) && (floor > 1));
1308 uintptr_t ulii = floor % 3;
1309 if (!ulii) {
1310 floor += 2;
1311 } else if (ulii == 1) {
1312 goto geqprime_1mod6;
1313 }
1314 while (is_composite6(floor)) {
1315 floor += 2;
1316 geqprime_1mod6:
1317 if (!is_composite6(floor)) {
1318 return floor;
1319 }
1320 floor += 4;
1321 }
1322 return floor;
1323 }
1324
1325 uintptr_t leqprime(uintptr_t ceil) {
1326 // assumes ceil is odd and greater than 4. Returns the first prime <= ceil.
1327 assert((ceil % 2) && (ceil > 4));
1328 uintptr_t ulii = ceil % 3;
1329 if (!ulii) {
1330 ceil -= 2;
1331 } else if (ulii == 2) {
1332 goto leqprime_5mod6;
1333 }
1334 while (is_composite6(ceil)) {
1335 ceil -= 2;
1336 leqprime_5mod6:
1337 if (!is_composite6(ceil)) {
1338 return ceil;
1339 }
1340 ceil -= 4;
1341 }
1342 return ceil;
1343 }
1344 */
1345
HtableGoodSizeAlloc(uint32_t item_ct,uintptr_t bytes_avail,uint32_t ** htable_ptr,uint32_t * htable_size_ptr)1346 BoolErr HtableGoodSizeAlloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t** htable_ptr, uint32_t* htable_size_ptr) {
1347 bytes_avail &= (~(kCacheline - k1LU));
1348 uint32_t htable_size = GetHtableFastSize(item_ct);
1349 if (htable_size > bytes_avail / sizeof(int32_t)) {
1350 if (unlikely(!bytes_avail)) {
1351 return 1;
1352 }
1353 htable_size = bytes_avail / sizeof(int32_t);
1354 // htable_size = leqprime((bytes_avail / sizeof(int32_t)) - 1);
1355 if (unlikely(htable_size < item_ct * 2)) {
1356 return 1;
1357 }
1358 }
1359 *htable_ptr = S_CAST(uint32_t*, bigstack_alloc_raw_rd(htable_size * sizeof(int32_t)));
1360 *htable_size_ptr = htable_size;
1361 return 0;
1362 }
1363
PopulateStrboxHtable(const char * strbox,uintptr_t str_ct,uintptr_t max_str_blen,uint32_t str_htable_size,uint32_t * str_htable)1364 uint32_t PopulateStrboxHtable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
1365 // may want subset_mask parameter later
1366 SetAllU32Arr(str_htable_size, str_htable);
1367 const char* strbox_iter = strbox;
1368 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
1369 const uint32_t slen = strlen(strbox_iter);
1370 // previously used quadratic probing, but turns out that that isn't
1371 // meaningfully better than linear probing.
1372 for (uint32_t hashval = Hashceil(strbox_iter, slen, str_htable_size); ; ) {
1373 const uint32_t cur_htable_entry = str_htable[hashval];
1374 if (cur_htable_entry == UINT32_MAX) {
1375 str_htable[hashval] = str_idx;
1376 break;
1377 }
1378 if (memequal(strbox_iter, &(strbox[cur_htable_entry * max_str_blen]), slen + 1)) {
1379 // guaranteed to be positive
1380 return str_idx;
1381 }
1382 if (++hashval == str_htable_size) {
1383 hashval = 0;
1384 }
1385 }
1386 strbox_iter = &(strbox_iter[max_str_blen]);
1387 }
1388 return 0;
1389 }
1390
1391 // could merge this with non-subset case, but this isn't much code
1392 /*
1393 uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
1394 // may want subset_mask parameter later
1395 SetAllU32Arr(str_htable_size, str_htable);
1396 uintptr_t str_uidx = 0;
1397 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx, ++str_uidx) {
1398 MovWTo1Bit(subset_mask, &str_uidx);
1399 const char* cur_str = &(strbox[str_uidx * max_str_blen]);
1400 const uint32_t slen = strlen(cur_str);
1401 uint32_t hashval = Hashceil(cur_str, slen, str_htable_size);
1402 while (1) {
1403 const uint32_t cur_htable_entry = str_htable[hashval];
1404 if (cur_htable_entry == UINT32_MAX) {
1405 str_htable[hashval] = str_uidx;
1406 break;
1407 }
1408 if (memequal(cur_str, &(strbox[cur_htable_entry * max_str_blen]), slen + 1)) {
1409 // guaranteed to be positive
1410 return str_uidx;
1411 }
1412 if (++hashval == str_htable_size) {
1413 hashval = 0;
1414 }
1415 }
1416 }
1417 return 0;
1418 }
1419 */
1420
IdHtableFind(const char * cur_id,const char * const * item_ids,const uint32_t * id_htable,uint32_t cur_id_slen,uint32_t id_htable_size)1421 uint32_t IdHtableFind(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size) {
1422 // returns UINT32_MAX on failure
1423 for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
1424 const uint32_t cur_htable_idval = id_htable[hashval];
1425 if ((cur_htable_idval == UINT32_MAX) || (!strcmp(cur_id, item_ids[cur_htable_idval]))) {
1426 return cur_htable_idval;
1427 }
1428 if (++hashval == id_htable_size) {
1429 hashval = 0;
1430 }
1431 }
1432 }
1433
IdHtableFindNnt(const char * cur_id,const char * const * item_ids,const uint32_t * id_htable,uint32_t cur_id_slen,uint32_t id_htable_size)1434 uint32_t IdHtableFindNnt(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size) {
1435 // returns UINT32_MAX on failure
1436 for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
1437 const uint32_t cur_htable_idval = id_htable[hashval];
1438 if ((cur_htable_idval == UINT32_MAX) || (memequal(cur_id, item_ids[cur_htable_idval], cur_id_slen) && (!item_ids[cur_htable_idval][cur_id_slen]))) {
1439 return cur_htable_idval;
1440 }
1441 if (++hashval == id_htable_size) {
1442 hashval = 0;
1443 }
1444 }
1445 }
1446
1447 // assumes cur_id_slen < max_str_blen.
1448 // requires cur_id to be null-terminated.
StrboxHtableFind(const char * cur_id,const char * strbox,const uint32_t * id_htable,uintptr_t max_str_blen,uint32_t cur_id_slen,uint32_t id_htable_size)1449 uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size) {
1450 const uint32_t cur_id_blen = cur_id_slen + 1;
1451 for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
1452 const uint32_t cur_htable_idval = id_htable[hashval];
1453 if ((cur_htable_idval == UINT32_MAX) || memequal(cur_id, &(strbox[cur_htable_idval * max_str_blen]), cur_id_blen)) {
1454 return cur_htable_idval;
1455 }
1456 if (++hashval == id_htable_size) {
1457 hashval = 0;
1458 }
1459 }
1460 }
1461
VariantIdDupflagHtableFind(const char * idbuf,const char * const * variant_ids,const uint32_t * id_htable,uint32_t cur_id_slen,uint32_t id_htable_size,uint32_t max_id_slen)1462 uint32_t VariantIdDupflagHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen) {
1463 // assumes duplicate variant IDs are flagged, but full variant_uidx linked
1464 // lists are not stored
1465 // idbuf does not need to be null-terminated (note that this is currently
1466 // achieved in a way that forces variant_ids[] entries to not be too close
1467 // to the end of bigstack, otherwise memequal behavior is potentially
1468 // undefined)
1469 // returns UINT32_MAX on failure, value with bit 31 set on duplicate
1470 if (cur_id_slen > max_id_slen) {
1471 return UINT32_MAX;
1472 }
1473 for (uint32_t hashval = Hashceil(idbuf, cur_id_slen, id_htable_size); ; ) {
1474 const uint32_t cur_htable_idval = id_htable[hashval];
1475 if ((cur_htable_idval == UINT32_MAX) || (memequal(idbuf, variant_ids[cur_htable_idval & 0x7fffffff], cur_id_slen) && (!variant_ids[cur_htable_idval & 0x7fffffff][cur_id_slen]))) {
1476 return cur_htable_idval;
1477 }
1478 if (++hashval == id_htable_size) {
1479 hashval = 0;
1480 }
1481 }
1482 }
1483
VariantIdDupHtableFind(const char * idbuf,const char * const * variant_ids,const uint32_t * id_htable,const uint32_t * htable_dup_base,uint32_t cur_id_slen,uint32_t id_htable_size,uint32_t max_id_slen,uint32_t * llidx_ptr)1484 uint32_t VariantIdDupHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, const uint32_t* htable_dup_base, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen, uint32_t* llidx_ptr) {
1485 // Permits duplicate entries. Similar to plink 1.9
1486 // extract_exclude_process_token().
1487 // - Returns UINT32_MAX on failure (llidx currently unset in that case),
1488 // otherwise returns the index of the first match (which will have the
1489 // highest index, due to how the linked list is constructed)
1490 // - Sets llidx to UINT32_MAX if not a duplicate, otherwise it's the
1491 // position in htable_dup_base[] of the next {variant_uidx, next_llidx}
1492 // linked list entry.
1493 // - idbuf does not need to be null-terminated.
1494 if (cur_id_slen > max_id_slen) {
1495 return UINT32_MAX;
1496 }
1497 for (uint32_t hashval = Hashceil(idbuf, cur_id_slen, id_htable_size); ; ) {
1498 const uint32_t cur_htable_idval = id_htable[hashval];
1499 const uint32_t cur_dup = cur_htable_idval >> 31;
1500 uint32_t cur_llidx;
1501 uint32_t variant_uidx;
1502 if (cur_dup) {
1503 // UINT32_MAX empty-entry code has high bit set, so only need to check
1504 // here
1505 if (cur_htable_idval == UINT32_MAX) {
1506 return UINT32_MAX;
1507 }
1508 cur_llidx = cur_htable_idval << 1;
1509 variant_uidx = htable_dup_base[cur_llidx];
1510 } else {
1511 cur_llidx = UINT32_MAX;
1512 variant_uidx = cur_htable_idval;
1513 }
1514 const char* sptr = variant_ids[variant_uidx];
1515 if (memequal(idbuf, sptr, cur_id_slen) && (!sptr[cur_id_slen])) {
1516 *llidx_ptr = cur_llidx;
1517 return variant_uidx;
1518 }
1519 if (++hashval == id_htable_size) {
1520 hashval = 0;
1521 }
1522 }
1523 }
1524
1525
CopySortStrboxSubsetNoalloc(const uintptr_t * __restrict subset_mask,const char * __restrict orig_strbox,uintptr_t str_ct,uintptr_t max_str_blen,uint32_t allow_dups,uint32_t collapse_idxs,uint32_t use_nsort,char * __restrict sorted_strbox,uint32_t * __restrict id_map)1526 PglErr CopySortStrboxSubsetNoalloc(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char* __restrict sorted_strbox, uint32_t* __restrict id_map) {
1527 // Stores a lexicographically sorted list of IDs in sorted_strbox and the raw
1528 // positions of the corresponding markers/samples in *id_map_ptr. Does not
1529 // include excluded markers/samples in the list.
1530 // Assumes sorted_strbox and id_map have been allocated; use the
1531 // CopySortStrboxSubset() wrapper if they haven't been.
1532 // Note that this DOES still perform a "stack" allocation; 'Noalloc' just
1533 // means that sorted_strbox/id_map must already be allocated.
1534 if (!str_ct) {
1535 return kPglRetSuccess;
1536 }
1537 unsigned char* bigstack_mark = g_bigstack_base;
1538 PglErr reterr = kPglRetSuccess;
1539 {
1540 #ifdef __cplusplus
1541 if (max_str_blen <= 60) {
1542 uintptr_t wkspace_entry_blen = (max_str_blen > 28)? sizeof(Strbuf60Ui) : sizeof(Strbuf28Ui);
1543 char* sort_wkspace;
1544 if (unlikely(bigstack_alloc_c(str_ct * wkspace_entry_blen, &sort_wkspace))) {
1545 goto CopySortStrboxSubsetNoalloc_ret_NOMEM;
1546 }
1547 const uint32_t wkspace_entry_blen_m4 = wkspace_entry_blen - 4;
1548 char* sort_wkspace_iter = sort_wkspace;
1549 uintptr_t str_uidx_base = 0;
1550 uintptr_t subset_mask_bits = subset_mask[0];
1551 for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
1552 const uint32_t str_uidx = BitIter1(subset_mask, &str_uidx_base, &subset_mask_bits);
1553 strcpy(sort_wkspace_iter, &(orig_strbox[str_uidx * max_str_blen]));
1554 sort_wkspace_iter = &(sort_wkspace_iter[wkspace_entry_blen_m4]);
1555 if (collapse_idxs) {
1556 memcpy(sort_wkspace_iter, &str_idx, sizeof(int32_t));
1557 } else {
1558 memcpy(sort_wkspace_iter, &str_uidx, sizeof(int32_t));
1559 }
1560 sort_wkspace_iter = &(sort_wkspace_iter[sizeof(int32_t)]);
1561 }
1562 if (wkspace_entry_blen == 32) {
1563 SortStrbox32bFinish(str_ct, max_str_blen, use_nsort, R_CAST(Strbuf28Ui*, sort_wkspace), sorted_strbox, id_map);
1564 } else {
1565 SortStrbox64bFinish(str_ct, max_str_blen, use_nsort, R_CAST(Strbuf60Ui*, sort_wkspace), sorted_strbox, id_map);
1566 }
1567 } else {
1568 #endif
1569 StrSortIndexedDerefOverread* sort_wkspace;
1570 if (unlikely(BIGSTACK_ALLOC_X(StrSortIndexedDerefOverread, str_ct, &sort_wkspace))) {
1571 goto CopySortStrboxSubsetNoalloc_ret_NOMEM;
1572 }
1573 uintptr_t str_uidx_base = 0;
1574 uintptr_t cur_bits = subset_mask[0];
1575 for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
1576 uintptr_t str_uidx = BitIter1(subset_mask, &str_uidx_base, &cur_bits);
1577 sort_wkspace[str_idx].strptr = &(orig_strbox[str_uidx * max_str_blen]);
1578 if (collapse_idxs) {
1579 sort_wkspace[str_idx].orig_idx = str_idx;
1580 } else {
1581 sort_wkspace[str_idx].orig_idx = str_uidx;
1582 }
1583 }
1584 if (!use_nsort) {
1585 STD_SORT_PAR_UNSEQ(str_ct, strcmp_overread_deref, sort_wkspace);
1586 } else {
1587 #ifdef __cplusplus
1588 StrNsortIndexedDeref* wkspace_alias = R_CAST(StrNsortIndexedDeref*, sort_wkspace);
1589 STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias);
1590 #else
1591 STD_SORT_PAR_UNSEQ(str_ct, strcmp_natural_deref, sort_wkspace);
1592 #endif
1593 }
1594 for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
1595 strcpy(&(sorted_strbox[str_idx * max_str_blen]), sort_wkspace[str_idx].strptr);
1596 id_map[str_idx] = sort_wkspace[str_idx].orig_idx;
1597 }
1598 #ifdef __cplusplus
1599 }
1600 #endif
1601 if (!allow_dups) {
1602 char* dup_id = ScanForDuplicateIds(sorted_strbox, str_ct, max_str_blen);
1603 if (unlikely(dup_id)) {
1604 TabsToSpaces(dup_id);
1605 logerrprintfww("Error: Duplicate ID '%s'.\n", dup_id);
1606 goto CopySortStrboxSubsetNoalloc_ret_MALFORMED_INPUT;
1607 }
1608 }
1609 }
1610 while (0) {
1611 CopySortStrboxSubsetNoalloc_ret_NOMEM:
1612 reterr = kPglRetNomem;
1613 break;
1614 CopySortStrboxSubsetNoalloc_ret_MALFORMED_INPUT:
1615 reterr = kPglRetMalformedInput;
1616 break;
1617 }
1618 BigstackReset(bigstack_mark);
1619 return reterr;
1620 }
1621
CopySortStrboxSubset(const uintptr_t * __restrict subset_mask,const char * __restrict orig_strbox,uintptr_t str_ct,uintptr_t max_str_blen,uint32_t allow_dups,uint32_t collapse_idxs,uint32_t use_nsort,char ** sorted_strbox_ptr,uint32_t ** id_map_ptr)1622 PglErr CopySortStrboxSubset(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char** sorted_strbox_ptr, uint32_t** id_map_ptr) {
1623 // id_map on bottom because --indiv-sort frees *sorted_strbox_ptr
1624 if (unlikely(
1625 bigstack_alloc_u32(str_ct, id_map_ptr) ||
1626 bigstack_alloc_c(str_ct * max_str_blen, sorted_strbox_ptr))) {
1627 return kPglRetNomem;
1628 }
1629 return CopySortStrboxSubsetNoalloc(subset_mask, orig_strbox, str_ct, max_str_blen, allow_dups, collapse_idxs, use_nsort, *sorted_strbox_ptr, *id_map_ptr);
1630 }
1631
1632
InitRangeList(RangeList * range_list_ptr)1633 void InitRangeList(RangeList* range_list_ptr) {
1634 range_list_ptr->names = nullptr;
1635 range_list_ptr->starts_range = nullptr;
1636 range_list_ptr->name_ct = 0;
1637 range_list_ptr->name_max_blen = 0;
1638 }
1639
CleanupRangeList(RangeList * range_list_ptr)1640 void CleanupRangeList(RangeList* range_list_ptr) {
1641 free_cond(range_list_ptr->names);
1642 // starts_range now uses same allocation
1643 }
1644
NumericRangeListToBitarr(const RangeList * range_list_ptr,uint32_t bitarr_size,uint32_t offset,uint32_t ignore_overflow,uintptr_t * bitarr)1645 BoolErr NumericRangeListToBitarr(const RangeList* range_list_ptr, uint32_t bitarr_size, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr) {
1646 // bitarr assumed to be initialized (but not necessarily zero-initialized)
1647 const char* names = range_list_ptr->names;
1648 const unsigned char* starts_range = range_list_ptr->starts_range;
1649 const uint32_t name_ct = range_list_ptr->name_ct;
1650 const uint32_t name_max_blen = range_list_ptr->name_max_blen;
1651 const uint32_t idx_max = bitarr_size + offset;
1652 for (uint32_t name_idx = 0; name_idx != name_ct; ++name_idx) {
1653 uint32_t idx1;
1654 if (ScanUintCapped(&(names[name_idx * name_max_blen]), idx_max, &idx1)) {
1655 if (likely(ignore_overflow)) {
1656 continue;
1657 }
1658 return 1;
1659 }
1660 if (unlikely(idx1 < offset)) {
1661 return 1;
1662 }
1663 if (starts_range[name_idx]) {
1664 ++name_idx;
1665 uint32_t idx2;
1666 if (ScanUintCapped(&(names[name_idx * name_max_blen]), idx_max, &idx2)) {
1667 if (unlikely(!ignore_overflow)) {
1668 return 1;
1669 }
1670 idx2 = idx_max - 1;
1671 }
1672 FillBitsNz(idx1 - offset, (idx2 - offset) + 1, bitarr);
1673 } else {
1674 SetBit(idx1 - offset, bitarr);
1675 }
1676 }
1677 return 0;
1678 }
1679
StringRangeListToBitarr(const char * header_line,const RangeList * range_list_ptr,const char * __restrict sorted_ids,const uint32_t * __restrict id_map,const char * __restrict range_list_flag,const char * __restrict file_descrip,uint32_t token_ct,uint32_t fixed_len,uint32_t comma_delim,uintptr_t * bitarr,int32_t * __restrict seen_idxs)1680 PglErr StringRangeListToBitarr(const char* header_line, const RangeList* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t* bitarr, int32_t* __restrict seen_idxs) {
1681 // bitarr assumed to be zero-initialized
1682 // if fixed_len is zero, header_line is assumed to be a list of
1683 // space-delimited unequal-length names
1684 assert(token_ct);
1685 assert(!PopcountWords(bitarr, BitCtToWordCt(token_ct)));
1686 PglErr reterr = kPglRetSuccess;
1687 {
1688 const char* header_line_iter = header_line;
1689 const uintptr_t name_ct = range_list_ptr->name_ct;
1690 const uintptr_t max_id_blen = range_list_ptr->name_max_blen;
1691 for (uint32_t item_idx = 0; ; ) {
1692 const char* token_end = CommaOrTspaceTokenEnd(header_line_iter, comma_delim);
1693 const int32_t sorted_pos = bsearch_str(header_line_iter, sorted_ids, token_end - header_line_iter, max_id_blen, name_ct);
1694 if (sorted_pos != -1) {
1695 uint32_t cmdline_pos = sorted_pos;
1696 if (id_map) {
1697 cmdline_pos = id_map[S_CAST(uint32_t, sorted_pos)];
1698 }
1699 if (unlikely(seen_idxs[cmdline_pos] != -1)) {
1700 logerrprintfww("Error: --%s: '%s' appears multiple times in header line of %s.\n", range_list_flag, &(sorted_ids[S_CAST(uint32_t, sorted_pos) * max_id_blen]), file_descrip);
1701 goto StringRangeListToBitarr_ret_MALFORMED_INPUT;
1702 }
1703 seen_idxs[cmdline_pos] = item_idx;
1704 if (cmdline_pos && range_list_ptr->starts_range[cmdline_pos - 1]) {
1705 if (unlikely(seen_idxs[cmdline_pos - 1] == -1)) {
1706 uint32_t first_sorted_pos = cmdline_pos - 1;
1707 if (id_map) {
1708 for (first_sorted_pos = 0; ; ++first_sorted_pos) {
1709 if (id_map[first_sorted_pos] == cmdline_pos - 1) {
1710 break;
1711 }
1712 }
1713 }
1714 snprintf(g_logbuf, kLogbufSize, "Error: --%s: '%s' appears before '%s' in header line of %s.\n", range_list_flag, &(sorted_ids[S_CAST(uint32_t, sorted_pos) * max_id_blen]), &(sorted_ids[first_sorted_pos * max_id_blen]), file_descrip);
1715 goto StringRangeListToBitarr_ret_INVALID_CMDLINE_WW;
1716 }
1717 FillBitsNz(seen_idxs[cmdline_pos - 1], item_idx + 1, bitarr);
1718 } else if (!(range_list_ptr->starts_range[cmdline_pos])) {
1719 SetBit(item_idx, bitarr);
1720 }
1721 }
1722 if (++item_idx == token_ct) {
1723 break;
1724 }
1725 if (fixed_len) {
1726 header_line_iter = &(header_line_iter[fixed_len]);
1727 } else {
1728 header_line_iter = FirstNonTspace(&(token_end[1]));
1729 }
1730 }
1731 for (uint32_t cmdline_pos = 0; cmdline_pos != name_ct; ++cmdline_pos) {
1732 if (unlikely(seen_idxs[cmdline_pos] == -1)) {
1733 uint32_t sorted_pos = cmdline_pos;
1734 if (id_map) {
1735 for (sorted_pos = 0; ; ++sorted_pos) {
1736 if (id_map[sorted_pos] == cmdline_pos) {
1737 break;
1738 }
1739 }
1740 }
1741 snprintf(g_logbuf, kLogbufSize, "Error: --%s: '%s' does not appear in header line of %s.\n", range_list_flag, &(sorted_ids[sorted_pos * max_id_blen]), file_descrip);
1742 goto StringRangeListToBitarr_ret_INVALID_CMDLINE_WW;
1743 }
1744 }
1745 }
1746 while (0) {
1747 StringRangeListToBitarr_ret_INVALID_CMDLINE_WW:
1748 WordWrapB(0);
1749 logerrputsb();
1750 reterr = kPglRetInvalidCmdline;
1751 break;
1752 StringRangeListToBitarr_ret_MALFORMED_INPUT:
1753 reterr = kPglRetMalformedInput;
1754 break;
1755 }
1756 return reterr;
1757 }
1758
StringRangeListToBitarrAlloc(const char * header_line,const RangeList * range_list_ptr,const char * __restrict range_list_flag,const char * __restrict file_descrip,uint32_t token_ct,uint32_t fixed_len,uint32_t comma_delim,uintptr_t ** bitarr_ptr)1759 PglErr StringRangeListToBitarrAlloc(const char* header_line, const RangeList* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t** bitarr_ptr) {
1760 // wrapper for StringRangeListToBitarr which allocates the bitfield and
1761 // temporary buffers on the heap
1762 uintptr_t token_ctl = BitCtToWordCt(token_ct);
1763 uintptr_t name_ct = range_list_ptr->name_ct;
1764 int32_t* seen_idxs;
1765 char* sorted_ids;
1766 uint32_t* id_map;
1767 if (unlikely(
1768 bigstack_calloc_w(token_ctl, bitarr_ptr) ||
1769 bigstack_alloc_i32(name_ct, &seen_idxs))) {
1770 return kPglRetNomem;
1771 }
1772 // kludge to use CopySortStrboxSubset()
1773 SetAllBits(name_ct, R_CAST(uintptr_t*, seen_idxs));
1774 if (unlikely(CopySortStrboxSubset(R_CAST(uintptr_t*, seen_idxs), range_list_ptr->names, name_ct, range_list_ptr->name_max_blen, 0, 0, 0, &sorted_ids, &id_map))) {
1775 return kPglRetNomem;
1776 }
1777 SetAllI32Arr(name_ct, seen_idxs);
1778 PglErr reterr = StringRangeListToBitarr(header_line, range_list_ptr, sorted_ids, id_map, range_list_flag, file_descrip, token_ct, fixed_len, comma_delim, *bitarr_ptr, seen_idxs);
1779 BigstackReset(seen_idxs);
1780 return reterr;
1781 }
1782
1783
PopcountBitRange(const uintptr_t * bitvec,uintptr_t start_idx,uintptr_t end_idx)1784 uintptr_t PopcountBitRange(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
1785 uintptr_t start_idxl = start_idx / kBitsPerWord;
1786 const uintptr_t start_idxlr = start_idx & (kBitsPerWord - 1);
1787 const uintptr_t end_idxl = end_idx / kBitsPerWord;
1788 const uintptr_t end_idxlr = end_idx & (kBitsPerWord - 1);
1789 uintptr_t ct = 0;
1790 if (start_idxl == end_idxl) {
1791 return PopcountWord(bitvec[start_idxl] & ((k1LU << end_idxlr) - (k1LU << start_idxlr)));
1792 }
1793 if (start_idxlr) {
1794 ct = PopcountWord(bitvec[start_idxl++] >> start_idxlr);
1795 }
1796 if (end_idxl > start_idxl) {
1797 ct += PopcountWordsNzbase(bitvec, start_idxl, end_idxl);
1798 }
1799 if (end_idxlr) {
1800 ct += PopcountWord(bzhi(bitvec[end_idxl], end_idxlr));
1801 }
1802 return ct;
1803 }
1804
1805 #ifdef USE_SSE42
PopcountWordsIntersect3val(const uintptr_t * __restrict bitvec1,const uintptr_t * __restrict bitvec2,uint32_t word_ct,uint32_t * __restrict popcount1_ptr,uint32_t * __restrict popcount2_ptr,uint32_t * __restrict popcount_intersect_ptr)1806 void PopcountWordsIntersect3val(const uintptr_t* __restrict bitvec1, const uintptr_t* __restrict bitvec2, uint32_t word_ct, uint32_t* __restrict popcount1_ptr, uint32_t* __restrict popcount2_ptr, uint32_t* __restrict popcount_intersect_ptr) {
1807 uint32_t ct1 = 0;
1808 uint32_t ct2 = 0;
1809 uint32_t ct3 = 0;
1810 for (uint32_t widx = 0; widx != word_ct; ++widx) {
1811 const uintptr_t word1 = bitvec1[widx];
1812 const uintptr_t word2 = bitvec2[widx];
1813 ct1 += PopcountWord(word1);
1814 ct2 += PopcountWord(word2);
1815 ct3 += PopcountWord(word1 & word2);
1816 }
1817 *popcount1_ptr = ct1;
1818 *popcount2_ptr = ct2;
1819 *popcount_intersect_ptr = ct3;
1820 }
1821 #else
PopcountVecsNoSse42Intersect3val(const VecW * __restrict vvec1_iter,const VecW * __restrict vvec2_iter,uint32_t vec_ct,uint32_t * __restrict popcount1_ptr,uint32_t * popcount2_ptr,uint32_t * __restrict popcount_intersect_ptr)1822 static inline void PopcountVecsNoSse42Intersect3val(const VecW* __restrict vvec1_iter, const VecW* __restrict vvec2_iter, uint32_t vec_ct, uint32_t* __restrict popcount1_ptr, uint32_t* popcount2_ptr, uint32_t* __restrict popcount_intersect_ptr) {
1823 // ct must be a multiple of 3.
1824 assert(!(vec_ct % 3));
1825 const VecW m1 = VCONST_W(kMask5555);
1826 const VecW m2 = VCONST_W(kMask3333);
1827 const VecW m4 = VCONST_W(kMask0F0F);
1828 VecW acc1 = vecw_setzero();
1829 VecW acc2 = vecw_setzero();
1830 VecW acc3 = vecw_setzero();
1831 uintptr_t cur_incr = 30;
1832 for (; ; vec_ct -= cur_incr) {
1833 if (vec_ct < 30) {
1834 if (!vec_ct) {
1835 *popcount1_ptr = HsumW(acc1);
1836 *popcount2_ptr = HsumW(acc2);
1837 *popcount_intersect_ptr = HsumW(acc3);
1838 return;
1839 }
1840 cur_incr = vec_ct;
1841 }
1842 VecW inner_acc1 = vecw_setzero();
1843 VecW inner_acc2 = vecw_setzero();
1844 VecW inner_acc3 = vecw_setzero();
1845 const VecW* vvec1_stop = &(vvec1_iter[cur_incr]);
1846 do {
1847 VecW count1a = *vvec1_iter++;
1848 VecW count2a = *vvec2_iter++;
1849 VecW count3a = count1a & count2a;
1850 VecW count1b = *vvec1_iter++;
1851 VecW count2b = *vvec2_iter++;
1852 VecW count3b = count1b & count2b;
1853 VecW half1a = *vvec1_iter++;
1854 VecW half2a = *vvec2_iter++;
1855 const VecW half1b = vecw_srli(half1a, 1) & m1;
1856 const VecW half2b = vecw_srli(half2a, 1) & m1;
1857 half1a = half1a & m1;
1858 half2a = half2a & m1;
1859
1860 count1a = count1a - (vecw_srli(count1a, 1) & m1);
1861 count2a = count2a - (vecw_srli(count2a, 1) & m1);
1862 count3a = count3a - (vecw_srli(count3a, 1) & m1);
1863 count1b = count1b - (vecw_srli(count1b, 1) & m1);
1864 count2b = count2b - (vecw_srli(count2b, 1) & m1);
1865 count3b = count3b - (vecw_srli(count3b, 1) & m1);
1866 count1a = count1a + half1a;
1867 count2a = count2a + half2a;
1868 count3a = count3a + (half1a & half2a);
1869 count1b = count1b + half1b;
1870 count2b = count2b + half2b;
1871 count3b = count3b + (half1b & half2b);
1872
1873 count1a = (count1a & m2) + (vecw_srli(count1a, 2) & m2);
1874 count2a = (count2a & m2) + (vecw_srli(count2a, 2) & m2);
1875 count3a = (count3a & m2) + (vecw_srli(count3a, 2) & m2);
1876 count1a = count1a + (count1b & m2) + (vecw_srli(count1b, 2) & m2);
1877 count2a = count2a + (count2b & m2) + (vecw_srli(count2b, 2) & m2);
1878 count3a = count3a + (count3b & m2) + (vecw_srli(count3b, 2) & m2);
1879 inner_acc1 = inner_acc1 + (count1a & m4) + (vecw_srli(count1a, 4) & m4);
1880 inner_acc2 = inner_acc2 + (count2a & m4) + (vecw_srli(count2a, 4) & m4);
1881 inner_acc3 = inner_acc3 + (count3a & m4) + (vecw_srli(count3a, 4) & m4);
1882 } while (vvec1_iter < vvec1_stop);
1883 // too much register pressure to use prev_sad_result pattern?
1884 const VecW m0 = vecw_setzero();
1885 acc1 = acc1 + vecw_bytesum(inner_acc1, m0);
1886 acc2 = acc2 + vecw_bytesum(inner_acc2, m0);
1887 acc3 = acc3 + vecw_bytesum(inner_acc3, m0);
1888 }
1889 }
1890
PopcountWordsIntersect3val(const uintptr_t * __restrict bitvec1,const uintptr_t * __restrict bitvec2,uint32_t word_ct,uint32_t * __restrict popcount1_ptr,uint32_t * __restrict popcount2_ptr,uint32_t * __restrict popcount_intersect_ptr)1891 void PopcountWordsIntersect3val(const uintptr_t* __restrict bitvec1, const uintptr_t* __restrict bitvec2, uint32_t word_ct, uint32_t* __restrict popcount1_ptr, uint32_t* __restrict popcount2_ptr, uint32_t* __restrict popcount_intersect_ptr) {
1892 const uint32_t trivec_ct = word_ct / (3 * kWordsPerVec);
1893 uint32_t ct1;
1894 uint32_t ct2;
1895 uint32_t ct3;
1896 PopcountVecsNoSse42Intersect3val(R_CAST(const VecW*, bitvec1), R_CAST(const VecW*, bitvec2), trivec_ct * 3, &ct1, &ct2, &ct3);
1897 const uint32_t words_consumed = trivec_ct * (3 * kWordsPerVec);
1898 bitvec1 = &(bitvec1[words_consumed]);
1899 bitvec2 = &(bitvec2[words_consumed]);
1900 const uint32_t remainder = word_ct - words_consumed;
1901 for (uint32_t widx = 0; widx != remainder; ++widx) {
1902 const uintptr_t word1 = bitvec1[widx];
1903 const uintptr_t word2 = bitvec2[widx];
1904 ct1 += PopcountWord(word1);
1905 ct2 += PopcountWord(word2);
1906 ct3 += PopcountWord(word1 & word2);
1907 }
1908 *popcount1_ptr = ct1;
1909 *popcount2_ptr = ct2;
1910 *popcount_intersect_ptr = ct3;
1911 }
1912 #endif
1913
AllBitsAreZero(const uintptr_t * bitarr,uintptr_t start_idx,uintptr_t end_idx)1914 uint32_t AllBitsAreZero(const uintptr_t* bitarr, uintptr_t start_idx, uintptr_t end_idx) {
1915 uintptr_t start_idxl = start_idx / kBitsPerWord;
1916 const uintptr_t start_idxlr = start_idx & (kBitsPerWord - 1);
1917 const uintptr_t end_idxl = end_idx / kBitsPerWord;
1918 const uintptr_t end_idxlr = end_idx & (kBitsPerWord - 1);
1919 if (start_idxl == end_idxl) {
1920 return !(bitarr[start_idxl] & ((k1LU << end_idxlr) - (k1LU << start_idxlr)));
1921 }
1922 if (start_idxlr) {
1923 if (bitarr[start_idxl++] >> start_idxlr) {
1924 return 0;
1925 }
1926 }
1927 for (; start_idxl != end_idxl; ++start_idxl) {
1928 if (bitarr[start_idxl]) {
1929 return 0;
1930 }
1931 }
1932 if (!end_idxlr) {
1933 return 1;
1934 }
1935 return !bzhi(bitarr[end_idxl], end_idxlr);
1936 }
1937
CopyBitarrRange(const uintptr_t * __restrict src_bitarr,uintptr_t src_start_bitidx,uintptr_t target_start_bitidx,uintptr_t len,uintptr_t * __restrict target_bitarr)1938 void CopyBitarrRange(const uintptr_t* __restrict src_bitarr, uintptr_t src_start_bitidx, uintptr_t target_start_bitidx, uintptr_t len, uintptr_t* __restrict target_bitarr) {
1939 // assumes len is positive, and relevant bits of target_bitarr are zero
1940 const uintptr_t* src_bitarr_iter = &(src_bitarr[src_start_bitidx / kBitsPerWord]);
1941 uint32_t src_rshift = src_start_bitidx % kBitsPerWord;
1942 uintptr_t* target_bitarr_iter = &(target_bitarr[target_start_bitidx / kBitsPerWord]);
1943 uint32_t target_initial_lshift = target_start_bitidx % kBitsPerWord;
1944 uintptr_t cur_src_word;
1945 if (target_initial_lshift) {
1946 const uint32_t initial_copy_bitct = kBitsPerWord - target_initial_lshift;
1947 if (len <= initial_copy_bitct) {
1948 goto CopyBitarrRange_last_partial_word;
1949 }
1950 cur_src_word = (*src_bitarr_iter) >> src_rshift;
1951 if (src_rshift >= target_initial_lshift) {
1952 ++src_bitarr_iter;
1953 cur_src_word |= (*src_bitarr_iter) << (kBitsPerWord - src_rshift);
1954 }
1955 *target_bitarr_iter++ |= cur_src_word << target_initial_lshift;
1956 src_rshift = (src_rshift + initial_copy_bitct) % kBitsPerWord;
1957 len -= initial_copy_bitct;
1958 }
1959 {
1960 const uintptr_t fullword_ct = len / kBitsPerWord;
1961 if (!src_rshift) {
1962 memcpy(target_bitarr_iter, src_bitarr_iter, fullword_ct * sizeof(intptr_t));
1963 target_bitarr_iter = &(target_bitarr_iter[fullword_ct]);
1964 src_bitarr_iter = &(src_bitarr_iter[fullword_ct]);
1965 } else {
1966 const uint32_t src_lshift = kBitsPerWord - src_rshift;
1967 cur_src_word = *src_bitarr_iter;
1968 for (uintptr_t widx = 0; widx != fullword_ct; ++widx) {
1969 const uintptr_t next_src_word = *(++src_bitarr_iter);
1970 *target_bitarr_iter++ = (cur_src_word >> src_rshift) | (next_src_word << src_lshift);
1971 cur_src_word = next_src_word;
1972 }
1973 }
1974 }
1975 len %= kBitsPerWord;
1976 if (len) {
1977 target_initial_lshift = 0;
1978 CopyBitarrRange_last_partial_word:
1979 cur_src_word = (*src_bitarr_iter) >> src_rshift;
1980 if (len + src_rshift > kBitsPerWord) {
1981 cur_src_word |= src_bitarr_iter[1] << (kBitsPerWord - src_rshift);
1982 }
1983 *target_bitarr_iter |= (cur_src_word & ((~k0LU) >> (kBitsPerWord - S_CAST(uint32_t, len)))) << target_initial_lshift;
1984 }
1985 }
1986
1987 // advances forward_ct set bits; forward_ct must be positive. (stays put if
1988 // forward_ct == 1 and current bit is set. may want to tweak this interface,
1989 // easy to introduce off-by-one bugs...)
1990 // In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end of
1991 // the trailing 16-byte block can be safely read from.
FindNth1BitFrom(const uintptr_t * bitvec,uintptr_t cur_pos,uintptr_t forward_ct)1992 uintptr_t FindNth1BitFrom(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct) {
1993 assert(forward_ct);
1994 uintptr_t widx = cur_pos / kBitsPerWord;
1995 uintptr_t ulii = cur_pos % kBitsPerWord;
1996 const uintptr_t* bptr = &(bitvec[widx]);
1997 uintptr_t uljj;
1998 uintptr_t ulkk;
1999 #ifdef __LP64__
2000 const VecW* vptr;
2001 assert(VecIsAligned(bitvec));
2002 #endif
2003 if (ulii) {
2004 uljj = (*bptr) >> ulii;
2005 ulkk = PopcountWord(uljj);
2006 if (ulkk >= forward_ct) {
2007 JumpForwardSetUnsafe_finish:
2008 return widx * kBitsPerWord + ulii + WordBitIdxToUidx(uljj, forward_ct - 1);
2009 }
2010 forward_ct -= ulkk;
2011 ++widx;
2012 ++bptr;
2013 }
2014 ulii = 0;
2015 #ifdef __LP64__
2016 while (widx & (kWordsPerVec - k1LU)) {
2017 uljj = *bptr;
2018 ulkk = PopcountWord(uljj);
2019 if (ulkk >= forward_ct) {
2020 goto JumpForwardSetUnsafe_finish;
2021 }
2022 forward_ct -= ulkk;
2023 ++widx;
2024 ++bptr;
2025 }
2026 vptr = R_CAST(const VecW*, bptr);
2027 #ifdef USE_AVX2
2028 while (forward_ct > kBitsPerWord * (16 * kWordsPerVec)) {
2029 uljj = (forward_ct - 1) / (kBitsPerWord * kWordsPerVec);
2030 ulkk = PopcountVecsAvx2(vptr, uljj);
2031 vptr = &(vptr[uljj]);
2032 forward_ct -= ulkk;
2033 }
2034 #else
2035 while (forward_ct > kBitsPerWord * (3 * kWordsPerVec)) {
2036 uljj = ((forward_ct - 1) / (kBitsPerWord * (3 * kWordsPerVec))) * 3;
2037 // yeah, yeah, this is suboptimal if we have SSE4.2
2038 ulkk = PopcountVecsNoAvx2(vptr, uljj);
2039 vptr = &(vptr[uljj]);
2040 forward_ct -= ulkk;
2041 }
2042 #endif
2043 bptr = R_CAST(const uintptr_t*, vptr);
2044 while (forward_ct > kBitsPerWord) {
2045 forward_ct -= PopcountWord(*bptr++);
2046 }
2047 #else
2048 while (forward_ct > kBitsPerWord) {
2049 uljj = (forward_ct - 1) / kBitsPerWord;
2050 ulkk = PopcountWords(bptr, uljj);
2051 bptr = &(bptr[uljj]);
2052 forward_ct -= ulkk;
2053 }
2054 #endif
2055 for (; ; ++bptr) {
2056 uljj = *bptr;
2057 ulkk = PopcountWord(uljj);
2058 if (ulkk >= forward_ct) {
2059 widx = bptr - bitvec;
2060 goto JumpForwardSetUnsafe_finish;
2061 }
2062 forward_ct -= ulkk;
2063 }
2064 }
2065
ComputeUidxStartPartition(const uintptr_t * variant_include,uint64_t variant_ct,uint32_t thread_ct,uint32_t first_variant_uidx,uint32_t * variant_uidx_starts)2066 void ComputeUidxStartPartition(const uintptr_t* variant_include, uint64_t variant_ct, uint32_t thread_ct, uint32_t first_variant_uidx, uint32_t* variant_uidx_starts) {
2067 assert(variant_ct);
2068 uint32_t cur_variant_uidx_start = AdvTo1Bit(variant_include, first_variant_uidx);
2069 uint32_t cur_variant_idx_start = 0;
2070 variant_uidx_starts[0] = cur_variant_uidx_start;
2071 for (uint32_t tidx = 1; tidx != thread_ct; ++tidx) {
2072 const uint32_t new_variant_idx_start = (tidx * variant_ct) / thread_ct;
2073 if (new_variant_idx_start != cur_variant_idx_start) {
2074 cur_variant_uidx_start = FindNth1BitFrom(variant_include, cur_variant_uidx_start + 1, new_variant_idx_start - cur_variant_idx_start);
2075 cur_variant_idx_start = new_variant_idx_start;
2076 }
2077 variant_uidx_starts[tidx] = cur_variant_uidx_start;
2078 }
2079 }
2080
2081 // May want to have an multiallelic_set bitarray to accelerate this type of
2082 // operation? Probably only want to conditionally initialize it, and only
2083 // after variant filtering is complete, though.
CountExtraAlleles(const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,uint32_t variant_uidx_start,uint32_t variant_uidx_end,uint32_t multiallelic_variant_ct_multiplier)2084 uintptr_t CountExtraAlleles(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t multiallelic_variant_ct_multiplier) {
2085 if (!allele_idx_offsets) {
2086 return 0;
2087 }
2088 const uint32_t variant_ct = PopcountBitRange(variant_include, variant_uidx_start, variant_uidx_end);
2089 const uint32_t const_subtract = 2 - multiallelic_variant_ct_multiplier;
2090 uintptr_t result = 0;
2091 uintptr_t variant_uidx_base;
2092 uintptr_t cur_bits;
2093 BitIter1Start(variant_include, variant_uidx_start, &variant_uidx_base, &cur_bits);
2094 for (uint32_t uii = 0; uii != variant_ct; ++uii) {
2095 const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
2096 const uintptr_t allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offsets[variant_uidx];
2097 if (allele_ct != 2) {
2098 result += allele_ct - const_subtract;
2099 }
2100 }
2101 return result;
2102 }
2103
MaxAlleleCtSubset(const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_allele_ct)2104 uint32_t MaxAlleleCtSubset(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct) {
2105 if (!allele_idx_offsets) {
2106 return 2;
2107 }
2108 if (raw_variant_ct == variant_ct) {
2109 return max_allele_ct;
2110 }
2111 uintptr_t variant_uidx_base = 0;
2112 uintptr_t cur_bits = variant_include[0];
2113 uintptr_t subset_max_allele_ct = 2;
2114 for (uint32_t uii = 0; uii != variant_ct; ++uii) {
2115 const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
2116 const uint32_t allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offsets[variant_uidx];
2117 if (allele_ct > subset_max_allele_ct) {
2118 if (allele_ct == max_allele_ct) {
2119 return max_allele_ct;
2120 }
2121 subset_max_allele_ct = allele_ct;
2122 }
2123 }
2124 return subset_max_allele_ct;
2125 }
2126
ComputePartitionAligned(const uintptr_t * variant_include,uint32_t orig_thread_ct,uint32_t first_variant_uidx,uint32_t cur_variant_idx,uint32_t cur_variant_ct,uint32_t alignment,uint32_t * variant_uidx_starts,uint32_t * vidx_starts)2127 void ComputePartitionAligned(const uintptr_t* variant_include, uint32_t orig_thread_ct, uint32_t first_variant_uidx, uint32_t cur_variant_idx, uint32_t cur_variant_ct, uint32_t alignment, uint32_t* variant_uidx_starts, uint32_t* vidx_starts) {
2128 // Minimize size of the largest chunk, under the condition that all
2129 // intermediate variant_idx values are divisible by alignment.
2130 //
2131 // There are three possibilities:
2132 // 1. The straightforward solution one gets from rounding cur_variant_idx
2133 // down, and (cur_variant_idx + variant_ct) up, is optimal. Call this
2134 // chunk size C.
2135 // 2. C - leading_idx_ct is ideal.
2136 // 3. C - trailing_idx_ct is ideal.
2137
2138 assert(cur_variant_ct);
2139 const uint32_t variant_idx_end = cur_variant_idx + cur_variant_ct;
2140 const uint32_t log2_align = ctzu32(alignment);
2141 const uint32_t leading_idx_ct = cur_variant_idx % alignment;
2142 const uint32_t trailing_idx_ct = (-variant_idx_end) % alignment;
2143 const uint32_t block_ct = (cur_variant_ct + leading_idx_ct + trailing_idx_ct) >> log2_align;
2144 uint32_t cur_variant_uidx_start = AdvTo1Bit(variant_include, first_variant_uidx);
2145 variant_uidx_starts[0] = cur_variant_uidx_start;
2146 vidx_starts[0] = cur_variant_idx;
2147 const uint32_t thread_ct = MINV(orig_thread_ct, block_ct);
2148 if (thread_ct > 1) {
2149 const uint32_t std_blocks_per_thread = 1 + (block_ct - 1) / thread_ct;
2150 // Possibilities 2 and 3 are only live if
2151 // block_ct == (std_blocks_per_thread - 1) * thread_ct + 1, or
2152 // block_ct == (std_blocks_per_thread - 1) * thread_ct + 2,
2153 // In the first situation, the best solution is
2154 // min(possibility 2, possibility 3).
2155 // In the second situation, the best solution is
2156 // max(possibility 2, possibility 3).
2157 uint32_t central_variant_ct = std_blocks_per_thread * alignment;
2158 uint32_t first_block_variant_ct = central_variant_ct - leading_idx_ct;
2159 const uint32_t remainder_m1 = block_ct - (std_blocks_per_thread - 1) * thread_ct - 1;
2160 if (remainder_m1 <= 1) {
2161 central_variant_ct -= alignment;
2162 if ((!remainder_m1) && (leading_idx_ct < trailing_idx_ct)) {
2163 first_block_variant_ct -= alignment;
2164 }
2165 }
2166 cur_variant_uidx_start = FindNth1BitFrom(variant_include, cur_variant_uidx_start + 1, first_block_variant_ct);
2167 cur_variant_idx += first_block_variant_ct;
2168 variant_uidx_starts[1] = cur_variant_uidx_start;
2169 vidx_starts[1] = cur_variant_idx;
2170 for (uint32_t tidx = 2; tidx != thread_ct; ++tidx) {
2171 cur_variant_uidx_start = FindNth1BitFrom(variant_include, cur_variant_uidx_start + 1, central_variant_ct);
2172 cur_variant_idx += central_variant_ct;
2173 // bugfix (14 Nov 2017): this decrement was in the wrong place
2174 if (tidx == remainder_m1) {
2175 central_variant_ct -= alignment;
2176 }
2177 variant_uidx_starts[tidx] = cur_variant_uidx_start;
2178 vidx_starts[tidx] = cur_variant_idx;
2179 }
2180 }
2181 if (thread_ct < orig_thread_ct) {
2182 uint32_t last_vidx_ct = variant_idx_end - vidx_starts[thread_ct - 1];
2183 cur_variant_uidx_start = FindNth1BitFrom(variant_include, cur_variant_uidx_start + 1, last_vidx_ct);
2184 for (uint32_t tidx = thread_ct; tidx != orig_thread_ct; ++tidx) {
2185 variant_uidx_starts[tidx] = cur_variant_uidx_start;
2186 }
2187 for (uint32_t tidx = thread_ct; tidx != orig_thread_ct; ++tidx) {
2188 vidx_starts[tidx] = variant_idx_end;
2189 }
2190 }
2191 vidx_starts[orig_thread_ct] = variant_idx_end;
2192 }
2193
ParseNextRange(const char * const * argvk,uint32_t param_ct,char range_delim,uint32_t * cur_param_idx_ptr,const char ** cur_arg_pptr,const char ** range_start_ptr,uint32_t * rs_len_ptr,const char ** range_end_ptr,uint32_t * re_len_ptr)2194 BoolErr ParseNextRange(const char* const* argvk, uint32_t param_ct, char range_delim, uint32_t* cur_param_idx_ptr, const char** cur_arg_pptr, const char** range_start_ptr, uint32_t* rs_len_ptr, const char** range_end_ptr, uint32_t* re_len_ptr) {
2195 // Starts reading from argv[cur_param_idx][cur_pos]. If a valid range is
2196 // next, range_start + rs_len + range_end + re_len are updated. If only a
2197 // single item is next, range_end is set to nullptr and range_start + rs_len
2198 // are updated. If there are no items left, range_start is set to nullptr.
2199 // If the input is not well-formed, -1 is returned instead of 0.
2200 uint32_t cur_param_idx = *cur_param_idx_ptr;
2201 if (cur_param_idx > param_ct) {
2202 *cur_arg_pptr = nullptr;
2203 return 0;
2204 }
2205 const char* cur_arg_ptr = *cur_arg_pptr;
2206 while (1) {
2207 char cc = *cur_arg_ptr;
2208 if (!cc) {
2209 *cur_param_idx_ptr = ++cur_param_idx;
2210 if (cur_param_idx > param_ct) {
2211 *range_start_ptr = nullptr;
2212 return 0;
2213 }
2214 cur_arg_ptr = argvk[cur_param_idx];
2215 cc = *cur_arg_ptr;
2216 }
2217 if (unlikely(cc == range_delim)) {
2218 return 1;
2219 }
2220 if (cc != ',') {
2221 break;
2222 }
2223 ++cur_arg_ptr;
2224 }
2225 *range_start_ptr = cur_arg_ptr;
2226 char cc;
2227 do {
2228 cc = *(++cur_arg_ptr);
2229 if ((!cc) || (cc == ',')) {
2230 *rs_len_ptr = cur_arg_ptr - (*range_start_ptr);
2231 *cur_arg_pptr = cur_arg_ptr;
2232 *range_end_ptr = nullptr;
2233 return 0;
2234 }
2235 } while (cc != range_delim);
2236 *rs_len_ptr = cur_arg_ptr - (*range_start_ptr);
2237 cc = *(++cur_arg_ptr);
2238 if (unlikely(((*rs_len_ptr) > kMaxIdSlen) || (!cc) || (cc == ',') || (cc == range_delim))) {
2239 return 1;
2240 }
2241 *range_end_ptr = cur_arg_ptr;
2242 do {
2243 cc = *(++cur_arg_ptr);
2244 if (unlikely(cc == range_delim)) {
2245 return 1;
2246 }
2247 } while (cc && (cc != ','));
2248 *re_len_ptr = cur_arg_ptr - (*range_end_ptr);
2249 if (unlikely((*re_len_ptr) > kMaxIdSlen)) {
2250 return 1;
2251 }
2252 *cur_arg_pptr = cur_arg_ptr;
2253 return 0;
2254 }
2255
ParseNameRanges(const char * const * argvk,const char * errstr_append,uint32_t param_ct,uint32_t require_posint,char range_delim,RangeList * range_list_ptr)2256 PglErr ParseNameRanges(const char* const* argvk, const char* errstr_append, uint32_t param_ct, uint32_t require_posint, char range_delim, RangeList* range_list_ptr) {
2257 uint32_t name_ct = 0;
2258 uint32_t cur_param_idx = 1;
2259 uint32_t name_max_blen = 0;
2260 const char* cur_arg_ptr;
2261 const char* range_start;
2262 uint32_t rs_len;
2263 const char* range_end;
2264 uint32_t re_len;
2265 unsigned char* cur_name_starts_range;
2266 uint32_t last_val;
2267 uint32_t cur_val;
2268 // two passes. first pass: count arguments, determine name_max_blen;
2269 // then allocate memory; then fill it.
2270 if (param_ct) {
2271 cur_arg_ptr = argvk[1];
2272 while (1) {
2273 if (unlikely(ParseNextRange(argvk, param_ct, range_delim, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len))) {
2274 logerrprintfww("Error: Invalid %s argument '%s'.\n", argvk[0], argvk[cur_param_idx]);
2275 logerrputs(errstr_append);
2276 return kPglRetInvalidCmdline;
2277 }
2278 if (!range_start) {
2279 break;
2280 }
2281 ++name_ct;
2282 if (rs_len > name_max_blen) {
2283 name_max_blen = rs_len; // does NOT include trailing null yet
2284 }
2285 if (range_end) {
2286 ++name_ct;
2287 if (re_len > name_max_blen) {
2288 name_max_blen = re_len;
2289 }
2290 }
2291 }
2292 }
2293 if (unlikely(!name_ct)) {
2294 logerrprintf("Error: %s requires at least one value.\n%s", argvk[0], errstr_append);
2295 return kPglRetInvalidCmdline;
2296 }
2297 range_list_ptr->name_max_blen = ++name_max_blen;
2298 range_list_ptr->name_ct = name_ct;
2299 if (unlikely(pgl_malloc(name_ct * (S_CAST(uintptr_t, name_max_blen) + 1), &range_list_ptr->names))) {
2300 return kPglRetNomem;
2301 }
2302 range_list_ptr->starts_range = R_CAST(unsigned char*, &(range_list_ptr->names[name_ct * S_CAST(uintptr_t, name_max_blen)]));
2303 char* cur_name_str = range_list_ptr->names;
2304 cur_name_starts_range = range_list_ptr->starts_range;
2305 cur_param_idx = 1;
2306 cur_arg_ptr = argvk[1];
2307 while (1) {
2308 // second pass; this can't fail since we already validated
2309 ParseNextRange(argvk, param_ct, range_delim, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len);
2310 if (!range_start) {
2311 if (require_posint) {
2312 last_val = 0;
2313 for (cur_param_idx = 0; cur_param_idx != name_ct; ++cur_param_idx) {
2314 cur_name_str = &(range_list_ptr->names[cur_param_idx * S_CAST(uintptr_t, name_max_blen)]);
2315 const char* dup_check = cur_name_str; // actually a numeric check
2316 do {
2317 if (unlikely(IsNotDigit(*dup_check))) {
2318 logerrprintfww("Error: Invalid %s argument '%s'.\n", argvk[0], cur_name_str);
2319 return kPglRetInvalidCmdline;
2320 }
2321 } while (*(++dup_check));
2322 if (unlikely(ScanPosintDefcapx(cur_name_str, &cur_val))) {
2323 logerrprintfww("Error: Invalid %s argument '%s'.\n", argvk[0], cur_name_str);
2324 return kPglRetInvalidCmdline;
2325 }
2326 if (range_list_ptr->starts_range[cur_param_idx]) {
2327 last_val = cur_val;
2328 } else {
2329 if (unlikely(cur_val <= last_val)) {
2330 logerrprintfww("Error: Invalid %s range '%s-%s'.\n", argvk[0], &(range_list_ptr->names[(cur_param_idx - 1) * name_max_blen]), cur_name_str);
2331 return kPglRetInvalidCmdline;
2332 }
2333 last_val = 0;
2334 }
2335 }
2336 }
2337 return kPglRetSuccess;
2338 }
2339 memcpyx(cur_name_str, range_start, rs_len, '\0');
2340 const char* dup_check = range_list_ptr->names;
2341 while (dup_check < cur_name_str) {
2342 if (unlikely(memequal(dup_check, cur_name_str, rs_len + 1))) {
2343 logerrprintfww("Error: Duplicate %s argument '%s'.\n", argvk[0], cur_name_str);
2344 return kPglRetInvalidCmdline;
2345 }
2346 dup_check = &(dup_check[name_max_blen]);
2347 }
2348 cur_name_str = &(cur_name_str[name_max_blen]);
2349 if (range_end) {
2350 *cur_name_starts_range++ = 1;
2351 memcpyx(cur_name_str, range_end, re_len, '\0');
2352 dup_check = range_list_ptr->names;
2353 while (dup_check < cur_name_str) {
2354 if (unlikely(memequal(dup_check, cur_name_str, rs_len + 1))) {
2355 logerrprintfww("Error: Duplicate %s argument '%s'.\n", argvk[0], cur_name_str);
2356 return kPglRetInvalidCmdline;
2357 }
2358 dup_check = &(dup_check[name_max_blen]);
2359 }
2360 cur_name_str = &(cur_name_str[name_max_blen]);
2361 *cur_name_starts_range++ = 0;
2362 } else {
2363 *cur_name_starts_range++ = 0;
2364 }
2365 }
2366 }
2367
2368
2369 uint32_t CubicRealRoots(double coef_a, double coef_b, double coef_c, STD_ARRAY_REF(double, 3) solutions) {
2370 // Additional research into numerical stability may be in order here...
2371 double a2 = coef_a * coef_a;
2372 double qq = (a2 - 3 * coef_b) * (1.0 / 9.0);
2373 double rr = (2 * a2 * coef_a - 9 * coef_a * coef_b + 27 * coef_c) * (1.0 / 54.0);
2374 double r2 = rr * rr;
2375 double q3 = qq * qq * qq;
2376 double adiv3 = coef_a * (1.0 / 3.0);
2377 double sq;
2378 double dxx;
2379 if (r2 < q3) {
2380 // three real roots
2381 sq = sqrt(qq);
2382 dxx = acos(rr / (qq * sq)) * (1.0 / 3.0);
2383 sq *= -2;
2384 solutions[0] = sq * cos(dxx) - adiv3;
2385 solutions[1] = sq * cos(dxx + (2.0 * kPi / 3.0)) - adiv3;
2386 solutions[2] = sq * cos(dxx - (2.0 * kPi / 3.0)) - adiv3;
2387 // now sort and check for within-epsilon equality
2388 if (solutions[0] > solutions[1]) {
2389 dxx = solutions[0];
2390 solutions[0] = solutions[1];
2391 if (dxx > solutions[2]) {
2392 solutions[1] = solutions[2];
2393 solutions[2] = dxx;
2394 } else {
2395 solutions[1] = dxx;
2396 }
2397 if (solutions[0] > solutions[1]) {
2398 dxx = solutions[0];
2399 solutions[0] = solutions[1];
2400 solutions[1] = dxx;
2401 }
2402 } else if (solutions[1] > solutions[2]) {
2403 dxx = solutions[1];
2404 solutions[1] = solutions[2];
2405 solutions[2] = dxx;
2406 }
2407 if (solutions[1] - solutions[0] < kEpsilon) {
2408 solutions[1] = solutions[2];
2409 return 2 - (solutions[1] - solutions[0] < kEpsilon);
2410 }
2411 return 3 - (solutions[2] - solutions[1] < kEpsilon);
2412 }
2413 dxx = -pow(fabs(rr) + sqrt(r2 - q3), 1.0 / 3.0);
2414 if (dxx == 0.0) {
2415 solutions[0] = -adiv3;
2416 return 1;
2417 }
2418 if (rr < 0.0) {
2419 dxx = -dxx;
2420 }
2421 sq = qq / dxx;
2422 solutions[0] = dxx + sq - adiv3;
2423 // use of regular epsilon here has actually burned us
2424 if (fabs(dxx - sq) >= (kEpsilon * 8)) {
2425 return 1;
2426 }
2427 if (dxx >= 0.0) {
2428 solutions[1] = solutions[0];
2429 solutions[0] = -dxx - adiv3;
2430 } else {
2431 solutions[1] = -dxx - adiv3;
2432 }
2433 return 2;
2434 }
2435
2436
2437 CONSTI32(kMaxDupflagThreads, 16);
2438
2439 typedef struct DupflagHtableMakerStruct {
2440 const uintptr_t* subset_mask;
2441 const char* const* item_ids;
2442 uint32_t item_ct;
2443 uint32_t id_htable_size;
2444 uint32_t item_uidx_starts[kMaxDupflagThreads];
2445
2446 uint32_t* id_htable;
2447 } DupflagHtableMaker;
2448
DupflagHtableMakerMain(uint32_t tidx,uint32_t thread_ct,DupflagHtableMaker * ctx)2449 void DupflagHtableMakerMain(uint32_t tidx, uint32_t thread_ct, DupflagHtableMaker* ctx) {
2450 const uint32_t id_htable_size = ctx->id_htable_size;
2451 const uintptr_t* subset_mask = ctx->subset_mask;
2452 const char* const* item_ids = ctx->item_ids;
2453 const uint32_t item_ct = ctx->item_ct;
2454 const uint32_t item_uidx_start = ctx->item_uidx_starts[tidx];
2455 const uint32_t item_idx_start = (item_ct * S_CAST(uint64_t, tidx)) / thread_ct;
2456 const uint32_t item_idx_end = (item_ct * (S_CAST(uint64_t, tidx) + 1)) / thread_ct;
2457 uint32_t* id_htable = ctx->id_htable;
2458
2459 uintptr_t cur_bits;
2460 uintptr_t item_uidx_base;
2461 BitIter1Start(subset_mask, item_uidx_start, &item_uidx_base, &cur_bits);
2462 for (uint32_t item_idx = item_idx_start; item_idx != item_idx_end; ++item_idx) {
2463 const uintptr_t item_uidx = BitIter1(subset_mask, &item_uidx_base, &cur_bits);
2464 const char* sptr = item_ids[item_uidx];
2465 const uint32_t slen = strlen(sptr);
2466 for (uint32_t hashval = Hashceil(sptr, slen, id_htable_size); ; ) {
2467 uint32_t old_htable_entry = id_htable[hashval];
2468 if (old_htable_entry == UINT32_MAX) {
2469 if (ATOMIC_COMPARE_EXCHANGE_N_U32(&(id_htable[hashval]), &old_htable_entry, item_uidx, 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) {
2470 break;
2471 }
2472 }
2473 if (strequal_overread(sptr, item_ids[old_htable_entry & 0x7fffffff])) {
2474 if (!(old_htable_entry & 0x80000000U)) {
2475 // ok if multiple threads do this
2476 id_htable[hashval] = old_htable_entry | 0x80000000U;
2477 }
2478 break;
2479 }
2480 if (++hashval == id_htable_size) {
2481 hashval = 0;
2482 }
2483 }
2484 }
2485 }
2486
DupflagHtableMakerThread(void * raw_arg)2487 THREAD_FUNC_DECL DupflagHtableMakerThread(void* raw_arg) {
2488 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
2489 const uint32_t tidx = arg->tidx;
2490 DupflagHtableMaker* ctx = S_CAST(DupflagHtableMaker*, arg->sharedp->context);
2491
2492 // 1. Initialize id_htable with 1-bits in parallel.
2493 const uint32_t id_htable_size = ctx->id_htable_size;
2494 const uint32_t thread_ct = GetThreadCt(arg->sharedp) + 1;
2495 uint32_t* id_htable = ctx->id_htable;
2496 const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, tidx)) / thread_ct, kInt32PerCacheline);
2497 const uint32_t fill_end = RoundDownPow2((id_htable_size * (S_CAST(uint64_t, tidx) + 1)) / thread_ct, kInt32PerCacheline);
2498 SetAllU32Arr(fill_end - fill_start, &(id_htable[fill_start]));
2499
2500 // 2. sync.Once
2501 if (THREAD_BLOCK_FINISH(arg)) {
2502 THREAD_RETURN;
2503 }
2504
2505 // 3. Fill hash table in parallel, and then return.
2506 DupflagHtableMakerMain(tidx, thread_ct, ctx);
2507 THREAD_RETURN;
2508 }
2509
MakeDupflagHtable(const uintptr_t * subset_mask,const char * const * item_ids,uintptr_t item_ct,uint32_t id_htable_size,uint32_t max_thread_ct,uint32_t * id_htable)2510 PglErr MakeDupflagHtable(const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t id_htable_size, uint32_t max_thread_ct, uint32_t* id_htable) {
2511 PglErr reterr = kPglRetSuccess;
2512 ThreadGroup tg;
2513 PreinitThreads(&tg);
2514 DupflagHtableMaker ctx;
2515 {
2516 uint32_t thread_ct = item_ct / 65536;
2517 if (!thread_ct) {
2518 thread_ct = 1;
2519 } else {
2520 if (thread_ct > max_thread_ct) {
2521 thread_ct = max_thread_ct;
2522 }
2523 // bugfix (26 Nov 2019): forgot to force this
2524 if (thread_ct > kMaxDupflagThreads) {
2525 thread_ct = kMaxDupflagThreads;
2526 }
2527 }
2528 if (unlikely(SetThreadCt0(thread_ct - 1, &tg))) {
2529 goto MakeDupflagHtable_ret_NOMEM;
2530 }
2531
2532 ctx.subset_mask = subset_mask;
2533 ctx.item_ids = item_ids;
2534 ctx.item_ct = item_ct;
2535 ctx.id_htable_size = id_htable_size;
2536 ctx.id_htable = id_htable;
2537
2538 uint32_t item_uidx = AdvTo1Bit(subset_mask, 0);
2539 uint32_t item_idx = 0;
2540 ctx.item_uidx_starts[0] = item_uidx;
2541 for (uintptr_t tidx = 1; tidx != thread_ct; ++tidx) {
2542 const uint32_t item_idx_new = (item_ct * S_CAST(uint64_t, tidx)) / thread_ct;
2543 item_uidx = FindNth1BitFrom(subset_mask, item_uidx + 1, item_idx_new - item_idx);
2544 ctx.item_uidx_starts[tidx] = item_uidx;
2545 item_idx = item_idx_new;
2546 }
2547
2548 if (thread_ct > 1) {
2549 SetThreadFuncAndData(DupflagHtableMakerThread, &ctx, &tg);
2550 if (unlikely(SpawnThreads(&tg))) {
2551 goto MakeDupflagHtable_ret_THREAD_CREATE_FAIL;
2552 }
2553 }
2554 const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, thread_ct - 1)) / thread_ct, kInt32PerCacheline);
2555 SetAllU32Arr(id_htable_size - fill_start, &(id_htable[fill_start]));
2556 if (thread_ct > 1) {
2557 JoinThreads(&tg);
2558 DeclareLastThreadBlock(&tg);
2559 SpawnThreads(&tg);
2560 }
2561 DupflagHtableMakerMain(thread_ct - 1, thread_ct, &ctx);
2562 JoinThreads0(&tg);
2563 }
2564 while (0) {
2565 MakeDupflagHtable_ret_NOMEM:
2566 reterr = kPglRetNomem;
2567 break;
2568 MakeDupflagHtable_ret_THREAD_CREATE_FAIL:
2569 reterr = kPglRetThreadCreateFail;
2570 break;
2571 }
2572 CleanupThreads(&tg);
2573 return reterr;
2574 }
2575
2576 CONSTI32(kMaxDupstoreThreads, 15); // probably reduce this after switching to XXH3
2577 CONSTI32(kDupstoreBlockSize, 65536);
2578 CONSTI32(kDupstoreThreadWkspace, kDupstoreBlockSize * 2 * sizeof(int32_t));
2579
2580 typedef struct DupstoreHtableMakerStruct {
2581 const uintptr_t* subset_mask;
2582 const char* const* item_ids;
2583 uint32_t item_ct;
2584 uint32_t id_htable_size;
2585 uint32_t* id_htable;
2586
2587 uint32_t item_uidx_start[2];
2588 uint32_t* hashes[2];
2589 } DupstoreHtableMaker;
2590
DupstoreHtableMakerThread(void * raw_arg)2591 THREAD_FUNC_DECL DupstoreHtableMakerThread(void* raw_arg) {
2592 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
2593 const uint32_t tidx = arg->tidx;
2594 DupstoreHtableMaker* ctx = S_CAST(DupstoreHtableMaker*, arg->sharedp->context);
2595
2596 // 1. Initialize id_htable with 1-bits in parallel.
2597 const uint32_t id_htable_size = ctx->id_htable_size;
2598 const uint32_t thread_ct = GetThreadCt(arg->sharedp);
2599 uint32_t* id_htable = ctx->id_htable;
2600 // Add 1 to thread_ct in denominator, since unlike the DupflagHtableMaker
2601 // case, the parent thread has separate logic to help out here.
2602 const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, tidx)) / (thread_ct + 1), kInt32PerCacheline);
2603 uint32_t fill_end = RoundDownPow2((id_htable_size * (S_CAST(uint64_t, tidx) + 1)) / (thread_ct + 1), kInt32PerCacheline);
2604 SetAllU32Arr(fill_end - fill_start, &(id_htable[fill_start]));
2605
2606 const uintptr_t* subset_mask = ctx->subset_mask;
2607 const char* const* item_ids = ctx->item_ids;
2608 uint32_t items_left = ctx->item_ct;
2609 const uint32_t idx_start_offset = tidx * kDupstoreBlockSize;
2610 uint32_t idx_stop_offset = idx_start_offset + kDupstoreBlockSize;
2611 const uint32_t is_last_thread = (tidx + 1 == thread_ct);
2612 uint32_t parity = 0;
2613 while (!THREAD_BLOCK_FINISH(arg)) {
2614 // 2. Compute up to kDupstoreBlockSize hashes. (Parent thread is
2615 // responsible for using them to update the hash table.)
2616 if (items_left < idx_stop_offset) {
2617 if (items_left <= idx_start_offset) {
2618 // Must be last iteration. Don't need to update parity.
2619 // (Could load-balance this iteration differently, but it shouldn't
2620 // really matter.)
2621 continue;
2622 }
2623 idx_stop_offset = items_left;
2624 }
2625 uint32_t* hashes = ctx->hashes[parity];
2626 // bugfix (24 Jan 2020): IsSet(subset_mask, item_uidx) is not always true.
2627 uintptr_t item_uidx = FindNth1BitFrom(subset_mask, ctx->item_uidx_start[parity], idx_start_offset + 1);
2628
2629 uintptr_t cur_bits;
2630 uintptr_t item_uidx_base;
2631 BitIter1Start(subset_mask, item_uidx, &item_uidx_base, &cur_bits);
2632 for (uint32_t item_idx = idx_start_offset; item_idx != idx_stop_offset; ++item_idx) {
2633 item_uidx = BitIter1(subset_mask, &item_uidx_base, &cur_bits);
2634 const char* sptr = item_ids[item_uidx];
2635 const uint32_t slen = strlen(sptr);
2636 hashes[item_idx] = Hashceil(sptr, slen, id_htable_size);
2637 }
2638 items_left -= thread_ct * kDupstoreBlockSize; // final-iteration underflow ok
2639 parity = 1 - parity;
2640 if (is_last_thread) {
2641 ctx->item_uidx_start[parity] = item_uidx + 1;
2642 }
2643 }
2644 THREAD_RETURN;
2645 }
2646
PopulateIdHtableMtDupstoreThreadCt(uint32_t max_thread_ct,uint32_t item_ct)2647 uint32_t PopulateIdHtableMtDupstoreThreadCt(uint32_t max_thread_ct, uint32_t item_ct) {
2648 uint32_t thread_ct = item_ct / (2 * kDupstoreBlockSize);
2649 if (thread_ct >= max_thread_ct) {
2650 // parent thread is sufficiently busy
2651 thread_ct = max_thread_ct - 1;
2652 }
2653 if (!thread_ct) {
2654 return 1;
2655 }
2656 return MINV(thread_ct, kMaxDupstoreThreads);
2657 }
2658
2659 // dup_ct assumed to be initialized to 0 when dup_ct_ptr != nullptr.
2660 //
2661 // This currently has totally separate code paths for the store_all_dups and
2662 // !store_all_dups cases. However, the table formats are nearly identical, so
2663 // the code may re-converge, and it's reasonable to have just this API entry
2664 // point.
2665 //
2666 // It will probably be moved out of plink2_cmdline soon.
PopulateIdHtableMt(unsigned char * arena_top,const uintptr_t * subset_mask,const char * const * item_ids,uintptr_t item_ct,uint32_t store_all_dups,uint32_t id_htable_size,uint32_t thread_ct,unsigned char ** arena_bottom_ptr,uint32_t * id_htable,uint32_t * dup_ct_ptr)2667 PglErr PopulateIdHtableMt(unsigned char* arena_top, const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t store_all_dups, uint32_t id_htable_size, uint32_t thread_ct, unsigned char** arena_bottom_ptr, uint32_t* id_htable, uint32_t* dup_ct_ptr) {
2668 // Change from plink 1.9: if store_all_dups is false, we don't error out on
2669 // the first encountered duplicate ID; instead, we just flag it in the hash
2670 // table. So if '.' is the only duplicate ID, and it never appears in a
2671 // variant ID list, plink2 never complains.
2672 //
2673 // When store_all_dups is true, additional linked lists are allocated past
2674 // the end of id_htable to track all raw indexes of duplicate names.
2675 if (!item_ct) {
2676 return kPglRetSuccess;
2677 }
2678 if (!store_all_dups) {
2679 return MakeDupflagHtable(subset_mask, item_ids, item_ct, id_htable_size, thread_ct, id_htable);
2680 }
2681 PglErr reterr = kPglRetSuccess;
2682 unsigned char* arena_bottom_mark = *arena_bottom_ptr;
2683 ThreadGroup tg;
2684 PreinitThreads(&tg);
2685 DupstoreHtableMaker ctx;
2686 {
2687 thread_ct = PopulateIdHtableMtDupstoreThreadCt(thread_ct, item_ct);
2688 uint32_t item_idx_stop = thread_ct * kDupstoreBlockSize;
2689 if (arena_end_alloc_u32(arena_bottom_mark, item_idx_stop, &arena_top, &ctx.hashes[0]) ||
2690 arena_end_alloc_u32(arena_bottom_mark, item_idx_stop, &arena_top, &ctx.hashes[1]) ||
2691 SetThreadCt(thread_ct, &tg)) {
2692 goto PopulateIdHtableMt_ret_NOMEM;
2693 }
2694 uintptr_t cur_arena_left = arena_top - arena_bottom_mark;
2695 // We only want to check for out-of-memory once per sync-iteration. The
2696 // maximum number of 2x-uint32 entries that might be added per iteration is
2697 // thread_ct * kDupstoreBlockSize * 2 (*2 is because, when we find the
2698 // first duplicate of a string, we need to add two new entries instead of
2699 // just one).
2700 // htable_dup_base grows from the bottom of the arena, and during the main
2701 // loop, extra_alloc is the next htable_dup_base[] array index that'll be
2702 // written to (i.e. it's always an even number). So if extra_alloc plus
2703 // twice the aforementioned limit isn't larger than cur_arena_left /
2704 // sizeof(int32_t), we can't run out of arena space.
2705 uint32_t extra_alloc_stop;
2706 #ifdef __LP64__
2707 if (cur_arena_left >= 0x400000000LLU + item_idx_stop * 4 * sizeof(int32_t)) {
2708 // this can never be hit
2709 extra_alloc_stop = 0xfffffffe;
2710 } else {
2711 #endif
2712 if (unlikely(cur_arena_left < item_idx_stop * 4 * sizeof(int32_t))) {
2713 goto PopulateIdHtableMt_ret_NOMEM;
2714 }
2715 extra_alloc_stop = (cur_arena_left / sizeof(int32_t)) - item_idx_stop * 4;
2716 #ifdef __LP64__
2717 }
2718 #endif
2719
2720 ctx.subset_mask = subset_mask;
2721 ctx.item_ids = item_ids;
2722 ctx.item_ct = item_ct;
2723 ctx.id_htable_size = id_htable_size;
2724 ctx.id_htable = id_htable;
2725 SetThreadFuncAndData(DupstoreHtableMakerThread, &ctx, &tg);
2726 if (unlikely(SpawnThreads(&tg))) {
2727 goto PopulateIdHtableMt_ret_THREAD_CREATE_FAIL;
2728 }
2729
2730 const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, thread_ct)) / (thread_ct + 1), kInt32PerCacheline);
2731 SetAllU32Arr(id_htable_size - fill_start, &(id_htable[fill_start]));
2732
2733 const uint32_t item_uidx_start = AdvTo1Bit(subset_mask, 0);
2734 JoinThreads(&tg);
2735 ctx.item_uidx_start[0] = item_uidx_start;
2736 if (item_idx_stop >= item_ct) {
2737 DeclareLastThreadBlock(&tg);
2738 item_idx_stop = item_ct;
2739 }
2740 SpawnThreads(&tg);
2741 uint32_t items_left = item_ct;
2742 uint32_t* htable_dup_base = R_CAST(uint32_t*, arena_bottom_mark);
2743 uint32_t extra_alloc = 0;
2744 uint32_t prev_llidx = 0;
2745 uintptr_t cur_bits;
2746 uintptr_t item_uidx_base;
2747 BitIter1Start(subset_mask, item_uidx_start, &item_uidx_base, &cur_bits);
2748 uint32_t parity = 0;
2749 do {
2750 JoinThreads(&tg);
2751 if (extra_alloc > extra_alloc_stop) {
2752 goto PopulateIdHtableMt_ret_NOMEM;
2753 }
2754 if (item_idx_stop < items_left) {
2755 if (item_idx_stop * 2 >= items_left) {
2756 DeclareLastThreadBlock(&tg);
2757 }
2758 SpawnThreads(&tg);
2759 } else {
2760 item_idx_stop = items_left;
2761 }
2762 uint32_t* hashes = ctx.hashes[parity];
2763 for (uint32_t item_idx = 0; item_idx != item_idx_stop; ++item_idx) {
2764 const uintptr_t item_uidx = BitIter1(subset_mask, &item_uidx_base, &cur_bits);
2765 const uint32_t hashval_base = hashes[item_idx];
2766 uint32_t old_htable_entry = id_htable[hashval_base];
2767 if (old_htable_entry == UINT32_MAX) {
2768 id_htable[hashval_base] = item_uidx;
2769 continue;
2770 }
2771 const char* sptr = item_ids[item_uidx];
2772 for (uint32_t hashval = hashval_base; ; ) {
2773 const uint32_t was_dup = old_htable_entry >> 31;
2774 uint32_t prev_uidx;
2775 if (was_dup) {
2776 prev_llidx = old_htable_entry * 2;
2777 prev_uidx = htable_dup_base[prev_llidx];
2778 } else {
2779 prev_uidx = old_htable_entry;
2780 }
2781 if (strequal_overread(sptr, item_ids[prev_uidx])) {
2782 // point to linked list entry instead
2783 if (!was_dup) {
2784 htable_dup_base[extra_alloc] = old_htable_entry;
2785 htable_dup_base[extra_alloc + 1] = UINT32_MAX;
2786 prev_llidx = extra_alloc;
2787 extra_alloc += 2;
2788 }
2789 htable_dup_base[extra_alloc] = item_uidx;
2790 htable_dup_base[extra_alloc + 1] = prev_llidx;
2791 id_htable[hashval] = 0x80000000U | (extra_alloc >> 1);
2792 extra_alloc += 2;
2793 break; // bugfix
2794 }
2795 if (++hashval == id_htable_size) {
2796 hashval = 0;
2797 }
2798 // duplicate this code since we want to avoid the item_ids[item_uidx]
2799 // read when possible
2800 old_htable_entry = id_htable[hashval];
2801 if (old_htable_entry == UINT32_MAX) {
2802 id_htable[hashval] = item_uidx;
2803 break;
2804 }
2805 }
2806 }
2807 items_left -= item_idx_stop;
2808 parity = 1 - parity;
2809 } while (items_left);
2810 // bugfix (15 Oct 2019): no threads to join here!
2811 if (extra_alloc) {
2812 // bugfix: forgot to align this
2813 *arena_bottom_ptr += RoundUpPow2(extra_alloc * sizeof(int32_t), kCacheline);
2814 if (dup_ct_ptr) {
2815 *dup_ct_ptr = extra_alloc / 2;
2816 }
2817 }
2818 }
2819 while (0) {
2820 PopulateIdHtableMt_ret_NOMEM:
2821 *arena_bottom_ptr = arena_bottom_mark;
2822 reterr = kPglRetNomem;
2823 break;
2824 PopulateIdHtableMt_ret_THREAD_CREATE_FAIL:
2825 // not currently possible for this to happen after *arena_bottom_ptr moved
2826 reterr = kPglRetThreadCreateFail;
2827 break;
2828 }
2829 CleanupThreads(&tg);
2830 return reterr;
2831 }
2832
2833 // Similar to DupflagHtableMaker, but we don't need to flag duplicates, we just
2834 // "error out" when we find one.
2835 typedef struct IdUniquenessCheckerStruct {
2836 const uintptr_t* subset_mask;
2837 const char* const* item_ids;
2838 uint32_t item_ct;
2839 uint32_t id_htable_size;
2840 uint32_t item_uidx_starts[kMaxDupflagThreads];
2841
2842 uint32_t* id_htable;
2843
2844 uint32_t dup_found;
2845 } IdUniquenessChecker;
2846
IdUniquenessCheckerMain(uint32_t tidx,uint32_t thread_ct,IdUniquenessChecker * ctx)2847 void IdUniquenessCheckerMain(uint32_t tidx, uint32_t thread_ct, IdUniquenessChecker* ctx) {
2848 const uint32_t id_htable_size = ctx->id_htable_size;
2849 const uintptr_t* subset_mask = ctx->subset_mask;
2850 const char* const* item_ids = ctx->item_ids;
2851 const uint32_t item_ct = ctx->item_ct;
2852 const uint32_t item_uidx_start = ctx->item_uidx_starts[tidx];
2853 const uint32_t item_idx_start = (item_ct * S_CAST(uint64_t, tidx)) / thread_ct;
2854 const uint32_t item_idx_end = (item_ct * (S_CAST(uint64_t, tidx) + 1)) / thread_ct;
2855 uint32_t* id_htable = ctx->id_htable;
2856
2857 uintptr_t cur_bits;
2858 uintptr_t item_uidx_base;
2859 BitIter1Start(subset_mask, item_uidx_start, &item_uidx_base, &cur_bits);
2860 for (uint32_t item_idx = item_idx_start; item_idx != item_idx_end; ) {
2861 const uint32_t item_idx_cur_end = MINV(item_idx_end, item_idx + 65536);
2862 for (; item_idx != item_idx_cur_end; ++item_idx) {
2863 const uintptr_t item_uidx = BitIter1(subset_mask, &item_uidx_base, &cur_bits);
2864 const char* sptr = item_ids[item_uidx];
2865 const uint32_t slen = strlen(sptr);
2866 for (uint32_t hashval = Hashceil(sptr, slen, id_htable_size); ; ) {
2867 uint32_t old_htable_entry = id_htable[hashval];
2868 if (old_htable_entry == UINT32_MAX) {
2869 if (ATOMIC_COMPARE_EXCHANGE_N_U32(&(id_htable[hashval]), &old_htable_entry, item_uidx, 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) {
2870 break;
2871 }
2872 }
2873 if (strequal_overread(sptr, item_ids[old_htable_entry & 0x7fffffff])) {
2874 // no synchronization needed since this variable can't change in any
2875 // other way
2876 ctx->dup_found = 1;
2877 return;
2878 }
2879 if (++hashval == id_htable_size) {
2880 hashval = 0;
2881 }
2882 }
2883 }
2884 if (ctx->dup_found) {
2885 return;
2886 }
2887 }
2888 }
2889
IdUniquenessCheckerThread(void * raw_arg)2890 THREAD_FUNC_DECL IdUniquenessCheckerThread(void* raw_arg) {
2891 ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
2892 const uint32_t tidx = arg->tidx;
2893 IdUniquenessChecker* ctx = S_CAST(IdUniquenessChecker*, arg->sharedp->context);
2894
2895 // 1. Initialize id_htable with 1-bits in parallel.
2896 const uint32_t id_htable_size = ctx->id_htable_size;
2897 const uint32_t thread_ct = GetThreadCt(arg->sharedp) + 1;
2898 uint32_t* id_htable = ctx->id_htable;
2899 const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, tidx)) / thread_ct, kInt32PerCacheline);
2900 const uint32_t fill_end = RoundDownPow2((id_htable_size * (S_CAST(uint64_t, tidx) + 1)) / thread_ct, kInt32PerCacheline);
2901 SetAllU32Arr(fill_end - fill_start, &(id_htable[fill_start]));
2902
2903 // 2. sync.Once
2904 if (THREAD_BLOCK_FINISH(arg)) {
2905 THREAD_RETURN;
2906 }
2907
2908 // 3. Fill hash table in parallel, and then return.
2909 IdUniquenessCheckerMain(tidx, thread_ct, ctx);
2910 THREAD_RETURN;
2911 }
2912
CheckIdUniqueness(unsigned char * arena_bottom,unsigned char * arena_top,const uintptr_t * subset_mask,const char * const * item_ids,uintptr_t item_ct,uint32_t max_thread_ct,uint32_t * dup_found_ptr)2913 PglErr CheckIdUniqueness(unsigned char* arena_bottom, unsigned char* arena_top, const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t max_thread_ct, uint32_t* dup_found_ptr) {
2914 PglErr reterr = kPglRetSuccess;
2915 ThreadGroup tg;
2916 PreinitThreads(&tg);
2917 IdUniquenessChecker ctx;
2918 {
2919 uint32_t thread_ct = item_ct / 65536;
2920 if (!thread_ct) {
2921 thread_ct = 1;
2922 } else {
2923 if (thread_ct > max_thread_ct) {
2924 thread_ct = max_thread_ct;
2925 }
2926 if (thread_ct > kMaxDupflagThreads) {
2927 thread_ct = kMaxDupflagThreads;
2928 }
2929 }
2930 if (unlikely(SetThreadCt0(thread_ct - 1, &tg))) {
2931 goto CheckIdUniqueness_ret_NOMEM;
2932 }
2933
2934 ctx.subset_mask = subset_mask;
2935 ctx.item_ids = item_ids;
2936 ctx.item_ct = item_ct;
2937 uint32_t id_htable_size = GetHtableFastSize(item_ct);
2938 {
2939 uintptr_t wkspace_left = S_CAST(uintptr_t, arena_top - arena_bottom);
2940 if (wkspace_left < id_htable_size * sizeof(int32_t)) {
2941 id_htable_size = wkspace_left / sizeof(int32_t);
2942 const uint32_t min_htable_size = GetHtableMinSize(item_ct);
2943 if (id_htable_size < min_htable_size) {
2944 goto CheckIdUniqueness_ret_NOMEM;
2945 }
2946 }
2947 }
2948 ctx.id_htable_size = id_htable_size;
2949 uint32_t* id_htable = R_CAST(uint32_t*, arena_bottom);
2950 ctx.id_htable = id_htable;
2951 ctx.dup_found = 0;
2952
2953 uint32_t item_uidx = AdvTo1Bit(subset_mask, 0);
2954 uint32_t item_idx = 0;
2955 ctx.item_uidx_starts[0] = item_uidx;
2956 for (uintptr_t tidx = 1; tidx != thread_ct; ++tidx) {
2957 const uint32_t item_idx_new = (item_ct * S_CAST(uint64_t, tidx)) / thread_ct;
2958 item_uidx = FindNth1BitFrom(subset_mask, item_uidx + 1, item_idx_new - item_idx);
2959 ctx.item_uidx_starts[tidx] = item_uidx;
2960 item_idx = item_idx_new;
2961 }
2962
2963 if (thread_ct > 1) {
2964 SetThreadFuncAndData(IdUniquenessCheckerThread, &ctx, &tg);
2965 if (unlikely(SpawnThreads(&tg))) {
2966 goto CheckIdUniqueness_ret_THREAD_CREATE_FAIL;
2967 }
2968 }
2969 const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, thread_ct - 1)) / thread_ct, kInt32PerCacheline);
2970 SetAllU32Arr(id_htable_size - fill_start, &(id_htable[fill_start]));
2971 if (thread_ct > 1) {
2972 JoinThreads(&tg);
2973 DeclareLastThreadBlock(&tg);
2974 SpawnThreads(&tg);
2975 }
2976 IdUniquenessCheckerMain(thread_ct - 1, thread_ct, &ctx);
2977 JoinThreads0(&tg);
2978 *dup_found_ptr = ctx.dup_found;
2979 }
2980 while (0) {
2981 CheckIdUniqueness_ret_NOMEM:
2982 reterr = kPglRetNomem;
2983 break;
2984 CheckIdUniqueness_ret_THREAD_CREATE_FAIL:
2985 reterr = kPglRetThreadCreateFail;
2986 break;
2987 }
2988 CleanupThreads(&tg);
2989 return reterr;
2990 }
2991
AllocAndPopulateIdHtableMt(const uintptr_t * subset_mask,const char * const * item_ids,uintptr_t item_ct,uintptr_t fast_size_min_extra_bytes,uint32_t max_thread_ct,uint32_t ** id_htable_ptr,uint32_t ** htable_dup_base_ptr,uint32_t * id_htable_size_ptr,uint32_t * dup_ct_ptr)2992 PglErr AllocAndPopulateIdHtableMt(const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uintptr_t fast_size_min_extra_bytes, uint32_t max_thread_ct, uint32_t** id_htable_ptr, uint32_t** htable_dup_base_ptr, uint32_t* id_htable_size_ptr, uint32_t* dup_ct_ptr) {
2993 uint32_t id_htable_size = GetHtableFastSize(item_ct);
2994 // if store_all_dups, up to 8 bytes per variant in extra_alloc for duplicate
2995 // tracking, as well as a small amount of per-thread temporary workspace
2996 const uint32_t store_all_dups = (htable_dup_base_ptr != nullptr);
2997 uintptr_t nonhtable_alloc = 0;
2998 if (store_all_dups) {
2999 const uint32_t actual_thread_ct = PopulateIdHtableMtDupstoreThreadCt(max_thread_ct, item_ct);
3000 // "2 *" in front of second term is temporary
3001 nonhtable_alloc = actual_thread_ct * kDupstoreThreadWkspace + 2 * RoundUpPow2(item_ct * 2 * sizeof(int32_t), kCacheline);
3002 }
3003 uintptr_t max_bytes = RoundDownPow2(bigstack_left(), kCacheline);
3004 // force max_bytes >= 5 so leqprime() doesn't fail
3005 // (not actually relevant any more, but whatever)
3006 if (nonhtable_alloc + (item_ct + 6) * sizeof(int32_t) > max_bytes) {
3007 return kPglRetNomem;
3008 }
3009 max_bytes -= nonhtable_alloc;
3010 if (id_htable_size * sizeof(int32_t) + fast_size_min_extra_bytes > max_bytes) {
3011 id_htable_size = max_bytes / sizeof(int32_t);
3012 // id_htable_size = leqprime((max_bytes / sizeof(int32_t)) - 1);
3013 const uint32_t min_htable_size = GetHtableMinSize(item_ct);
3014 if (id_htable_size < min_htable_size) {
3015 id_htable_size = min_htable_size;
3016 }
3017 }
3018 *id_htable_ptr = S_CAST(uint32_t*, bigstack_alloc_raw_rd(id_htable_size * sizeof(int32_t)));
3019 if (store_all_dups) {
3020 *htable_dup_base_ptr = &((*id_htable_ptr)[RoundUpPow2(id_htable_size, kInt32PerCacheline)]);
3021 }
3022 *id_htable_size_ptr = id_htable_size;
3023 return PopulateIdHtableMt(g_bigstack_end, subset_mask, item_ids, item_ct, store_all_dups, id_htable_size, max_thread_ct, &g_bigstack_base, *id_htable_ptr, dup_ct_ptr);
3024 }
3025
3026
Edit1Match(const char * s1,const char * s2,uint32_t len1,uint32_t len2)3027 uint32_t Edit1Match(const char* s1, const char* s2, uint32_t len1, uint32_t len2) {
3028 // Permit one difference of the following forms (Damerau-Levenshtein distance
3029 // 1):
3030 // - inserted/deleted character
3031 // - replaced character
3032 // - adjacent pair of swapped characters
3033 // Returns 1 on approximate match, 0 on nonmatch.
3034 // With only one difference allowed, it is unnecessary to use a
3035 // fully-powered quadratic-time DP algorithm; instead, given the lengths of
3036 // the two strings, we select the appropriate linear-time loop.
3037 if (abs_i32(len1 - len2) > 1) {
3038 return 0;
3039 }
3040 uintptr_t mismatch_pos = FirstUnequal(s1, s2, len1);
3041 if (mismatch_pos == len1) {
3042 // bugfix (5 Nov 2019): this was backwards...
3043 return 1;
3044 }
3045 if (len1 != len2) {
3046 // At least one mismatch guaranteed.
3047 if (len1 < len2) {
3048 return memequal(&(s1[mismatch_pos]), &(s2[mismatch_pos + 1]), len1 - mismatch_pos);
3049 }
3050 return memequal(&(s1[mismatch_pos + 1]), &(s2[mismatch_pos]), len2 - mismatch_pos);
3051 }
3052 // Strings are equal length, and we have at least one mismatch.
3053 // Two ways to still have an approximate match:
3054 // 1. Remainder of strings are equal.
3055 // 2. Next character also differs, but due to transposition. Remainder of
3056 // strings are equal.
3057 if (s1[mismatch_pos + 1] != s2[mismatch_pos + 1]) {
3058 if ((s1[mismatch_pos] != s2[mismatch_pos + 1]) || (s1[mismatch_pos + 1] != s2[mismatch_pos])) {
3059 return 0;
3060 }
3061 ++mismatch_pos;
3062 }
3063 const uintptr_t post_mismatch_pos = mismatch_pos + 1;
3064 return memequal(&(s1[post_mismatch_pos]), &(s2[post_mismatch_pos]), len1 - post_mismatch_pos);
3065 }
3066
3067 CONSTI32(kMaxEqualHelpParams, 64);
3068
HelpPrint(const char * cur_params,HelpCtrl * help_ctrl_ptr,uint32_t postprint_newline,const char * payload)3069 void HelpPrint(const char* cur_params, HelpCtrl* help_ctrl_ptr, uint32_t postprint_newline, const char* payload) {
3070 if (help_ctrl_ptr->param_ct) {
3071 const char* params_iter = cur_params;
3072 uint32_t cur_param_ct = 0;
3073 const char* cur_param_starts[kMaxEqualHelpParams];
3074 uint32_t cur_param_slens[kMaxEqualHelpParams];
3075 while (*params_iter) {
3076 cur_param_starts[cur_param_ct] = params_iter;
3077 const uint32_t cur_slen = strlen(params_iter);
3078 cur_param_slens[cur_param_ct++] = cur_slen;
3079 params_iter = &(params_iter[cur_slen + 1]);
3080 }
3081 if (help_ctrl_ptr->iters_left) {
3082 const uint32_t orig_unmatched_ct = help_ctrl_ptr->unmatched_ct;
3083 if (orig_unmatched_ct) {
3084 uint32_t arg_uidx = 0;
3085 if (help_ctrl_ptr->iters_left == 2) {
3086 // First pass: only exact matches.
3087 for (uint32_t arg_idx = 0; arg_idx != orig_unmatched_ct; ++arg_idx, ++arg_uidx) {
3088 arg_uidx = AdvTo0Bit(help_ctrl_ptr->all_match_arr, arg_uidx);
3089 const char* cur_arg = help_ctrl_ptr->argv[arg_uidx];
3090 const uint32_t cur_arg_slen = help_ctrl_ptr->param_slens[arg_uidx];
3091 for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3092 if ((cur_arg_slen == cur_param_slens[cur_param_idx]) && memequal(cur_arg, cur_param_starts[cur_param_idx], cur_arg_slen)) {
3093 SetBit(arg_uidx, help_ctrl_ptr->perfect_match_arr);
3094 SetBit(arg_uidx, help_ctrl_ptr->prefix_match_arr);
3095 SetBit(arg_uidx, help_ctrl_ptr->all_match_arr);
3096 help_ctrl_ptr->unmatched_ct -= 1;
3097 break;
3098 }
3099 }
3100 }
3101 } else {
3102 // Second pass: Prefix matches.
3103 for (uint32_t arg_idx = 0; arg_idx != orig_unmatched_ct; ++arg_idx, ++arg_uidx) {
3104 arg_uidx = AdvTo0Bit(help_ctrl_ptr->all_match_arr, arg_uidx);
3105 const char* cur_arg = help_ctrl_ptr->argv[arg_uidx];
3106 const uint32_t cur_arg_slen = help_ctrl_ptr->param_slens[arg_uidx];
3107 for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3108 if (cur_param_slens[cur_param_idx] > cur_arg_slen) {
3109 if (memequal(cur_arg, cur_param_starts[cur_param_idx], cur_arg_slen)) {
3110 SetBit(arg_uidx, help_ctrl_ptr->prefix_match_arr);
3111 SetBit(arg_uidx, help_ctrl_ptr->all_match_arr);
3112 help_ctrl_ptr->unmatched_ct -= 1;
3113 break;
3114 }
3115 }
3116 }
3117 }
3118 }
3119 }
3120 } else {
3121 uint32_t print_this = 0;
3122 for (uint32_t arg_uidx = 0; arg_uidx != help_ctrl_ptr->param_ct; ++arg_uidx) {
3123 if (IsSet(help_ctrl_ptr->prefix_match_arr, arg_uidx)) {
3124 // Current user-provided help argument has at least one prefix
3125 // match...
3126 if (!print_this) {
3127 const char* cur_arg = help_ctrl_ptr->argv[arg_uidx];
3128 const uint32_t cur_arg_slen = help_ctrl_ptr->param_slens[arg_uidx];
3129 if (IsSet(help_ctrl_ptr->perfect_match_arr, arg_uidx)) {
3130 // ...and at least one exact match. So we only care about an
3131 // exact match.
3132 for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3133 if ((cur_arg_slen == cur_param_slens[cur_param_idx]) && memequal(cur_arg, cur_param_starts[cur_param_idx], cur_arg_slen)) {
3134 print_this = 1;
3135 break;
3136 }
3137 }
3138 } else {
3139 for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3140 if (cur_param_slens[cur_param_idx] > cur_arg_slen) {
3141 if (memequal(cur_arg, cur_param_starts[cur_param_idx], cur_arg_slen)) {
3142 print_this = 1;
3143 break;
3144 }
3145 }
3146 }
3147 }
3148 }
3149 } else {
3150 // Current user-provided help argument does not have an exact or
3151 // prefix match. Print current help text if it's within
3152 // Damerau-Levenshtein distance 1 of any index term.
3153 for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3154 if (Edit1Match(cur_param_starts[cur_param_idx], help_ctrl_ptr->argv[arg_uidx], cur_param_slens[cur_param_idx], help_ctrl_ptr->param_slens[arg_uidx])) {
3155 print_this = 1;
3156 if (!IsSet(help_ctrl_ptr->all_match_arr, arg_uidx)) {
3157 SetBit(arg_uidx, help_ctrl_ptr->all_match_arr);
3158 help_ctrl_ptr->unmatched_ct -= 1;
3159 }
3160 break;
3161 }
3162 }
3163 }
3164 }
3165 if (print_this) {
3166 const uint32_t payload_slen = strlen(payload);
3167 const char* payload_end;
3168 if (payload[payload_slen - 2] == '\n') {
3169 payload_end = &(payload[payload_slen - 1]);
3170 } else {
3171 payload_end = &(payload[payload_slen]);
3172 }
3173 if (help_ctrl_ptr->preprint_newline) {
3174 putc_unlocked('\n', stdout);
3175 }
3176 help_ctrl_ptr->preprint_newline = postprint_newline;
3177 const char* payload_iter = payload;
3178 do {
3179 const char* line_end = AdvPastDelim(payload_iter, '\n');
3180 uint32_t line_slen = S_CAST(uint32_t, line_end - payload_iter);
3181 if (line_slen > 2) {
3182 payload_iter = &(payload_iter[2]);
3183 line_slen -= 2;
3184 }
3185 memcpyx(g_textbuf, payload_iter, line_slen, '\0');
3186 fputs(g_textbuf, stdout);
3187 payload_iter = line_end;
3188 } while (payload_iter < payload_end);
3189 }
3190 }
3191 } else {
3192 fputs(payload, stdout);
3193 }
3194 }
3195
3196
PreinitPlink2CmdlineMeta(Plink2CmdlineMeta * pcmp)3197 void PreinitPlink2CmdlineMeta(Plink2CmdlineMeta* pcmp) {
3198 pcmp->subst_argv = nullptr;
3199 pcmp->script_buf = nullptr;
3200 pcmp->rerun_buf = nullptr;
3201 pcmp->flag_buf = nullptr;
3202 pcmp->flag_map = nullptr;
3203 }
3204
3205 const char kErrstrNomem[] = "Error: Out of memory. The --memory flag may be helpful.\n";
3206 const char kErrstrWrite[] = "Error: File write failure: %s.\n";
3207 const char kErrstrThreadCreate[] = "Error: Failed to create thread.\n";
3208 const char kErrstrVarRecordTooLarge[] = "Error: Variant record size exceeds ~4 GiB limit.\n";
3209
3210 // assumes logfile is open
DispExitMsg(PglErr reterr)3211 void DispExitMsg(PglErr reterr) {
3212 if (reterr) {
3213 if (reterr == kPglRetNomem) {
3214 logputs("\n");
3215 logerrputs(kErrstrNomem);
3216 if (g_failed_alloc_attempt_size) {
3217 logerrprintf("Failed allocation size: %" PRIuPTR "\n", g_failed_alloc_attempt_size);
3218 }
3219 // kPglRetReadFail no longer gets a message here, for the same reason
3220 // kPglRetOpenFail doesn't: it's important to know which file we failed
3221 // to read.
3222 } else if (reterr == kPglRetWriteFail) {
3223 logputs("\n");
3224 if (errno) {
3225 logerrprintfww(kErrstrWrite, strerror(errno));
3226 } else {
3227 // Defensive.
3228 logerrputs("Error: File write failure: Untracked cause.\n");
3229 }
3230 } else if (reterr == kPglRetThreadCreateFail) {
3231 logputs("\n");
3232 logerrputs(kErrstrThreadCreate);
3233 } else if (reterr == kPglRetVarRecordTooLarge) {
3234 logputs("\n");
3235 logerrputs(kErrstrVarRecordTooLarge);
3236 } else if (reterr == kPglRetLongLine) {
3237 logputs("\n");
3238 logerrprintf("Error: Unhandled internal line-too-long message.\n");
3239 } else if (reterr == kPglRetEof) {
3240 logputs("\n");
3241 logerrprintf("Error: Unhandled internal EOF message.\n");
3242 }
3243 }
3244 }
3245
3246 // useful when there's e.g. a filename and an optional modifier, and we want to
3247 // permit either parmeter ordering
CheckExtraParam(const char * const * argvk,const char * permitted_modif,uint32_t * other_idx_ptr)3248 BoolErr CheckExtraParam(const char* const* argvk, const char* permitted_modif, uint32_t* other_idx_ptr) {
3249 const uint32_t idx_base = *other_idx_ptr;
3250 if (!strcmp(argvk[idx_base], permitted_modif)) {
3251 *other_idx_ptr = idx_base + 1;
3252 } else if (strcmp(argvk[idx_base + 1], permitted_modif)) {
3253 logerrprintf("Error: Invalid %s argument sequence.\n", argvk[0]);
3254 return 1;
3255 }
3256 return 0;
3257 }
3258
ExtractCharParam(const char * ss)3259 char ExtractCharParam(const char* ss) {
3260 // maps c, 'c', and "c" to c, and anything else to the null char. This is
3261 // intended to support e.g. always using '#' to designate a # argument
3262 // without worrying about differences between shells.
3263 const char cc = ss[0];
3264 if (((cc == '\'') || (cc == '"')) && (ss[1]) && (ss[2] == cc) && (!ss[3])) {
3265 return ss[1];
3266 }
3267 if (cc && (!ss[1])) {
3268 return cc;
3269 }
3270 return '\0';
3271 }
3272
CmdlineAllocString(const char * source,const char * flag_name,uint32_t max_slen,char ** sbuf_ptr)3273 PglErr CmdlineAllocString(const char* source, const char* flag_name, uint32_t max_slen, char** sbuf_ptr) {
3274 const uint32_t slen = strlen(source);
3275 if (slen > max_slen) {
3276 logerrprintf("Error: %s argument too long.\n", flag_name);
3277 return kPglRetInvalidCmdline;
3278 }
3279 const uint32_t blen = slen + 1;
3280 if (pgl_malloc(blen, sbuf_ptr)) {
3281 return kPglRetNomem;
3282 }
3283 memcpy(*sbuf_ptr, source, blen);
3284 return kPglRetSuccess;
3285 }
3286
AllocFname(const char * source,const char * flagname_p,uint32_t extra_size,char ** fnbuf_ptr)3287 PglErr AllocFname(const char* source, const char* flagname_p, uint32_t extra_size, char** fnbuf_ptr) {
3288 const uint32_t blen = strlen(source) + 1;
3289 if (blen > (kPglFnamesize - extra_size)) {
3290 logerrprintf("Error: --%s filename too long.\n", flagname_p);
3291 return kPglRetOpenFail;
3292 }
3293 if (pgl_malloc(blen + extra_size, fnbuf_ptr)) {
3294 return kPglRetNomem;
3295 }
3296 memcpy(*fnbuf_ptr, source, blen);
3297 return kPglRetSuccess;
3298 }
3299
AllocAndFlatten(const char * const * sources,uint32_t param_ct,uint32_t max_blen,char ** flattened_buf_ptr)3300 PglErr AllocAndFlatten(const char* const* sources, uint32_t param_ct, uint32_t max_blen, char** flattened_buf_ptr) {
3301 uintptr_t tot_blen = 1;
3302 for (uint32_t param_idx = 0; param_idx != param_ct; ++param_idx) {
3303 const uint32_t cur_blen = 1 + strlen(sources[param_idx]);
3304 if (cur_blen > max_blen) {
3305 return kPglRetInvalidCmdline;
3306 }
3307 tot_blen += cur_blen;
3308 }
3309 char* buf_iter;
3310 if (pgl_malloc(tot_blen, &buf_iter)) {
3311 return kPglRetNomem;
3312 }
3313 *flattened_buf_ptr = buf_iter;
3314 for (uint32_t param_idx = 0; param_idx != param_ct; ++param_idx) {
3315 buf_iter = strcpyax(buf_iter, sources[param_idx], '\0');
3316 }
3317 *buf_iter = '\0';
3318 return kPglRetSuccess;
3319 }
3320
Rerun(const char * ver_str,const char * ver_str2,const char * prog_name_str,uint32_t rerun_argv_pos,uint32_t rerun_parameter_present,int32_t * argc_ptr,uint32_t * first_arg_idx_ptr,char *** argv_ptr,char *** subst_argv_ptr,char ** rerun_buf_ptr)3321 PglErr Rerun(const char* ver_str, const char* ver_str2, const char* prog_name_str, uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t* argc_ptr, uint32_t* first_arg_idx_ptr, char*** argv_ptr, char*** subst_argv_ptr, char** rerun_buf_ptr) {
3322 // caller is responsible for freeing rerun_buf
3323
3324 // ok, requiring zlib/zstd here is totally not worth it
3325 FILE* rerunfile = nullptr;
3326
3327 char** subst_argv2 = nullptr;
3328 char** argv = *argv_ptr;
3329 char* rerun_fname = rerun_parameter_present? argv[rerun_argv_pos + 1] : g_textbuf;
3330 uintptr_t line_idx = 1;
3331 PglErr reterr = kPglRetSuccess;
3332 {
3333 if (!rerun_parameter_present) {
3334 char* write_iter = strcpya(rerun_fname, prog_name_str);
3335 snprintf(write_iter, kMaxOutfnameExtBlen, ".log");
3336 }
3337 rerunfile = fopen(rerun_fname, FOPEN_RB);
3338 if (unlikely(!rerunfile)) {
3339 goto Rerun_ret_OPEN_FAIL;
3340 }
3341 char* textbuf = g_textbuf;
3342 textbuf[kMaxMediumLine - 1] = ' ';
3343 if (unlikely(!fgets(textbuf, kMaxMediumLine, rerunfile))) {
3344 fputs(ver_str, stdout);
3345 fputs(ver_str2, stdout);
3346 fputs("Error: Empty log file for --rerun.\n", stderr);
3347 goto Rerun_ret_MALFORMED_INPUT;
3348 }
3349 if (unlikely(!textbuf[kMaxMediumLine - 1])) {
3350 goto Rerun_ret_LONG_LINE;
3351 }
3352 if (unlikely(!fgets(textbuf, kMaxMediumLine, rerunfile))) {
3353 fputs(ver_str, stdout);
3354 fputs(ver_str2, stdout);
3355 fputs("Error: Only one line in --rerun log file.\n", stderr);
3356 goto Rerun_ret_MALFORMED_INPUT;
3357 }
3358 ++line_idx;
3359 if (unlikely(!textbuf[kMaxMediumLine - 1])) {
3360 goto Rerun_ret_LONG_LINE;
3361 }
3362 // don't bother supporting "xx arguments: --aa bb --cc --dd" format
3363 while ((!StrStartsWithUnsafe(textbuf, "Options in effect:")) || (textbuf[strlen("Options in effect:")] >= ' ')) {
3364 ++line_idx;
3365 if (unlikely(!fgets(textbuf, kMaxMediumLine, rerunfile))) {
3366 fputs(ver_str, stdout);
3367 fputs(ver_str2, stdout);
3368 fputs("Error: Invalid log file for --rerun.\n", stderr);
3369 goto Rerun_ret_MALFORMED_INPUT;
3370 }
3371 }
3372 char* all_args_write_iter = textbuf;
3373 char* textbuf_limit = &(textbuf[kMaxMediumLine]);
3374 uint32_t loaded_arg_ct = 0;
3375 // We load each of the option lines in sequence into textbuf, always
3376 // overwriting the previous line's newline. (Note that textbuf[] has
3377 // size > 2 * kMaxMediumLine; this lets us avoid additional
3378 // dynamic memory allocation as long as we impose the constraint that all
3379 // lines combined add up to less than kMaxMediumLine.)
3380 while (1) {
3381 all_args_write_iter[kMaxMediumLine - 1] = ' ';
3382 if (!fgets(all_args_write_iter, kMaxMediumLine, rerunfile)) {
3383 break;
3384 }
3385 ++line_idx;
3386 if (unlikely(!all_args_write_iter[kMaxMediumLine - 1])) {
3387 goto Rerun_ret_LONG_LINE;
3388 }
3389 char* arg_iter = FirstNonTspace(all_args_write_iter);
3390 if (IsEolnKns(*arg_iter)) {
3391 *all_args_write_iter = '\0';
3392 break;
3393 }
3394 char* token_end;
3395 do {
3396 token_end = CurTokenEnd(arg_iter);
3397 ++loaded_arg_ct;
3398 arg_iter = FirstNonTspace(token_end);
3399 } while (!IsEolnKns(*arg_iter));
3400 all_args_write_iter = token_end;
3401 if (unlikely(all_args_write_iter >= textbuf_limit)) {
3402 fputs(ver_str, stdout);
3403 fputs(ver_str2, stdout);
3404 fputs("Error: --rerun argument sequence too long.\n", stderr);
3405 goto Rerun_ret_MALFORMED_INPUT;
3406 }
3407 }
3408 fclose_null(&rerunfile);
3409 const uint32_t line_byte_ct = 1 + S_CAST(uintptr_t, all_args_write_iter - textbuf);
3410 char* rerun_buf;
3411 if (unlikely(pgl_malloc(line_byte_ct, &rerun_buf))) {
3412 goto Rerun_ret_NOMEM;
3413 }
3414 *rerun_buf_ptr = rerun_buf;
3415 memcpy(rerun_buf, textbuf, line_byte_ct);
3416 const uint32_t argc = *argc_ptr;
3417 const uint32_t first_arg_idx = *first_arg_idx_ptr;
3418 char* rerun_first_token = FirstNonTspace(rerun_buf);
3419 const char* arg_iter = rerun_first_token;
3420 // now use textbuf as a lame bitfield
3421 memset(textbuf, 1, loaded_arg_ct);
3422 uint32_t loaded_arg_idx = 0;
3423 uint32_t duplicate_arg_ct = 0;
3424 do {
3425 if (unlikely(NoMoreTokensKns(arg_iter))) {
3426 fputs(ver_str, stdout);
3427 fputs(ver_str2, stdout);
3428 fputs("Error: Line 2 of --rerun log file has fewer tokens than expected.\n", stderr);
3429 goto Rerun_ret_MALFORMED_INPUT;
3430 }
3431 const char* flagname_p = IsCmdlineFlagStart(arg_iter);
3432 if (flagname_p) {
3433 const uint32_t slen = strlen_se(flagname_p);
3434 uint32_t cmdline_arg_idx = first_arg_idx;
3435 for (; cmdline_arg_idx != argc; ++cmdline_arg_idx) {
3436 const char* later_flagname_p = IsCmdlineFlagStart(argv[cmdline_arg_idx]);
3437 if (later_flagname_p) {
3438 const uint32_t slen2 = strlen(later_flagname_p);
3439 if ((slen == slen2) && memequal(flagname_p, later_flagname_p, slen)) {
3440 cmdline_arg_idx = UINT32_MAX;
3441 break;
3442 }
3443 }
3444 }
3445 if (cmdline_arg_idx == UINT32_MAX) {
3446 // matching flag, override --rerun
3447 do {
3448 ++duplicate_arg_ct;
3449 textbuf[loaded_arg_idx++] = 0;
3450 if (loaded_arg_idx == loaded_arg_ct) {
3451 break;
3452 }
3453 arg_iter = NextToken(arg_iter);
3454 } while (!IsCmdlineFlag(arg_iter));
3455 } else {
3456 ++loaded_arg_idx;
3457 arg_iter = NextToken(arg_iter);
3458 }
3459 } else {
3460 ++loaded_arg_idx;
3461 arg_iter = NextToken(arg_iter);
3462 }
3463 } while (loaded_arg_idx < loaded_arg_ct);
3464 if (unlikely(pgl_malloc((argc + loaded_arg_ct - duplicate_arg_ct - rerun_parameter_present - 1 - first_arg_idx) * sizeof(intptr_t), &subst_argv2))) {
3465 goto Rerun_ret_NOMEM;
3466 }
3467 uint32_t new_arg_idx = rerun_argv_pos - first_arg_idx;
3468 memcpy(subst_argv2, &(argv[first_arg_idx]), new_arg_idx * sizeof(intptr_t));
3469 char* arg_nullterminate_iter = rerun_first_token;
3470 for (loaded_arg_idx = 0; loaded_arg_idx != loaded_arg_ct; ++loaded_arg_idx) {
3471 arg_nullterminate_iter = FirstNonTspace(arg_nullterminate_iter);
3472 char* token_end = CurTokenEnd(arg_nullterminate_iter);
3473 if (textbuf[loaded_arg_idx]) {
3474 subst_argv2[new_arg_idx++] = arg_nullterminate_iter;
3475 *token_end = '\0';
3476 }
3477 arg_nullterminate_iter = &(token_end[1]);
3478 }
3479 const uint32_t final_copy_start_idx = rerun_argv_pos + rerun_parameter_present + 1;
3480 memcpy(&(subst_argv2[new_arg_idx]), &(argv[final_copy_start_idx]), (argc - final_copy_start_idx) * sizeof(intptr_t));
3481 *first_arg_idx_ptr = 0;
3482 *argc_ptr = new_arg_idx + argc - final_copy_start_idx;
3483 if (*subst_argv_ptr) {
3484 free(*subst_argv_ptr);
3485 }
3486 *subst_argv_ptr = subst_argv2;
3487 *argv_ptr = subst_argv2;
3488 subst_argv2 = nullptr;
3489 }
3490 while (0) {
3491 Rerun_ret_NOMEM:
3492 fputs(ver_str, stdout);
3493 fputs(ver_str2, stdout);
3494 reterr = kPglRetNomem;
3495 break;
3496 Rerun_ret_OPEN_FAIL:
3497 fputs(ver_str, stdout);
3498 fputs(ver_str2, stdout);
3499 fprintf(stderr, kErrprintfFopen, rerun_fname, strerror(errno));
3500 reterr = kPglRetOpenFail;
3501 break;
3502 Rerun_ret_LONG_LINE:
3503 fputs(ver_str, stdout);
3504 fputs(ver_str2, stdout);
3505 fprintf(stderr, "Error: Line %" PRIuPTR " of --rerun log file is pathologically long.\n", line_idx);
3506 Rerun_ret_MALFORMED_INPUT:
3507 reterr = kPglRetMalformedInput;
3508 break;
3509 }
3510 free_cond(subst_argv2);
3511 fclose_cond(rerunfile);
3512 return reterr;
3513 }
3514
3515
3516 // Handles --script, --rerun, --help, --version, and --silent.
3517 // subst_argv, script_buf, and rerun_buf must be initialized to nullptr.
CmdlineParsePhase1(const char * ver_str,const char * ver_str2,const char * prog_name_str,const char * notestr_null_calc2,const char * cmdline_format_str,const char * errstr_append,uint32_t max_flag_blen,PglErr (* disp_help_fn)(const char * const *,uint32_t),int * argc_ptr,char *** argv_ptr,Plink2CmdlineMeta * pcmp,uint32_t * first_arg_idx_ptr,uint32_t * flag_ct_ptr)3518 PglErr CmdlineParsePhase1(const char* ver_str, const char* ver_str2, const char* prog_name_str, const char* notestr_null_calc2, const char* cmdline_format_str, const char* errstr_append, uint32_t max_flag_blen, PglErr(* disp_help_fn)(const char* const*, uint32_t), int* argc_ptr, char*** argv_ptr, Plink2CmdlineMeta* pcmp, uint32_t* first_arg_idx_ptr, uint32_t* flag_ct_ptr) {
3519 FILE* scriptfile = nullptr;
3520 PglErr reterr = kPglRetSuccess;
3521 {
3522 int argc = *argc_ptr;
3523 const char* const* argvk = TO_CONSTCPCONSTP(*argv_ptr);
3524 char** subst_argv = nullptr;
3525 uint32_t first_arg_idx = 1;
3526 for (uint32_t arg_idx = 1; arg_idx != S_CAST(uint32_t, argc); ++arg_idx) {
3527 if ((!strcmp("-script", argvk[arg_idx])) || (!strcmp("--script", argvk[arg_idx]))) {
3528 const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
3529 if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
3530 fputs(ver_str, stdout);
3531 fputs(ver_str2, stdout);
3532 fputs(g_logbuf, stderr);
3533 fputs(errstr_append, stderr);
3534 goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3535 }
3536 for (uint32_t arg_idx2 = arg_idx + 2; arg_idx2 != S_CAST(uint32_t, argc); ++arg_idx2) {
3537 if (unlikely((!strcmp("-script", argvk[arg_idx2])) || (!strcmp("--script", argvk[arg_idx2])))) {
3538 fputs(ver_str, stdout);
3539 fputs(ver_str2, stdout);
3540 fputs("Error: Multiple --script flags. Merge the files into one.\n", stderr);
3541 fputs(errstr_append, stderr);
3542 goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3543 }
3544 }
3545 // logging not yet active, so don't use fopen_checked()
3546 scriptfile = fopen(argvk[arg_idx + 1], FOPEN_RB);
3547 if (unlikely(!scriptfile)) {
3548 fputs(ver_str, stdout);
3549 fputs(ver_str2, stdout);
3550 fprintf(stderr, kErrprintfFopen, argvk[arg_idx + 1], strerror(errno));
3551 goto CmdlineParsePhase1_ret_OPEN_FAIL;
3552 }
3553 if (unlikely(fseeko(scriptfile, 0, SEEK_END))) {
3554 fputs(ver_str, stdout);
3555 fputs(ver_str2, stdout);
3556 fprintf(stderr, kErrprintfFread, argvk[arg_idx + 1], strerror(errno));
3557 goto CmdlineParsePhase1_ret_READ_FAIL;
3558 }
3559 int64_t fsize = ftello(scriptfile);
3560 if (unlikely(fsize < 0)) {
3561 fputs(ver_str, stdout);
3562 fputs(ver_str2, stdout);
3563 fprintf(stderr, kErrprintfFread, argvk[arg_idx + 1], strerror(errno));
3564 goto CmdlineParsePhase1_ret_READ_FAIL;
3565 }
3566 if (unlikely(fsize > 0x7ffffffe)) {
3567 // could actually happen if user enters parameters in the wrong
3568 // order, so may as well catch it and print a somewhat informative
3569 // error message
3570 fputs(ver_str, stdout);
3571 fputs(ver_str2, stdout);
3572 fputs("Error: --script file too large.", stderr);
3573 goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3574 }
3575 rewind(scriptfile);
3576 const uint32_t fsize_ui = fsize;
3577 if (unlikely(pgl_malloc(fsize_ui + 1, &pcmp->script_buf))) {
3578 goto CmdlineParsePhase1_ret_NOMEM;
3579 }
3580 char* script_buf = pcmp->script_buf;
3581 if (unlikely(!fread_unlocked(script_buf, fsize_ui, 1, scriptfile))) {
3582 fputs(ver_str, stdout);
3583 fputs(ver_str2, stdout);
3584 fprintf(stderr, kErrprintfFread, argvk[arg_idx + 1], strerror(errno));
3585 goto CmdlineParsePhase1_ret_READ_FAIL;
3586 }
3587 script_buf[fsize_ui] = '\0';
3588 fclose_null(&scriptfile);
3589 uint32_t num_script_params = 0;
3590 char* script_buf_iter = script_buf;
3591 uint32_t char_code;
3592 do {
3593 uint32_t char_code_m1;
3594 do {
3595 char_code_m1 = ctou32(*script_buf_iter++) - 1;
3596 } while (char_code_m1 < 32);
3597 if (char_code_m1 == UINT32_MAX) {
3598 break;
3599 }
3600 ++num_script_params;
3601 do {
3602 char_code = ctou32(*script_buf_iter++);
3603 } while (char_code > 32);
3604 } while (char_code);
3605 if (unlikely(script_buf_iter != (&(script_buf[fsize_ui + 1])))) {
3606 fputs(ver_str, stdout);
3607 fputs(ver_str2, stdout);
3608 fputs("Error: Null byte in --script file.\n", stderr);
3609 goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3610 }
3611 // probable todo: detect duplicate flags in the same manner as --rerun
3612 const uint32_t new_param_ct = num_script_params + argc - 3;
3613 if (unlikely(pgl_malloc(new_param_ct * sizeof(intptr_t), &subst_argv))) {
3614 goto CmdlineParsePhase1_ret_NOMEM;
3615 }
3616 pcmp->subst_argv = subst_argv;
3617 memcpy(subst_argv, &(argvk[1]), arg_idx * sizeof(intptr_t));
3618 const uint32_t load_param_idx_end = arg_idx + num_script_params;
3619 script_buf_iter = &(script_buf[-1]);
3620 for (uint32_t param_idx = arg_idx; param_idx != load_param_idx_end; ++param_idx) {
3621 while (ctou32(*(++script_buf_iter)) <= 32);
3622 subst_argv[param_idx] = script_buf_iter;
3623 while (ctou32(*(++script_buf_iter)) > 32);
3624 // could enforce some sort of length limit here
3625 *script_buf_iter = '\0';
3626 }
3627 memcpy(&(subst_argv[load_param_idx_end]), &(argvk[arg_idx + 2]), (argc - arg_idx - 2) * sizeof(intptr_t));
3628 argc = new_param_ct;
3629 *argc_ptr = argc;
3630 first_arg_idx = 0;
3631 argvk = TO_CONSTCPCONSTP(subst_argv);
3632 *argv_ptr = subst_argv;
3633 break;
3634 }
3635 }
3636 for (uint32_t arg_idx = first_arg_idx; arg_idx != S_CAST(uint32_t, argc); ++arg_idx) {
3637 if ((!strcmp("-rerun", argvk[arg_idx])) || (!strcmp("--rerun", argvk[arg_idx]))) {
3638 const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
3639 if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 0, 1))) {
3640 fputs(ver_str, stdout);
3641 fputs(ver_str2, stdout);
3642 fputs(g_logbuf, stderr);
3643 fputs(errstr_append, stderr);
3644 goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3645 }
3646 for (uint32_t arg_idx2 = arg_idx + param_ct + 1; arg_idx2 != S_CAST(uint32_t, argc); ++arg_idx2) {
3647 if (unlikely((!strcmp("-rerun", argvk[arg_idx2])) || (!strcmp("--rerun", argvk[arg_idx2])))) {
3648 fputs(ver_str, stdout);
3649 fputs(ver_str2, stdout);
3650 fputs("Error: Duplicate --rerun flag.\n", stderr);
3651 goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3652 }
3653 }
3654 reterr = Rerun(ver_str, ver_str2, prog_name_str, arg_idx, param_ct, &argc, &first_arg_idx, argv_ptr, &pcmp->subst_argv, &pcmp->rerun_buf);
3655 if (unlikely(reterr)) {
3656 goto CmdlineParsePhase1_ret_1;
3657 }
3658 *argc_ptr = argc;
3659 subst_argv = pcmp->subst_argv;
3660 argvk = TO_CONSTCPCONSTP(*argv_ptr);
3661 break;
3662 }
3663 }
3664 if (unlikely((first_arg_idx < S_CAST(uint32_t, argc)) && (!IsCmdlineFlag(argvk[first_arg_idx])))) {
3665 fputs("Error: First argument must be a flag.\n", stderr);
3666 fputs(errstr_append, stderr);
3667 goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3668 }
3669 uint32_t flag_ct = 0;
3670 uint32_t version_present = 0;
3671 uint32_t silent_present = 0;
3672 for (uint32_t arg_idx = first_arg_idx; arg_idx != S_CAST(uint32_t, argc); ++arg_idx) {
3673 const char* flagname_p = IsCmdlineFlagStart(argvk[arg_idx]);
3674 if (flagname_p) {
3675 const uint32_t flagname_p_slen = strlen(flagname_p);
3676 if (strequal_k(flagname_p, "help", flagname_p_slen)) {
3677 fputs(ver_str, stdout);
3678 fputs(ver_str2, stdout);
3679 if ((!first_arg_idx) || (arg_idx != 1) || subst_argv) {
3680 fputs("--help present, ignoring other flags.\n", stdout);
3681 }
3682 if ((arg_idx == S_CAST(uint32_t, argc) - 1) && flag_ct) {
3683 // make "plink [valid flags/arguments] --help" work, and skip the
3684 // arguments
3685 const char** help_argv;
3686 if (unlikely(pgl_malloc(flag_ct * sizeof(intptr_t), &help_argv))) {
3687 goto CmdlineParsePhase1_ret_NOMEM2;
3688 }
3689 uint32_t arg_idx2 = 0;
3690 for (uint32_t flag_idx = 0; flag_idx != flag_ct; ++flag_idx) {
3691 while (!IsCmdlineFlagStart(argvk[++arg_idx2]));
3692 help_argv[flag_idx] = argvk[arg_idx2];
3693 }
3694 reterr = disp_help_fn(help_argv, flag_ct);
3695 free(help_argv);
3696 } else {
3697 reterr = disp_help_fn(&(argvk[arg_idx + 1]), argc - arg_idx - 1);
3698 }
3699 if (!reterr) {
3700 reterr = kPglRetHelp;
3701 }
3702 goto CmdlineParsePhase1_ret_1;
3703 }
3704 if (strequal_k(flagname_p, "h", flagname_p_slen) ||
3705 strequal_k(flagname_p, "?", flagname_p_slen)) {
3706 // these just act like the no-argument case
3707 fputs(ver_str, stdout);
3708 fputs(ver_str2, stdout);
3709 if ((!first_arg_idx) || (arg_idx != 1) || subst_argv) {
3710 printf("-%c present, ignoring other flags.\n", *flagname_p);
3711 }
3712 fputs(cmdline_format_str, stdout);
3713 fputs(notestr_null_calc2, stdout);
3714 reterr = kPglRetHelp;
3715 goto CmdlineParsePhase1_ret_1;
3716 }
3717 if (strequal_k(flagname_p, "version", flagname_p_slen)) {
3718 version_present = 1;
3719 } else if (strequal_k(flagname_p, "silent", flagname_p_slen)) {
3720 silent_present = 1;
3721 } else if (unlikely(flagname_p_slen >= max_flag_blen)) {
3722 fputs(ver_str, stdout);
3723 fputs(ver_str2, stdout);
3724 // shouldn't be possible for this to overflow the buffer...
3725 snprintf(g_logbuf, kLogbufSize, "Error: Unrecognized flag ('%s').\n", argvk[arg_idx]);
3726 WordWrapB(0);
3727 fputs(g_logbuf, stderr);
3728 fputs(errstr_append, stderr);
3729 goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3730 }
3731 ++flag_ct;
3732 }
3733 }
3734 if (version_present) {
3735 fputs(ver_str, stdout);
3736 putc_unlocked('\n', stdout);
3737 reterr = kPglRetHelp;
3738 goto CmdlineParsePhase1_ret_1;
3739 }
3740 if (silent_present) {
3741 if (unlikely(!freopen(NULL_STREAM_NAME, "w", stdout))) {
3742 fputs("Warning: --silent failed.", stderr);
3743 g_stderr_written_to = 1;
3744 }
3745 }
3746 fputs(ver_str, stdout);
3747 fputs(ver_str2, stdout);
3748 *first_arg_idx_ptr = first_arg_idx;
3749 *flag_ct_ptr = flag_ct;
3750 }
3751 while (0) {
3752 CmdlineParsePhase1_ret_NOMEM:
3753 fputs(ver_str, stdout);
3754 fputs(ver_str2, stdout);
3755 CmdlineParsePhase1_ret_NOMEM2:
3756 fputs(kErrstrNomem, stderr);
3757 if (g_failed_alloc_attempt_size) {
3758 fprintf(stderr, "Failed allocation size: %" PRIuPTR "\n", g_failed_alloc_attempt_size);
3759 }
3760 reterr = kPglRetNomem;
3761 break;
3762 CmdlineParsePhase1_ret_OPEN_FAIL:
3763 reterr = kPglRetOpenFail;
3764 break;
3765 CmdlineParsePhase1_ret_READ_FAIL:
3766 reterr = kPglRetReadFail;
3767 break;
3768 CmdlineParsePhase1_ret_INVALID_CMDLINE:
3769 reterr = kPglRetInvalidCmdline;
3770 break;
3771 }
3772 CmdlineParsePhase1_ret_1:
3773 fclose_cond(scriptfile);
3774 return reterr;
3775 }
3776
3777 // Assumes CmdlineParsePhase1() has completed, flag names have been copied to
3778 // flag_buf/flag_map, aliases handled, and PROG_NAME_STR has been copied to
3779 // outname (no null-terminator needed). outname_end must be initialized to
3780 // nullptr.
3781 // This sorts the flag names so they're processed in a predictable order,
3782 // handles --out if present, initializes the log, and determines the number of
3783 // processors the OS wants us to think the machine has.
CmdlineParsePhase2(const char * ver_str,const char * errstr_append,const char * const * argvk,uint32_t prog_name_str_slen,uint32_t max_flag_blen,int32_t argc,uint32_t flag_ct,Plink2CmdlineMeta * pcmp,char * outname,char ** outname_end_ptr,char * range_delim_ptr,int32_t * known_procs_ptr,uint32_t * max_thread_ct_ptr)3784 PglErr CmdlineParsePhase2(const char* ver_str, const char* errstr_append, const char* const* argvk, uint32_t prog_name_str_slen, uint32_t max_flag_blen, int32_t argc, uint32_t flag_ct, Plink2CmdlineMeta* pcmp, char* outname, char** outname_end_ptr, char* range_delim_ptr, int32_t* known_procs_ptr, uint32_t* max_thread_ct_ptr) {
3785 PglErr reterr = kPglRetSuccess;
3786 {
3787 char* flag_buf = pcmp->flag_buf;
3788 uint32_t* flag_map = pcmp->flag_map;
3789 reterr = SortCmdlineFlags(max_flag_blen, flag_ct, flag_buf, flag_map);
3790 if (unlikely(reterr)) {
3791 if (reterr == kPglRetNomem) {
3792 goto CmdlineParsePhase2_ret_NOMEM_NOLOG;
3793 }
3794 goto CmdlineParsePhase2_ret_1;
3795 }
3796
3797 *range_delim_ptr = '-';
3798 for (uint32_t cur_flag_idx = 0; cur_flag_idx != flag_ct; ++cur_flag_idx) {
3799 char* cur_flag = &(flag_buf[cur_flag_idx * max_flag_blen]);
3800 if (strequal_k_unsafe(cur_flag, "d")) {
3801 // Must be here, instead of in the main parse loop, for --covar-name +
3802 // --d to work.
3803 const uint32_t arg_idx = flag_map[cur_flag_idx];
3804 const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
3805 if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
3806 fputs(g_logbuf, stderr);
3807 fputs(errstr_append, stderr);
3808 goto CmdlineParsePhase2_ret_INVALID_CMDLINE;
3809 }
3810 const char cc = ExtractCharParam(argvk[arg_idx + 1]);
3811 if (unlikely(!cc)) {
3812 fputs("Error: --d argument too long (must be a single character).\n", stderr);
3813 goto CmdlineParsePhase2_ret_INVALID_CMDLINE;
3814 }
3815 if ((cc == '-') || (cc == ',')) {
3816 fputs("Error: --d argument cannot be '-' or ','.\n", stderr);
3817 goto CmdlineParsePhase2_ret_INVALID_CMDLINE;
3818 }
3819 *range_delim_ptr = cc;
3820 // bugfix (31 Jul 2019): can't set *cur_flag = '\0' to mark the flag as
3821 // already-processed, since we haven't written the start of the .log
3822 // file yet.
3823 continue;
3824 }
3825 const int32_t memcmp_out_result = Memcmp("out", cur_flag, 4);
3826 if (!memcmp_out_result) {
3827 const uint32_t arg_idx = flag_map[cur_flag_idx];
3828 const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
3829 if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
3830 fputs(g_logbuf, stderr);
3831 fputs(errstr_append, stderr);
3832 goto CmdlineParsePhase2_ret_INVALID_CMDLINE;
3833 }
3834 if (unlikely(strlen(argvk[arg_idx + 1]) > (kPglFnamesize - kMaxOutfnameExtBlen))) {
3835 fflush(stdout);
3836 fputs("Error: --out argument too long.\n", stderr);
3837 goto CmdlineParsePhase2_ret_OPEN_FAIL;
3838 }
3839 const uint32_t slen = strlen(argvk[arg_idx + 1]);
3840 memcpy(outname, argvk[arg_idx + 1], slen + 1);
3841 *outname_end_ptr = &(outname[slen]);
3842 }
3843 if (memcmp_out_result <= 0) {
3844 break;
3845 }
3846 }
3847 if (unlikely(InitLogfile(0, outname, (*outname_end_ptr)? (*outname_end_ptr) : &(outname[prog_name_str_slen])))) {
3848 goto CmdlineParsePhase2_ret_OPEN_FAIL;
3849 }
3850 logputs_silent(ver_str);
3851 logputs_silent("\n");
3852 logputs("Options in effect:\n");
3853 for (uint32_t cur_flag_idx = 0; cur_flag_idx != flag_ct; ++cur_flag_idx) {
3854 logputs(" --");
3855 logputs(&(flag_buf[cur_flag_idx * max_flag_blen]));
3856 uint32_t arg_idx = flag_map[cur_flag_idx] + 1;
3857 while ((arg_idx < S_CAST(uint32_t, argc)) && (!IsCmdlineFlag(argvk[arg_idx]))) {
3858 logputs(" ");
3859 // Thought about special-casing argvk[arg_idx] == " ", so that
3860 // "--id-delim ' '" actually works with --rerun, but that adds too much
3861 // complexity to the rereader to be worth it. Instead we just document
3862 // the incompatibility.
3863 logputs(argvk[arg_idx++]);
3864 }
3865 logputs("\n");
3866 }
3867 logputs("\n");
3868
3869 #ifdef _WIN32
3870 DWORD windows_dw = kTextbufSize;
3871 if (GetComputerName(g_textbuf, &windows_dw))
3872 #else
3873 if (gethostname(g_textbuf, kTextbufSize) != -1)
3874 #endif
3875 {
3876 logputs_silent("Hostname: ");
3877 logputs_silent(g_textbuf);
3878 }
3879 logputs_silent("\nWorking directory: ");
3880 if (unlikely(!getcwd(g_textbuf, kPglFnamesize))) {
3881 logputs_silent("\n");
3882 logerrprintfww("Error: Failed to get current working directory: %s.\n", strerror(errno));
3883 // debatable what error code applies here
3884 goto CmdlineParsePhase2_ret_OPEN_FAIL;
3885 }
3886 logputs_silent(g_textbuf);
3887 logputs_silent("\n");
3888 logputs("Start time: ");
3889 time_t rawtime;
3890 time(&rawtime);
3891 logputs(ctime(&rawtime));
3892 // ctime string always has a newline at the end
3893 logputs_silent("\n");
3894
3895 *max_thread_ct_ptr = NumCpu(known_procs_ptr);
3896 // don't subtract 1 any more since, when max_thread_ct > 2, one of the
3897 // (virtual) cores will be dedicated to I/O and have lots of downtime.
3898 //
3899 // may make kMaxThreads a parameter later.
3900 if (*max_thread_ct_ptr > kMaxThreads) {
3901 *max_thread_ct_ptr = kMaxThreads;
3902 }
3903 }
3904 while (0) {
3905 CmdlineParsePhase2_ret_NOMEM_NOLOG:
3906 fputs(kErrstrNomem, stderr);
3907 if (g_failed_alloc_attempt_size) {
3908 fprintf(stderr, "Failed allocation size: %" PRIuPTR "\n", g_failed_alloc_attempt_size);
3909 }
3910 reterr = kPglRetNomem;
3911 break;
3912 CmdlineParsePhase2_ret_OPEN_FAIL:
3913 reterr = kPglRetOpenFail;
3914 break;
3915 CmdlineParsePhase2_ret_INVALID_CMDLINE:
3916 reterr = kPglRetInvalidCmdline;
3917 break;
3918 }
3919 CmdlineParsePhase2_ret_1:
3920 return reterr;
3921 }
3922
CmdlineParsePhase3(uintptr_t max_default_mib,uintptr_t malloc_size_mib,uint32_t memory_require,Plink2CmdlineMeta * pcmp,unsigned char ** bigstack_ua_ptr)3923 PglErr CmdlineParsePhase3(uintptr_t max_default_mib, uintptr_t malloc_size_mib, uint32_t memory_require, Plink2CmdlineMeta* pcmp, unsigned char** bigstack_ua_ptr) {
3924 PglErr reterr = kPglRetSuccess;
3925 {
3926 if (pcmp->subst_argv) {
3927 free(pcmp->subst_argv);
3928 pcmp->subst_argv = nullptr;
3929 }
3930 if (pcmp->script_buf) {
3931 free(pcmp->script_buf);
3932 pcmp->script_buf = nullptr;
3933 }
3934 if (pcmp->rerun_buf) {
3935 free(pcmp->rerun_buf);
3936 pcmp->rerun_buf = nullptr;
3937 }
3938 if (pcmp->flag_buf) {
3939 free(pcmp->flag_buf);
3940 pcmp->flag_buf = nullptr;
3941 }
3942 if (pcmp->flag_map) {
3943 free(pcmp->flag_map);
3944 pcmp->flag_map = nullptr;
3945 }
3946
3947 uint64_t total_mib = DetectMib();
3948 if (!malloc_size_mib) {
3949 if (!total_mib) {
3950 malloc_size_mib = max_default_mib? max_default_mib : kBigstackDefaultMib;
3951 } else if (total_mib < (kBigstackMinMib * 2)) {
3952 malloc_size_mib = kBigstackMinMib;
3953 } else {
3954 malloc_size_mib = total_mib / 2;
3955 if (max_default_mib && (malloc_size_mib > max_default_mib)) {
3956 malloc_size_mib = max_default_mib;
3957 }
3958 }
3959 }
3960 assert(malloc_size_mib >= kBigstackMinMib);
3961 #ifndef __LP64__
3962 if (malloc_size_mib > kMalloc32bitMibMax) {
3963 malloc_size_mib = kMalloc32bitMibMax;
3964 }
3965 #endif
3966 if (total_mib) {
3967 snprintf(g_logbuf, kLogbufSize, "%" PRIu64 " MiB RAM detected; reserving %" PRIuPTR " MiB for main workspace.\n", total_mib, malloc_size_mib);
3968 } else {
3969 snprintf(g_logbuf, kLogbufSize, "Failed to determine total system memory. Attempting to reserve %" PRIuPTR " MiB.\n", malloc_size_mib);
3970 }
3971 logputsb();
3972 uintptr_t malloc_mib_final;
3973 if (unlikely(InitBigstack(malloc_size_mib, &malloc_mib_final, bigstack_ua_ptr))) {
3974 goto CmdlineParsePhase3_ret_NOMEM;
3975 }
3976 if (malloc_size_mib != malloc_mib_final) {
3977 if (unlikely(memory_require)) {
3978 goto CmdlineParsePhase3_ret_NOMEM;
3979 }
3980 logprintf("Allocated %" PRIuPTR " MiB successfully, after larger attempt(s) failed.\n", malloc_mib_final);
3981 }
3982 }
3983 while (0) {
3984 CmdlineParsePhase3_ret_NOMEM:
3985 reterr = kPglRetNomem;
3986 break;
3987 }
3988 return reterr;
3989 }
3990
CleanupPlink2CmdlineMeta(Plink2CmdlineMeta * pcmp)3991 void CleanupPlink2CmdlineMeta(Plink2CmdlineMeta* pcmp) {
3992 free_cond(pcmp->subst_argv);
3993 free_cond(pcmp->script_buf);
3994 free_cond(pcmp->rerun_buf);
3995 free_cond(pcmp->flag_buf);
3996 free_cond(pcmp->flag_map);
3997 }
3998
3999
InitCmpExpr(CmpExpr * cmp_expr_ptr)4000 void InitCmpExpr(CmpExpr* cmp_expr_ptr) {
4001 cmp_expr_ptr->pheno_name = nullptr;
4002 }
4003
CleanupCmpExpr(CmpExpr * cmp_expr_ptr)4004 void CleanupCmpExpr(CmpExpr* cmp_expr_ptr) {
4005 free_cond(cmp_expr_ptr->pheno_name);
4006 }
4007
4008 // may want CXXCONST_CP treatment later
ParseNextBinaryOp(const char * expr_str,uint32_t expr_slen,const char ** op_start_ptr,CmpBinaryOp * binary_op_ptr)4009 const char* ParseNextBinaryOp(const char* expr_str, uint32_t expr_slen, const char** op_start_ptr, CmpBinaryOp* binary_op_ptr) {
4010 // !=, <>: kCmpOperatorNoteq
4011 // <: kCmpOperatorLe
4012 // <=: kCmpOperatorLeq
4013 // =, ==: kCmpOperatorEq
4014 // >=: kCmpOperatorGeq
4015 // >: kCmpOperatorGe
4016 const char* next_eq = S_CAST(const char*, memchr(expr_str, '=', expr_slen));
4017 const char* next_lt = S_CAST(const char*, memchr(expr_str, '<', expr_slen));
4018 const char* next_gt = S_CAST(const char*, memchr(expr_str, '>', expr_slen));
4019 if (!next_eq) {
4020 if (!next_lt) {
4021 // may want to remove unlikely() later
4022 if (unlikely(!next_gt)) {
4023 return nullptr;
4024 }
4025 *op_start_ptr = next_gt;
4026 *binary_op_ptr = kCmpOperatorGe;
4027 return &(next_gt[1]);
4028 }
4029 if (next_gt == (&(next_lt[1]))) {
4030 *op_start_ptr = next_lt;
4031 *binary_op_ptr = kCmpOperatorNoteq;
4032 return &(next_lt[2]);
4033 }
4034 if ((!next_gt) || (next_gt > next_lt)) {
4035 *op_start_ptr = next_lt;
4036 *binary_op_ptr = kCmpOperatorLe;
4037 return &(next_lt[1]);
4038 }
4039 *op_start_ptr = next_gt;
4040 *binary_op_ptr = kCmpOperatorGe;
4041 return &(next_gt[1]);
4042 }
4043 if ((!next_lt) || (next_lt > next_eq)) {
4044 if ((!next_gt) || (next_gt > next_eq)) {
4045 if ((next_eq != expr_str) && (next_eq[-1] == '!')) {
4046 *op_start_ptr = &(next_eq[-1]);
4047 *binary_op_ptr = kCmpOperatorNoteq;
4048 return &(next_eq[1]);
4049 }
4050 *op_start_ptr = next_eq;
4051 *binary_op_ptr = kCmpOperatorEq;
4052 return (next_eq[1] == '=')? (&(next_eq[2])) : (&(next_eq[1]));
4053 }
4054 *op_start_ptr = next_gt;
4055 if (next_eq == (&(next_gt[1]))) {
4056 *binary_op_ptr = kCmpOperatorGeq;
4057 return &(next_gt[2]);
4058 }
4059 *binary_op_ptr = kCmpOperatorGe;
4060 return &(next_gt[1]);
4061 }
4062 if (next_gt == (&(next_lt[1]))) {
4063 *op_start_ptr = next_lt;
4064 *binary_op_ptr = kCmpOperatorNoteq;
4065 return &(next_lt[2]);
4066 }
4067 if ((!next_gt) || (next_gt > next_lt)) {
4068 *op_start_ptr = next_lt;
4069 if (next_eq == (&(next_lt[1]))) {
4070 *binary_op_ptr = kCmpOperatorLeq;
4071 return &(next_lt[2]);
4072 }
4073 *binary_op_ptr = kCmpOperatorLe;
4074 return &(next_lt[1]);
4075 }
4076 *op_start_ptr = next_gt;
4077 if (next_eq == (&(next_gt[1]))) {
4078 *binary_op_ptr = kCmpOperatorGeq;
4079 return &(next_gt[2]);
4080 }
4081 *binary_op_ptr = kCmpOperatorGe;
4082 return &(next_gt[1]);
4083 }
4084
ValidateAndAllocCmpExpr(const char * const * sources,const char * flag_name,uint32_t param_ct,CmpExpr * cmp_expr_ptr)4085 PglErr ValidateAndAllocCmpExpr(const char* const* sources, const char* flag_name, uint32_t param_ct, CmpExpr* cmp_expr_ptr) {
4086 // Currently four use cases:
4087 // [pheno/covar name] [operator] [pheno val]: regular comparison
4088 // [pheno/covar name]: existence check
4089 // [INFO key] [operator] [val]: regular comparison
4090 // [INFO key]: existence check
4091 // Some key/value validation is deferred to LoadPvar()/KeepRemoveIf(),
4092 // since the requirements are different (e.g. no semicolons in anything
4093 // INFO-related, categorical phenotypes can be assumed to not start with a
4094 // valid number).
4095 // May support or/and, parentheses later, but need to be careful to not slow
4096 // down LoadPvar() too much in the no-INFO-filter case.
4097 PglErr reterr = kPglRetSuccess;
4098 {
4099 if (unlikely((param_ct != 1) && (param_ct != 3))) {
4100 goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4101 }
4102 const char* pheno_name_start = sources[0];
4103 const char* pheno_val_start;
4104 uint32_t pheno_name_slen;
4105 uint32_t pheno_val_slen;
4106 if (param_ct == 3) {
4107 pheno_name_slen = strlen(pheno_name_start);
4108 const char* op_str = sources[1];
4109 uint32_t op_slen = strlen(op_str);
4110 // ok to have single/double quotes around operator
4111 if (op_slen > 2) {
4112 const char cc = op_str[0];
4113 if (((cc == '\'') || (cc == '"')) && (op_str[op_slen - 1] == cc)) {
4114 ++op_str;
4115 op_slen -= 2;
4116 }
4117 }
4118 const char* op_start;
4119 const char* op_end = ParseNextBinaryOp(op_str, op_slen, &op_start, &cmp_expr_ptr->binary_op);
4120 if (unlikely((!op_end) || (*op_end) || (op_start != op_str))) {
4121 goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4122 }
4123 pheno_val_start = sources[2];
4124 pheno_val_slen = strlen(pheno_val_start);
4125 } else {
4126 // permit param_ct == 1 as long as tokens are unambiguous
4127 uint32_t expr_slen = strlen(pheno_name_start);
4128 const char* op_start;
4129 pheno_val_start = ParseNextBinaryOp(pheno_name_start, expr_slen, &op_start, &cmp_expr_ptr->binary_op);
4130 if (unlikely((!pheno_val_start) || (!(*pheno_val_start)) || (op_start == pheno_name_start))) {
4131 goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4132 }
4133 pheno_name_slen = op_start - pheno_name_start;
4134 // quasi-bugfix (13 Dec 2017): allow single argument to contain internal
4135 // spaces;
4136 // --keep-if "PHENO1 > 1"
4137 // is more intuitive and consistent with usage of other command-line
4138 // tools than
4139 // --keep-if PHENO1 '>' 1
4140 //
4141 // To prevent --rerun from breaking, if there's a space after the
4142 // operator, there must be a space before the operator as well, etc.
4143 if (*pheno_val_start == ' ') {
4144 if (unlikely(pheno_name_start[pheno_name_slen - 1] != ' ')) {
4145 goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4146 }
4147 do {
4148 ++pheno_val_start;
4149 } while (*pheno_val_start == ' ');
4150 do {
4151 --pheno_name_slen;
4152 if (unlikely(!pheno_name_slen)) {
4153 goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4154 }
4155 } while (pheno_name_start[pheno_name_slen - 1] == ' ');
4156 }
4157 pheno_val_slen = expr_slen - S_CAST(uintptr_t, pheno_val_start - pheno_name_start);
4158 }
4159 if (unlikely(memchr(pheno_name_start, ' ', pheno_name_slen) || memchr(pheno_val_start, ' ', pheno_val_slen))) {
4160 goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4161 }
4162 if (unlikely((pheno_name_slen > kMaxIdSlen) || (pheno_val_slen > kMaxIdSlen))) {
4163 logerrprintf("Error: ID too long in %s expression.\n", flag_name);
4164 goto ValidateAndAllocCmpExpr_ret_INVALID_CMDLINE;
4165 }
4166 if ((cmp_expr_ptr->binary_op != kCmpOperatorNoteq) && (cmp_expr_ptr->binary_op != kCmpOperatorEq)) {
4167 double dxx;
4168 if (unlikely(!ScantokDouble(pheno_val_start, &dxx))) {
4169 logerrprintfww("Error: Invalid %s value '%s' (finite number expected).\n", flag_name, pheno_val_start);
4170 goto ValidateAndAllocCmpExpr_ret_INVALID_CMDLINE;
4171 }
4172 }
4173 char* new_pheno_name_buf;
4174 if (unlikely(pgl_malloc(2 + pheno_name_slen + pheno_val_slen, &new_pheno_name_buf))) {
4175 goto ValidateAndAllocCmpExpr_ret_NOMEM;
4176 }
4177 memcpyx(new_pheno_name_buf, pheno_name_start, pheno_name_slen, '\0');
4178 // pheno_val_start guaranteed to be null-terminated for now
4179 memcpy(&(new_pheno_name_buf[pheno_name_slen + 1]), pheno_val_start, pheno_val_slen + 1);
4180 cmp_expr_ptr->pheno_name = new_pheno_name_buf;
4181 }
4182 while (0) {
4183 ValidateAndAllocCmpExpr_ret_NOMEM:
4184 reterr = kPglRetNomem;
4185 break;
4186 ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC:
4187 logerrprintf("Error: Invalid %s expression.\n", flag_name);
4188 ValidateAndAllocCmpExpr_ret_INVALID_CMDLINE:
4189 reterr = kPglRetInvalidCmdline;
4190 break;
4191 }
4192 return reterr;
4193 }
4194
4195 // See e.g. meta_analysis_open_and_read_header() in plink 1.9.
4196 // Assumes at least one search term.
SearchHeaderLine(const char * header_line_iter,const char * const * search_multistrs,const char * flagname_p,uint32_t search_col_ct,uint32_t * found_col_ct_ptr,uint32_t * found_type_bitset_ptr,uint32_t * col_skips,uint32_t * col_types)4197 PglErr SearchHeaderLine(const char* header_line_iter, const char* const* search_multistrs, const char* flagname_p, uint32_t search_col_ct, uint32_t* found_col_ct_ptr, uint32_t* found_type_bitset_ptr, uint32_t* col_skips, uint32_t* col_types) {
4198 assert(search_col_ct <= 32);
4199 unsigned char* bigstack_mark = g_bigstack_base;
4200 PglErr reterr = kPglRetSuccess;
4201 {
4202 uint32_t search_term_ct = 0;
4203 uintptr_t max_blen = 0;
4204 for (uint32_t search_col_idx = 0; search_col_idx != search_col_ct; ++search_col_idx) {
4205 const uint32_t cur_search_term_ct = CountAndMeasureMultistr(search_multistrs[search_col_idx], &max_blen);
4206 assert(cur_search_term_ct <= (1 << 26));
4207 search_term_ct += cur_search_term_ct;
4208 }
4209 char* merged_strbox;
4210 uint32_t* id_map;
4211 uint32_t* priority_vals;
4212 uint64_t* cols_and_types;
4213 if (unlikely(
4214 bigstack_alloc_c(search_term_ct * max_blen, &merged_strbox) ||
4215 bigstack_alloc_u32(search_term_ct, &id_map) ||
4216 bigstack_alloc_u32(search_col_ct, &priority_vals) ||
4217 bigstack_alloc_u64(search_col_ct, &cols_and_types))) {
4218 goto SearchHeaderLine_ret_NOMEM;
4219 }
4220 uint32_t search_term_idx = 0;
4221 for (uint32_t search_col_idx = 0; search_col_idx != search_col_ct; ++search_col_idx) {
4222 const char* multistr_iter = search_multistrs[search_col_idx];
4223 uint32_t priority_idx = 0; // 0 = highest priority
4224 while (*multistr_iter) {
4225 const uint32_t cur_blen = strlen(multistr_iter) + 1;
4226 memcpy(&(merged_strbox[search_term_idx * max_blen]), multistr_iter, cur_blen);
4227 id_map[search_term_idx] = search_col_idx + (priority_idx * 32);
4228 ++search_term_idx;
4229 ++priority_idx;
4230 multistr_iter = &(multistr_iter[cur_blen]);
4231 }
4232 }
4233 SortStrboxIndexed(search_term_ct, max_blen, 0, merged_strbox, id_map);
4234 assert(search_term_ct);
4235 const char* duplicate_search_term = ScanForDuplicateIds(merged_strbox, search_term_ct, max_blen);
4236 if (unlikely(duplicate_search_term)) {
4237 logerrprintfww("Error: Duplicate term '%s' in --%s column search order.\n", duplicate_search_term, flagname_p);
4238 goto SearchHeaderLine_ret_INVALID_CMDLINE;
4239 }
4240
4241 SetAllU32Arr(search_col_ct, priority_vals);
4242 SetAllU64Arr(search_col_ct, cols_and_types);
4243 for (uintptr_t col_idx = 0; ; ++col_idx) {
4244 const char* token_end = CurTokenEnd(header_line_iter);
4245 const uint32_t token_slen = token_end - header_line_iter;
4246 int32_t ii = bsearch_str(header_line_iter, merged_strbox, token_slen, max_blen, search_term_ct);
4247 if (ii != -1) {
4248 const uint32_t cur_map_idx = id_map[S_CAST(uint32_t, ii)];
4249 const uint32_t search_col_idx = cur_map_idx & 31;
4250 const uint32_t priority_idx = cur_map_idx >> 5;
4251 if (priority_vals[search_col_idx] >= priority_idx) {
4252 if (unlikely(priority_vals[search_col_idx] == priority_idx)) {
4253 logerrprintfww("Error: Duplicate column header '%s' in --%s file.\n", &(merged_strbox[max_blen * cur_map_idx]), flagname_p);
4254 goto SearchHeaderLine_ret_MALFORMED_INPUT;
4255 }
4256 priority_vals[search_col_idx] = priority_idx;
4257 cols_and_types[search_col_idx] = (S_CAST(uint64_t, col_idx) << 32) | search_col_idx;
4258 }
4259 }
4260 header_line_iter = FirstNonTspace(token_end);
4261 if (IsEolnKns(*header_line_iter)) {
4262 break;
4263 }
4264 }
4265 uint32_t found_type_bitset = 0;
4266 for (uint32_t search_col_idx = 0; search_col_idx != search_col_ct; ++search_col_idx) {
4267 if (priority_vals[search_col_idx] != UINT32_MAX) {
4268 found_type_bitset |= 1U << search_col_idx;
4269 }
4270 }
4271 const uint32_t found_col_ct = PopcountWord(found_type_bitset);
4272 *found_col_ct_ptr = found_col_ct;
4273 *found_type_bitset_ptr = found_type_bitset;
4274 if (found_col_ct) {
4275 STD_SORT(search_col_ct, u64cmp, cols_and_types);
4276 uint32_t prev_col_idx = cols_and_types[0] >> 32;
4277 col_skips[0] = prev_col_idx;
4278 col_types[0] = S_CAST(uint32_t, cols_and_types[0]);
4279 for (uint32_t found_col_idx = 1; found_col_idx != found_col_ct; ++found_col_idx) {
4280 const uint64_t cur_col_and_type = cols_and_types[found_col_idx];
4281 const uint32_t cur_col_idx = cur_col_and_type >> 32;
4282 col_skips[found_col_idx] = cur_col_idx - prev_col_idx;
4283 col_types[found_col_idx] = S_CAST(uint32_t, cur_col_and_type);
4284 prev_col_idx = cur_col_idx;
4285 }
4286 }
4287 }
4288 while (0) {
4289 SearchHeaderLine_ret_NOMEM:
4290 reterr = kPglRetNomem;
4291 break;
4292 SearchHeaderLine_ret_INVALID_CMDLINE:
4293 reterr = kPglRetInvalidCmdline;
4294 break;
4295 SearchHeaderLine_ret_MALFORMED_INPUT:
4296 reterr = kPglRetMalformedInput;
4297 break;
4298 }
4299 BigstackReset(bigstack_mark);
4300 return reterr;
4301 }
4302
ParseColDescriptor(const char * col_descriptor_iter,const char * supported_ids,const char * cur_flag_name,uint32_t first_col_shifted,uint32_t default_cols_mask,uint32_t prohibit_empty,void * result_ptr)4303 PglErr ParseColDescriptor(const char* col_descriptor_iter, const char* supported_ids, const char* cur_flag_name, uint32_t first_col_shifted, uint32_t default_cols_mask, uint32_t prohibit_empty, void* result_ptr) {
4304 // col_descriptor is usually a pointer to argv[...][5] (first five characters
4305 // are "cols="). supported_ids is a multistr.
4306 // may need to switch first_col_shifted/default_cols_mask/result to uint64_t
4307 PglErr reterr = kPglRetSuccess;
4308 uint32_t* id_map = nullptr;
4309 {
4310 uint32_t max_id_blen = 0;
4311 uint32_t id_ct = 0;
4312
4313 // work around strchr not returning a const char*?
4314 const char* supported_ids_iter = supported_ids;
4315
4316 // can precompute this sorted index and avoid the dynamic
4317 // allocations/deallocations, but this is cheap enough that I'd rather make
4318 // it easier to extend functionality.
4319 do {
4320 const uint32_t blen = 1 + strlen(supported_ids_iter);
4321 if (blen > max_id_blen) {
4322 max_id_blen = blen;
4323 }
4324 ++id_ct;
4325 supported_ids_iter = &(supported_ids_iter[blen]);
4326 } while (*supported_ids_iter);
4327 // max_id_blen + 4 extra bytes at the end, to support a "maybe" search
4328 // (yes, this can also be precomputed)
4329 if (unlikely(pgl_malloc((max_id_blen + 4) * (id_ct + 1) + 1, &id_map))) {
4330 goto ParseColDescriptor_ret_NOMEM;
4331 }
4332 char* sorted_ids = R_CAST(char*, &(id_map[id_ct]));
4333 supported_ids_iter = supported_ids;
4334 for (uint32_t id_idx = 0; id_idx != id_ct; ++id_idx) {
4335 const uint32_t blen = strlen(supported_ids_iter) + 1;
4336 memcpy(&(sorted_ids[id_idx * max_id_blen]), supported_ids_iter, blen);
4337 id_map[id_idx] = id_idx;
4338 supported_ids_iter = &(supported_ids_iter[blen]);
4339 }
4340 if (unlikely(SortStrboxIndexedMalloc(id_ct, max_id_blen, sorted_ids, id_map))) {
4341 goto ParseColDescriptor_ret_NOMEM;
4342 }
4343 uint32_t result = *S_CAST(uint32_t*, result_ptr);
4344 // might not want to bother splitting this into two loops
4345 if ((col_descriptor_iter[0] == '+') || (col_descriptor_iter[0] == '-')) {
4346 result |= default_cols_mask;
4347 char* maybebuf = &(sorted_ids[max_id_blen * id_ct]);
4348 memcpy_k(maybebuf, "maybe", 5);
4349 while (1) {
4350 const char* id_start = &(col_descriptor_iter[1]);
4351 const char* tok_end = strchrnul(id_start, ',');
4352 const uint32_t slen = tok_end - id_start;
4353 int32_t alpha_idx = bsearch_str(id_start, sorted_ids, slen, max_id_blen, id_ct);
4354 if (unlikely(alpha_idx == -1)) {
4355 char* write_iter = strcpya_k(g_logbuf, "Error: Unrecognized ID '");
4356 write_iter = memcpya(write_iter, id_start, slen);
4357 write_iter = strcpya_k(write_iter, "' in --");
4358 write_iter = strcpya(write_iter, cur_flag_name);
4359 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " column set descriptor.\n");
4360 goto ParseColDescriptor_ret_INVALID_CMDLINE_WW;
4361 }
4362 uint32_t shift = id_map[S_CAST(uint32_t, alpha_idx)];
4363 if (col_descriptor_iter[0] == '+') {
4364 result |= first_col_shifted << shift;
4365 } else {
4366 if (result & (first_col_shifted << shift)) {
4367 result -= first_col_shifted << shift;
4368 } else if (slen + 5 < max_id_blen) {
4369 // special case: if default column set includes e.g. "maybesid",
4370 // and user types "-sid", that should work
4371 memcpy(&(maybebuf[5]), id_start, slen);
4372 alpha_idx = bsearch_str(maybebuf, sorted_ids, slen + 5, max_id_blen, id_ct);
4373 if (alpha_idx != -1) {
4374 shift = id_map[S_CAST(uint32_t, alpha_idx)];
4375 result &= ~(first_col_shifted << shift);
4376 }
4377 }
4378 }
4379 // bugfix (16 Oct 2017): forgot to switch from !tok_end to !(*tok_end)
4380 // after use of strchrnul().
4381 if (!(*tok_end)) {
4382 break;
4383 }
4384 col_descriptor_iter = &(tok_end[1]);
4385 if (unlikely((col_descriptor_iter[0] != '+') && (col_descriptor_iter[0] != '-'))) {
4386 goto ParseColDescriptor_ret_MIXED_SIGN;
4387 }
4388 }
4389 } else if (*col_descriptor_iter) {
4390 while (1) {
4391 const char* tok_end = strchrnul(col_descriptor_iter, ',');
4392 const uint32_t slen = tok_end - col_descriptor_iter;
4393 const int32_t alpha_idx = bsearch_str(col_descriptor_iter, sorted_ids, slen, max_id_blen, id_ct);
4394 if (unlikely(alpha_idx == -1)) {
4395 char* write_iter = strcpya_k(g_logbuf, "Error: Unrecognized ID '");
4396 write_iter = memcpya(write_iter, col_descriptor_iter, slen);
4397 write_iter = strcpya_k(write_iter, "' in --");
4398 write_iter = strcpya(write_iter, cur_flag_name);
4399 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " column set descriptor.\n");
4400 goto ParseColDescriptor_ret_INVALID_CMDLINE_WW;
4401 }
4402 uint32_t shift = id_map[S_CAST(uint32_t, alpha_idx)];
4403 result |= first_col_shifted << shift;
4404 if (!(*tok_end)) {
4405 break;
4406 }
4407 col_descriptor_iter = &(tok_end[1]);
4408 if (unlikely((col_descriptor_iter[0] == '+') || (col_descriptor_iter[0] == '-'))) {
4409 goto ParseColDescriptor_ret_MIXED_SIGN;
4410 }
4411 }
4412 }
4413 if (unlikely(prohibit_empty && (!(result & (first_col_shifted * (UINT32_MAX >> (32 - id_ct))))))) {
4414 char* write_iter = strcpya_k(g_logbuf, "Error: All columns excluded by --");
4415 write_iter = strcpya(write_iter, cur_flag_name);
4416 snprintf(write_iter, kLogbufSize - 128, " column set descriptor.\n");
4417 goto ParseColDescriptor_ret_INVALID_CMDLINE_WW;
4418 }
4419 *S_CAST(uint32_t*, result_ptr) = result;
4420 }
4421 while (0) {
4422 ParseColDescriptor_ret_NOMEM:
4423 reterr = kPglRetNomem;
4424 break;
4425 ParseColDescriptor_ret_MIXED_SIGN:
4426 snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s column set descriptor (either all column set IDs must be preceded by +/-, or none of them can be).\n", cur_flag_name);
4427 ParseColDescriptor_ret_INVALID_CMDLINE_WW:
4428 WordWrapB(0);
4429 logerrputsb();
4430 reterr = kPglRetInvalidCmdline;
4431 break;
4432 }
4433 free_cond(id_map);
4434 return reterr;
4435 }
4436
4437 #ifdef __cplusplus
4438 } // namespace plink2
4439 #endif
4440