1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink2_cmdline.h"
19 
20 #ifdef __APPLE__
21   // needed for sysctl() call
22 #  include <sys/sysctl.h>
23 #endif
24 
25 #include <sys/types.h>  // open()
26 #include <sys/stat.h>  // open()
27 #include <fcntl.h>  // open()
28 #include <time.h>  // time(), ctime()
29 #include <unistd.h>  // getcwd(), gethostname(), sysconf(), fstat()
30 
31 #ifdef __cplusplus
32 namespace plink2 {
33 #endif
34 
35 const char kErrprintfFopen[] = "Error: Failed to open %s : %s.\n";
36 const char kErrprintfFread[] = "Error: %s read failure: %s.\n";
37 const char kErrprintfRewind[] = "Error: %s could not be scanned twice. (Process-substitution/named-pipe input is not permitted in this use case.)\n";
38 
39 char g_textbuf[kTextbufSize];
40 
41 // now initialized by init_bigstack
42 // can't use std::array<char, 512>& since it's initially null
43 const char* g_one_char_strs = nullptr;
44 // If one-base indels become sufficiently common, might want to predefine
45 // g_two_char_strs[], and update allele string construction/destruction
46 // accordingly.  (Though that should either be programmatically initialized, or
47 // only cover a subset of the space; 192k is a lot to increase the binary image
48 // size for a single simple table.)
49 
50 const char* g_input_missing_geno_ptr = nullptr;  // in addition to '.'
51 const char* g_output_missing_geno_ptr = nullptr;  // now '.'
52 
53 FILE* g_logfile = nullptr;
54 
55 char g_logbuf[kLogbufSize];
56 
57 uint32_t g_debug_on = 0;
58 uint32_t g_log_failed = 0;
59 uint32_t g_stderr_written_to = 0;
60 
logputs_silent(const char * str)61 void logputs_silent(const char* str) {
62   if (!g_debug_on) {
63     fputs(str, g_logfile);
64     if (unlikely(ferror_unlocked(g_logfile))) {
65       putchar('\n');
66       fflush(stdout);
67       fprintf(stderr, "Warning: Logging failure on:\n%s\nFurther logging will not be attempted in this run.\n", str);
68       g_log_failed = 1;
69     }
70   } else {
71     if (g_log_failed) {
72       fflush(stdout);
73       fputs(str, stderr);
74     } else {
75       fputs(str, g_logfile);
76       if (unlikely(ferror_unlocked(g_logfile))) {
77         putchar('\n');
78         fflush(stdout);
79         fprintf(stderr, "Error: Debug logging failure.  Dumping to stderr:\n%s", str);
80         g_log_failed = 1;
81         g_stderr_written_to = 1;
82       } else {
83         fflush(g_logfile);
84       }
85     }
86   }
87 }
88 
logputs(const char * str)89 void logputs(const char* str) {
90   logputs_silent(str);
91   fputs(str, stdout);
92 }
93 
logerrputs(const char * str)94 void logerrputs(const char* str) {
95   logputs_silent(str);
96   fflush(stdout);
97   fputs(str, stderr);
98   g_stderr_written_to = 1;
99 }
100 
logputsb()101 void logputsb() {
102   logputs_silent(g_logbuf);
103   fputs(g_logbuf, stdout);
104 }
105 
logerrputsb()106 void logerrputsb() {
107   logputs_silent(g_logbuf);
108   fflush(stdout);
109   fputs(g_logbuf, stderr);
110   g_stderr_written_to = 1;
111 }
112 
ForceNonFifo(const char * fname)113 PglErr ForceNonFifo(const char* fname) {
114   int32_t file_handle = open(fname, O_RDONLY);
115   if (unlikely(file_handle < 0)) {
116     return kPglRetOpenFail;
117   }
118   struct stat statbuf;
119   if (unlikely(fstat(file_handle, &statbuf) < 0)) {
120     close(file_handle);
121     return kPglRetOpenFail;
122   }
123   if (unlikely(S_ISFIFO(statbuf.st_mode))) {
124     close(file_handle);
125     return kPglRetRewindFail;
126   }
127   close(file_handle);
128   return kPglRetSuccess;
129 }
130 
fopen_checked(const char * fname,const char * mode,FILE ** target_ptr)131 BoolErr fopen_checked(const char* fname, const char* mode, FILE** target_ptr) {
132   /*
133   if (!strcmp(mode, FOPEN_WB)) {
134     // The fopen call may take a long time to return when overwriting in mode
135     // "w" in some scenarios on some OSes (I've seen 15+ sec).  On the other
136     // hand, sometimes it returns immediately and then writing takes less time.
137     // Since it goes both ways, I won't second-guess the OSes for now;
138     // presumably they're trying to minimize average amortized time.  But it
139     // might be worth retesting in the future; perhaps some common plink
140     // workflows violate normally-reasonable OS assumptions.
141     if (access(fname, W_OK) != -1) {
142       if (unlikely(unlink(fname))) {
143         logputs("\n");
144         logerrprintfww(kErrprintfFopen, fname, strerror(errno));
145         return 1;
146       }
147     }
148   }
149   */
150   *target_ptr = fopen(fname, mode);
151   if (unlikely(!(*target_ptr))) {
152     logputs("\n");
153     logerrprintfww(kErrprintfFopen, fname, strerror(errno));
154     return 1;
155   }
156   return 0;
157 }
158 
fwrite_flush2(char * buf_flush,FILE * outfile,char ** write_iter_ptr)159 BoolErr fwrite_flush2(char* buf_flush, FILE* outfile, char** write_iter_ptr) {
160   char* buf = &(buf_flush[-S_CAST(int32_t, kMaxMediumLine)]);
161   char* buf_end = *write_iter_ptr;
162   *write_iter_ptr = buf;
163   return fwrite_checked(buf, buf_end - buf, outfile);
164 }
165 
fclose_flush_null(char * buf_flush,char * write_iter,FILE ** outfile_ptr)166 BoolErr fclose_flush_null(char* buf_flush, char* write_iter, FILE** outfile_ptr) {
167   char* buf = &(buf_flush[-S_CAST(int32_t, kMaxMediumLine)]);
168   if (write_iter != buf) {
169     if (unlikely(fwrite_checked(buf, write_iter - buf, *outfile_ptr))) {
170       return 1;
171     }
172   }
173   return fclose_null(outfile_ptr);
174 }
175 
176 
float_cmp(const void * aa,const void * bb)177 int32_t float_cmp(const void* aa, const void* bb) {
178   const float fxx = *S_CAST(const float*, aa);
179   const float fyy = *S_CAST(const float*, bb);
180   if (fxx < fyy) {
181     return -1;
182   }
183   return (fxx > fyy);
184 }
185 
double_cmp(const void * aa,const void * bb)186 int32_t double_cmp(const void* aa, const void* bb) {
187   const double dxx = *S_CAST(const double*, aa);
188   const double dyy = *S_CAST(const double*, bb);
189   if (dxx < dyy) {
190     return -1;
191   }
192   return (dxx > dyy);
193 }
194 
double_cmp_decr(const void * aa,const void * bb)195 int32_t double_cmp_decr(const void* aa, const void* bb) {
196   const double dxx = *S_CAST(const double*, aa);
197   const double dyy = *S_CAST(const double*, bb);
198   if (dxx > dyy) {
199     return -1;
200   }
201   return (dxx < dyy);
202 }
203 
u64cmp(const void * aa,const void * bb)204 int32_t u64cmp(const void* aa, const void* bb) {
205   const uint64_t ullaa = *S_CAST(const uint64_t*, aa);
206   const uint64_t ullbb = *S_CAST(const uint64_t*, bb);
207   if (ullaa < ullbb) {
208     return -1;
209   }
210   return (ullaa > ullbb);
211 }
212 
213 #ifndef __cplusplus
u64cmp_decr(const void * aa,const void * bb)214 int32_t u64cmp_decr(const void* aa, const void* bb) {
215   const uint64_t ullaa = *S_CAST(const uint64_t*, aa);
216   const uint64_t ullbb = *S_CAST(const uint64_t*, bb);
217   if (ullaa > ullbb) {
218     return -1;
219   }
220   return (ullaa < ullbb);
221 }
222 #endif
223 
224 #ifdef __cplusplus
DestructiveMedianF(uintptr_t len,float * unsorted_arr)225 float DestructiveMedianF(uintptr_t len, float* unsorted_arr) {
226   if (!len) {
227     return 0.0;
228   }
229   const uintptr_t len_d2 = len / 2;
230   std::nth_element(unsorted_arr, &(unsorted_arr[len_d2]), &(unsorted_arr[len]));
231   const float median_upper = unsorted_arr[len_d2];
232   if (len % 2) {
233     return median_upper;
234   }
235   return (FArrMax(unsorted_arr, len_d2) + median_upper) * 0.5f;
236 }
237 
DestructiveMedianD(uintptr_t len,double * unsorted_arr)238 double DestructiveMedianD(uintptr_t len, double* unsorted_arr) {
239   if (!len) {
240     return 0.0;
241   }
242   const uintptr_t len_d2 = len / 2;
243   std::nth_element(unsorted_arr, &(unsorted_arr[len_d2]), &(unsorted_arr[len]));
244   const double median_upper = unsorted_arr[len_d2];
245   if (len % 2) {
246     return median_upper;
247   }
248   return (DArrMax(unsorted_arr, len_d2) + median_upper) * 0.5;
249 }
250 #else
251 // these will probably be used in __cplusplus case too
MedianF(const float * sorted_arr,uintptr_t len)252 float MedianF(const float* sorted_arr, uintptr_t len) {
253   if (!len) {
254     return 0.0;
255   }
256   if (len % 2) {
257     return sorted_arr[len / 2];
258   }
259   return (sorted_arr[len / 2] + sorted_arr[(len / 2) - 1]) * 0.5f;
260 }
261 
MedianD(const double * sorted_arr,uintptr_t len)262 double MedianD(const double* sorted_arr, uintptr_t len) {
263   if (!len) {
264     return 0.0;
265   }
266   if (len % 2) {
267     return sorted_arr[len / 2];
268   }
269   return (sorted_arr[len / 2] + sorted_arr[(len / 2) - 1]) * 0.5;
270 }
271 
DestructiveMedianF(uintptr_t len,float * unsorted_arr)272 float DestructiveMedianF(uintptr_t len, float* unsorted_arr) {
273   // no, I'm not gonna bother reimplementing introselect just for folks who
274   // insist on compiling this as pure C instead of C++
275   qsort(unsorted_arr, len, sizeof(float), float_cmp);
276   return MedianF(unsorted_arr, len);
277 }
278 
DestructiveMedianD(uintptr_t len,double * unsorted_arr)279 double DestructiveMedianD(uintptr_t len, double* unsorted_arr) {
280   qsort(unsorted_arr, len, sizeof(double), double_cmp);
281   return MedianD(unsorted_arr, len);
282 }
283 #endif
284 
285 
286 // Overread must be ok.
SortStrboxIndexed(uintptr_t str_ct,uintptr_t max_str_blen,uint32_t use_nsort,char * strbox,uint32_t * id_map)287 BoolErr SortStrboxIndexed(uintptr_t str_ct, uintptr_t max_str_blen, uint32_t use_nsort, char* strbox, uint32_t* id_map) {
288   if (str_ct < 2) {
289     return 0;
290   }
291   unsigned char* bigstack_mark = g_bigstack_base;
292   const uintptr_t wkspace_entry_blen = GetStrboxsortWentryBlen(max_str_blen);
293   unsigned char* sort_wkspace;
294   if (unlikely(bigstack_alloc_uc(str_ct * wkspace_entry_blen, &sort_wkspace))) {
295     return 1;
296   }
297   SortStrboxIndexed2(str_ct, max_str_blen, use_nsort, strbox, id_map, sort_wkspace);
298   BigstackReset(bigstack_mark);
299   return 0;
300 }
301 
302 /*
303 BoolErr StrptrArrIndexedSort(const char* const* unsorted_strptrs, uint32_t str_ct, uint32_t overread_ok, uint32_t use_nsort, uint32_t* id_map) {
304   if (str_ct < 2) {
305     if (str_ct) {
306       id_map[0] = 0;
307     }
308     return 0;
309   }
310   if (bigstack_left() < str_ct * sizeof(StrSortIndexedDeref)) {
311     return 1;
312   }
313   StrSortIndexedDeref* wkspace_alias = (StrSortIndexedDeref*)g_bigstack_base;
314   for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
315     wkspace_alias[str_idx].strptr = unsorted_strptrs[str_idx];
316     wkspace_alias[str_idx].orig_idx = str_idx;
317   }
318   StrptrArrSortMain(str_ct, overread_ok, use_nsort, wkspace_alias);
319   for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
320     id_map[str_idx] = wkspace_alias[str_idx].orig_idx;
321   }
322   BigstackReset(wkspace_alias);
323   return 0;
324 }
325 */
326 
327 
CountSortedSmallerU32(const uint32_t * sorted_u32_arr,uint32_t arr_length,uint32_t needle)328 uint32_t CountSortedSmallerU32(const uint32_t* sorted_u32_arr, uint32_t arr_length, uint32_t needle) {
329   // (strangely, this seems to be equal to or better than std::lower_bound with
330   // -O2 optimization, but can become much slower with -O3?)
331 
332   // assumes arr_length is nonzero, and sorted_u32_arr is in nondecreasing
333   // order.  (useful for searching variant_bps[].)
334   // needle guaranteed to be larger than sorted_u32_arr[min_idx - 1] if it
335   // exists, but NOT necessarily sorted_u32_arr[min_idx].
336   int32_t min_idx = 0;
337   // similarly, needle guaranteed to be no greater than
338   // sorted_u32_arr[max_idx + 1] if it exists, but not necessarily
339   // sorted_u32_arr[max_idx].  Signed integer since it could become -1, and
340   // min_idx in turn is signed so comparisons are safe.
341   int32_t max_idx = arr_length - 1;
342   while (min_idx < max_idx) {
343     const uint32_t mid_idx = (S_CAST(uint32_t, min_idx) + S_CAST(uint32_t, max_idx)) / 2;
344     if (needle > sorted_u32_arr[mid_idx]) {
345       min_idx = mid_idx + 1;
346     } else {
347       max_idx = mid_idx - 1;
348     }
349   }
350   return min_idx + (needle > sorted_u32_arr[S_CAST(uint32_t, min_idx)]);
351 }
352 
ExpsearchU32(const uint32_t * sorted_u32_arr,uint32_t end_idx,uint32_t needle)353 uint32_t ExpsearchU32(const uint32_t* sorted_u32_arr, uint32_t end_idx, uint32_t needle) {
354   uint32_t next_incr = 1;
355   uint32_t start_idx = 0;
356   uint32_t cur_idx = 0;
357   while (cur_idx < end_idx) {
358     if (sorted_u32_arr[cur_idx] >= needle) {
359       end_idx = cur_idx;
360       break;
361     }
362     start_idx = cur_idx + 1;
363     cur_idx += next_incr;
364     next_incr *= 2;
365   }
366   while (start_idx < end_idx) {
367     // this breaks if arr_length > 2^31
368     const uint32_t mid_idx = (start_idx + end_idx) / 2;
369 
370     if (sorted_u32_arr[mid_idx] < needle) {
371       start_idx = mid_idx + 1;
372     } else {
373       end_idx = mid_idx;
374     }
375   }
376   return start_idx;
377 }
378 
CountSortedSmallerU64(const uint64_t * sorted_u64_arr,uintptr_t arr_length,uint64_t needle)379 uintptr_t CountSortedSmallerU64(const uint64_t* sorted_u64_arr, uintptr_t arr_length, uint64_t needle) {
380   intptr_t min_idx = 0;
381   intptr_t max_idx = arr_length - 1;
382   while (min_idx < max_idx) {
383     const uintptr_t mid_idx = (S_CAST(uintptr_t, min_idx) + S_CAST(uintptr_t, max_idx)) / 2;
384     if (needle > sorted_u64_arr[mid_idx]) {
385       min_idx = mid_idx + 1;
386     } else {
387       max_idx = mid_idx - 1;
388     }
389   }
390   return min_idx + (needle > sorted_u64_arr[S_CAST(uintptr_t, min_idx)]);
391 }
392 
CountSortedSmallerD(const double * sorted_dbl_arr,uintptr_t arr_length,double needle)393 uintptr_t CountSortedSmallerD(const double* sorted_dbl_arr, uintptr_t arr_length, double needle) {
394   intptr_t min_idx = 0;
395   intptr_t max_idx = arr_length - 1;
396   while (min_idx < max_idx) {
397     const uintptr_t mid_idx = (S_CAST(uintptr_t, min_idx) + S_CAST(uintptr_t, max_idx)) / 2;
398     if (needle > sorted_dbl_arr[mid_idx]) {
399       min_idx = mid_idx + 1;
400     } else {
401       max_idx = mid_idx - 1;
402     }
403   }
404   return min_idx + (needle > sorted_dbl_arr[S_CAST(uintptr_t, min_idx)]);
405 }
406 
CountSortedLeqU64(const uint64_t * sorted_u64_arr,uintptr_t arr_length,uint64_t needle)407 uintptr_t CountSortedLeqU64(const uint64_t* sorted_u64_arr, uintptr_t arr_length, uint64_t needle) {
408   intptr_t min_idx = 0;
409   intptr_t max_idx = arr_length - 1;
410   while (min_idx < max_idx) {
411     const uintptr_t mid_idx = (S_CAST(uintptr_t, min_idx) + S_CAST(uintptr_t, max_idx)) / 2;
412     if (needle >= sorted_u64_arr[mid_idx]) {
413       min_idx = mid_idx + 1;
414     } else {
415       max_idx = mid_idx - 1;
416     }
417   }
418   return min_idx + (needle >= sorted_u64_arr[S_CAST(uintptr_t, min_idx)]);
419 }
420 
GetParamCt(const char * const * argvk,uint32_t argc,uint32_t flag_idx)421 uint32_t GetParamCt(const char* const* argvk, uint32_t argc, uint32_t flag_idx) {
422   // Counts the number of optional arguments given to the flag at position
423   // flag_idx, treating any nonnumeric argument beginning with "-" as optional.
424   ++flag_idx;
425   uint32_t cur_idx = flag_idx;
426   while ((cur_idx < argc) && (!IsCmdlineFlag(argvk[cur_idx]))) {
427     ++cur_idx;
428   }
429   return cur_idx - flag_idx;
430 }
431 
EnforceParamCtRange(const char * flag_name,uint32_t param_ct,uint32_t min_ct,uint32_t max_ct)432 BoolErr EnforceParamCtRange(const char* flag_name, uint32_t param_ct, uint32_t min_ct, uint32_t max_ct) {
433   if (unlikely(param_ct > max_ct)) {
434     if (max_ct > min_ct) {
435       snprintf(g_logbuf, kLogbufSize, "Error: %s accepts at most %u argument%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
436     } else {
437       snprintf(g_logbuf, kLogbufSize, "Error: %s only accepts %u argument%s.\n", flag_name, max_ct, (max_ct == 1)? "" : "s");
438     }
439     return 1;
440   }
441   if (likely(param_ct >= min_ct)) {
442     return 0;
443   }
444   if (min_ct == 1) {
445     snprintf(g_logbuf, kLogbufSize, "Error: Missing %s argument.\n", flag_name);
446   } else {
447     snprintf(g_logbuf, kLogbufSize, "Error: %s requires %s%u arguments.\n", flag_name, (min_ct < max_ct)? "at least " : "", min_ct);
448   }
449   return 1;
450 }
451 
SortCmdlineFlags(uint32_t max_flag_blen,uint32_t flag_ct,char * flag_buf,uint32_t * flag_map)452 PglErr SortCmdlineFlags(uint32_t max_flag_blen, uint32_t flag_ct, char* flag_buf, uint32_t* flag_map) {
453   // Assumes flag_ct is the number of flag (as opposed to value) arguments,
454   // flag_buf[] points to a rectangular char* array (width max_flag_blen) of
455   // flag names with leading dash(es) stripped, and flag_map[] maps flag_buf[]
456   // entries to argv[] entries.
457   // Lexicographically sorts flag_buf (updating flag_map in the process), and
458   // then checks for duplicates.
459   // Okay for flag_buf to contain entries with spaces (plink 1.9's alias
460   // resolution takes advantage of this).
461   assert(flag_ct);  // this must be skipped if there are no flags at all
462   if (unlikely(SortStrboxIndexedMalloc(flag_ct, max_flag_blen, flag_buf, flag_map))) {
463     return kPglRetNomem;
464   }
465   uint32_t prev_flag_len = strlen_se(flag_buf);
466   char* prev_flag_ptr = flag_buf;
467   for (uint32_t cur_flag_idx = 1; cur_flag_idx != flag_ct; ++cur_flag_idx) {
468     char* cur_flag_ptr = &(prev_flag_ptr[max_flag_blen]);
469     const uint32_t cur_flag_len = strlen_se(cur_flag_ptr);
470     if (unlikely((prev_flag_len == cur_flag_len) && memequal(prev_flag_ptr, cur_flag_ptr, cur_flag_len))) {
471       cur_flag_ptr[cur_flag_len] = '\0';  // just in case of aliases
472       fflush(stdout);
473       fprintf(stderr, "Error: Duplicate --%s flag.\n", cur_flag_ptr);
474       // g_stderr_written_to = 1;
475       return kPglRetInvalidCmdline;
476     }
477     prev_flag_ptr = cur_flag_ptr;
478     prev_flag_len = cur_flag_len;
479   }
480   return kPglRetSuccess;
481 }
482 
InitLogfile(uint32_t always_stderr,char * outname,char * outname_end)483 PglErr InitLogfile(uint32_t always_stderr, char* outname, char* outname_end) {
484   snprintf(outname_end, kMaxOutfnameExtBlen, ".log");
485   g_logfile = fopen(outname, "w");
486   if (unlikely(!g_logfile)) {
487     fflush(stdout);
488     fprintf(stderr, "Error: Failed to open %s for logging: %s.\n", outname, strerror(errno));
489     // g_stderr_written_to = 1;
490     return kPglRetOpenFail;
491   }
492   fprintf(always_stderr? stderr : stdout, "Logging to %s.\n", outname);
493   return kPglRetSuccess;
494 }
495 
CleanupLogfile(uint32_t print_end_time)496 BoolErr CleanupLogfile(uint32_t print_end_time) {
497   char* write_iter = strcpya_k(g_logbuf, "End time: ");
498   time_t rawtime;
499   time(&rawtime);
500   write_iter = Stpcpy(write_iter, ctime(&rawtime));  // has trailing \n
501   if (print_end_time) {
502     fputs(g_logbuf, stdout);
503   }
504   BoolErr ret_boolerr = 0;
505   if (g_logfile) {
506     if (!g_log_failed) {
507       logputs_silent("\n");
508       logputs_silent(g_logbuf);
509       if (unlikely(fclose(g_logfile))) {
510         fflush(stdout);
511         fprintf(stderr, "Error: Failed to finish writing to log: %s.\n", strerror(errno));
512         ret_boolerr = 1;
513       }
514     } else {
515       fclose(g_logfile);
516     }
517     g_logfile = nullptr;
518   }
519   return ret_boolerr;
520 }
521 
522 // manually managed, very large stack
523 unsigned char* g_bigstack_base = nullptr;
524 unsigned char* g_bigstack_end = nullptr;
525 
DetectMib()526 uintptr_t DetectMib() {
527   int64_t llxx;
528   // return zero if detection failed
529   // see e.g. http://nadeausoftware.com/articles/2012/09/c_c_tip_how_get_physical_memory_size_system .
530 #ifdef __APPLE__
531   int32_t mib[2];
532   mib[0] = CTL_HW;
533   mib[1] = HW_MEMSIZE;
534   llxx = 0;
535   size_t sztmp = sizeof(int64_t);
536   sysctl(&mib[0], 2, &llxx, &sztmp, nullptr, 0);
537   llxx /= 1048576;
538 #else
539 #  ifdef _WIN32
540   MEMORYSTATUSEX memstatus;
541   memstatus.dwLength = sizeof(memstatus);
542   GlobalMemoryStatusEx(&memstatus);
543   llxx = memstatus.ullTotalPhys / 1048576;
544 #  else
545   llxx = S_CAST(uint64_t, sysconf(_SC_PHYS_PAGES)) * S_CAST(size_t, sysconf(_SC_PAGESIZE)) / 1048576;
546 #  endif
547 #endif
548   return llxx;
549 }
550 
GetDefaultAllocMib()551 uintptr_t GetDefaultAllocMib() {
552   const uintptr_t total_mib = DetectMib();
553   if (!total_mib) {
554     return kBigstackDefaultMib;
555   }
556   if (total_mib < (kBigstackMinMib * 2)) {
557     return kBigstackMinMib;
558   }
559   return (total_mib / 2);
560 }
561 
InitBigstack(uintptr_t malloc_size_mib,uintptr_t * malloc_mib_final_ptr,unsigned char ** bigstack_ua_ptr)562 PglErr InitBigstack(uintptr_t malloc_size_mib, uintptr_t* malloc_mib_final_ptr, unsigned char** bigstack_ua_ptr) {
563   // guarantee contiguous malloc space outside of main workspace
564   unsigned char* bubble;
565 
566   // this is pointless with overcommit on, may want to conditionally compile
567   // this out, and/or conditionally skip this
568   if (unlikely(pgl_malloc(kNonBigstackMin, &bubble))) {
569     return kPglRetNomem;
570   }
571 
572   assert(malloc_size_mib >= kBigstackMinMib);
573 #ifndef __LP64__
574   assert(malloc_size_mib <= 2047);
575 #endif
576   // don't use pgl_malloc here since we don't automatically want to set
577   // g_failed_alloc_attempt_size on failure
578   unsigned char* bigstack_ua = S_CAST(unsigned char*, malloc(malloc_size_mib * 1048576 * sizeof(char)));
579   // this is thwarted by overcommit, but still better than nothing...
580   while (!bigstack_ua) {
581     malloc_size_mib = (malloc_size_mib * 3) / 4;
582     if (malloc_size_mib < kBigstackMinMib) {
583       malloc_size_mib = kBigstackMinMib;
584     }
585     bigstack_ua = S_CAST(unsigned char*, malloc(malloc_size_mib * 1048576 * sizeof(char)));
586     if (unlikely((!bigstack_ua) && (malloc_size_mib == kBigstackMinMib))) {
587       // switch to "goto cleanup" pattern if any more exit points are needed
588       g_failed_alloc_attempt_size = kBigstackMinMib * 1048576;
589       free(bubble);
590       return kPglRetNomem;
591     }
592   }
593   // force 64-byte align to make cache line sensitivity work
594   unsigned char* bigstack_initial_base = R_CAST(unsigned char*, RoundUpPow2(R_CAST(uintptr_t, bigstack_ua), kCacheline));
595   g_bigstack_base = bigstack_initial_base;
596   // last 576 bytes now reserved for g_one_char_strs + overread buffer
597   g_bigstack_end = &(bigstack_initial_base[RoundDownPow2(malloc_size_mib * 1048576 - 576 - S_CAST(uintptr_t, bigstack_initial_base - bigstack_ua), kCacheline)]);
598   free(bubble);
599   uintptr_t* one_char_iter = R_CAST(uintptr_t*, g_bigstack_end);
600 #ifdef __LP64__
601   // assumes little-endian
602   uintptr_t cur_word = 0x3000200010000LLU;
603   for (uint32_t uii = 0; uii != 64; ++uii) {
604     *one_char_iter++ = cur_word;
605     cur_word += 0x4000400040004LLU;
606   }
607 #else
608   uintptr_t cur_word = 0x10000;
609   for (uint32_t uii = 0; uii != 128; ++uii) {
610     *one_char_iter++ = cur_word;
611     cur_word += 0x20002;
612   }
613 #endif
614   g_one_char_strs = R_CAST(const char*, g_bigstack_end);
615 
616   // plink2 doesn't actually need these here, but short programs using
617   // plink2_common benefit from this
618   g_input_missing_geno_ptr = &(g_one_char_strs[96]);
619   g_output_missing_geno_ptr = &(g_one_char_strs[92]);
620 
621   *malloc_mib_final_ptr = malloc_size_mib;
622   *bigstack_ua_ptr = bigstack_ua;
623   return kPglRetSuccess;
624 }
625 
626 
bigstack_calloc_uc(uintptr_t ct,unsigned char ** uc_arr_ptr)627 BoolErr bigstack_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
628   *uc_arr_ptr = S_CAST(unsigned char*, bigstack_alloc(ct));
629   if (unlikely(!(*uc_arr_ptr))) {
630     return 1;
631   }
632   memset(*uc_arr_ptr, 0, ct);
633   return 0;
634 }
635 
bigstack_calloc_d(uintptr_t ct,double ** d_arr_ptr)636 BoolErr bigstack_calloc_d(uintptr_t ct, double** d_arr_ptr) {
637   *d_arr_ptr = S_CAST(double*, bigstack_alloc(ct * sizeof(double)));
638   if (unlikely(!(*d_arr_ptr))) {
639     return 1;
640   }
641   ZeroDArr(ct, *d_arr_ptr);
642   return 0;
643 }
644 
bigstack_calloc_f(uintptr_t ct,float ** f_arr_ptr)645 BoolErr bigstack_calloc_f(uintptr_t ct, float** f_arr_ptr) {
646   *f_arr_ptr = S_CAST(float*, bigstack_alloc(ct * sizeof(float)));
647   if (unlikely(!(*f_arr_ptr))) {
648     return 1;
649   }
650   ZeroFArr(ct, *f_arr_ptr);
651   return 0;
652 }
653 
bigstack_calloc_u16(uintptr_t ct,uint16_t ** u16_arr_ptr)654 BoolErr bigstack_calloc_u16(uintptr_t ct, uint16_t** u16_arr_ptr) {
655   *u16_arr_ptr = S_CAST(uint16_t*, bigstack_alloc(ct * sizeof(int16_t)));
656   if (unlikely(!(*u16_arr_ptr))) {
657     return 1;
658   }
659   memset(*u16_arr_ptr, 0, ct * sizeof(int16_t));
660   return 0;
661 }
662 
bigstack_calloc_u32(uintptr_t ct,uint32_t ** u32_arr_ptr)663 BoolErr bigstack_calloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr) {
664   *u32_arr_ptr = S_CAST(uint32_t*, bigstack_alloc(ct * sizeof(int32_t)));
665   if (unlikely(!(*u32_arr_ptr))) {
666     return 1;
667   }
668   ZeroU32Arr(ct, *u32_arr_ptr);
669   return 0;
670 }
671 
bigstack_calloc_w(uintptr_t ct,uintptr_t ** w_arr_ptr)672 BoolErr bigstack_calloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) {
673   *w_arr_ptr = S_CAST(uintptr_t*, bigstack_alloc(ct * sizeof(intptr_t)));
674   if (unlikely(!(*w_arr_ptr))) {
675     return 1;
676   }
677   ZeroWArr(ct, *w_arr_ptr);
678   return 0;
679 }
680 
bigstack_calloc_u64(uintptr_t ct,uint64_t ** u64_arr_ptr)681 BoolErr bigstack_calloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr) {
682   *u64_arr_ptr = S_CAST(uint64_t*, bigstack_alloc(ct * sizeof(int64_t)));
683   if (unlikely(!(*u64_arr_ptr))) {
684     return 1;
685   }
686   ZeroU64Arr(ct, *u64_arr_ptr);
687   return 0;
688 }
689 
bigstack_calloc_cp(uintptr_t ct,char *** cp_arr_ptr)690 BoolErr bigstack_calloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
691   *cp_arr_ptr = S_CAST(char**, bigstack_alloc(ct * sizeof(intptr_t)));
692   if (unlikely(!(*cp_arr_ptr))) {
693     return 1;
694   }
695   ZeroPtrArr(ct, *cp_arr_ptr);
696   return 0;
697 }
698 
bigstack_calloc_kcp(uintptr_t ct,const char *** kcp_arr_ptr)699 BoolErr bigstack_calloc_kcp(uintptr_t ct, const char*** kcp_arr_ptr) {
700   *kcp_arr_ptr = S_CAST(const char**, bigstack_alloc(ct * sizeof(intptr_t)));
701   if (unlikely(!(*kcp_arr_ptr))) {
702     return 1;
703   }
704   ZeroPtrArr(ct, *kcp_arr_ptr);
705   return 0;
706 }
707 
bigstack_end_calloc_uc(uintptr_t ct,unsigned char ** uc_arr_ptr)708 BoolErr bigstack_end_calloc_uc(uintptr_t ct, unsigned char** uc_arr_ptr) {
709   *uc_arr_ptr = S_CAST(unsigned char*, bigstack_end_alloc(ct));
710   if (unlikely(!(*uc_arr_ptr))) {
711     return 1;
712   }
713   memset(*uc_arr_ptr, 0, ct);
714   return 0;
715 }
716 
bigstack_end_calloc_d(uintptr_t ct,double ** d_arr_ptr)717 BoolErr bigstack_end_calloc_d(uintptr_t ct, double** d_arr_ptr) {
718   *d_arr_ptr = S_CAST(double*, bigstack_end_alloc(ct * sizeof(double)));
719   if (unlikely(!(*d_arr_ptr))) {
720     return 1;
721   }
722   ZeroDArr(ct, *d_arr_ptr);
723   return 0;
724 }
725 
bigstack_end_calloc_f(uintptr_t ct,float ** f_arr_ptr)726 BoolErr bigstack_end_calloc_f(uintptr_t ct, float** f_arr_ptr) {
727   *f_arr_ptr = S_CAST(float*, bigstack_end_alloc(ct * sizeof(float)));
728   if (unlikely(!(*f_arr_ptr))) {
729     return 1;
730   }
731   ZeroFArr(ct, *f_arr_ptr);
732   return 0;
733 }
734 
bigstack_end_calloc_u32(uintptr_t ct,uint32_t ** u32_arr_ptr)735 BoolErr bigstack_end_calloc_u32(uintptr_t ct, uint32_t** u32_arr_ptr) {
736   *u32_arr_ptr = S_CAST(uint32_t*, bigstack_end_alloc(ct * sizeof(int32_t)));
737   if (unlikely(!(*u32_arr_ptr))) {
738     return 1;
739   }
740   ZeroU32Arr(ct, *u32_arr_ptr);
741   return 0;
742 }
743 
bigstack_end_calloc_w(uintptr_t ct,uintptr_t ** w_arr_ptr)744 BoolErr bigstack_end_calloc_w(uintptr_t ct, uintptr_t** w_arr_ptr) {
745   *w_arr_ptr = S_CAST(uintptr_t*, bigstack_end_alloc(ct * sizeof(intptr_t)));
746   if (unlikely(!(*w_arr_ptr))) {
747     return 1;
748   }
749   ZeroWArr(ct, *w_arr_ptr);
750   return 0;
751 }
752 
bigstack_end_calloc_u64(uintptr_t ct,uint64_t ** u64_arr_ptr)753 BoolErr bigstack_end_calloc_u64(uintptr_t ct, uint64_t** u64_arr_ptr) {
754   *u64_arr_ptr = S_CAST(uint64_t*, bigstack_end_alloc(ct * sizeof(int64_t)));
755   if (unlikely(!(*u64_arr_ptr))) {
756     return 1;
757   }
758   ZeroU64Arr(ct, *u64_arr_ptr);
759   return 0;
760 }
761 
bigstack_end_calloc_cp(uintptr_t ct,char *** cp_arr_ptr)762 BoolErr bigstack_end_calloc_cp(uintptr_t ct, char*** cp_arr_ptr) {
763   *cp_arr_ptr = S_CAST(char**, bigstack_end_alloc(ct * sizeof(intptr_t)));
764   if (unlikely(!(*cp_arr_ptr))) {
765     return 1;
766   }
767   ZeroPtrArr(ct, *cp_arr_ptr);
768   return 0;
769 }
770 
771 
PushLlStr(const char * str,LlStr ** ll_stack_ptr)772 BoolErr PushLlStr(const char* str, LlStr** ll_stack_ptr) {
773   uintptr_t blen = strlen(str) + 1;
774   LlStr* new_llstr;
775   if (unlikely(pgl_malloc(sizeof(LlStr) + blen, &new_llstr))) {
776     return 1;
777   }
778   new_llstr->next = *ll_stack_ptr;
779   memcpy(new_llstr->str, str, blen);
780   *ll_stack_ptr = new_llstr;
781   return 0;
782 }
783 
784 /*
785 BoolErr push_llstr_counted(const char* str, uint32_t slen, LlStr** ll_stack_ptr) {
786   LlStr* new_llstr;
787   if (pgl_malloc(sizeof(LlStr) + slen + 1, &new_llstr)) {
788     return 1;
789   }
790   new_llstr->next = *ll_stack_ptr;
791   memcpy(new_llstr->str, str, slen);
792   new_llstr->str[slen] = '\0';
793   *ll_stack_ptr = new_llstr;
794   return 0;
795 }
796 */
797 
798 
CountAndMeasureMultistrReverseAlloc(const char * multistr,uintptr_t max_str_ct,uint32_t * str_ct_ptr,uintptr_t * max_blen_ptr,const char *** strptr_arrp)799 BoolErr CountAndMeasureMultistrReverseAlloc(const char* multistr, uintptr_t max_str_ct, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr, const char*** strptr_arrp) {
800   // assumes multistr is nonempty
801   assert(multistr[0]);
802   uintptr_t ct = 0;
803   uintptr_t max_blen = *max_blen_ptr;
804   const char** strptr_arr_iter = *strptr_arrp;
805   do {
806     if (unlikely(++ct > max_str_ct)) {
807       return 1;
808     }
809     const uintptr_t blen = strlen(multistr) + 1;
810     if (blen > max_blen) {
811       max_blen = blen;
812     }
813     *(--strptr_arr_iter) = multistr;
814     multistr = &(multistr[blen]);
815   } while (*multistr);
816   *str_ct_ptr = ct;
817   *max_blen_ptr = max_blen;
818   *strptr_arrp = strptr_arr_iter;
819   return 0;
820 }
821 
MultistrToStrboxDedupArenaAlloc(unsigned char * arena_top,const char * multistr,unsigned char ** arena_bottom_ptr,char ** sorted_strbox_ptr,uint32_t * str_ct_ptr,uintptr_t * max_blen_ptr)822 BoolErr MultistrToStrboxDedupArenaAlloc(unsigned char* arena_top, const char* multistr, unsigned char** arena_bottom_ptr, char** sorted_strbox_ptr, uint32_t* str_ct_ptr, uintptr_t* max_blen_ptr) {
823   const char** strptr_arr = R_CAST(const char**, arena_top);
824   uintptr_t max_str_blen = 0;
825   uint32_t str_ct;
826   if (unlikely(CountAndMeasureMultistrReverseAlloc(multistr, (arena_top - (*arena_bottom_ptr)) / sizeof(intptr_t), &str_ct, &max_str_blen, &strptr_arr))) {
827     return 1;
828   }
829   const uintptr_t strbox_byte_ct = RoundUpPow2(str_ct * max_str_blen, kCacheline);
830   if (unlikely((R_CAST(uintptr_t, strptr_arr) - R_CAST(uintptr_t, *arena_bottom_ptr)) < strbox_byte_ct)) {
831     return 1;
832   }
833   // multistr can currently come from command line parser, which does not
834   // guarantee safe overread
835   StrptrArrSort(str_ct, strptr_arr);
836   *sorted_strbox_ptr = R_CAST(char*, *arena_bottom_ptr);
837   str_ct = CopyAndDedupSortedStrptrsToStrbox(strptr_arr, str_ct, max_str_blen, *sorted_strbox_ptr);
838   *arena_bottom_ptr += RoundUpPow2(str_ct * max_str_blen, kCacheline);
839   *str_ct_ptr = str_ct;
840   *max_blen_ptr = max_str_blen;
841   return 0;
842 }
843 
844 
DivisionMagicNums(uint32_t divisor,uint64_t * multp,uint32_t * __restrict pre_shiftp,uint32_t * __restrict post_shiftp,uint32_t * __restrict incrp)845 void DivisionMagicNums(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp) {
846   // Enables fast integer division by a constant not known until runtime.  See
847   // http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html .
848   // Assumes divisor is not zero, of course.
849   // May want to populate a struct instead.
850   assert(divisor);
851   if (!(divisor & (divisor - 1))) {
852     // power of 2
853     *multp = 1;
854     *pre_shiftp = 0;
855     *post_shiftp = ctzu32(divisor);
856     *incrp = 0;
857     return;
858   }
859   uint32_t quotient = 0x80000000U / divisor;
860   uint32_t remainder = 0x80000000U - (quotient * divisor);
861   const uint32_t ceil_log_2_d = 1 + bsru32(divisor);
862   uint32_t down_multiplier = 0;
863   uint32_t down_exponent = 0;
864   uint32_t has_magic_down = 0;
865   for (uint32_t exponent = 0; ; ++exponent) {
866     if (remainder >= divisor - remainder) {
867       quotient = quotient * 2 + 1;
868       remainder = remainder * 2 - divisor;
869     } else {
870       quotient = quotient * 2;
871       remainder = remainder * 2;
872     }
873     if ((exponent >= ceil_log_2_d) || (divisor - remainder) <= (1U << exponent)) {
874       if (exponent < ceil_log_2_d) {
875         *multp = quotient + 1;
876         *pre_shiftp = 0;
877         *post_shiftp = 32 + exponent;
878         *incrp = 0;
879         return;
880       }
881       break;
882     }
883     if ((!has_magic_down) && (remainder <= (1U << exponent))) {
884       has_magic_down = 1;
885       down_multiplier = quotient;
886       down_exponent = exponent;
887     }
888   }
889   if (divisor & 1) {
890     *multp = down_multiplier;
891     *pre_shiftp = 0;
892     *post_shiftp = 32 + down_exponent;
893     *incrp = 1;
894     return;
895   }
896   *pre_shiftp = ctzu32(divisor);
897   uint32_t dummy;
898   DivisionMagicNums(divisor >> (*pre_shiftp), multp, &dummy, post_shiftp, incrp);
899 }
900 
901 
FillBitsNz(uintptr_t start_idx,uintptr_t end_idx,uintptr_t * bitarr)902 void FillBitsNz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr) {
903   assert(end_idx > start_idx);
904   uintptr_t maj_start = start_idx / kBitsPerWord;
905   uintptr_t maj_end = end_idx / kBitsPerWord;
906   uintptr_t minor;
907   if (maj_start == maj_end) {
908     bitarr[maj_start] |= (k1LU << (end_idx % kBitsPerWord)) - (k1LU << (start_idx % kBitsPerWord));
909   } else {
910     bitarr[maj_start] |= ~((k1LU << (start_idx % kBitsPerWord)) - k1LU);
911     SetAllWArr(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
912     minor = end_idx % kBitsPerWord;
913     if (minor) {
914       bitarr[maj_end] |= (k1LU << minor) - k1LU;
915     }
916   }
917 }
918 
ClearBitsNz(uintptr_t start_idx,uintptr_t end_idx,uintptr_t * bitarr)919 void ClearBitsNz(uintptr_t start_idx, uintptr_t end_idx, uintptr_t* bitarr) {
920   assert(end_idx > start_idx);
921   uintptr_t maj_start = start_idx / kBitsPerWord;
922   uintptr_t maj_end = end_idx / kBitsPerWord;
923   uintptr_t minor;
924   if (maj_start == maj_end) {
925     bitarr[maj_start] &= ~((k1LU << (end_idx % kBitsPerWord)) - (k1LU << (start_idx % kBitsPerWord)));
926   } else {
927     bitarr[maj_start] = bzhi(bitarr[maj_start], start_idx % kBitsPerWord);
928     ZeroWArr(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
929     minor = end_idx % kBitsPerWord;
930     if (minor) {
931       bitarr[maj_end] &= ~((k1LU << minor) - k1LU);
932     }
933   }
934 }
935 
936 // floor permitted to be -1, though not smaller than that.
FindLast1BitBeforeBounded(const uintptr_t * bitarr,uint32_t loc,int32_t floor)937 int32_t FindLast1BitBeforeBounded(const uintptr_t* bitarr, uint32_t loc, int32_t floor) {
938   const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
939   uint32_t remainder = loc % kBitsPerWord;
940   uintptr_t ulii;
941   if (remainder) {
942     ulii = bzhi(*bitarr_ptr, remainder);
943     if (ulii) {
944       const uint32_t set_bit_loc = loc - remainder + bsrw(ulii);
945       return MAXV(S_CAST(int32_t, set_bit_loc), floor);
946     }
947   }
948   const uintptr_t* bitarr_last = &(bitarr[S_CAST(uint32_t, floor + 1) / kBitsPerWord]);
949   do {
950     if (bitarr_ptr <= bitarr_last) {
951       return floor;
952     }
953     ulii = *(--bitarr_ptr);
954   } while (!ulii);
955   const uint32_t set_bit_loc = S_CAST(uintptr_t, bitarr_ptr - bitarr) * kBitsPerWord + bsrw(ulii);
956   return MAXV(S_CAST(int32_t, set_bit_loc), floor);
957 }
958 
BitvecAndCopy(const uintptr_t * __restrict source1_bitvec,const uintptr_t * __restrict source2_bitvec,uintptr_t word_ct,uintptr_t * target_bitvec)959 void BitvecAndCopy(const uintptr_t* __restrict source1_bitvec, const uintptr_t* __restrict source2_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec) {
960 #ifdef __LP64__
961   VecW* target_bitvvec = R_CAST(VecW*, target_bitvec);
962   const VecW* source1_bitvvec = R_CAST(const VecW*, source1_bitvec);
963   const VecW* source2_bitvvec = R_CAST(const VecW*, source2_bitvec);
964   const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
965   for (uintptr_t ulii = 0; ulii != full_vec_ct; ++ulii) {
966     target_bitvvec[ulii] = source1_bitvvec[ulii] & source2_bitvvec[ulii];
967   }
968 #  ifdef USE_AVX2
969   if (word_ct & 2) {
970     const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
971     target_bitvec[base_idx] = source1_bitvec[base_idx] & source2_bitvec[base_idx];
972     target_bitvec[base_idx + 1] = source1_bitvec[base_idx + 1] & source2_bitvec[base_idx + 1];
973   }
974 #  endif
975   if (word_ct & 1) {
976     target_bitvec[word_ct - 1] = source1_bitvec[word_ct - 1] & source2_bitvec[word_ct - 1];
977   }
978 #else
979   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
980     target_bitvec[widx] = source1_bitvec[widx] & source2_bitvec[widx];
981   }
982 #endif
983 }
984 
BitvecInvmaskCopy(const uintptr_t * __restrict source_bitvec,const uintptr_t * __restrict exclude_bitvec,uintptr_t word_ct,uintptr_t * target_bitvec)985 void BitvecInvmaskCopy(const uintptr_t* __restrict source_bitvec, const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* target_bitvec) {
986   // target_bitvec := source_bitvec AND (~exclude_bitvec)
987 #ifdef __LP64__
988   VecW* target_bitvvec = R_CAST(VecW*, target_bitvec);
989   const VecW* source_bitvvec = R_CAST(const VecW*, source_bitvec);
990   const VecW* exclude_bitvvec = R_CAST(const VecW*, exclude_bitvec);
991   const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
992   for (uintptr_t ulii = 0; ulii != full_vec_ct; ++ulii) {
993     target_bitvvec[ulii] = vecw_and_notfirst(exclude_bitvvec[ulii], source_bitvvec[ulii]);
994   }
995 #  ifdef USE_AVX2
996   if (word_ct & 2) {
997     const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
998     target_bitvec[base_idx] = (~exclude_bitvec[base_idx]) & source_bitvec[base_idx];
999     target_bitvec[base_idx + 1] = (~exclude_bitvec[base_idx + 1]) & source_bitvec[base_idx + 1];
1000   }
1001 #  endif
1002   if (word_ct & 1) {
1003     target_bitvec[word_ct - 1] = source_bitvec[word_ct - 1] & (~exclude_bitvec[word_ct - 1]);
1004   }
1005 #else
1006   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1007     target_bitvec[widx] = source_bitvec[widx] & (~exclude_bitvec[widx]);
1008   }
1009 #endif
1010 }
1011 
BitvecInvertCopy(const uintptr_t * __restrict source_bitvec,uintptr_t word_ct,uintptr_t * __restrict target_bitvec)1012 void BitvecInvertCopy(const uintptr_t* __restrict source_bitvec, uintptr_t word_ct, uintptr_t* __restrict target_bitvec) {
1013 #ifdef __LP64__
1014   const VecW* source_bitvvec_iter = R_CAST(const VecW*, source_bitvec);
1015   VecW* target_bitvvec_iter = R_CAST(VecW*, target_bitvec);
1016   const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
1017   const VecW all1 = VCONST_W(~k0LU);
1018   // As of Apple clang 11, this manual unroll is no longer relevant.  todo:
1019   // check Linux performance, and remove all of these unrolls if perf is good
1020   // enough without them.
1021   if (full_vec_ct & 1) {
1022     *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1023   }
1024   if (full_vec_ct & 2) {
1025     *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1026     *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1027   }
1028   for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
1029     *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1030     *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1031     *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1032     *target_bitvvec_iter++ = (*source_bitvvec_iter++) ^ all1;
1033   }
1034 #  ifdef USE_AVX2
1035   if (word_ct & 2) {
1036     const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
1037     target_bitvec[base_idx] = ~source_bitvec[base_idx];
1038     target_bitvec[base_idx + 1] = ~source_bitvec[base_idx + 1];
1039   }
1040 #  endif
1041   if (word_ct & 1) {
1042     target_bitvec[word_ct - 1] = ~source_bitvec[word_ct - 1];
1043   }
1044 #else
1045   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1046     target_bitvec[widx] = ~source_bitvec[widx];
1047   }
1048 #endif
1049 }
1050 
1051 /*
1052 void BitvecXor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
1053   // main_bitvec := main_bitvec XOR arg_bitvec
1054 #ifdef __LP64__
1055   VecW* main_bitvvec_iter = R_CAST(VecW*, main_bitvec);
1056   const VecW* arg_bitvvec_iter = R_CAST(const VecW*, arg_bitvec);
1057   const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
1058   if (full_vec_ct & 1) {
1059     *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1060   }
1061   if (full_vec_ct & 2) {
1062     *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1063     *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1064   }
1065   for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
1066     *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1067     *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1068     *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1069     *main_bitvvec_iter++ ^= (*arg_bitvvec_iter++);
1070   }
1071 #  ifdef USE_AVX2
1072   if (word_ct & 2) {
1073     const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
1074     main_bitvec[base_idx] ^= arg_bitvec[base_idx];
1075     main_bitvec[base_idx + 1] ^= arg_bitvec[base_idx + 1];
1076   }
1077 #  endif
1078   if (word_ct & 1) {
1079     main_bitvec[word_ct - 1] ^= arg_bitvec[word_ct - 1];
1080   }
1081 #else
1082   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1083     main_bitvec[widx] ^= arg_bitvec[widx];
1084   }
1085 #endif
1086 }
1087 */
1088 
BitvecInvertAndMask(const uintptr_t * __restrict include_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)1089 void BitvecInvertAndMask(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
1090   // main_bitvec := (~main_bitvec) AND include_bitvec
1091   // this corresponds _mm_andnot() operand order
1092 #ifdef __LP64__
1093   VecW* main_bitvvec_iter = R_CAST(VecW*, main_bitvec);
1094   const VecW* include_bitvvec_iter = R_CAST(const VecW*, include_bitvec);
1095   const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
1096   if (full_vec_ct & 1) {
1097     *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1098     ++main_bitvvec_iter;
1099   }
1100   if (full_vec_ct & 2) {
1101     *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1102     ++main_bitvvec_iter;
1103     *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1104     ++main_bitvvec_iter;
1105   }
1106   for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
1107     *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1108     ++main_bitvvec_iter;
1109     *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1110     ++main_bitvvec_iter;
1111     *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1112     ++main_bitvvec_iter;
1113     *main_bitvvec_iter = vecw_and_notfirst(*main_bitvvec_iter, *include_bitvvec_iter++);
1114     ++main_bitvvec_iter;
1115   }
1116 #  ifdef USE_AVX2
1117   if (word_ct & 2) {
1118     const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
1119     main_bitvec[base_idx] = (~main_bitvec[base_idx]) & include_bitvec[base_idx];
1120     main_bitvec[base_idx + 1] = (~main_bitvec[base_idx + 1]) & include_bitvec[base_idx + 1];
1121   }
1122 #  endif
1123   if (word_ct & 1) {
1124     main_bitvec[word_ct - 1] = (~main_bitvec[word_ct - 1]) & include_bitvec[word_ct - 1];
1125   }
1126 #else
1127   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1128     main_bitvec[widx] = (~main_bitvec[widx]) & include_bitvec[widx];
1129   }
1130 #endif
1131 }
1132 
1133 /*
1134 void BitvecOrNot(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
1135   // main_bitvec := main_bitvec OR (~arg_bitvec)
1136 #ifdef __LP64__
1137   VecW* main_bitvvec_iter = (VecW*)main_bitvec;
1138   const VecW* arg_bitvvec_iter = (const VecW*)arg_bitvec;
1139   const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
1140   if (full_vec_ct & 1) {
1141     // todo: verify the compiler isn't dumb here
1142     *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1143   }
1144   if (full_vec_ct & 2) {
1145     *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1146     *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1147   }
1148   for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
1149     *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1150     *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1151     *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1152     *main_bitvvec_iter++ |= ~(*arg_bitvvec_iter++);
1153   }
1154 #  ifdef USE_AVX2
1155   if (word_ct & 2) {
1156     const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
1157     main_bitvec[base_idx] |= ~arg_bitvec[base_idx];
1158     main_bitvec[base_idx + 1] |= ~arg_bitvec[base_idx + 1]
1159   }
1160 #  endif
1161   if (word_ct & 1) {
1162     main_bitvec[word_ct - 1] |= ~arg_bitvec[word_ct - 1];
1163   }
1164 #else
1165   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
1166     main_bitvec[widx] |= ~arg_bitvec[widx];
1167   }
1168 #endif
1169 }
1170 */
1171 
1172 
GetVariantUidxWithoutHtable(const char * idstr,const char * const * variant_ids,const uintptr_t * variant_include,uint32_t variant_ct)1173 int32_t GetVariantUidxWithoutHtable(const char* idstr, const char* const* variant_ids, const uintptr_t* variant_include, uint32_t variant_ct) {
1174   const uint32_t id_blen = strlen(idstr) + 1;
1175   uintptr_t widx = ~k0LU;
1176   int32_t retval = -1;
1177   for (uint32_t variant_idx = 0; variant_idx != variant_ct; ) {
1178     uintptr_t variant_include_bits;
1179     do {
1180       variant_include_bits = variant_include[++widx];
1181     } while (!variant_include_bits);
1182     const uintptr_t variant_uidx_base = widx * kBitsPerWord;
1183     const char* const* cur_variant_ids = &(variant_ids[variant_uidx_base]);
1184     variant_idx += PopcountWord(variant_include_bits);
1185     do {
1186       const uint32_t uidx_lowbits = ctzw(variant_include_bits);
1187       if (memequal(idstr, cur_variant_ids[uidx_lowbits], id_blen)) {
1188         if (retval != -1) {
1189           // duplicate
1190           return -2;
1191         }
1192         retval = S_CAST(int32_t, uidx_lowbits + variant_uidx_base);
1193       }
1194       variant_include_bits &= variant_include_bits - 1;
1195     } while (variant_include_bits);
1196   }
1197   return retval;
1198 }
1199 
1200 
1201 // MurmurHash3, from
1202 // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
rotl32(uint32_t x,int8_t r)1203 CSINLINE uint32_t rotl32(uint32_t x, int8_t r) {
1204   return (x << r) | (x >> (32 - r));
1205 }
1206 
getblock32(const uint32_t * p,int i)1207 static inline uint32_t getblock32(const uint32_t* p, int i) {
1208   return p[i];
1209 }
1210 
1211 //-----------------------------------------------------------------------------
1212 // Finalization mix - force all bits of a hash block to avalanche
1213 
fmix32(uint32_t h)1214 CSINLINE2 uint32_t fmix32(uint32_t h) {
1215   h ^= h >> 16;
1216   h *= 0x85ebca6b;
1217   h ^= h >> 13;
1218   h *= 0xc2b2ae35;
1219   h ^= h >> 16;
1220 
1221   return h;
1222 }
1223 
Hash32(const void * key,uint32_t len)1224 uint32_t Hash32(const void* key, uint32_t len) {
1225   const uint8_t* data = S_CAST(const uint8_t*, key);
1226   const int32_t nblocks = len / 4;
1227 
1228   uint32_t h1 = 0;
1229   // uint32_t h1 = seed;
1230 
1231   const uint32_t c1 = 0xcc9e2d51;
1232   const uint32_t c2 = 0x1b873593;
1233 
1234   //----------
1235   // body
1236 
1237   const uint32_t* blocks = R_CAST(const uint32_t*, data + nblocks*4);
1238 
1239   int32_t i;
1240   uint32_t k1;
1241   for(i = -nblocks; i; i++) {
1242       k1 = getblock32(blocks,i);
1243 
1244       k1 *= c1;
1245       k1 = rotl32(k1,15);
1246       k1 *= c2;
1247 
1248       h1 ^= k1;
1249       h1 = rotl32(h1,13);
1250       h1 = h1*5+0xe6546b64;
1251   }
1252 
1253   //----------
1254   // tail
1255 
1256   const uint8_t* tail = R_CAST(const uint8_t*, data + nblocks*4);
1257 
1258   k1 = 0;
1259 
1260   switch(len & 3) {
1261     case 3:
1262       k1 ^= tail[2] << 16;
1263       // fall through
1264     case 2:
1265       k1 ^= tail[1] << 8;
1266       // fall through
1267     case 1:
1268       k1 ^= tail[0];
1269       k1 *= c1;
1270       k1 = rotl32(k1,15);
1271       k1 *= c2;
1272       h1 ^= k1;
1273   }
1274 
1275   //----------
1276   // finalization
1277 
1278   h1 ^= len;
1279 
1280   return fmix32(h1);
1281 }
1282 
1283 
1284 /*
1285 uint32_t is_composite6(uintptr_t num) {
1286   // assumes num is congruent to 1 or 5 mod 6.
1287   // can speed this up by ~50% by hardcoding avoidance of multiples of 5/7,
1288   // but this isn't currently a bottleneck so I'll keep this simple
1289   assert((num % 6 == 1) || (num % 6 == 5));
1290   uintptr_t divisor = 5;
1291   while (divisor * divisor <= num) {
1292     if (!(num % divisor)) {
1293       return 1;
1294     }
1295     divisor += 2;
1296     if (!(num % divisor)) {
1297       return 1;
1298     }
1299     divisor += 4;
1300   }
1301   return 0;
1302 }
1303 
1304 uintptr_t geqprime(uintptr_t floor) {
1305   // assumes floor is odd and greater than 1.  Returns 5 if floor = 3,
1306   // otherwise returns the first prime >= floor.
1307   assert((floor % 2) && (floor > 1));
1308   uintptr_t ulii = floor % 3;
1309   if (!ulii) {
1310     floor += 2;
1311   } else if (ulii == 1) {
1312     goto geqprime_1mod6;
1313   }
1314   while (is_composite6(floor)) {
1315     floor += 2;
1316   geqprime_1mod6:
1317     if (!is_composite6(floor)) {
1318       return floor;
1319     }
1320     floor += 4;
1321   }
1322   return floor;
1323 }
1324 
1325 uintptr_t leqprime(uintptr_t ceil) {
1326   // assumes ceil is odd and greater than 4.  Returns the first prime <= ceil.
1327   assert((ceil % 2) && (ceil > 4));
1328   uintptr_t ulii = ceil % 3;
1329   if (!ulii) {
1330     ceil -= 2;
1331   } else if (ulii == 2) {
1332     goto leqprime_5mod6;
1333   }
1334   while (is_composite6(ceil)) {
1335     ceil -= 2;
1336   leqprime_5mod6:
1337     if (!is_composite6(ceil)) {
1338       return ceil;
1339     }
1340     ceil -= 4;
1341   }
1342   return ceil;
1343 }
1344 */
1345 
HtableGoodSizeAlloc(uint32_t item_ct,uintptr_t bytes_avail,uint32_t ** htable_ptr,uint32_t * htable_size_ptr)1346 BoolErr HtableGoodSizeAlloc(uint32_t item_ct, uintptr_t bytes_avail, uint32_t** htable_ptr, uint32_t* htable_size_ptr) {
1347   bytes_avail &= (~(kCacheline - k1LU));
1348   uint32_t htable_size = GetHtableFastSize(item_ct);
1349   if (htable_size > bytes_avail / sizeof(int32_t)) {
1350     if (unlikely(!bytes_avail)) {
1351       return 1;
1352     }
1353     htable_size = bytes_avail / sizeof(int32_t);
1354     // htable_size = leqprime((bytes_avail / sizeof(int32_t)) - 1);
1355     if (unlikely(htable_size < item_ct * 2)) {
1356       return 1;
1357     }
1358   }
1359   *htable_ptr = S_CAST(uint32_t*, bigstack_alloc_raw_rd(htable_size * sizeof(int32_t)));
1360   *htable_size_ptr = htable_size;
1361   return 0;
1362 }
1363 
PopulateStrboxHtable(const char * strbox,uintptr_t str_ct,uintptr_t max_str_blen,uint32_t str_htable_size,uint32_t * str_htable)1364 uint32_t PopulateStrboxHtable(const char* strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
1365   // may want subset_mask parameter later
1366   SetAllU32Arr(str_htable_size, str_htable);
1367   const char* strbox_iter = strbox;
1368   for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
1369     const uint32_t slen = strlen(strbox_iter);
1370     // previously used quadratic probing, but turns out that that isn't
1371     // meaningfully better than linear probing.
1372     for (uint32_t hashval = Hashceil(strbox_iter, slen, str_htable_size); ; ) {
1373       const uint32_t cur_htable_entry = str_htable[hashval];
1374       if (cur_htable_entry == UINT32_MAX) {
1375         str_htable[hashval] = str_idx;
1376         break;
1377       }
1378       if (memequal(strbox_iter, &(strbox[cur_htable_entry * max_str_blen]), slen + 1)) {
1379         // guaranteed to be positive
1380         return str_idx;
1381       }
1382       if (++hashval == str_htable_size) {
1383         hashval = 0;
1384       }
1385     }
1386     strbox_iter = &(strbox_iter[max_str_blen]);
1387   }
1388   return 0;
1389 }
1390 
1391 // could merge this with non-subset case, but this isn't much code
1392 /*
1393 uint32_t populate_strbox_subset_htable(const uintptr_t* __restrict subset_mask, const char* strbox, uintptr_t raw_str_ct, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t str_htable_size, uint32_t* str_htable) {
1394   // may want subset_mask parameter later
1395   SetAllU32Arr(str_htable_size, str_htable);
1396   uintptr_t str_uidx = 0;
1397   for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx, ++str_uidx) {
1398     MovWTo1Bit(subset_mask, &str_uidx);
1399     const char* cur_str = &(strbox[str_uidx * max_str_blen]);
1400     const uint32_t slen = strlen(cur_str);
1401     uint32_t hashval = Hashceil(cur_str, slen, str_htable_size);
1402     while (1) {
1403       const uint32_t cur_htable_entry = str_htable[hashval];
1404       if (cur_htable_entry == UINT32_MAX) {
1405         str_htable[hashval] = str_uidx;
1406         break;
1407       }
1408       if (memequal(cur_str, &(strbox[cur_htable_entry * max_str_blen]), slen + 1)) {
1409         // guaranteed to be positive
1410         return str_uidx;
1411       }
1412       if (++hashval == str_htable_size) {
1413         hashval = 0;
1414       }
1415     }
1416   }
1417   return 0;
1418 }
1419 */
1420 
IdHtableFind(const char * cur_id,const char * const * item_ids,const uint32_t * id_htable,uint32_t cur_id_slen,uint32_t id_htable_size)1421 uint32_t IdHtableFind(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size) {
1422   // returns UINT32_MAX on failure
1423   for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
1424     const uint32_t cur_htable_idval = id_htable[hashval];
1425     if ((cur_htable_idval == UINT32_MAX) || (!strcmp(cur_id, item_ids[cur_htable_idval]))) {
1426       return cur_htable_idval;
1427     }
1428     if (++hashval == id_htable_size) {
1429       hashval = 0;
1430     }
1431   }
1432 }
1433 
IdHtableFindNnt(const char * cur_id,const char * const * item_ids,const uint32_t * id_htable,uint32_t cur_id_slen,uint32_t id_htable_size)1434 uint32_t IdHtableFindNnt(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size) {
1435   // returns UINT32_MAX on failure
1436   for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
1437     const uint32_t cur_htable_idval = id_htable[hashval];
1438     if ((cur_htable_idval == UINT32_MAX) || (memequal(cur_id, item_ids[cur_htable_idval], cur_id_slen) && (!item_ids[cur_htable_idval][cur_id_slen]))) {
1439       return cur_htable_idval;
1440     }
1441     if (++hashval == id_htable_size) {
1442       hashval = 0;
1443     }
1444   }
1445 }
1446 
1447 // assumes cur_id_slen < max_str_blen.
1448 // requires cur_id to be null-terminated.
StrboxHtableFind(const char * cur_id,const char * strbox,const uint32_t * id_htable,uintptr_t max_str_blen,uint32_t cur_id_slen,uint32_t id_htable_size)1449 uint32_t StrboxHtableFind(const char* cur_id, const char* strbox, const uint32_t* id_htable, uintptr_t max_str_blen, uint32_t cur_id_slen, uint32_t id_htable_size) {
1450   const uint32_t cur_id_blen = cur_id_slen + 1;
1451   for (uint32_t hashval = Hashceil(cur_id, cur_id_slen, id_htable_size); ; ) {
1452     const uint32_t cur_htable_idval = id_htable[hashval];
1453     if ((cur_htable_idval == UINT32_MAX) || memequal(cur_id, &(strbox[cur_htable_idval * max_str_blen]), cur_id_blen)) {
1454       return cur_htable_idval;
1455     }
1456     if (++hashval == id_htable_size) {
1457       hashval = 0;
1458     }
1459   }
1460 }
1461 
VariantIdDupflagHtableFind(const char * idbuf,const char * const * variant_ids,const uint32_t * id_htable,uint32_t cur_id_slen,uint32_t id_htable_size,uint32_t max_id_slen)1462 uint32_t VariantIdDupflagHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen) {
1463   // assumes duplicate variant IDs are flagged, but full variant_uidx linked
1464   // lists are not stored
1465   // idbuf does not need to be null-terminated (note that this is currently
1466   // achieved in a way that forces variant_ids[] entries to not be too close
1467   // to the end of bigstack, otherwise memequal behavior is potentially
1468   // undefined)
1469   // returns UINT32_MAX on failure, value with bit 31 set on duplicate
1470   if (cur_id_slen > max_id_slen) {
1471     return UINT32_MAX;
1472   }
1473   for (uint32_t hashval = Hashceil(idbuf, cur_id_slen, id_htable_size); ; ) {
1474     const uint32_t cur_htable_idval = id_htable[hashval];
1475     if ((cur_htable_idval == UINT32_MAX) || (memequal(idbuf, variant_ids[cur_htable_idval & 0x7fffffff], cur_id_slen) && (!variant_ids[cur_htable_idval & 0x7fffffff][cur_id_slen]))) {
1476       return cur_htable_idval;
1477     }
1478     if (++hashval == id_htable_size) {
1479       hashval = 0;
1480     }
1481   }
1482 }
1483 
VariantIdDupHtableFind(const char * idbuf,const char * const * variant_ids,const uint32_t * id_htable,const uint32_t * htable_dup_base,uint32_t cur_id_slen,uint32_t id_htable_size,uint32_t max_id_slen,uint32_t * llidx_ptr)1484 uint32_t VariantIdDupHtableFind(const char* idbuf, const char* const* variant_ids, const uint32_t* id_htable, const uint32_t* htable_dup_base, uint32_t cur_id_slen, uint32_t id_htable_size, uint32_t max_id_slen, uint32_t* llidx_ptr) {
1485   // Permits duplicate entries.  Similar to plink 1.9
1486   // extract_exclude_process_token().
1487   // - Returns UINT32_MAX on failure (llidx currently unset in that case),
1488   //   otherwise returns the index of the first match (which will have the
1489   //   highest index, due to how the linked list is constructed)
1490   // - Sets llidx to UINT32_MAX if not a duplicate, otherwise it's the
1491   //   position in htable_dup_base[] of the next {variant_uidx, next_llidx}
1492   //   linked list entry.
1493   // - idbuf does not need to be null-terminated.
1494   if (cur_id_slen > max_id_slen) {
1495     return UINT32_MAX;
1496   }
1497   for (uint32_t hashval = Hashceil(idbuf, cur_id_slen, id_htable_size); ; ) {
1498     const uint32_t cur_htable_idval = id_htable[hashval];
1499     const uint32_t cur_dup = cur_htable_idval >> 31;
1500     uint32_t cur_llidx;
1501     uint32_t variant_uidx;
1502     if (cur_dup) {
1503       // UINT32_MAX empty-entry code has high bit set, so only need to check
1504       // here
1505       if (cur_htable_idval == UINT32_MAX) {
1506         return UINT32_MAX;
1507       }
1508       cur_llidx = cur_htable_idval << 1;
1509       variant_uidx = htable_dup_base[cur_llidx];
1510     } else {
1511       cur_llidx = UINT32_MAX;
1512       variant_uidx = cur_htable_idval;
1513     }
1514     const char* sptr = variant_ids[variant_uidx];
1515     if (memequal(idbuf, sptr, cur_id_slen) && (!sptr[cur_id_slen])) {
1516       *llidx_ptr = cur_llidx;
1517       return variant_uidx;
1518     }
1519     if (++hashval == id_htable_size) {
1520       hashval = 0;
1521     }
1522   }
1523 }
1524 
1525 
CopySortStrboxSubsetNoalloc(const uintptr_t * __restrict subset_mask,const char * __restrict orig_strbox,uintptr_t str_ct,uintptr_t max_str_blen,uint32_t allow_dups,uint32_t collapse_idxs,uint32_t use_nsort,char * __restrict sorted_strbox,uint32_t * __restrict id_map)1526 PglErr CopySortStrboxSubsetNoalloc(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char* __restrict sorted_strbox, uint32_t* __restrict id_map) {
1527   // Stores a lexicographically sorted list of IDs in sorted_strbox and the raw
1528   // positions of the corresponding markers/samples in *id_map_ptr.  Does not
1529   // include excluded markers/samples in the list.
1530   // Assumes sorted_strbox and id_map have been allocated; use the
1531   // CopySortStrboxSubset() wrapper if they haven't been.
1532   // Note that this DOES still perform a "stack" allocation; 'Noalloc' just
1533   // means that sorted_strbox/id_map must already be allocated.
1534   if (!str_ct) {
1535     return kPglRetSuccess;
1536   }
1537   unsigned char* bigstack_mark = g_bigstack_base;
1538   PglErr reterr = kPglRetSuccess;
1539   {
1540 #ifdef __cplusplus
1541     if (max_str_blen <= 60) {
1542       uintptr_t wkspace_entry_blen = (max_str_blen > 28)? sizeof(Strbuf60Ui) : sizeof(Strbuf28Ui);
1543       char* sort_wkspace;
1544       if (unlikely(bigstack_alloc_c(str_ct * wkspace_entry_blen, &sort_wkspace))) {
1545         goto CopySortStrboxSubsetNoalloc_ret_NOMEM;
1546       }
1547       const uint32_t wkspace_entry_blen_m4 = wkspace_entry_blen - 4;
1548       char* sort_wkspace_iter = sort_wkspace;
1549       uintptr_t str_uidx_base = 0;
1550       uintptr_t subset_mask_bits = subset_mask[0];
1551       for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
1552         const uint32_t str_uidx = BitIter1(subset_mask, &str_uidx_base, &subset_mask_bits);
1553         strcpy(sort_wkspace_iter, &(orig_strbox[str_uidx * max_str_blen]));
1554         sort_wkspace_iter = &(sort_wkspace_iter[wkspace_entry_blen_m4]);
1555         if (collapse_idxs) {
1556           memcpy(sort_wkspace_iter, &str_idx, sizeof(int32_t));
1557         } else {
1558           memcpy(sort_wkspace_iter, &str_uidx, sizeof(int32_t));
1559         }
1560         sort_wkspace_iter = &(sort_wkspace_iter[sizeof(int32_t)]);
1561       }
1562       if (wkspace_entry_blen == 32) {
1563         SortStrbox32bFinish(str_ct, max_str_blen, use_nsort, R_CAST(Strbuf28Ui*, sort_wkspace), sorted_strbox, id_map);
1564       } else {
1565         SortStrbox64bFinish(str_ct, max_str_blen, use_nsort, R_CAST(Strbuf60Ui*, sort_wkspace), sorted_strbox, id_map);
1566       }
1567     } else {
1568 #endif
1569       StrSortIndexedDerefOverread* sort_wkspace;
1570       if (unlikely(BIGSTACK_ALLOC_X(StrSortIndexedDerefOverread, str_ct, &sort_wkspace))) {
1571         goto CopySortStrboxSubsetNoalloc_ret_NOMEM;
1572       }
1573       uintptr_t str_uidx_base = 0;
1574       uintptr_t cur_bits = subset_mask[0];
1575       for (uint32_t str_idx = 0; str_idx != str_ct; ++str_idx) {
1576         uintptr_t str_uidx = BitIter1(subset_mask, &str_uidx_base, &cur_bits);
1577         sort_wkspace[str_idx].strptr = &(orig_strbox[str_uidx * max_str_blen]);
1578         if (collapse_idxs) {
1579           sort_wkspace[str_idx].orig_idx = str_idx;
1580         } else {
1581           sort_wkspace[str_idx].orig_idx = str_uidx;
1582         }
1583       }
1584       if (!use_nsort) {
1585         STD_SORT_PAR_UNSEQ(str_ct, strcmp_overread_deref, sort_wkspace);
1586       } else {
1587 #ifdef __cplusplus
1588         StrNsortIndexedDeref* wkspace_alias = R_CAST(StrNsortIndexedDeref*, sort_wkspace);
1589         STD_SORT_PAR_UNSEQ(str_ct, nullptr, wkspace_alias);
1590 #else
1591         STD_SORT_PAR_UNSEQ(str_ct, strcmp_natural_deref, sort_wkspace);
1592 #endif
1593       }
1594       for (uintptr_t str_idx = 0; str_idx != str_ct; ++str_idx) {
1595         strcpy(&(sorted_strbox[str_idx * max_str_blen]), sort_wkspace[str_idx].strptr);
1596         id_map[str_idx] = sort_wkspace[str_idx].orig_idx;
1597       }
1598 #ifdef __cplusplus
1599     }
1600 #endif
1601     if (!allow_dups) {
1602       char* dup_id = ScanForDuplicateIds(sorted_strbox, str_ct, max_str_blen);
1603       if (unlikely(dup_id)) {
1604         TabsToSpaces(dup_id);
1605         logerrprintfww("Error: Duplicate ID '%s'.\n", dup_id);
1606         goto CopySortStrboxSubsetNoalloc_ret_MALFORMED_INPUT;
1607       }
1608     }
1609   }
1610   while (0) {
1611   CopySortStrboxSubsetNoalloc_ret_NOMEM:
1612     reterr = kPglRetNomem;
1613     break;
1614   CopySortStrboxSubsetNoalloc_ret_MALFORMED_INPUT:
1615     reterr = kPglRetMalformedInput;
1616     break;
1617   }
1618   BigstackReset(bigstack_mark);
1619   return reterr;
1620 }
1621 
CopySortStrboxSubset(const uintptr_t * __restrict subset_mask,const char * __restrict orig_strbox,uintptr_t str_ct,uintptr_t max_str_blen,uint32_t allow_dups,uint32_t collapse_idxs,uint32_t use_nsort,char ** sorted_strbox_ptr,uint32_t ** id_map_ptr)1622 PglErr CopySortStrboxSubset(const uintptr_t* __restrict subset_mask, const char* __restrict orig_strbox, uintptr_t str_ct, uintptr_t max_str_blen, uint32_t allow_dups, uint32_t collapse_idxs, uint32_t use_nsort, char** sorted_strbox_ptr, uint32_t** id_map_ptr) {
1623   // id_map on bottom because --indiv-sort frees *sorted_strbox_ptr
1624   if (unlikely(
1625           bigstack_alloc_u32(str_ct, id_map_ptr) ||
1626           bigstack_alloc_c(str_ct * max_str_blen, sorted_strbox_ptr))) {
1627     return kPglRetNomem;
1628   }
1629   return CopySortStrboxSubsetNoalloc(subset_mask, orig_strbox, str_ct, max_str_blen, allow_dups, collapse_idxs, use_nsort, *sorted_strbox_ptr, *id_map_ptr);
1630 }
1631 
1632 
InitRangeList(RangeList * range_list_ptr)1633 void InitRangeList(RangeList* range_list_ptr) {
1634   range_list_ptr->names = nullptr;
1635   range_list_ptr->starts_range = nullptr;
1636   range_list_ptr->name_ct = 0;
1637   range_list_ptr->name_max_blen = 0;
1638 }
1639 
CleanupRangeList(RangeList * range_list_ptr)1640 void CleanupRangeList(RangeList* range_list_ptr) {
1641   free_cond(range_list_ptr->names);
1642   // starts_range now uses same allocation
1643 }
1644 
NumericRangeListToBitarr(const RangeList * range_list_ptr,uint32_t bitarr_size,uint32_t offset,uint32_t ignore_overflow,uintptr_t * bitarr)1645 BoolErr NumericRangeListToBitarr(const RangeList* range_list_ptr, uint32_t bitarr_size, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr) {
1646   // bitarr assumed to be initialized (but not necessarily zero-initialized)
1647   const char* names = range_list_ptr->names;
1648   const unsigned char* starts_range = range_list_ptr->starts_range;
1649   const uint32_t name_ct = range_list_ptr->name_ct;
1650   const uint32_t name_max_blen = range_list_ptr->name_max_blen;
1651   const uint32_t idx_max = bitarr_size + offset;
1652   for (uint32_t name_idx = 0; name_idx != name_ct; ++name_idx) {
1653     uint32_t idx1;
1654     if (ScanUintCapped(&(names[name_idx * name_max_blen]), idx_max, &idx1)) {
1655       if (likely(ignore_overflow)) {
1656         continue;
1657       }
1658       return 1;
1659     }
1660     if (unlikely(idx1 < offset)) {
1661       return 1;
1662     }
1663     if (starts_range[name_idx]) {
1664       ++name_idx;
1665       uint32_t idx2;
1666       if (ScanUintCapped(&(names[name_idx * name_max_blen]), idx_max, &idx2)) {
1667         if (unlikely(!ignore_overflow)) {
1668           return 1;
1669         }
1670         idx2 = idx_max - 1;
1671       }
1672       FillBitsNz(idx1 - offset, (idx2 - offset) + 1, bitarr);
1673     } else {
1674       SetBit(idx1 - offset, bitarr);
1675     }
1676   }
1677   return 0;
1678 }
1679 
StringRangeListToBitarr(const char * header_line,const RangeList * range_list_ptr,const char * __restrict sorted_ids,const uint32_t * __restrict id_map,const char * __restrict range_list_flag,const char * __restrict file_descrip,uint32_t token_ct,uint32_t fixed_len,uint32_t comma_delim,uintptr_t * bitarr,int32_t * __restrict seen_idxs)1680 PglErr StringRangeListToBitarr(const char* header_line, const RangeList* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t* bitarr, int32_t* __restrict seen_idxs) {
1681   // bitarr assumed to be zero-initialized
1682   // if fixed_len is zero, header_line is assumed to be a list of
1683   // space-delimited unequal-length names
1684   assert(token_ct);
1685   assert(!PopcountWords(bitarr, BitCtToWordCt(token_ct)));
1686   PglErr reterr = kPglRetSuccess;
1687   {
1688     const char* header_line_iter = header_line;
1689     const uintptr_t name_ct = range_list_ptr->name_ct;
1690     const uintptr_t max_id_blen = range_list_ptr->name_max_blen;
1691     for (uint32_t item_idx = 0; ; ) {
1692       const char* token_end = CommaOrTspaceTokenEnd(header_line_iter, comma_delim);
1693       const int32_t sorted_pos = bsearch_str(header_line_iter, sorted_ids, token_end - header_line_iter, max_id_blen, name_ct);
1694       if (sorted_pos != -1) {
1695         uint32_t cmdline_pos = sorted_pos;
1696         if (id_map) {
1697           cmdline_pos = id_map[S_CAST(uint32_t, sorted_pos)];
1698         }
1699         if (unlikely(seen_idxs[cmdline_pos] != -1)) {
1700           logerrprintfww("Error: --%s: '%s' appears multiple times in header line of %s.\n", range_list_flag, &(sorted_ids[S_CAST(uint32_t, sorted_pos) * max_id_blen]), file_descrip);
1701           goto StringRangeListToBitarr_ret_MALFORMED_INPUT;
1702         }
1703         seen_idxs[cmdline_pos] = item_idx;
1704         if (cmdline_pos && range_list_ptr->starts_range[cmdline_pos - 1]) {
1705           if (unlikely(seen_idxs[cmdline_pos - 1] == -1)) {
1706             uint32_t first_sorted_pos = cmdline_pos - 1;
1707             if (id_map) {
1708               for (first_sorted_pos = 0; ; ++first_sorted_pos) {
1709                 if (id_map[first_sorted_pos] == cmdline_pos - 1) {
1710                   break;
1711                 }
1712               }
1713             }
1714             snprintf(g_logbuf, kLogbufSize, "Error: --%s: '%s' appears before '%s' in header line of %s.\n", range_list_flag, &(sorted_ids[S_CAST(uint32_t, sorted_pos) * max_id_blen]), &(sorted_ids[first_sorted_pos * max_id_blen]), file_descrip);
1715             goto StringRangeListToBitarr_ret_INVALID_CMDLINE_WW;
1716           }
1717           FillBitsNz(seen_idxs[cmdline_pos - 1], item_idx + 1, bitarr);
1718         } else if (!(range_list_ptr->starts_range[cmdline_pos])) {
1719           SetBit(item_idx, bitarr);
1720         }
1721       }
1722       if (++item_idx == token_ct) {
1723         break;
1724       }
1725       if (fixed_len) {
1726         header_line_iter = &(header_line_iter[fixed_len]);
1727       } else {
1728         header_line_iter = FirstNonTspace(&(token_end[1]));
1729       }
1730     }
1731     for (uint32_t cmdline_pos = 0; cmdline_pos != name_ct; ++cmdline_pos) {
1732       if (unlikely(seen_idxs[cmdline_pos] == -1)) {
1733         uint32_t sorted_pos = cmdline_pos;
1734         if (id_map) {
1735           for (sorted_pos = 0; ; ++sorted_pos) {
1736             if (id_map[sorted_pos] == cmdline_pos) {
1737               break;
1738             }
1739           }
1740         }
1741         snprintf(g_logbuf, kLogbufSize, "Error: --%s: '%s' does not appear in header line of %s.\n", range_list_flag, &(sorted_ids[sorted_pos * max_id_blen]), file_descrip);
1742         goto StringRangeListToBitarr_ret_INVALID_CMDLINE_WW;
1743       }
1744     }
1745   }
1746   while (0) {
1747   StringRangeListToBitarr_ret_INVALID_CMDLINE_WW:
1748     WordWrapB(0);
1749     logerrputsb();
1750     reterr = kPglRetInvalidCmdline;
1751     break;
1752   StringRangeListToBitarr_ret_MALFORMED_INPUT:
1753     reterr = kPglRetMalformedInput;
1754     break;
1755   }
1756   return reterr;
1757 }
1758 
StringRangeListToBitarrAlloc(const char * header_line,const RangeList * range_list_ptr,const char * __restrict range_list_flag,const char * __restrict file_descrip,uint32_t token_ct,uint32_t fixed_len,uint32_t comma_delim,uintptr_t ** bitarr_ptr)1759 PglErr StringRangeListToBitarrAlloc(const char* header_line, const RangeList* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uint32_t token_ct, uint32_t fixed_len, uint32_t comma_delim, uintptr_t** bitarr_ptr) {
1760   // wrapper for StringRangeListToBitarr which allocates the bitfield and
1761   // temporary buffers on the heap
1762   uintptr_t token_ctl = BitCtToWordCt(token_ct);
1763   uintptr_t name_ct = range_list_ptr->name_ct;
1764   int32_t* seen_idxs;
1765   char* sorted_ids;
1766   uint32_t* id_map;
1767   if (unlikely(
1768           bigstack_calloc_w(token_ctl, bitarr_ptr) ||
1769           bigstack_alloc_i32(name_ct, &seen_idxs))) {
1770     return kPglRetNomem;
1771   }
1772   // kludge to use CopySortStrboxSubset()
1773   SetAllBits(name_ct, R_CAST(uintptr_t*, seen_idxs));
1774   if (unlikely(CopySortStrboxSubset(R_CAST(uintptr_t*, seen_idxs), range_list_ptr->names, name_ct, range_list_ptr->name_max_blen, 0, 0, 0, &sorted_ids, &id_map))) {
1775     return kPglRetNomem;
1776   }
1777   SetAllI32Arr(name_ct, seen_idxs);
1778   PglErr reterr = StringRangeListToBitarr(header_line, range_list_ptr, sorted_ids, id_map, range_list_flag, file_descrip, token_ct, fixed_len, comma_delim, *bitarr_ptr, seen_idxs);
1779   BigstackReset(seen_idxs);
1780   return reterr;
1781 }
1782 
1783 
PopcountBitRange(const uintptr_t * bitvec,uintptr_t start_idx,uintptr_t end_idx)1784 uintptr_t PopcountBitRange(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
1785   uintptr_t start_idxl = start_idx / kBitsPerWord;
1786   const uintptr_t start_idxlr = start_idx & (kBitsPerWord - 1);
1787   const uintptr_t end_idxl = end_idx / kBitsPerWord;
1788   const uintptr_t end_idxlr = end_idx & (kBitsPerWord - 1);
1789   uintptr_t ct = 0;
1790   if (start_idxl == end_idxl) {
1791     return PopcountWord(bitvec[start_idxl] & ((k1LU << end_idxlr) - (k1LU << start_idxlr)));
1792   }
1793   if (start_idxlr) {
1794     ct = PopcountWord(bitvec[start_idxl++] >> start_idxlr);
1795   }
1796   if (end_idxl > start_idxl) {
1797     ct += PopcountWordsNzbase(bitvec, start_idxl, end_idxl);
1798   }
1799   if (end_idxlr) {
1800     ct += PopcountWord(bzhi(bitvec[end_idxl], end_idxlr));
1801   }
1802   return ct;
1803 }
1804 
1805 #ifdef USE_SSE42
PopcountWordsIntersect3val(const uintptr_t * __restrict bitvec1,const uintptr_t * __restrict bitvec2,uint32_t word_ct,uint32_t * __restrict popcount1_ptr,uint32_t * __restrict popcount2_ptr,uint32_t * __restrict popcount_intersect_ptr)1806 void PopcountWordsIntersect3val(const uintptr_t* __restrict bitvec1, const uintptr_t* __restrict bitvec2, uint32_t word_ct, uint32_t* __restrict popcount1_ptr, uint32_t* __restrict popcount2_ptr, uint32_t* __restrict popcount_intersect_ptr) {
1807   uint32_t ct1 = 0;
1808   uint32_t ct2 = 0;
1809   uint32_t ct3 = 0;
1810   for (uint32_t widx = 0; widx != word_ct; ++widx) {
1811     const uintptr_t word1 = bitvec1[widx];
1812     const uintptr_t word2 = bitvec2[widx];
1813     ct1 += PopcountWord(word1);
1814     ct2 += PopcountWord(word2);
1815     ct3 += PopcountWord(word1 & word2);
1816   }
1817   *popcount1_ptr = ct1;
1818   *popcount2_ptr = ct2;
1819   *popcount_intersect_ptr = ct3;
1820 }
1821 #else
PopcountVecsNoSse42Intersect3val(const VecW * __restrict vvec1_iter,const VecW * __restrict vvec2_iter,uint32_t vec_ct,uint32_t * __restrict popcount1_ptr,uint32_t * popcount2_ptr,uint32_t * __restrict popcount_intersect_ptr)1822 static inline void PopcountVecsNoSse42Intersect3val(const VecW* __restrict vvec1_iter, const VecW* __restrict vvec2_iter, uint32_t vec_ct, uint32_t* __restrict popcount1_ptr, uint32_t* popcount2_ptr, uint32_t* __restrict popcount_intersect_ptr) {
1823   // ct must be a multiple of 3.
1824   assert(!(vec_ct % 3));
1825   const VecW m1 = VCONST_W(kMask5555);
1826   const VecW m2 = VCONST_W(kMask3333);
1827   const VecW m4 = VCONST_W(kMask0F0F);
1828   VecW acc1 = vecw_setzero();
1829   VecW acc2 = vecw_setzero();
1830   VecW acc3 = vecw_setzero();
1831   uintptr_t cur_incr = 30;
1832   for (; ; vec_ct -= cur_incr) {
1833     if (vec_ct < 30) {
1834       if (!vec_ct) {
1835         *popcount1_ptr = HsumW(acc1);
1836         *popcount2_ptr = HsumW(acc2);
1837         *popcount_intersect_ptr = HsumW(acc3);
1838         return;
1839       }
1840       cur_incr = vec_ct;
1841     }
1842     VecW inner_acc1 = vecw_setzero();
1843     VecW inner_acc2 = vecw_setzero();
1844     VecW inner_acc3 = vecw_setzero();
1845     const VecW* vvec1_stop = &(vvec1_iter[cur_incr]);
1846     do {
1847       VecW count1a = *vvec1_iter++;
1848       VecW count2a = *vvec2_iter++;
1849       VecW count3a = count1a & count2a;
1850       VecW count1b = *vvec1_iter++;
1851       VecW count2b = *vvec2_iter++;
1852       VecW count3b = count1b & count2b;
1853       VecW half1a = *vvec1_iter++;
1854       VecW half2a = *vvec2_iter++;
1855       const VecW half1b = vecw_srli(half1a, 1) & m1;
1856       const VecW half2b = vecw_srli(half2a, 1) & m1;
1857       half1a = half1a & m1;
1858       half2a = half2a & m1;
1859 
1860       count1a = count1a - (vecw_srli(count1a, 1) & m1);
1861       count2a = count2a - (vecw_srli(count2a, 1) & m1);
1862       count3a = count3a - (vecw_srli(count3a, 1) & m1);
1863       count1b = count1b - (vecw_srli(count1b, 1) & m1);
1864       count2b = count2b - (vecw_srli(count2b, 1) & m1);
1865       count3b = count3b - (vecw_srli(count3b, 1) & m1);
1866       count1a = count1a + half1a;
1867       count2a = count2a + half2a;
1868       count3a = count3a + (half1a & half2a);
1869       count1b = count1b + half1b;
1870       count2b = count2b + half2b;
1871       count3b = count3b + (half1b & half2b);
1872 
1873       count1a = (count1a & m2) + (vecw_srli(count1a, 2) & m2);
1874       count2a = (count2a & m2) + (vecw_srli(count2a, 2) & m2);
1875       count3a = (count3a & m2) + (vecw_srli(count3a, 2) & m2);
1876       count1a = count1a + (count1b & m2) + (vecw_srli(count1b, 2) & m2);
1877       count2a = count2a + (count2b & m2) + (vecw_srli(count2b, 2) & m2);
1878       count3a = count3a + (count3b & m2) + (vecw_srli(count3b, 2) & m2);
1879       inner_acc1 = inner_acc1 + (count1a & m4) + (vecw_srli(count1a, 4) & m4);
1880       inner_acc2 = inner_acc2 + (count2a & m4) + (vecw_srli(count2a, 4) & m4);
1881       inner_acc3 = inner_acc3 + (count3a & m4) + (vecw_srli(count3a, 4) & m4);
1882     } while (vvec1_iter < vvec1_stop);
1883     // too much register pressure to use prev_sad_result pattern?
1884     const VecW m0 = vecw_setzero();
1885     acc1 = acc1 + vecw_bytesum(inner_acc1, m0);
1886     acc2 = acc2 + vecw_bytesum(inner_acc2, m0);
1887     acc3 = acc3 + vecw_bytesum(inner_acc3, m0);
1888   }
1889 }
1890 
PopcountWordsIntersect3val(const uintptr_t * __restrict bitvec1,const uintptr_t * __restrict bitvec2,uint32_t word_ct,uint32_t * __restrict popcount1_ptr,uint32_t * __restrict popcount2_ptr,uint32_t * __restrict popcount_intersect_ptr)1891 void PopcountWordsIntersect3val(const uintptr_t* __restrict bitvec1, const uintptr_t* __restrict bitvec2, uint32_t word_ct, uint32_t* __restrict popcount1_ptr, uint32_t* __restrict popcount2_ptr, uint32_t* __restrict popcount_intersect_ptr) {
1892   const uint32_t trivec_ct = word_ct / (3 * kWordsPerVec);
1893   uint32_t ct1;
1894   uint32_t ct2;
1895   uint32_t ct3;
1896   PopcountVecsNoSse42Intersect3val(R_CAST(const VecW*, bitvec1), R_CAST(const VecW*, bitvec2), trivec_ct * 3, &ct1, &ct2, &ct3);
1897   const uint32_t words_consumed = trivec_ct * (3 * kWordsPerVec);
1898   bitvec1 = &(bitvec1[words_consumed]);
1899   bitvec2 = &(bitvec2[words_consumed]);
1900   const uint32_t remainder = word_ct - words_consumed;
1901   for (uint32_t widx = 0; widx != remainder; ++widx) {
1902     const uintptr_t word1 = bitvec1[widx];
1903     const uintptr_t word2 = bitvec2[widx];
1904     ct1 += PopcountWord(word1);
1905     ct2 += PopcountWord(word2);
1906     ct3 += PopcountWord(word1 & word2);
1907   }
1908   *popcount1_ptr = ct1;
1909   *popcount2_ptr = ct2;
1910   *popcount_intersect_ptr = ct3;
1911 }
1912 #endif
1913 
AllBitsAreZero(const uintptr_t * bitarr,uintptr_t start_idx,uintptr_t end_idx)1914 uint32_t AllBitsAreZero(const uintptr_t* bitarr, uintptr_t start_idx, uintptr_t end_idx) {
1915   uintptr_t start_idxl = start_idx / kBitsPerWord;
1916   const uintptr_t start_idxlr = start_idx & (kBitsPerWord - 1);
1917   const uintptr_t end_idxl = end_idx / kBitsPerWord;
1918   const uintptr_t end_idxlr = end_idx & (kBitsPerWord - 1);
1919   if (start_idxl == end_idxl) {
1920     return !(bitarr[start_idxl] & ((k1LU << end_idxlr) - (k1LU << start_idxlr)));
1921   }
1922   if (start_idxlr) {
1923     if (bitarr[start_idxl++] >> start_idxlr) {
1924       return 0;
1925     }
1926   }
1927   for (; start_idxl != end_idxl; ++start_idxl) {
1928     if (bitarr[start_idxl]) {
1929       return 0;
1930     }
1931   }
1932   if (!end_idxlr) {
1933     return 1;
1934   }
1935   return !bzhi(bitarr[end_idxl], end_idxlr);
1936 }
1937 
CopyBitarrRange(const uintptr_t * __restrict src_bitarr,uintptr_t src_start_bitidx,uintptr_t target_start_bitidx,uintptr_t len,uintptr_t * __restrict target_bitarr)1938 void CopyBitarrRange(const uintptr_t* __restrict src_bitarr, uintptr_t src_start_bitidx, uintptr_t target_start_bitidx, uintptr_t len, uintptr_t* __restrict target_bitarr) {
1939   // assumes len is positive, and relevant bits of target_bitarr are zero
1940   const uintptr_t* src_bitarr_iter = &(src_bitarr[src_start_bitidx / kBitsPerWord]);
1941   uint32_t src_rshift = src_start_bitidx % kBitsPerWord;
1942   uintptr_t* target_bitarr_iter = &(target_bitarr[target_start_bitidx / kBitsPerWord]);
1943   uint32_t target_initial_lshift = target_start_bitidx % kBitsPerWord;
1944   uintptr_t cur_src_word;
1945   if (target_initial_lshift) {
1946     const uint32_t initial_copy_bitct = kBitsPerWord - target_initial_lshift;
1947     if (len <= initial_copy_bitct) {
1948       goto CopyBitarrRange_last_partial_word;
1949     }
1950     cur_src_word = (*src_bitarr_iter) >> src_rshift;
1951     if (src_rshift >= target_initial_lshift) {
1952       ++src_bitarr_iter;
1953       cur_src_word |= (*src_bitarr_iter) << (kBitsPerWord - src_rshift);
1954     }
1955     *target_bitarr_iter++ |= cur_src_word << target_initial_lshift;
1956     src_rshift = (src_rshift + initial_copy_bitct) % kBitsPerWord;
1957     len -= initial_copy_bitct;
1958   }
1959   {
1960     const uintptr_t fullword_ct = len / kBitsPerWord;
1961     if (!src_rshift) {
1962       memcpy(target_bitarr_iter, src_bitarr_iter, fullword_ct * sizeof(intptr_t));
1963       target_bitarr_iter = &(target_bitarr_iter[fullword_ct]);
1964       src_bitarr_iter = &(src_bitarr_iter[fullword_ct]);
1965     } else {
1966       const uint32_t src_lshift = kBitsPerWord - src_rshift;
1967       cur_src_word = *src_bitarr_iter;
1968       for (uintptr_t widx = 0; widx != fullword_ct; ++widx) {
1969         const uintptr_t next_src_word = *(++src_bitarr_iter);
1970         *target_bitarr_iter++ = (cur_src_word >> src_rshift) | (next_src_word << src_lshift);
1971         cur_src_word = next_src_word;
1972       }
1973     }
1974   }
1975   len %= kBitsPerWord;
1976   if (len) {
1977     target_initial_lshift = 0;
1978   CopyBitarrRange_last_partial_word:
1979     cur_src_word = (*src_bitarr_iter) >> src_rshift;
1980     if (len + src_rshift > kBitsPerWord) {
1981       cur_src_word |= src_bitarr_iter[1] << (kBitsPerWord - src_rshift);
1982     }
1983     *target_bitarr_iter |= (cur_src_word & ((~k0LU) >> (kBitsPerWord - S_CAST(uint32_t, len)))) << target_initial_lshift;
1984   }
1985 }
1986 
1987 // advances forward_ct set bits; forward_ct must be positive.  (stays put if
1988 // forward_ct == 1 and current bit is set.  may want to tweak this interface,
1989 // easy to introduce off-by-one bugs...)
1990 // In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end of
1991 // the trailing 16-byte block can be safely read from.
FindNth1BitFrom(const uintptr_t * bitvec,uintptr_t cur_pos,uintptr_t forward_ct)1992 uintptr_t FindNth1BitFrom(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct) {
1993   assert(forward_ct);
1994   uintptr_t widx = cur_pos / kBitsPerWord;
1995   uintptr_t ulii = cur_pos % kBitsPerWord;
1996   const uintptr_t* bptr = &(bitvec[widx]);
1997   uintptr_t uljj;
1998   uintptr_t ulkk;
1999 #ifdef __LP64__
2000   const VecW* vptr;
2001   assert(VecIsAligned(bitvec));
2002 #endif
2003   if (ulii) {
2004     uljj = (*bptr) >> ulii;
2005     ulkk = PopcountWord(uljj);
2006     if (ulkk >= forward_ct) {
2007     JumpForwardSetUnsafe_finish:
2008       return widx * kBitsPerWord + ulii + WordBitIdxToUidx(uljj, forward_ct - 1);
2009     }
2010     forward_ct -= ulkk;
2011     ++widx;
2012     ++bptr;
2013   }
2014   ulii = 0;
2015 #ifdef __LP64__
2016   while (widx & (kWordsPerVec - k1LU)) {
2017     uljj = *bptr;
2018     ulkk = PopcountWord(uljj);
2019     if (ulkk >= forward_ct) {
2020       goto JumpForwardSetUnsafe_finish;
2021     }
2022     forward_ct -= ulkk;
2023     ++widx;
2024     ++bptr;
2025   }
2026   vptr = R_CAST(const VecW*, bptr);
2027 #ifdef USE_AVX2
2028   while (forward_ct > kBitsPerWord * (16 * kWordsPerVec)) {
2029     uljj = (forward_ct - 1) / (kBitsPerWord * kWordsPerVec);
2030     ulkk = PopcountVecsAvx2(vptr, uljj);
2031     vptr = &(vptr[uljj]);
2032     forward_ct -= ulkk;
2033   }
2034 #else
2035   while (forward_ct > kBitsPerWord * (3 * kWordsPerVec)) {
2036     uljj = ((forward_ct - 1) / (kBitsPerWord * (3 * kWordsPerVec))) * 3;
2037     // yeah, yeah, this is suboptimal if we have SSE4.2
2038     ulkk = PopcountVecsNoAvx2(vptr, uljj);
2039     vptr = &(vptr[uljj]);
2040     forward_ct -= ulkk;
2041   }
2042 #endif
2043   bptr = R_CAST(const uintptr_t*, vptr);
2044   while (forward_ct > kBitsPerWord) {
2045     forward_ct -= PopcountWord(*bptr++);
2046   }
2047 #else
2048   while (forward_ct > kBitsPerWord) {
2049     uljj = (forward_ct - 1) / kBitsPerWord;
2050     ulkk = PopcountWords(bptr, uljj);
2051     bptr = &(bptr[uljj]);
2052     forward_ct -= ulkk;
2053   }
2054 #endif
2055   for (; ; ++bptr) {
2056     uljj = *bptr;
2057     ulkk = PopcountWord(uljj);
2058     if (ulkk >= forward_ct) {
2059       widx = bptr - bitvec;
2060       goto JumpForwardSetUnsafe_finish;
2061     }
2062     forward_ct -= ulkk;
2063   }
2064 }
2065 
ComputeUidxStartPartition(const uintptr_t * variant_include,uint64_t variant_ct,uint32_t thread_ct,uint32_t first_variant_uidx,uint32_t * variant_uidx_starts)2066 void ComputeUidxStartPartition(const uintptr_t* variant_include, uint64_t variant_ct, uint32_t thread_ct, uint32_t first_variant_uidx, uint32_t* variant_uidx_starts) {
2067   assert(variant_ct);
2068   uint32_t cur_variant_uidx_start = AdvTo1Bit(variant_include, first_variant_uidx);
2069   uint32_t cur_variant_idx_start = 0;
2070   variant_uidx_starts[0] = cur_variant_uidx_start;
2071   for (uint32_t tidx = 1; tidx != thread_ct; ++tidx) {
2072     const uint32_t new_variant_idx_start = (tidx * variant_ct) / thread_ct;
2073     if (new_variant_idx_start != cur_variant_idx_start) {
2074       cur_variant_uidx_start = FindNth1BitFrom(variant_include, cur_variant_uidx_start + 1, new_variant_idx_start - cur_variant_idx_start);
2075       cur_variant_idx_start = new_variant_idx_start;
2076     }
2077     variant_uidx_starts[tidx] = cur_variant_uidx_start;
2078   }
2079 }
2080 
2081 // May want to have an multiallelic_set bitarray to accelerate this type of
2082 // operation?  Probably only want to conditionally initialize it, and only
2083 // after variant filtering is complete, though.
CountExtraAlleles(const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,uint32_t variant_uidx_start,uint32_t variant_uidx_end,uint32_t multiallelic_variant_ct_multiplier)2084 uintptr_t CountExtraAlleles(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t multiallelic_variant_ct_multiplier) {
2085   if (!allele_idx_offsets) {
2086     return 0;
2087   }
2088   const uint32_t variant_ct = PopcountBitRange(variant_include, variant_uidx_start, variant_uidx_end);
2089   const uint32_t const_subtract = 2 - multiallelic_variant_ct_multiplier;
2090   uintptr_t result = 0;
2091   uintptr_t variant_uidx_base;
2092   uintptr_t cur_bits;
2093   BitIter1Start(variant_include, variant_uidx_start, &variant_uidx_base, &cur_bits);
2094   for (uint32_t uii = 0; uii != variant_ct; ++uii) {
2095     const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
2096     const uintptr_t allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offsets[variant_uidx];
2097     if (allele_ct != 2) {
2098       result += allele_ct - const_subtract;
2099     }
2100   }
2101   return result;
2102 }
2103 
MaxAlleleCtSubset(const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_allele_ct)2104 uint32_t MaxAlleleCtSubset(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct) {
2105   if (!allele_idx_offsets) {
2106     return 2;
2107   }
2108   if (raw_variant_ct == variant_ct) {
2109     return max_allele_ct;
2110   }
2111   uintptr_t variant_uidx_base = 0;
2112   uintptr_t cur_bits = variant_include[0];
2113   uintptr_t subset_max_allele_ct = 2;
2114   for (uint32_t uii = 0; uii != variant_ct; ++uii) {
2115     const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
2116     const uint32_t allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offsets[variant_uidx];
2117     if (allele_ct > subset_max_allele_ct) {
2118       if (allele_ct == max_allele_ct) {
2119         return max_allele_ct;
2120       }
2121       subset_max_allele_ct = allele_ct;
2122     }
2123   }
2124   return subset_max_allele_ct;
2125 }
2126 
ComputePartitionAligned(const uintptr_t * variant_include,uint32_t orig_thread_ct,uint32_t first_variant_uidx,uint32_t cur_variant_idx,uint32_t cur_variant_ct,uint32_t alignment,uint32_t * variant_uidx_starts,uint32_t * vidx_starts)2127 void ComputePartitionAligned(const uintptr_t* variant_include, uint32_t orig_thread_ct, uint32_t first_variant_uidx, uint32_t cur_variant_idx, uint32_t cur_variant_ct, uint32_t alignment, uint32_t* variant_uidx_starts, uint32_t* vidx_starts) {
2128   // Minimize size of the largest chunk, under the condition that all
2129   // intermediate variant_idx values are divisible by alignment.
2130   //
2131   // There are three possibilities:
2132   // 1. The straightforward solution one gets from rounding cur_variant_idx
2133   //    down, and (cur_variant_idx + variant_ct) up, is optimal.  Call this
2134   //    chunk size C.
2135   // 2. C - leading_idx_ct is ideal.
2136   // 3. C - trailing_idx_ct is ideal.
2137 
2138   assert(cur_variant_ct);
2139   const uint32_t variant_idx_end = cur_variant_idx + cur_variant_ct;
2140   const uint32_t log2_align = ctzu32(alignment);
2141   const uint32_t leading_idx_ct = cur_variant_idx % alignment;
2142   const uint32_t trailing_idx_ct = (-variant_idx_end) % alignment;
2143   const uint32_t block_ct = (cur_variant_ct + leading_idx_ct + trailing_idx_ct) >> log2_align;
2144   uint32_t cur_variant_uidx_start = AdvTo1Bit(variant_include, first_variant_uidx);
2145   variant_uidx_starts[0] = cur_variant_uidx_start;
2146   vidx_starts[0] = cur_variant_idx;
2147   const uint32_t thread_ct = MINV(orig_thread_ct, block_ct);
2148   if (thread_ct > 1) {
2149     const uint32_t std_blocks_per_thread = 1 + (block_ct - 1) / thread_ct;
2150     // Possibilities 2 and 3 are only live if
2151     //   block_ct == (std_blocks_per_thread - 1) * thread_ct + 1, or
2152     //   block_ct == (std_blocks_per_thread - 1) * thread_ct + 2,
2153     // In the first situation, the best solution is
2154     //    min(possibility 2, possibility 3).
2155     // In the second situation, the best solution is
2156     //    max(possibility 2, possibility 3).
2157     uint32_t central_variant_ct = std_blocks_per_thread * alignment;
2158     uint32_t first_block_variant_ct = central_variant_ct - leading_idx_ct;
2159     const uint32_t remainder_m1 = block_ct - (std_blocks_per_thread - 1) * thread_ct - 1;
2160     if (remainder_m1 <= 1) {
2161       central_variant_ct -= alignment;
2162       if ((!remainder_m1) && (leading_idx_ct < trailing_idx_ct)) {
2163         first_block_variant_ct -= alignment;
2164       }
2165     }
2166     cur_variant_uidx_start = FindNth1BitFrom(variant_include, cur_variant_uidx_start + 1, first_block_variant_ct);
2167     cur_variant_idx += first_block_variant_ct;
2168     variant_uidx_starts[1] = cur_variant_uidx_start;
2169     vidx_starts[1] = cur_variant_idx;
2170     for (uint32_t tidx = 2; tidx != thread_ct; ++tidx) {
2171       cur_variant_uidx_start = FindNth1BitFrom(variant_include, cur_variant_uidx_start + 1, central_variant_ct);
2172       cur_variant_idx += central_variant_ct;
2173       // bugfix (14 Nov 2017): this decrement was in the wrong place
2174       if (tidx == remainder_m1) {
2175         central_variant_ct -= alignment;
2176       }
2177       variant_uidx_starts[tidx] = cur_variant_uidx_start;
2178       vidx_starts[tidx] = cur_variant_idx;
2179     }
2180   }
2181   if (thread_ct < orig_thread_ct) {
2182     uint32_t last_vidx_ct = variant_idx_end - vidx_starts[thread_ct - 1];
2183     cur_variant_uidx_start = FindNth1BitFrom(variant_include, cur_variant_uidx_start + 1, last_vidx_ct);
2184     for (uint32_t tidx = thread_ct; tidx != orig_thread_ct; ++tidx) {
2185       variant_uidx_starts[tidx] = cur_variant_uidx_start;
2186     }
2187     for (uint32_t tidx = thread_ct; tidx != orig_thread_ct; ++tidx) {
2188       vidx_starts[tidx] = variant_idx_end;
2189     }
2190   }
2191   vidx_starts[orig_thread_ct] = variant_idx_end;
2192 }
2193 
ParseNextRange(const char * const * argvk,uint32_t param_ct,char range_delim,uint32_t * cur_param_idx_ptr,const char ** cur_arg_pptr,const char ** range_start_ptr,uint32_t * rs_len_ptr,const char ** range_end_ptr,uint32_t * re_len_ptr)2194 BoolErr ParseNextRange(const char* const* argvk, uint32_t param_ct, char range_delim, uint32_t* cur_param_idx_ptr, const char** cur_arg_pptr, const char** range_start_ptr, uint32_t* rs_len_ptr, const char** range_end_ptr, uint32_t* re_len_ptr) {
2195   // Starts reading from argv[cur_param_idx][cur_pos].  If a valid range is
2196   // next, range_start + rs_len + range_end + re_len are updated.  If only a
2197   // single item is next, range_end is set to nullptr and range_start + rs_len
2198   // are updated.  If there are no items left, range_start is set to nullptr.
2199   // If the input is not well-formed, -1 is returned instead of 0.
2200   uint32_t cur_param_idx = *cur_param_idx_ptr;
2201   if (cur_param_idx > param_ct) {
2202     *cur_arg_pptr = nullptr;
2203     return 0;
2204   }
2205   const char* cur_arg_ptr = *cur_arg_pptr;
2206   while (1) {
2207     char cc = *cur_arg_ptr;
2208     if (!cc) {
2209       *cur_param_idx_ptr = ++cur_param_idx;
2210       if (cur_param_idx > param_ct) {
2211         *range_start_ptr = nullptr;
2212         return 0;
2213       }
2214       cur_arg_ptr = argvk[cur_param_idx];
2215       cc = *cur_arg_ptr;
2216     }
2217     if (unlikely(cc == range_delim)) {
2218       return 1;
2219     }
2220     if (cc != ',') {
2221       break;
2222     }
2223     ++cur_arg_ptr;
2224   }
2225   *range_start_ptr = cur_arg_ptr;
2226   char cc;
2227   do {
2228     cc = *(++cur_arg_ptr);
2229     if ((!cc) || (cc == ',')) {
2230       *rs_len_ptr = cur_arg_ptr - (*range_start_ptr);
2231       *cur_arg_pptr = cur_arg_ptr;
2232       *range_end_ptr = nullptr;
2233       return 0;
2234     }
2235   } while (cc != range_delim);
2236   *rs_len_ptr = cur_arg_ptr - (*range_start_ptr);
2237   cc = *(++cur_arg_ptr);
2238   if (unlikely(((*rs_len_ptr) > kMaxIdSlen) || (!cc) || (cc == ',') || (cc == range_delim))) {
2239     return 1;
2240   }
2241   *range_end_ptr = cur_arg_ptr;
2242   do {
2243     cc = *(++cur_arg_ptr);
2244     if (unlikely(cc == range_delim)) {
2245       return 1;
2246     }
2247   } while (cc && (cc != ','));
2248   *re_len_ptr = cur_arg_ptr - (*range_end_ptr);
2249   if (unlikely((*re_len_ptr) > kMaxIdSlen)) {
2250     return 1;
2251   }
2252   *cur_arg_pptr = cur_arg_ptr;
2253   return 0;
2254 }
2255 
ParseNameRanges(const char * const * argvk,const char * errstr_append,uint32_t param_ct,uint32_t require_posint,char range_delim,RangeList * range_list_ptr)2256 PglErr ParseNameRanges(const char* const* argvk, const char* errstr_append, uint32_t param_ct, uint32_t require_posint, char range_delim, RangeList* range_list_ptr) {
2257   uint32_t name_ct = 0;
2258   uint32_t cur_param_idx = 1;
2259   uint32_t name_max_blen = 0;
2260   const char* cur_arg_ptr;
2261   const char* range_start;
2262   uint32_t rs_len;
2263   const char* range_end;
2264   uint32_t re_len;
2265   unsigned char* cur_name_starts_range;
2266   uint32_t last_val;
2267   uint32_t cur_val;
2268   // two passes.  first pass: count arguments, determine name_max_blen;
2269   // then allocate memory; then fill it.
2270   if (param_ct) {
2271     cur_arg_ptr = argvk[1];
2272     while (1) {
2273       if (unlikely(ParseNextRange(argvk, param_ct, range_delim, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len))) {
2274         logerrprintfww("Error: Invalid %s argument '%s'.\n", argvk[0], argvk[cur_param_idx]);
2275         logerrputs(errstr_append);
2276         return kPglRetInvalidCmdline;
2277       }
2278       if (!range_start) {
2279         break;
2280       }
2281       ++name_ct;
2282       if (rs_len > name_max_blen) {
2283         name_max_blen = rs_len;  // does NOT include trailing null yet
2284       }
2285       if (range_end) {
2286         ++name_ct;
2287         if (re_len > name_max_blen) {
2288           name_max_blen = re_len;
2289         }
2290       }
2291     }
2292   }
2293   if (unlikely(!name_ct)) {
2294     logerrprintf("Error: %s requires at least one value.\n%s", argvk[0], errstr_append);
2295     return kPglRetInvalidCmdline;
2296   }
2297   range_list_ptr->name_max_blen = ++name_max_blen;
2298   range_list_ptr->name_ct = name_ct;
2299   if (unlikely(pgl_malloc(name_ct * (S_CAST(uintptr_t, name_max_blen) + 1), &range_list_ptr->names))) {
2300     return kPglRetNomem;
2301   }
2302   range_list_ptr->starts_range = R_CAST(unsigned char*, &(range_list_ptr->names[name_ct * S_CAST(uintptr_t, name_max_blen)]));
2303   char* cur_name_str = range_list_ptr->names;
2304   cur_name_starts_range = range_list_ptr->starts_range;
2305   cur_param_idx = 1;
2306   cur_arg_ptr = argvk[1];
2307   while (1) {
2308     // second pass; this can't fail since we already validated
2309     ParseNextRange(argvk, param_ct, range_delim, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len);
2310     if (!range_start) {
2311       if (require_posint) {
2312         last_val = 0;
2313         for (cur_param_idx = 0; cur_param_idx != name_ct; ++cur_param_idx) {
2314           cur_name_str = &(range_list_ptr->names[cur_param_idx * S_CAST(uintptr_t, name_max_blen)]);
2315           const char* dup_check = cur_name_str;  // actually a numeric check
2316           do {
2317             if (unlikely(IsNotDigit(*dup_check))) {
2318               logerrprintfww("Error: Invalid %s argument '%s'.\n", argvk[0], cur_name_str);
2319               return kPglRetInvalidCmdline;
2320             }
2321           } while (*(++dup_check));
2322           if (unlikely(ScanPosintDefcapx(cur_name_str, &cur_val))) {
2323             logerrprintfww("Error: Invalid %s argument '%s'.\n", argvk[0], cur_name_str);
2324             return kPglRetInvalidCmdline;
2325           }
2326           if (range_list_ptr->starts_range[cur_param_idx]) {
2327             last_val = cur_val;
2328           } else {
2329             if (unlikely(cur_val <= last_val)) {
2330               logerrprintfww("Error: Invalid %s range '%s-%s'.\n", argvk[0], &(range_list_ptr->names[(cur_param_idx - 1) * name_max_blen]), cur_name_str);
2331               return kPglRetInvalidCmdline;
2332             }
2333             last_val = 0;
2334           }
2335         }
2336       }
2337       return kPglRetSuccess;
2338     }
2339     memcpyx(cur_name_str, range_start, rs_len, '\0');
2340     const char* dup_check = range_list_ptr->names;
2341     while (dup_check < cur_name_str) {
2342       if (unlikely(memequal(dup_check, cur_name_str, rs_len + 1))) {
2343         logerrprintfww("Error: Duplicate %s argument '%s'.\n", argvk[0], cur_name_str);
2344         return kPglRetInvalidCmdline;
2345       }
2346       dup_check = &(dup_check[name_max_blen]);
2347     }
2348     cur_name_str = &(cur_name_str[name_max_blen]);
2349     if (range_end) {
2350       *cur_name_starts_range++ = 1;
2351       memcpyx(cur_name_str, range_end, re_len, '\0');
2352       dup_check = range_list_ptr->names;
2353       while (dup_check < cur_name_str) {
2354         if (unlikely(memequal(dup_check, cur_name_str, rs_len + 1))) {
2355           logerrprintfww("Error: Duplicate %s argument '%s'.\n", argvk[0], cur_name_str);
2356           return kPglRetInvalidCmdline;
2357         }
2358         dup_check = &(dup_check[name_max_blen]);
2359       }
2360       cur_name_str = &(cur_name_str[name_max_blen]);
2361       *cur_name_starts_range++ = 0;
2362     } else {
2363       *cur_name_starts_range++ = 0;
2364     }
2365   }
2366 }
2367 
2368 
2369 uint32_t CubicRealRoots(double coef_a, double coef_b, double coef_c, STD_ARRAY_REF(double, 3) solutions) {
2370   // Additional research into numerical stability may be in order here...
2371   double a2 = coef_a * coef_a;
2372   double qq = (a2 - 3 * coef_b) * (1.0 / 9.0);
2373   double rr = (2 * a2 * coef_a - 9 * coef_a * coef_b + 27 * coef_c) * (1.0 / 54.0);
2374   double r2 = rr * rr;
2375   double q3 = qq * qq * qq;
2376   double adiv3 = coef_a * (1.0 / 3.0);
2377   double sq;
2378   double dxx;
2379   if (r2 < q3) {
2380     // three real roots
2381     sq = sqrt(qq);
2382     dxx = acos(rr / (qq * sq)) * (1.0 / 3.0);
2383     sq *= -2;
2384     solutions[0] = sq * cos(dxx) - adiv3;
2385     solutions[1] = sq * cos(dxx + (2.0 * kPi / 3.0)) - adiv3;
2386     solutions[2] = sq * cos(dxx - (2.0 * kPi / 3.0)) - adiv3;
2387     // now sort and check for within-epsilon equality
2388     if (solutions[0] > solutions[1]) {
2389       dxx = solutions[0];
2390       solutions[0] = solutions[1];
2391       if (dxx > solutions[2]) {
2392         solutions[1] = solutions[2];
2393         solutions[2] = dxx;
2394       } else {
2395         solutions[1] = dxx;
2396       }
2397       if (solutions[0] > solutions[1]) {
2398         dxx = solutions[0];
2399         solutions[0] = solutions[1];
2400         solutions[1] = dxx;
2401       }
2402     } else if (solutions[1] > solutions[2]) {
2403       dxx = solutions[1];
2404       solutions[1] = solutions[2];
2405       solutions[2] = dxx;
2406     }
2407     if (solutions[1] - solutions[0] < kEpsilon) {
2408       solutions[1] = solutions[2];
2409       return 2 - (solutions[1] - solutions[0] < kEpsilon);
2410     }
2411     return 3 - (solutions[2] - solutions[1] < kEpsilon);
2412   }
2413   dxx = -pow(fabs(rr) + sqrt(r2 - q3), 1.0 / 3.0);
2414   if (dxx == 0.0) {
2415     solutions[0] = -adiv3;
2416     return 1;
2417   }
2418   if (rr < 0.0) {
2419     dxx = -dxx;
2420   }
2421   sq = qq / dxx;
2422   solutions[0] = dxx + sq - adiv3;
2423   // use of regular epsilon here has actually burned us
2424   if (fabs(dxx - sq) >= (kEpsilon * 8)) {
2425     return 1;
2426   }
2427   if (dxx >= 0.0) {
2428     solutions[1] = solutions[0];
2429     solutions[0] = -dxx - adiv3;
2430   } else {
2431     solutions[1] = -dxx - adiv3;
2432   }
2433   return 2;
2434 }
2435 
2436 
2437 CONSTI32(kMaxDupflagThreads, 16);
2438 
2439 typedef struct DupflagHtableMakerStruct {
2440   const uintptr_t* subset_mask;
2441   const char* const* item_ids;
2442   uint32_t item_ct;
2443   uint32_t id_htable_size;
2444   uint32_t item_uidx_starts[kMaxDupflagThreads];
2445 
2446   uint32_t* id_htable;
2447 } DupflagHtableMaker;
2448 
DupflagHtableMakerMain(uint32_t tidx,uint32_t thread_ct,DupflagHtableMaker * ctx)2449 void DupflagHtableMakerMain(uint32_t tidx, uint32_t thread_ct, DupflagHtableMaker* ctx) {
2450   const uint32_t id_htable_size = ctx->id_htable_size;
2451   const uintptr_t* subset_mask = ctx->subset_mask;
2452   const char* const* item_ids = ctx->item_ids;
2453   const uint32_t item_ct = ctx->item_ct;
2454   const uint32_t item_uidx_start = ctx->item_uidx_starts[tidx];
2455   const uint32_t item_idx_start = (item_ct * S_CAST(uint64_t, tidx)) / thread_ct;
2456   const uint32_t item_idx_end = (item_ct * (S_CAST(uint64_t, tidx) + 1)) / thread_ct;
2457   uint32_t* id_htable = ctx->id_htable;
2458 
2459   uintptr_t cur_bits;
2460   uintptr_t item_uidx_base;
2461   BitIter1Start(subset_mask, item_uidx_start, &item_uidx_base, &cur_bits);
2462   for (uint32_t item_idx = item_idx_start; item_idx != item_idx_end; ++item_idx) {
2463     const uintptr_t item_uidx = BitIter1(subset_mask, &item_uidx_base, &cur_bits);
2464     const char* sptr = item_ids[item_uidx];
2465     const uint32_t slen = strlen(sptr);
2466     for (uint32_t hashval = Hashceil(sptr, slen, id_htable_size); ; ) {
2467       uint32_t old_htable_entry = id_htable[hashval];
2468       if (old_htable_entry == UINT32_MAX) {
2469         if (ATOMIC_COMPARE_EXCHANGE_N_U32(&(id_htable[hashval]), &old_htable_entry, item_uidx, 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) {
2470           break;
2471         }
2472       }
2473       if (strequal_overread(sptr, item_ids[old_htable_entry & 0x7fffffff])) {
2474         if (!(old_htable_entry & 0x80000000U)) {
2475           // ok if multiple threads do this
2476           id_htable[hashval] = old_htable_entry | 0x80000000U;
2477         }
2478         break;
2479       }
2480       if (++hashval == id_htable_size) {
2481         hashval = 0;
2482       }
2483     }
2484   }
2485 }
2486 
DupflagHtableMakerThread(void * raw_arg)2487 THREAD_FUNC_DECL DupflagHtableMakerThread(void* raw_arg) {
2488   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
2489   const uint32_t tidx = arg->tidx;
2490   DupflagHtableMaker* ctx = S_CAST(DupflagHtableMaker*, arg->sharedp->context);
2491 
2492   // 1. Initialize id_htable with 1-bits in parallel.
2493   const uint32_t id_htable_size = ctx->id_htable_size;
2494   const uint32_t thread_ct = GetThreadCt(arg->sharedp) + 1;
2495   uint32_t* id_htable = ctx->id_htable;
2496   const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, tidx)) / thread_ct, kInt32PerCacheline);
2497   const uint32_t fill_end = RoundDownPow2((id_htable_size * (S_CAST(uint64_t, tidx) + 1)) / thread_ct, kInt32PerCacheline);
2498   SetAllU32Arr(fill_end - fill_start, &(id_htable[fill_start]));
2499 
2500   // 2. sync.Once
2501   if (THREAD_BLOCK_FINISH(arg)) {
2502     THREAD_RETURN;
2503   }
2504 
2505   // 3. Fill hash table in parallel, and then return.
2506   DupflagHtableMakerMain(tidx, thread_ct, ctx);
2507   THREAD_RETURN;
2508 }
2509 
MakeDupflagHtable(const uintptr_t * subset_mask,const char * const * item_ids,uintptr_t item_ct,uint32_t id_htable_size,uint32_t max_thread_ct,uint32_t * id_htable)2510 PglErr MakeDupflagHtable(const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t id_htable_size, uint32_t max_thread_ct, uint32_t* id_htable) {
2511   PglErr reterr = kPglRetSuccess;
2512   ThreadGroup tg;
2513   PreinitThreads(&tg);
2514   DupflagHtableMaker ctx;
2515   {
2516     uint32_t thread_ct = item_ct / 65536;
2517     if (!thread_ct) {
2518       thread_ct = 1;
2519     } else {
2520       if (thread_ct > max_thread_ct) {
2521         thread_ct = max_thread_ct;
2522       }
2523       // bugfix (26 Nov 2019): forgot to force this
2524       if (thread_ct > kMaxDupflagThreads) {
2525         thread_ct = kMaxDupflagThreads;
2526       }
2527     }
2528     if (unlikely(SetThreadCt0(thread_ct - 1, &tg))) {
2529       goto MakeDupflagHtable_ret_NOMEM;
2530     }
2531 
2532     ctx.subset_mask = subset_mask;
2533     ctx.item_ids = item_ids;
2534     ctx.item_ct = item_ct;
2535     ctx.id_htable_size = id_htable_size;
2536     ctx.id_htable = id_htable;
2537 
2538     uint32_t item_uidx = AdvTo1Bit(subset_mask, 0);
2539     uint32_t item_idx = 0;
2540     ctx.item_uidx_starts[0] = item_uidx;
2541     for (uintptr_t tidx = 1; tidx != thread_ct; ++tidx) {
2542       const uint32_t item_idx_new = (item_ct * S_CAST(uint64_t, tidx)) / thread_ct;
2543       item_uidx = FindNth1BitFrom(subset_mask, item_uidx + 1, item_idx_new - item_idx);
2544       ctx.item_uidx_starts[tidx] = item_uidx;
2545       item_idx = item_idx_new;
2546     }
2547 
2548     if (thread_ct > 1) {
2549       SetThreadFuncAndData(DupflagHtableMakerThread, &ctx, &tg);
2550       if (unlikely(SpawnThreads(&tg))) {
2551         goto MakeDupflagHtable_ret_THREAD_CREATE_FAIL;
2552       }
2553     }
2554     const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, thread_ct - 1)) / thread_ct, kInt32PerCacheline);
2555     SetAllU32Arr(id_htable_size - fill_start, &(id_htable[fill_start]));
2556     if (thread_ct > 1) {
2557       JoinThreads(&tg);
2558       DeclareLastThreadBlock(&tg);
2559       SpawnThreads(&tg);
2560     }
2561     DupflagHtableMakerMain(thread_ct - 1, thread_ct, &ctx);
2562     JoinThreads0(&tg);
2563   }
2564   while (0) {
2565   MakeDupflagHtable_ret_NOMEM:
2566     reterr = kPglRetNomem;
2567     break;
2568   MakeDupflagHtable_ret_THREAD_CREATE_FAIL:
2569     reterr = kPglRetThreadCreateFail;
2570     break;
2571   }
2572   CleanupThreads(&tg);
2573   return reterr;
2574 }
2575 
2576 CONSTI32(kMaxDupstoreThreads, 15);  // probably reduce this after switching to XXH3
2577 CONSTI32(kDupstoreBlockSize, 65536);
2578 CONSTI32(kDupstoreThreadWkspace, kDupstoreBlockSize * 2 * sizeof(int32_t));
2579 
2580 typedef struct DupstoreHtableMakerStruct {
2581   const uintptr_t* subset_mask;
2582   const char* const* item_ids;
2583   uint32_t item_ct;
2584   uint32_t id_htable_size;
2585   uint32_t* id_htable;
2586 
2587   uint32_t item_uidx_start[2];
2588   uint32_t* hashes[2];
2589 } DupstoreHtableMaker;
2590 
DupstoreHtableMakerThread(void * raw_arg)2591 THREAD_FUNC_DECL DupstoreHtableMakerThread(void* raw_arg) {
2592   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
2593   const uint32_t tidx = arg->tidx;
2594   DupstoreHtableMaker* ctx = S_CAST(DupstoreHtableMaker*, arg->sharedp->context);
2595 
2596   // 1. Initialize id_htable with 1-bits in parallel.
2597   const uint32_t id_htable_size = ctx->id_htable_size;
2598   const uint32_t thread_ct = GetThreadCt(arg->sharedp);
2599   uint32_t* id_htable = ctx->id_htable;
2600   // Add 1 to thread_ct in denominator, since unlike the DupflagHtableMaker
2601   // case, the parent thread has separate logic to help out here.
2602   const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, tidx)) / (thread_ct + 1), kInt32PerCacheline);
2603   uint32_t fill_end = RoundDownPow2((id_htable_size * (S_CAST(uint64_t, tidx) + 1)) / (thread_ct + 1), kInt32PerCacheline);
2604   SetAllU32Arr(fill_end - fill_start, &(id_htable[fill_start]));
2605 
2606   const uintptr_t* subset_mask = ctx->subset_mask;
2607   const char* const* item_ids = ctx->item_ids;
2608   uint32_t items_left = ctx->item_ct;
2609   const uint32_t idx_start_offset = tidx * kDupstoreBlockSize;
2610   uint32_t idx_stop_offset = idx_start_offset + kDupstoreBlockSize;
2611   const uint32_t is_last_thread = (tidx + 1 == thread_ct);
2612   uint32_t parity = 0;
2613   while (!THREAD_BLOCK_FINISH(arg)) {
2614     // 2. Compute up to kDupstoreBlockSize hashes.  (Parent thread is
2615     //    responsible for using them to update the hash table.)
2616     if (items_left < idx_stop_offset) {
2617       if (items_left <= idx_start_offset) {
2618         // Must be last iteration.  Don't need to update parity.
2619         // (Could load-balance this iteration differently, but it shouldn't
2620         // really matter.)
2621         continue;
2622       }
2623       idx_stop_offset = items_left;
2624     }
2625     uint32_t* hashes = ctx->hashes[parity];
2626     // bugfix (24 Jan 2020): IsSet(subset_mask, item_uidx) is not always true.
2627     uintptr_t item_uidx = FindNth1BitFrom(subset_mask, ctx->item_uidx_start[parity], idx_start_offset + 1);
2628 
2629     uintptr_t cur_bits;
2630     uintptr_t item_uidx_base;
2631     BitIter1Start(subset_mask, item_uidx, &item_uidx_base, &cur_bits);
2632     for (uint32_t item_idx = idx_start_offset; item_idx != idx_stop_offset; ++item_idx) {
2633       item_uidx = BitIter1(subset_mask, &item_uidx_base, &cur_bits);
2634       const char* sptr = item_ids[item_uidx];
2635       const uint32_t slen = strlen(sptr);
2636       hashes[item_idx] = Hashceil(sptr, slen, id_htable_size);
2637     }
2638     items_left -= thread_ct * kDupstoreBlockSize;  // final-iteration underflow ok
2639     parity = 1 - parity;
2640     if (is_last_thread) {
2641       ctx->item_uidx_start[parity] = item_uidx + 1;
2642     }
2643   }
2644   THREAD_RETURN;
2645 }
2646 
PopulateIdHtableMtDupstoreThreadCt(uint32_t max_thread_ct,uint32_t item_ct)2647 uint32_t PopulateIdHtableMtDupstoreThreadCt(uint32_t max_thread_ct, uint32_t item_ct) {
2648   uint32_t thread_ct = item_ct / (2 * kDupstoreBlockSize);
2649   if (thread_ct >= max_thread_ct) {
2650     // parent thread is sufficiently busy
2651     thread_ct = max_thread_ct - 1;
2652   }
2653   if (!thread_ct) {
2654     return 1;
2655   }
2656   return MINV(thread_ct, kMaxDupstoreThreads);
2657 }
2658 
2659 // dup_ct assumed to be initialized to 0 when dup_ct_ptr != nullptr.
2660 //
2661 // This currently has totally separate code paths for the store_all_dups and
2662 // !store_all_dups cases.  However, the table formats are nearly identical, so
2663 // the code may re-converge, and it's reasonable to have just this API entry
2664 // point.
2665 //
2666 // It will probably be moved out of plink2_cmdline soon.
PopulateIdHtableMt(unsigned char * arena_top,const uintptr_t * subset_mask,const char * const * item_ids,uintptr_t item_ct,uint32_t store_all_dups,uint32_t id_htable_size,uint32_t thread_ct,unsigned char ** arena_bottom_ptr,uint32_t * id_htable,uint32_t * dup_ct_ptr)2667 PglErr PopulateIdHtableMt(unsigned char* arena_top, const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t store_all_dups, uint32_t id_htable_size, uint32_t thread_ct, unsigned char** arena_bottom_ptr, uint32_t* id_htable, uint32_t* dup_ct_ptr) {
2668   // Change from plink 1.9: if store_all_dups is false, we don't error out on
2669   // the first encountered duplicate ID; instead, we just flag it in the hash
2670   // table.  So if '.' is the only duplicate ID, and it never appears in a
2671   // variant ID list, plink2 never complains.
2672   //
2673   // When store_all_dups is true, additional linked lists are allocated past
2674   // the end of id_htable to track all raw indexes of duplicate names.
2675   if (!item_ct) {
2676     return kPglRetSuccess;
2677   }
2678   if (!store_all_dups) {
2679     return MakeDupflagHtable(subset_mask, item_ids, item_ct, id_htable_size, thread_ct, id_htable);
2680   }
2681   PglErr reterr = kPglRetSuccess;
2682   unsigned char* arena_bottom_mark = *arena_bottom_ptr;
2683   ThreadGroup tg;
2684   PreinitThreads(&tg);
2685   DupstoreHtableMaker ctx;
2686   {
2687     thread_ct = PopulateIdHtableMtDupstoreThreadCt(thread_ct, item_ct);
2688     uint32_t item_idx_stop = thread_ct * kDupstoreBlockSize;
2689     if (arena_end_alloc_u32(arena_bottom_mark, item_idx_stop, &arena_top, &ctx.hashes[0]) ||
2690         arena_end_alloc_u32(arena_bottom_mark, item_idx_stop, &arena_top, &ctx.hashes[1]) ||
2691         SetThreadCt(thread_ct, &tg)) {
2692       goto PopulateIdHtableMt_ret_NOMEM;
2693     }
2694     uintptr_t cur_arena_left = arena_top - arena_bottom_mark;
2695     // We only want to check for out-of-memory once per sync-iteration.  The
2696     // maximum number of 2x-uint32 entries that might be added per iteration is
2697     // thread_ct * kDupstoreBlockSize * 2 (*2 is because, when we find the
2698     // first duplicate of a string, we need to add two new entries instead of
2699     // just one).
2700     // htable_dup_base grows from the bottom of the arena, and during the main
2701     // loop, extra_alloc is the next htable_dup_base[] array index that'll be
2702     // written to (i.e. it's always an even number).  So if extra_alloc plus
2703     // twice the aforementioned limit isn't larger than cur_arena_left /
2704     // sizeof(int32_t), we can't run out of arena space.
2705     uint32_t extra_alloc_stop;
2706 #ifdef __LP64__
2707     if (cur_arena_left >= 0x400000000LLU + item_idx_stop * 4 * sizeof(int32_t)) {
2708       // this can never be hit
2709       extra_alloc_stop = 0xfffffffe;
2710     } else {
2711 #endif
2712       if (unlikely(cur_arena_left < item_idx_stop * 4 * sizeof(int32_t))) {
2713         goto PopulateIdHtableMt_ret_NOMEM;
2714       }
2715       extra_alloc_stop = (cur_arena_left / sizeof(int32_t)) - item_idx_stop * 4;
2716 #ifdef __LP64__
2717     }
2718 #endif
2719 
2720     ctx.subset_mask = subset_mask;
2721     ctx.item_ids = item_ids;
2722     ctx.item_ct = item_ct;
2723     ctx.id_htable_size = id_htable_size;
2724     ctx.id_htable = id_htable;
2725     SetThreadFuncAndData(DupstoreHtableMakerThread, &ctx, &tg);
2726     if (unlikely(SpawnThreads(&tg))) {
2727       goto PopulateIdHtableMt_ret_THREAD_CREATE_FAIL;
2728     }
2729 
2730     const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, thread_ct)) / (thread_ct + 1), kInt32PerCacheline);
2731     SetAllU32Arr(id_htable_size - fill_start, &(id_htable[fill_start]));
2732 
2733     const uint32_t item_uidx_start = AdvTo1Bit(subset_mask, 0);
2734     JoinThreads(&tg);
2735     ctx.item_uidx_start[0] = item_uidx_start;
2736     if (item_idx_stop >= item_ct) {
2737       DeclareLastThreadBlock(&tg);
2738       item_idx_stop = item_ct;
2739     }
2740     SpawnThreads(&tg);
2741     uint32_t items_left = item_ct;
2742     uint32_t* htable_dup_base = R_CAST(uint32_t*, arena_bottom_mark);
2743     uint32_t extra_alloc = 0;
2744     uint32_t prev_llidx = 0;
2745     uintptr_t cur_bits;
2746     uintptr_t item_uidx_base;
2747     BitIter1Start(subset_mask, item_uidx_start, &item_uidx_base, &cur_bits);
2748     uint32_t parity = 0;
2749     do {
2750       JoinThreads(&tg);
2751       if (extra_alloc > extra_alloc_stop) {
2752         goto PopulateIdHtableMt_ret_NOMEM;
2753       }
2754       if (item_idx_stop < items_left) {
2755         if (item_idx_stop * 2 >= items_left) {
2756           DeclareLastThreadBlock(&tg);
2757         }
2758         SpawnThreads(&tg);
2759       } else {
2760         item_idx_stop = items_left;
2761       }
2762       uint32_t* hashes = ctx.hashes[parity];
2763       for (uint32_t item_idx = 0; item_idx != item_idx_stop; ++item_idx) {
2764         const uintptr_t item_uidx = BitIter1(subset_mask, &item_uidx_base, &cur_bits);
2765         const uint32_t hashval_base = hashes[item_idx];
2766         uint32_t old_htable_entry = id_htable[hashval_base];
2767         if (old_htable_entry == UINT32_MAX) {
2768           id_htable[hashval_base] = item_uidx;
2769           continue;
2770         }
2771         const char* sptr = item_ids[item_uidx];
2772         for (uint32_t hashval = hashval_base; ; ) {
2773           const uint32_t was_dup = old_htable_entry >> 31;
2774           uint32_t prev_uidx;
2775           if (was_dup) {
2776             prev_llidx = old_htable_entry * 2;
2777             prev_uidx = htable_dup_base[prev_llidx];
2778           } else {
2779             prev_uidx = old_htable_entry;
2780           }
2781           if (strequal_overread(sptr, item_ids[prev_uidx])) {
2782             // point to linked list entry instead
2783             if (!was_dup) {
2784               htable_dup_base[extra_alloc] = old_htable_entry;
2785               htable_dup_base[extra_alloc + 1] = UINT32_MAX;
2786               prev_llidx = extra_alloc;
2787               extra_alloc += 2;
2788             }
2789             htable_dup_base[extra_alloc] = item_uidx;
2790             htable_dup_base[extra_alloc + 1] = prev_llidx;
2791             id_htable[hashval] = 0x80000000U | (extra_alloc >> 1);
2792             extra_alloc += 2;
2793             break;  // bugfix
2794           }
2795           if (++hashval == id_htable_size) {
2796             hashval = 0;
2797           }
2798           // duplicate this code since we want to avoid the item_ids[item_uidx]
2799           // read when possible
2800           old_htable_entry = id_htable[hashval];
2801           if (old_htable_entry == UINT32_MAX) {
2802             id_htable[hashval] = item_uidx;
2803             break;
2804           }
2805         }
2806       }
2807       items_left -= item_idx_stop;
2808       parity = 1 - parity;
2809     } while (items_left);
2810     // bugfix (15 Oct 2019): no threads to join here!
2811     if (extra_alloc) {
2812       // bugfix: forgot to align this
2813       *arena_bottom_ptr += RoundUpPow2(extra_alloc * sizeof(int32_t), kCacheline);
2814       if (dup_ct_ptr) {
2815         *dup_ct_ptr = extra_alloc / 2;
2816       }
2817     }
2818   }
2819   while (0) {
2820   PopulateIdHtableMt_ret_NOMEM:
2821     *arena_bottom_ptr = arena_bottom_mark;
2822     reterr = kPglRetNomem;
2823     break;
2824   PopulateIdHtableMt_ret_THREAD_CREATE_FAIL:
2825     // not currently possible for this to happen after *arena_bottom_ptr moved
2826     reterr = kPglRetThreadCreateFail;
2827     break;
2828   }
2829   CleanupThreads(&tg);
2830   return reterr;
2831 }
2832 
2833 // Similar to DupflagHtableMaker, but we don't need to flag duplicates, we just
2834 // "error out" when we find one.
2835 typedef struct IdUniquenessCheckerStruct {
2836   const uintptr_t* subset_mask;
2837   const char* const* item_ids;
2838   uint32_t item_ct;
2839   uint32_t id_htable_size;
2840   uint32_t item_uidx_starts[kMaxDupflagThreads];
2841 
2842   uint32_t* id_htable;
2843 
2844   uint32_t dup_found;
2845 } IdUniquenessChecker;
2846 
IdUniquenessCheckerMain(uint32_t tidx,uint32_t thread_ct,IdUniquenessChecker * ctx)2847 void IdUniquenessCheckerMain(uint32_t tidx, uint32_t thread_ct, IdUniquenessChecker* ctx) {
2848   const uint32_t id_htable_size = ctx->id_htable_size;
2849   const uintptr_t* subset_mask = ctx->subset_mask;
2850   const char* const* item_ids = ctx->item_ids;
2851   const uint32_t item_ct = ctx->item_ct;
2852   const uint32_t item_uidx_start = ctx->item_uidx_starts[tidx];
2853   const uint32_t item_idx_start = (item_ct * S_CAST(uint64_t, tidx)) / thread_ct;
2854   const uint32_t item_idx_end = (item_ct * (S_CAST(uint64_t, tidx) + 1)) / thread_ct;
2855   uint32_t* id_htable = ctx->id_htable;
2856 
2857   uintptr_t cur_bits;
2858   uintptr_t item_uidx_base;
2859   BitIter1Start(subset_mask, item_uidx_start, &item_uidx_base, &cur_bits);
2860   for (uint32_t item_idx = item_idx_start; item_idx != item_idx_end; ) {
2861     const uint32_t item_idx_cur_end = MINV(item_idx_end, item_idx + 65536);
2862     for (; item_idx != item_idx_cur_end; ++item_idx) {
2863       const uintptr_t item_uidx = BitIter1(subset_mask, &item_uidx_base, &cur_bits);
2864       const char* sptr = item_ids[item_uidx];
2865       const uint32_t slen = strlen(sptr);
2866       for (uint32_t hashval = Hashceil(sptr, slen, id_htable_size); ; ) {
2867         uint32_t old_htable_entry = id_htable[hashval];
2868         if (old_htable_entry == UINT32_MAX) {
2869           if (ATOMIC_COMPARE_EXCHANGE_N_U32(&(id_htable[hashval]), &old_htable_entry, item_uidx, 0, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) {
2870             break;
2871           }
2872         }
2873         if (strequal_overread(sptr, item_ids[old_htable_entry & 0x7fffffff])) {
2874           // no synchronization needed since this variable can't change in any
2875           // other way
2876           ctx->dup_found = 1;
2877           return;
2878         }
2879         if (++hashval == id_htable_size) {
2880           hashval = 0;
2881         }
2882       }
2883     }
2884     if (ctx->dup_found) {
2885       return;
2886     }
2887   }
2888 }
2889 
IdUniquenessCheckerThread(void * raw_arg)2890 THREAD_FUNC_DECL IdUniquenessCheckerThread(void* raw_arg) {
2891   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
2892   const uint32_t tidx = arg->tidx;
2893   IdUniquenessChecker* ctx = S_CAST(IdUniquenessChecker*, arg->sharedp->context);
2894 
2895   // 1. Initialize id_htable with 1-bits in parallel.
2896   const uint32_t id_htable_size = ctx->id_htable_size;
2897   const uint32_t thread_ct = GetThreadCt(arg->sharedp) + 1;
2898   uint32_t* id_htable = ctx->id_htable;
2899   const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, tidx)) / thread_ct, kInt32PerCacheline);
2900   const uint32_t fill_end = RoundDownPow2((id_htable_size * (S_CAST(uint64_t, tidx) + 1)) / thread_ct, kInt32PerCacheline);
2901   SetAllU32Arr(fill_end - fill_start, &(id_htable[fill_start]));
2902 
2903   // 2. sync.Once
2904   if (THREAD_BLOCK_FINISH(arg)) {
2905     THREAD_RETURN;
2906   }
2907 
2908   // 3. Fill hash table in parallel, and then return.
2909   IdUniquenessCheckerMain(tidx, thread_ct, ctx);
2910   THREAD_RETURN;
2911 }
2912 
CheckIdUniqueness(unsigned char * arena_bottom,unsigned char * arena_top,const uintptr_t * subset_mask,const char * const * item_ids,uintptr_t item_ct,uint32_t max_thread_ct,uint32_t * dup_found_ptr)2913 PglErr CheckIdUniqueness(unsigned char* arena_bottom, unsigned char* arena_top, const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uint32_t max_thread_ct, uint32_t* dup_found_ptr) {
2914   PglErr reterr = kPglRetSuccess;
2915   ThreadGroup tg;
2916   PreinitThreads(&tg);
2917   IdUniquenessChecker ctx;
2918   {
2919     uint32_t thread_ct = item_ct / 65536;
2920     if (!thread_ct) {
2921       thread_ct = 1;
2922     } else {
2923       if (thread_ct > max_thread_ct) {
2924         thread_ct = max_thread_ct;
2925       }
2926       if (thread_ct > kMaxDupflagThreads) {
2927         thread_ct = kMaxDupflagThreads;
2928       }
2929     }
2930     if (unlikely(SetThreadCt0(thread_ct - 1, &tg))) {
2931       goto CheckIdUniqueness_ret_NOMEM;
2932     }
2933 
2934     ctx.subset_mask = subset_mask;
2935     ctx.item_ids = item_ids;
2936     ctx.item_ct = item_ct;
2937     uint32_t id_htable_size = GetHtableFastSize(item_ct);
2938     {
2939       uintptr_t wkspace_left = S_CAST(uintptr_t, arena_top - arena_bottom);
2940       if (wkspace_left < id_htable_size * sizeof(int32_t)) {
2941         id_htable_size = wkspace_left / sizeof(int32_t);
2942         const uint32_t min_htable_size = GetHtableMinSize(item_ct);
2943         if (id_htable_size < min_htable_size) {
2944           goto CheckIdUniqueness_ret_NOMEM;
2945         }
2946       }
2947     }
2948     ctx.id_htable_size = id_htable_size;
2949     uint32_t* id_htable = R_CAST(uint32_t*, arena_bottom);
2950     ctx.id_htable = id_htable;
2951     ctx.dup_found = 0;
2952 
2953     uint32_t item_uidx = AdvTo1Bit(subset_mask, 0);
2954     uint32_t item_idx = 0;
2955     ctx.item_uidx_starts[0] = item_uidx;
2956     for (uintptr_t tidx = 1; tidx != thread_ct; ++tidx) {
2957       const uint32_t item_idx_new = (item_ct * S_CAST(uint64_t, tidx)) / thread_ct;
2958       item_uidx = FindNth1BitFrom(subset_mask, item_uidx + 1, item_idx_new - item_idx);
2959       ctx.item_uidx_starts[tidx] = item_uidx;
2960       item_idx = item_idx_new;
2961     }
2962 
2963     if (thread_ct > 1) {
2964       SetThreadFuncAndData(IdUniquenessCheckerThread, &ctx, &tg);
2965       if (unlikely(SpawnThreads(&tg))) {
2966         goto CheckIdUniqueness_ret_THREAD_CREATE_FAIL;
2967       }
2968     }
2969     const uint32_t fill_start = RoundDownPow2((id_htable_size * S_CAST(uint64_t, thread_ct - 1)) / thread_ct, kInt32PerCacheline);
2970     SetAllU32Arr(id_htable_size - fill_start, &(id_htable[fill_start]));
2971     if (thread_ct > 1) {
2972       JoinThreads(&tg);
2973       DeclareLastThreadBlock(&tg);
2974       SpawnThreads(&tg);
2975     }
2976     IdUniquenessCheckerMain(thread_ct - 1, thread_ct, &ctx);
2977     JoinThreads0(&tg);
2978     *dup_found_ptr = ctx.dup_found;
2979   }
2980   while (0) {
2981   CheckIdUniqueness_ret_NOMEM:
2982     reterr = kPglRetNomem;
2983     break;
2984   CheckIdUniqueness_ret_THREAD_CREATE_FAIL:
2985     reterr = kPglRetThreadCreateFail;
2986     break;
2987   }
2988   CleanupThreads(&tg);
2989   return reterr;
2990 }
2991 
AllocAndPopulateIdHtableMt(const uintptr_t * subset_mask,const char * const * item_ids,uintptr_t item_ct,uintptr_t fast_size_min_extra_bytes,uint32_t max_thread_ct,uint32_t ** id_htable_ptr,uint32_t ** htable_dup_base_ptr,uint32_t * id_htable_size_ptr,uint32_t * dup_ct_ptr)2992 PglErr AllocAndPopulateIdHtableMt(const uintptr_t* subset_mask, const char* const* item_ids, uintptr_t item_ct, uintptr_t fast_size_min_extra_bytes, uint32_t max_thread_ct, uint32_t** id_htable_ptr, uint32_t** htable_dup_base_ptr, uint32_t* id_htable_size_ptr, uint32_t* dup_ct_ptr) {
2993   uint32_t id_htable_size = GetHtableFastSize(item_ct);
2994   // if store_all_dups, up to 8 bytes per variant in extra_alloc for duplicate
2995   //   tracking, as well as a small amount of per-thread temporary workspace
2996   const uint32_t store_all_dups = (htable_dup_base_ptr != nullptr);
2997   uintptr_t nonhtable_alloc = 0;
2998   if (store_all_dups) {
2999     const uint32_t actual_thread_ct = PopulateIdHtableMtDupstoreThreadCt(max_thread_ct, item_ct);
3000     // "2 *" in front of second term is temporary
3001     nonhtable_alloc = actual_thread_ct * kDupstoreThreadWkspace + 2 * RoundUpPow2(item_ct * 2 * sizeof(int32_t), kCacheline);
3002   }
3003   uintptr_t max_bytes = RoundDownPow2(bigstack_left(), kCacheline);
3004   // force max_bytes >= 5 so leqprime() doesn't fail
3005   // (not actually relevant any more, but whatever)
3006   if (nonhtable_alloc + (item_ct + 6) * sizeof(int32_t) > max_bytes) {
3007     return kPglRetNomem;
3008   }
3009   max_bytes -= nonhtable_alloc;
3010   if (id_htable_size * sizeof(int32_t) + fast_size_min_extra_bytes > max_bytes) {
3011     id_htable_size = max_bytes / sizeof(int32_t);
3012     // id_htable_size = leqprime((max_bytes / sizeof(int32_t)) - 1);
3013     const uint32_t min_htable_size = GetHtableMinSize(item_ct);
3014     if (id_htable_size < min_htable_size) {
3015       id_htable_size = min_htable_size;
3016     }
3017   }
3018   *id_htable_ptr = S_CAST(uint32_t*, bigstack_alloc_raw_rd(id_htable_size * sizeof(int32_t)));
3019   if (store_all_dups) {
3020     *htable_dup_base_ptr = &((*id_htable_ptr)[RoundUpPow2(id_htable_size, kInt32PerCacheline)]);
3021   }
3022   *id_htable_size_ptr = id_htable_size;
3023   return PopulateIdHtableMt(g_bigstack_end, subset_mask, item_ids, item_ct, store_all_dups, id_htable_size, max_thread_ct, &g_bigstack_base, *id_htable_ptr, dup_ct_ptr);
3024 }
3025 
3026 
Edit1Match(const char * s1,const char * s2,uint32_t len1,uint32_t len2)3027 uint32_t Edit1Match(const char* s1, const char* s2, uint32_t len1, uint32_t len2) {
3028   // Permit one difference of the following forms (Damerau-Levenshtein distance
3029   // 1):
3030   // - inserted/deleted character
3031   // - replaced character
3032   // - adjacent pair of swapped characters
3033   // Returns 1 on approximate match, 0 on nonmatch.
3034   // With only one difference allowed, it is unnecessary to use a
3035   // fully-powered quadratic-time DP algorithm; instead, given the lengths of
3036   // the two strings, we select the appropriate linear-time loop.
3037   if (abs_i32(len1 - len2) > 1) {
3038     return 0;
3039   }
3040   uintptr_t mismatch_pos = FirstUnequal(s1, s2, len1);
3041   if (mismatch_pos == len1) {
3042     // bugfix (5 Nov 2019): this was backwards...
3043     return 1;
3044   }
3045   if (len1 != len2) {
3046     // At least one mismatch guaranteed.
3047     if (len1 < len2) {
3048       return memequal(&(s1[mismatch_pos]), &(s2[mismatch_pos + 1]), len1 - mismatch_pos);
3049     }
3050     return memequal(&(s1[mismatch_pos + 1]), &(s2[mismatch_pos]), len2 - mismatch_pos);
3051   }
3052   // Strings are equal length, and we have at least one mismatch.
3053   // Two ways to still have an approximate match:
3054   // 1. Remainder of strings are equal.
3055   // 2. Next character also differs, but due to transposition.  Remainder of
3056   //    strings are equal.
3057   if (s1[mismatch_pos + 1] != s2[mismatch_pos + 1]) {
3058     if ((s1[mismatch_pos] != s2[mismatch_pos + 1]) || (s1[mismatch_pos + 1] != s2[mismatch_pos])) {
3059       return 0;
3060     }
3061     ++mismatch_pos;
3062   }
3063   const uintptr_t post_mismatch_pos = mismatch_pos + 1;
3064   return memequal(&(s1[post_mismatch_pos]), &(s2[post_mismatch_pos]), len1 - post_mismatch_pos);
3065 }
3066 
3067 CONSTI32(kMaxEqualHelpParams, 64);
3068 
HelpPrint(const char * cur_params,HelpCtrl * help_ctrl_ptr,uint32_t postprint_newline,const char * payload)3069 void HelpPrint(const char* cur_params, HelpCtrl* help_ctrl_ptr, uint32_t postprint_newline, const char* payload) {
3070   if (help_ctrl_ptr->param_ct) {
3071     const char* params_iter = cur_params;
3072     uint32_t cur_param_ct = 0;
3073     const char* cur_param_starts[kMaxEqualHelpParams];
3074     uint32_t cur_param_slens[kMaxEqualHelpParams];
3075     while (*params_iter) {
3076       cur_param_starts[cur_param_ct] = params_iter;
3077       const uint32_t cur_slen = strlen(params_iter);
3078       cur_param_slens[cur_param_ct++] = cur_slen;
3079       params_iter = &(params_iter[cur_slen + 1]);
3080     }
3081     if (help_ctrl_ptr->iters_left) {
3082       const uint32_t orig_unmatched_ct = help_ctrl_ptr->unmatched_ct;
3083       if (orig_unmatched_ct) {
3084         uint32_t arg_uidx = 0;
3085         if (help_ctrl_ptr->iters_left == 2) {
3086           // First pass: only exact matches.
3087           for (uint32_t arg_idx = 0; arg_idx != orig_unmatched_ct; ++arg_idx, ++arg_uidx) {
3088             arg_uidx = AdvTo0Bit(help_ctrl_ptr->all_match_arr, arg_uidx);
3089             const char* cur_arg = help_ctrl_ptr->argv[arg_uidx];
3090             const uint32_t cur_arg_slen = help_ctrl_ptr->param_slens[arg_uidx];
3091             for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3092               if ((cur_arg_slen == cur_param_slens[cur_param_idx]) && memequal(cur_arg, cur_param_starts[cur_param_idx], cur_arg_slen)) {
3093                 SetBit(arg_uidx, help_ctrl_ptr->perfect_match_arr);
3094                 SetBit(arg_uidx, help_ctrl_ptr->prefix_match_arr);
3095                 SetBit(arg_uidx, help_ctrl_ptr->all_match_arr);
3096                 help_ctrl_ptr->unmatched_ct -= 1;
3097                 break;
3098               }
3099             }
3100           }
3101         } else {
3102           // Second pass: Prefix matches.
3103           for (uint32_t arg_idx = 0; arg_idx != orig_unmatched_ct; ++arg_idx, ++arg_uidx) {
3104             arg_uidx = AdvTo0Bit(help_ctrl_ptr->all_match_arr, arg_uidx);
3105             const char* cur_arg = help_ctrl_ptr->argv[arg_uidx];
3106             const uint32_t cur_arg_slen = help_ctrl_ptr->param_slens[arg_uidx];
3107             for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3108               if (cur_param_slens[cur_param_idx] > cur_arg_slen) {
3109                 if (memequal(cur_arg, cur_param_starts[cur_param_idx], cur_arg_slen)) {
3110                   SetBit(arg_uidx, help_ctrl_ptr->prefix_match_arr);
3111                   SetBit(arg_uidx, help_ctrl_ptr->all_match_arr);
3112                   help_ctrl_ptr->unmatched_ct -= 1;
3113                   break;
3114                 }
3115               }
3116             }
3117           }
3118         }
3119       }
3120     } else {
3121       uint32_t print_this = 0;
3122       for (uint32_t arg_uidx = 0; arg_uidx != help_ctrl_ptr->param_ct; ++arg_uidx) {
3123         if (IsSet(help_ctrl_ptr->prefix_match_arr, arg_uidx)) {
3124           // Current user-provided help argument has at least one prefix
3125           // match...
3126           if (!print_this) {
3127             const char* cur_arg = help_ctrl_ptr->argv[arg_uidx];
3128             const uint32_t cur_arg_slen = help_ctrl_ptr->param_slens[arg_uidx];
3129             if (IsSet(help_ctrl_ptr->perfect_match_arr, arg_uidx)) {
3130               // ...and at least one exact match.  So we only care about an
3131               // exact match.
3132               for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3133                 if ((cur_arg_slen == cur_param_slens[cur_param_idx]) && memequal(cur_arg, cur_param_starts[cur_param_idx], cur_arg_slen)) {
3134                   print_this = 1;
3135                   break;
3136                 }
3137               }
3138             } else {
3139               for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3140                 if (cur_param_slens[cur_param_idx] > cur_arg_slen) {
3141                   if (memequal(cur_arg, cur_param_starts[cur_param_idx], cur_arg_slen)) {
3142                     print_this = 1;
3143                     break;
3144                   }
3145                 }
3146               }
3147             }
3148           }
3149         } else {
3150           // Current user-provided help argument does not have an exact or
3151           // prefix match.  Print current help text if it's within
3152           // Damerau-Levenshtein distance 1 of any index term.
3153           for (uint32_t cur_param_idx = 0; cur_param_idx != cur_param_ct; ++cur_param_idx) {
3154             if (Edit1Match(cur_param_starts[cur_param_idx], help_ctrl_ptr->argv[arg_uidx], cur_param_slens[cur_param_idx], help_ctrl_ptr->param_slens[arg_uidx])) {
3155               print_this = 1;
3156               if (!IsSet(help_ctrl_ptr->all_match_arr, arg_uidx)) {
3157                 SetBit(arg_uidx, help_ctrl_ptr->all_match_arr);
3158                 help_ctrl_ptr->unmatched_ct -= 1;
3159               }
3160               break;
3161             }
3162           }
3163         }
3164       }
3165       if (print_this) {
3166         const uint32_t payload_slen = strlen(payload);
3167         const char* payload_end;
3168         if (payload[payload_slen - 2] == '\n') {
3169           payload_end = &(payload[payload_slen - 1]);
3170         } else {
3171           payload_end = &(payload[payload_slen]);
3172         }
3173         if (help_ctrl_ptr->preprint_newline) {
3174           putc_unlocked('\n', stdout);
3175         }
3176         help_ctrl_ptr->preprint_newline = postprint_newline;
3177         const char* payload_iter = payload;
3178         do {
3179           const char* line_end = AdvPastDelim(payload_iter, '\n');
3180           uint32_t line_slen = S_CAST(uint32_t, line_end - payload_iter);
3181           if (line_slen > 2) {
3182             payload_iter = &(payload_iter[2]);
3183             line_slen -= 2;
3184           }
3185           memcpyx(g_textbuf, payload_iter, line_slen, '\0');
3186           fputs(g_textbuf, stdout);
3187           payload_iter = line_end;
3188         } while (payload_iter < payload_end);
3189       }
3190     }
3191   } else {
3192     fputs(payload, stdout);
3193   }
3194 }
3195 
3196 
PreinitPlink2CmdlineMeta(Plink2CmdlineMeta * pcmp)3197 void PreinitPlink2CmdlineMeta(Plink2CmdlineMeta* pcmp) {
3198   pcmp->subst_argv = nullptr;
3199   pcmp->script_buf = nullptr;
3200   pcmp->rerun_buf = nullptr;
3201   pcmp->flag_buf = nullptr;
3202   pcmp->flag_map = nullptr;
3203 }
3204 
3205 const char kErrstrNomem[] = "Error: Out of memory.  The --memory flag may be helpful.\n";
3206 const char kErrstrWrite[] = "Error: File write failure: %s.\n";
3207 const char kErrstrThreadCreate[] = "Error: Failed to create thread.\n";
3208 const char kErrstrVarRecordTooLarge[] = "Error: Variant record size exceeds ~4 GiB limit.\n";
3209 
3210 // assumes logfile is open
DispExitMsg(PglErr reterr)3211 void DispExitMsg(PglErr reterr) {
3212   if (reterr) {
3213     if (reterr == kPglRetNomem) {
3214       logputs("\n");
3215       logerrputs(kErrstrNomem);
3216       if (g_failed_alloc_attempt_size) {
3217         logerrprintf("Failed allocation size: %" PRIuPTR "\n", g_failed_alloc_attempt_size);
3218       }
3219       // kPglRetReadFail no longer gets a message here, for the same reason
3220       // kPglRetOpenFail doesn't: it's important to know which file we failed
3221       // to read.
3222     } else if (reterr == kPglRetWriteFail) {
3223       logputs("\n");
3224       if (errno) {
3225         logerrprintfww(kErrstrWrite, strerror(errno));
3226       } else {
3227         // Defensive.
3228         logerrputs("Error: File write failure: Untracked cause.\n");
3229       }
3230     } else if (reterr == kPglRetThreadCreateFail) {
3231       logputs("\n");
3232       logerrputs(kErrstrThreadCreate);
3233     } else if (reterr == kPglRetVarRecordTooLarge) {
3234       logputs("\n");
3235       logerrputs(kErrstrVarRecordTooLarge);
3236     } else if (reterr == kPglRetLongLine) {
3237       logputs("\n");
3238       logerrprintf("Error: Unhandled internal line-too-long message.\n");
3239     } else if (reterr == kPglRetEof) {
3240       logputs("\n");
3241       logerrprintf("Error: Unhandled internal EOF message.\n");
3242     }
3243   }
3244 }
3245 
3246 // useful when there's e.g. a filename and an optional modifier, and we want to
3247 // permit either parmeter ordering
CheckExtraParam(const char * const * argvk,const char * permitted_modif,uint32_t * other_idx_ptr)3248 BoolErr CheckExtraParam(const char* const* argvk, const char* permitted_modif, uint32_t* other_idx_ptr) {
3249   const uint32_t idx_base = *other_idx_ptr;
3250   if (!strcmp(argvk[idx_base], permitted_modif)) {
3251     *other_idx_ptr = idx_base + 1;
3252   } else if (strcmp(argvk[idx_base + 1], permitted_modif)) {
3253     logerrprintf("Error: Invalid %s argument sequence.\n", argvk[0]);
3254     return 1;
3255   }
3256   return 0;
3257 }
3258 
ExtractCharParam(const char * ss)3259 char ExtractCharParam(const char* ss) {
3260   // maps c, 'c', and "c" to c, and anything else to the null char.  This is
3261   // intended to support e.g. always using '#' to designate a # argument
3262   // without worrying about differences between shells.
3263   const char cc = ss[0];
3264   if (((cc == '\'') || (cc == '"')) && (ss[1]) && (ss[2] == cc) && (!ss[3])) {
3265     return ss[1];
3266   }
3267   if (cc && (!ss[1])) {
3268     return cc;
3269   }
3270   return '\0';
3271 }
3272 
CmdlineAllocString(const char * source,const char * flag_name,uint32_t max_slen,char ** sbuf_ptr)3273 PglErr CmdlineAllocString(const char* source, const char* flag_name, uint32_t max_slen, char** sbuf_ptr) {
3274   const uint32_t slen = strlen(source);
3275   if (slen > max_slen) {
3276     logerrprintf("Error: %s argument too long.\n", flag_name);
3277     return kPglRetInvalidCmdline;
3278   }
3279   const uint32_t blen = slen + 1;
3280   if (pgl_malloc(blen, sbuf_ptr)) {
3281     return kPglRetNomem;
3282   }
3283   memcpy(*sbuf_ptr, source, blen);
3284   return kPglRetSuccess;
3285 }
3286 
AllocFname(const char * source,const char * flagname_p,uint32_t extra_size,char ** fnbuf_ptr)3287 PglErr AllocFname(const char* source, const char* flagname_p, uint32_t extra_size, char** fnbuf_ptr) {
3288   const uint32_t blen = strlen(source) + 1;
3289   if (blen > (kPglFnamesize - extra_size)) {
3290     logerrprintf("Error: --%s filename too long.\n", flagname_p);
3291     return kPglRetOpenFail;
3292   }
3293   if (pgl_malloc(blen + extra_size, fnbuf_ptr)) {
3294     return kPglRetNomem;
3295   }
3296   memcpy(*fnbuf_ptr, source, blen);
3297   return kPglRetSuccess;
3298 }
3299 
AllocAndFlatten(const char * const * sources,uint32_t param_ct,uint32_t max_blen,char ** flattened_buf_ptr)3300 PglErr AllocAndFlatten(const char* const* sources, uint32_t param_ct, uint32_t max_blen, char** flattened_buf_ptr) {
3301   uintptr_t tot_blen = 1;
3302   for (uint32_t param_idx = 0; param_idx != param_ct; ++param_idx) {
3303     const uint32_t cur_blen = 1 + strlen(sources[param_idx]);
3304     if (cur_blen > max_blen) {
3305       return kPglRetInvalidCmdline;
3306     }
3307     tot_blen += cur_blen;
3308   }
3309   char* buf_iter;
3310   if (pgl_malloc(tot_blen, &buf_iter)) {
3311     return kPglRetNomem;
3312   }
3313   *flattened_buf_ptr = buf_iter;
3314   for (uint32_t param_idx = 0; param_idx != param_ct; ++param_idx) {
3315     buf_iter = strcpyax(buf_iter, sources[param_idx], '\0');
3316   }
3317   *buf_iter = '\0';
3318   return kPglRetSuccess;
3319 }
3320 
Rerun(const char * ver_str,const char * ver_str2,const char * prog_name_str,uint32_t rerun_argv_pos,uint32_t rerun_parameter_present,int32_t * argc_ptr,uint32_t * first_arg_idx_ptr,char *** argv_ptr,char *** subst_argv_ptr,char ** rerun_buf_ptr)3321 PglErr Rerun(const char* ver_str, const char* ver_str2, const char* prog_name_str, uint32_t rerun_argv_pos, uint32_t rerun_parameter_present, int32_t* argc_ptr, uint32_t* first_arg_idx_ptr, char*** argv_ptr, char*** subst_argv_ptr, char** rerun_buf_ptr) {
3322   // caller is responsible for freeing rerun_buf
3323 
3324   // ok, requiring zlib/zstd here is totally not worth it
3325   FILE* rerunfile = nullptr;
3326 
3327   char** subst_argv2 = nullptr;
3328   char** argv = *argv_ptr;
3329   char* rerun_fname = rerun_parameter_present? argv[rerun_argv_pos + 1] : g_textbuf;
3330   uintptr_t line_idx = 1;
3331   PglErr reterr = kPglRetSuccess;
3332   {
3333     if (!rerun_parameter_present) {
3334       char* write_iter = strcpya(rerun_fname, prog_name_str);
3335       snprintf(write_iter, kMaxOutfnameExtBlen, ".log");
3336     }
3337     rerunfile = fopen(rerun_fname, FOPEN_RB);
3338     if (unlikely(!rerunfile)) {
3339       goto Rerun_ret_OPEN_FAIL;
3340     }
3341     char* textbuf = g_textbuf;
3342     textbuf[kMaxMediumLine - 1] = ' ';
3343     if (unlikely(!fgets(textbuf, kMaxMediumLine, rerunfile))) {
3344       fputs(ver_str, stdout);
3345       fputs(ver_str2, stdout);
3346       fputs("Error: Empty log file for --rerun.\n", stderr);
3347       goto Rerun_ret_MALFORMED_INPUT;
3348     }
3349     if (unlikely(!textbuf[kMaxMediumLine - 1])) {
3350       goto Rerun_ret_LONG_LINE;
3351     }
3352     if (unlikely(!fgets(textbuf, kMaxMediumLine, rerunfile))) {
3353       fputs(ver_str, stdout);
3354       fputs(ver_str2, stdout);
3355       fputs("Error: Only one line in --rerun log file.\n", stderr);
3356       goto Rerun_ret_MALFORMED_INPUT;
3357     }
3358     ++line_idx;
3359     if (unlikely(!textbuf[kMaxMediumLine - 1])) {
3360       goto Rerun_ret_LONG_LINE;
3361     }
3362     // don't bother supporting "xx arguments: --aa bb --cc --dd" format
3363     while ((!StrStartsWithUnsafe(textbuf, "Options in effect:")) || (textbuf[strlen("Options in effect:")] >= ' ')) {
3364       ++line_idx;
3365       if (unlikely(!fgets(textbuf, kMaxMediumLine, rerunfile))) {
3366         fputs(ver_str, stdout);
3367         fputs(ver_str2, stdout);
3368         fputs("Error: Invalid log file for --rerun.\n", stderr);
3369         goto Rerun_ret_MALFORMED_INPUT;
3370       }
3371     }
3372     char* all_args_write_iter = textbuf;
3373     char* textbuf_limit = &(textbuf[kMaxMediumLine]);
3374     uint32_t loaded_arg_ct = 0;
3375     // We load each of the option lines in sequence into textbuf, always
3376     // overwriting the previous line's newline.  (Note that textbuf[] has
3377     // size > 2 * kMaxMediumLine; this lets us avoid additional
3378     // dynamic memory allocation as long as we impose the constraint that all
3379     // lines combined add up to less than kMaxMediumLine.)
3380     while (1) {
3381       all_args_write_iter[kMaxMediumLine - 1] = ' ';
3382       if (!fgets(all_args_write_iter, kMaxMediumLine, rerunfile)) {
3383         break;
3384       }
3385       ++line_idx;
3386       if (unlikely(!all_args_write_iter[kMaxMediumLine - 1])) {
3387         goto Rerun_ret_LONG_LINE;
3388       }
3389       char* arg_iter = FirstNonTspace(all_args_write_iter);
3390       if (IsEolnKns(*arg_iter)) {
3391         *all_args_write_iter = '\0';
3392         break;
3393       }
3394       char* token_end;
3395       do {
3396         token_end = CurTokenEnd(arg_iter);
3397         ++loaded_arg_ct;
3398         arg_iter = FirstNonTspace(token_end);
3399       } while (!IsEolnKns(*arg_iter));
3400       all_args_write_iter = token_end;
3401       if (unlikely(all_args_write_iter >= textbuf_limit)) {
3402         fputs(ver_str, stdout);
3403         fputs(ver_str2, stdout);
3404         fputs("Error: --rerun argument sequence too long.\n", stderr);
3405         goto Rerun_ret_MALFORMED_INPUT;
3406       }
3407     }
3408     fclose_null(&rerunfile);
3409     const uint32_t line_byte_ct = 1 + S_CAST(uintptr_t, all_args_write_iter - textbuf);
3410     char* rerun_buf;
3411     if (unlikely(pgl_malloc(line_byte_ct, &rerun_buf))) {
3412       goto Rerun_ret_NOMEM;
3413     }
3414     *rerun_buf_ptr = rerun_buf;
3415     memcpy(rerun_buf, textbuf, line_byte_ct);
3416     const uint32_t argc = *argc_ptr;
3417     const uint32_t first_arg_idx = *first_arg_idx_ptr;
3418     char* rerun_first_token = FirstNonTspace(rerun_buf);
3419     const char* arg_iter = rerun_first_token;
3420     // now use textbuf as a lame bitfield
3421     memset(textbuf, 1, loaded_arg_ct);
3422     uint32_t loaded_arg_idx = 0;
3423     uint32_t duplicate_arg_ct = 0;
3424     do {
3425       if (unlikely(NoMoreTokensKns(arg_iter))) {
3426         fputs(ver_str, stdout);
3427         fputs(ver_str2, stdout);
3428         fputs("Error: Line 2 of --rerun log file has fewer tokens than expected.\n", stderr);
3429         goto Rerun_ret_MALFORMED_INPUT;
3430       }
3431       const char* flagname_p = IsCmdlineFlagStart(arg_iter);
3432       if (flagname_p) {
3433         const uint32_t slen = strlen_se(flagname_p);
3434         uint32_t cmdline_arg_idx = first_arg_idx;
3435         for (; cmdline_arg_idx != argc; ++cmdline_arg_idx) {
3436           const char* later_flagname_p = IsCmdlineFlagStart(argv[cmdline_arg_idx]);
3437           if (later_flagname_p) {
3438             const uint32_t slen2 = strlen(later_flagname_p);
3439             if ((slen == slen2) && memequal(flagname_p, later_flagname_p, slen)) {
3440               cmdline_arg_idx = UINT32_MAX;
3441               break;
3442             }
3443           }
3444         }
3445         if (cmdline_arg_idx == UINT32_MAX) {
3446           // matching flag, override --rerun
3447           do {
3448             ++duplicate_arg_ct;
3449             textbuf[loaded_arg_idx++] = 0;
3450             if (loaded_arg_idx == loaded_arg_ct) {
3451               break;
3452             }
3453             arg_iter = NextToken(arg_iter);
3454           } while (!IsCmdlineFlag(arg_iter));
3455         } else {
3456           ++loaded_arg_idx;
3457           arg_iter = NextToken(arg_iter);
3458         }
3459       } else {
3460         ++loaded_arg_idx;
3461         arg_iter = NextToken(arg_iter);
3462       }
3463     } while (loaded_arg_idx < loaded_arg_ct);
3464     if (unlikely(pgl_malloc((argc + loaded_arg_ct - duplicate_arg_ct - rerun_parameter_present - 1 - first_arg_idx) * sizeof(intptr_t), &subst_argv2))) {
3465       goto Rerun_ret_NOMEM;
3466     }
3467     uint32_t new_arg_idx = rerun_argv_pos - first_arg_idx;
3468     memcpy(subst_argv2, &(argv[first_arg_idx]), new_arg_idx * sizeof(intptr_t));
3469     char* arg_nullterminate_iter = rerun_first_token;
3470     for (loaded_arg_idx = 0; loaded_arg_idx != loaded_arg_ct; ++loaded_arg_idx) {
3471       arg_nullterminate_iter = FirstNonTspace(arg_nullterminate_iter);
3472       char* token_end = CurTokenEnd(arg_nullterminate_iter);
3473       if (textbuf[loaded_arg_idx]) {
3474         subst_argv2[new_arg_idx++] = arg_nullterminate_iter;
3475         *token_end = '\0';
3476       }
3477       arg_nullterminate_iter = &(token_end[1]);
3478     }
3479     const uint32_t final_copy_start_idx = rerun_argv_pos + rerun_parameter_present + 1;
3480     memcpy(&(subst_argv2[new_arg_idx]), &(argv[final_copy_start_idx]), (argc - final_copy_start_idx) * sizeof(intptr_t));
3481     *first_arg_idx_ptr = 0;
3482     *argc_ptr = new_arg_idx + argc - final_copy_start_idx;
3483     if (*subst_argv_ptr) {
3484       free(*subst_argv_ptr);
3485     }
3486     *subst_argv_ptr = subst_argv2;
3487     *argv_ptr = subst_argv2;
3488     subst_argv2 = nullptr;
3489   }
3490   while (0) {
3491   Rerun_ret_NOMEM:
3492     fputs(ver_str, stdout);
3493     fputs(ver_str2, stdout);
3494     reterr = kPglRetNomem;
3495     break;
3496   Rerun_ret_OPEN_FAIL:
3497     fputs(ver_str, stdout);
3498     fputs(ver_str2, stdout);
3499     fprintf(stderr, kErrprintfFopen, rerun_fname, strerror(errno));
3500     reterr = kPglRetOpenFail;
3501     break;
3502   Rerun_ret_LONG_LINE:
3503     fputs(ver_str, stdout);
3504     fputs(ver_str2, stdout);
3505     fprintf(stderr, "Error: Line %" PRIuPTR " of --rerun log file is pathologically long.\n", line_idx);
3506   Rerun_ret_MALFORMED_INPUT:
3507     reterr = kPglRetMalformedInput;
3508     break;
3509   }
3510   free_cond(subst_argv2);
3511   fclose_cond(rerunfile);
3512   return reterr;
3513 }
3514 
3515 
3516 // Handles --script, --rerun, --help, --version, and --silent.
3517 // subst_argv, script_buf, and rerun_buf must be initialized to nullptr.
CmdlineParsePhase1(const char * ver_str,const char * ver_str2,const char * prog_name_str,const char * notestr_null_calc2,const char * cmdline_format_str,const char * errstr_append,uint32_t max_flag_blen,PglErr (* disp_help_fn)(const char * const *,uint32_t),int * argc_ptr,char *** argv_ptr,Plink2CmdlineMeta * pcmp,uint32_t * first_arg_idx_ptr,uint32_t * flag_ct_ptr)3518 PglErr CmdlineParsePhase1(const char* ver_str, const char* ver_str2, const char* prog_name_str, const char* notestr_null_calc2, const char* cmdline_format_str, const char* errstr_append, uint32_t max_flag_blen, PglErr(* disp_help_fn)(const char* const*, uint32_t), int* argc_ptr, char*** argv_ptr, Plink2CmdlineMeta* pcmp, uint32_t* first_arg_idx_ptr, uint32_t* flag_ct_ptr) {
3519   FILE* scriptfile = nullptr;
3520   PglErr reterr = kPglRetSuccess;
3521   {
3522     int argc = *argc_ptr;
3523     const char* const* argvk = TO_CONSTCPCONSTP(*argv_ptr);
3524     char** subst_argv = nullptr;
3525     uint32_t first_arg_idx = 1;
3526     for (uint32_t arg_idx = 1; arg_idx != S_CAST(uint32_t, argc); ++arg_idx) {
3527       if ((!strcmp("-script", argvk[arg_idx])) || (!strcmp("--script", argvk[arg_idx]))) {
3528         const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
3529         if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
3530           fputs(ver_str, stdout);
3531           fputs(ver_str2, stdout);
3532           fputs(g_logbuf, stderr);
3533           fputs(errstr_append, stderr);
3534           goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3535         }
3536         for (uint32_t arg_idx2 = arg_idx + 2; arg_idx2 != S_CAST(uint32_t, argc); ++arg_idx2) {
3537           if (unlikely((!strcmp("-script", argvk[arg_idx2])) || (!strcmp("--script", argvk[arg_idx2])))) {
3538             fputs(ver_str, stdout);
3539             fputs(ver_str2, stdout);
3540             fputs("Error: Multiple --script flags.  Merge the files into one.\n", stderr);
3541             fputs(errstr_append, stderr);
3542             goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3543           }
3544         }
3545         // logging not yet active, so don't use fopen_checked()
3546         scriptfile = fopen(argvk[arg_idx + 1], FOPEN_RB);
3547         if (unlikely(!scriptfile)) {
3548           fputs(ver_str, stdout);
3549           fputs(ver_str2, stdout);
3550           fprintf(stderr, kErrprintfFopen, argvk[arg_idx + 1], strerror(errno));
3551           goto CmdlineParsePhase1_ret_OPEN_FAIL;
3552         }
3553         if (unlikely(fseeko(scriptfile, 0, SEEK_END))) {
3554           fputs(ver_str, stdout);
3555           fputs(ver_str2, stdout);
3556           fprintf(stderr, kErrprintfFread, argvk[arg_idx + 1], strerror(errno));
3557           goto CmdlineParsePhase1_ret_READ_FAIL;
3558         }
3559         int64_t fsize = ftello(scriptfile);
3560         if (unlikely(fsize < 0)) {
3561           fputs(ver_str, stdout);
3562           fputs(ver_str2, stdout);
3563           fprintf(stderr, kErrprintfFread, argvk[arg_idx + 1], strerror(errno));
3564           goto CmdlineParsePhase1_ret_READ_FAIL;
3565         }
3566         if (unlikely(fsize > 0x7ffffffe)) {
3567           // could actually happen if user enters parameters in the wrong
3568           // order, so may as well catch it and print a somewhat informative
3569           // error message
3570           fputs(ver_str, stdout);
3571           fputs(ver_str2, stdout);
3572           fputs("Error: --script file too large.", stderr);
3573           goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3574         }
3575         rewind(scriptfile);
3576         const uint32_t fsize_ui = fsize;
3577         if (unlikely(pgl_malloc(fsize_ui + 1, &pcmp->script_buf))) {
3578           goto CmdlineParsePhase1_ret_NOMEM;
3579         }
3580         char* script_buf = pcmp->script_buf;
3581         if (unlikely(!fread_unlocked(script_buf, fsize_ui, 1, scriptfile))) {
3582           fputs(ver_str, stdout);
3583           fputs(ver_str2, stdout);
3584           fprintf(stderr, kErrprintfFread, argvk[arg_idx + 1], strerror(errno));
3585           goto CmdlineParsePhase1_ret_READ_FAIL;
3586         }
3587         script_buf[fsize_ui] = '\0';
3588         fclose_null(&scriptfile);
3589         uint32_t num_script_params = 0;
3590         char* script_buf_iter = script_buf;
3591         uint32_t char_code;
3592         do {
3593           uint32_t char_code_m1;
3594           do {
3595             char_code_m1 = ctou32(*script_buf_iter++) - 1;
3596           } while (char_code_m1 < 32);
3597           if (char_code_m1 == UINT32_MAX) {
3598             break;
3599           }
3600           ++num_script_params;
3601           do {
3602             char_code = ctou32(*script_buf_iter++);
3603           } while (char_code > 32);
3604         } while (char_code);
3605         if (unlikely(script_buf_iter != (&(script_buf[fsize_ui + 1])))) {
3606           fputs(ver_str, stdout);
3607           fputs(ver_str2, stdout);
3608           fputs("Error: Null byte in --script file.\n", stderr);
3609           goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3610         }
3611         // probable todo: detect duplicate flags in the same manner as --rerun
3612         const uint32_t new_param_ct = num_script_params + argc - 3;
3613         if (unlikely(pgl_malloc(new_param_ct * sizeof(intptr_t), &subst_argv))) {
3614           goto CmdlineParsePhase1_ret_NOMEM;
3615         }
3616         pcmp->subst_argv = subst_argv;
3617         memcpy(subst_argv, &(argvk[1]), arg_idx * sizeof(intptr_t));
3618         const uint32_t load_param_idx_end = arg_idx + num_script_params;
3619         script_buf_iter = &(script_buf[-1]);
3620         for (uint32_t param_idx = arg_idx; param_idx != load_param_idx_end; ++param_idx) {
3621           while (ctou32(*(++script_buf_iter)) <= 32);
3622           subst_argv[param_idx] = script_buf_iter;
3623           while (ctou32(*(++script_buf_iter)) > 32);
3624           // could enforce some sort of length limit here
3625           *script_buf_iter = '\0';
3626         }
3627         memcpy(&(subst_argv[load_param_idx_end]), &(argvk[arg_idx + 2]), (argc - arg_idx - 2) * sizeof(intptr_t));
3628         argc = new_param_ct;
3629         *argc_ptr = argc;
3630         first_arg_idx = 0;
3631         argvk = TO_CONSTCPCONSTP(subst_argv);
3632         *argv_ptr = subst_argv;
3633         break;
3634       }
3635     }
3636     for (uint32_t arg_idx = first_arg_idx; arg_idx != S_CAST(uint32_t, argc); ++arg_idx) {
3637       if ((!strcmp("-rerun", argvk[arg_idx])) || (!strcmp("--rerun", argvk[arg_idx]))) {
3638         const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
3639         if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 0, 1))) {
3640           fputs(ver_str, stdout);
3641           fputs(ver_str2, stdout);
3642           fputs(g_logbuf, stderr);
3643           fputs(errstr_append, stderr);
3644           goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3645         }
3646         for (uint32_t arg_idx2 = arg_idx + param_ct + 1; arg_idx2 != S_CAST(uint32_t, argc); ++arg_idx2) {
3647           if (unlikely((!strcmp("-rerun", argvk[arg_idx2])) || (!strcmp("--rerun", argvk[arg_idx2])))) {
3648             fputs(ver_str, stdout);
3649             fputs(ver_str2, stdout);
3650             fputs("Error: Duplicate --rerun flag.\n", stderr);
3651             goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3652           }
3653         }
3654         reterr = Rerun(ver_str, ver_str2, prog_name_str, arg_idx, param_ct, &argc, &first_arg_idx, argv_ptr, &pcmp->subst_argv, &pcmp->rerun_buf);
3655         if (unlikely(reterr)) {
3656           goto CmdlineParsePhase1_ret_1;
3657         }
3658         *argc_ptr = argc;
3659         subst_argv = pcmp->subst_argv;
3660         argvk = TO_CONSTCPCONSTP(*argv_ptr);
3661         break;
3662       }
3663     }
3664     if (unlikely((first_arg_idx < S_CAST(uint32_t, argc)) && (!IsCmdlineFlag(argvk[first_arg_idx])))) {
3665       fputs("Error: First argument must be a flag.\n", stderr);
3666       fputs(errstr_append, stderr);
3667       goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3668     }
3669     uint32_t flag_ct = 0;
3670     uint32_t version_present = 0;
3671     uint32_t silent_present = 0;
3672     for (uint32_t arg_idx = first_arg_idx; arg_idx != S_CAST(uint32_t, argc); ++arg_idx) {
3673       const char* flagname_p = IsCmdlineFlagStart(argvk[arg_idx]);
3674       if (flagname_p) {
3675         const uint32_t flagname_p_slen = strlen(flagname_p);
3676         if (strequal_k(flagname_p, "help", flagname_p_slen)) {
3677           fputs(ver_str, stdout);
3678           fputs(ver_str2, stdout);
3679           if ((!first_arg_idx) || (arg_idx != 1) || subst_argv) {
3680             fputs("--help present, ignoring other flags.\n", stdout);
3681           }
3682           if ((arg_idx == S_CAST(uint32_t, argc) - 1) && flag_ct) {
3683             // make "plink [valid flags/arguments] --help" work, and skip the
3684             // arguments
3685             const char** help_argv;
3686             if (unlikely(pgl_malloc(flag_ct * sizeof(intptr_t), &help_argv))) {
3687               goto CmdlineParsePhase1_ret_NOMEM2;
3688             }
3689             uint32_t arg_idx2 = 0;
3690             for (uint32_t flag_idx = 0; flag_idx != flag_ct; ++flag_idx) {
3691               while (!IsCmdlineFlagStart(argvk[++arg_idx2]));
3692               help_argv[flag_idx] = argvk[arg_idx2];
3693             }
3694             reterr = disp_help_fn(help_argv, flag_ct);
3695             free(help_argv);
3696           } else {
3697             reterr = disp_help_fn(&(argvk[arg_idx + 1]), argc - arg_idx - 1);
3698           }
3699           if (!reterr) {
3700             reterr = kPglRetHelp;
3701           }
3702           goto CmdlineParsePhase1_ret_1;
3703         }
3704         if (strequal_k(flagname_p, "h", flagname_p_slen) ||
3705             strequal_k(flagname_p, "?", flagname_p_slen)) {
3706           // these just act like the no-argument case
3707           fputs(ver_str, stdout);
3708           fputs(ver_str2, stdout);
3709           if ((!first_arg_idx) || (arg_idx != 1) || subst_argv) {
3710             printf("-%c present, ignoring other flags.\n", *flagname_p);
3711           }
3712           fputs(cmdline_format_str, stdout);
3713           fputs(notestr_null_calc2, stdout);
3714           reterr = kPglRetHelp;
3715           goto CmdlineParsePhase1_ret_1;
3716         }
3717         if (strequal_k(flagname_p, "version", flagname_p_slen)) {
3718           version_present = 1;
3719         } else if (strequal_k(flagname_p, "silent", flagname_p_slen)) {
3720           silent_present = 1;
3721         } else if (unlikely(flagname_p_slen >= max_flag_blen)) {
3722           fputs(ver_str, stdout);
3723           fputs(ver_str2, stdout);
3724           // shouldn't be possible for this to overflow the buffer...
3725           snprintf(g_logbuf, kLogbufSize, "Error: Unrecognized flag ('%s').\n", argvk[arg_idx]);
3726           WordWrapB(0);
3727           fputs(g_logbuf, stderr);
3728           fputs(errstr_append, stderr);
3729           goto CmdlineParsePhase1_ret_INVALID_CMDLINE;
3730         }
3731         ++flag_ct;
3732       }
3733     }
3734     if (version_present) {
3735       fputs(ver_str, stdout);
3736       putc_unlocked('\n', stdout);
3737       reterr = kPglRetHelp;
3738       goto CmdlineParsePhase1_ret_1;
3739     }
3740     if (silent_present) {
3741       if (unlikely(!freopen(NULL_STREAM_NAME, "w", stdout))) {
3742         fputs("Warning: --silent failed.", stderr);
3743         g_stderr_written_to = 1;
3744       }
3745     }
3746     fputs(ver_str, stdout);
3747     fputs(ver_str2, stdout);
3748     *first_arg_idx_ptr = first_arg_idx;
3749     *flag_ct_ptr = flag_ct;
3750   }
3751   while (0) {
3752   CmdlineParsePhase1_ret_NOMEM:
3753     fputs(ver_str, stdout);
3754     fputs(ver_str2, stdout);
3755   CmdlineParsePhase1_ret_NOMEM2:
3756     fputs(kErrstrNomem, stderr);
3757     if (g_failed_alloc_attempt_size) {
3758       fprintf(stderr, "Failed allocation size: %" PRIuPTR "\n", g_failed_alloc_attempt_size);
3759     }
3760     reterr = kPglRetNomem;
3761     break;
3762   CmdlineParsePhase1_ret_OPEN_FAIL:
3763     reterr = kPglRetOpenFail;
3764     break;
3765   CmdlineParsePhase1_ret_READ_FAIL:
3766     reterr = kPglRetReadFail;
3767     break;
3768   CmdlineParsePhase1_ret_INVALID_CMDLINE:
3769     reterr = kPglRetInvalidCmdline;
3770     break;
3771   }
3772  CmdlineParsePhase1_ret_1:
3773   fclose_cond(scriptfile);
3774   return reterr;
3775 }
3776 
3777 // Assumes CmdlineParsePhase1() has completed, flag names have been copied to
3778 // flag_buf/flag_map, aliases handled, and PROG_NAME_STR has been copied to
3779 // outname (no null-terminator needed).  outname_end must be initialized to
3780 // nullptr.
3781 // This sorts the flag names so they're processed in a predictable order,
3782 // handles --out if present, initializes the log, and determines the number of
3783 // processors the OS wants us to think the machine has.
CmdlineParsePhase2(const char * ver_str,const char * errstr_append,const char * const * argvk,uint32_t prog_name_str_slen,uint32_t max_flag_blen,int32_t argc,uint32_t flag_ct,Plink2CmdlineMeta * pcmp,char * outname,char ** outname_end_ptr,char * range_delim_ptr,int32_t * known_procs_ptr,uint32_t * max_thread_ct_ptr)3784 PglErr CmdlineParsePhase2(const char* ver_str, const char* errstr_append, const char* const* argvk, uint32_t prog_name_str_slen, uint32_t max_flag_blen, int32_t argc, uint32_t flag_ct, Plink2CmdlineMeta* pcmp, char* outname, char** outname_end_ptr, char* range_delim_ptr, int32_t* known_procs_ptr, uint32_t* max_thread_ct_ptr) {
3785   PglErr reterr = kPglRetSuccess;
3786   {
3787     char* flag_buf = pcmp->flag_buf;
3788     uint32_t* flag_map = pcmp->flag_map;
3789     reterr = SortCmdlineFlags(max_flag_blen, flag_ct, flag_buf, flag_map);
3790     if (unlikely(reterr)) {
3791       if (reterr == kPglRetNomem) {
3792         goto CmdlineParsePhase2_ret_NOMEM_NOLOG;
3793       }
3794       goto CmdlineParsePhase2_ret_1;
3795     }
3796 
3797     *range_delim_ptr = '-';
3798     for (uint32_t cur_flag_idx = 0; cur_flag_idx != flag_ct; ++cur_flag_idx) {
3799       char* cur_flag = &(flag_buf[cur_flag_idx * max_flag_blen]);
3800       if (strequal_k_unsafe(cur_flag, "d")) {
3801         // Must be here, instead of in the main parse loop, for --covar-name +
3802         // --d to work.
3803         const uint32_t arg_idx = flag_map[cur_flag_idx];
3804         const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
3805         if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
3806           fputs(g_logbuf, stderr);
3807           fputs(errstr_append, stderr);
3808           goto CmdlineParsePhase2_ret_INVALID_CMDLINE;
3809         }
3810         const char cc = ExtractCharParam(argvk[arg_idx + 1]);
3811         if (unlikely(!cc)) {
3812           fputs("Error: --d argument too long (must be a single character).\n", stderr);
3813           goto CmdlineParsePhase2_ret_INVALID_CMDLINE;
3814         }
3815         if ((cc == '-') || (cc == ',')) {
3816           fputs("Error: --d argument cannot be '-' or ','.\n", stderr);
3817           goto CmdlineParsePhase2_ret_INVALID_CMDLINE;
3818         }
3819         *range_delim_ptr = cc;
3820         // bugfix (31 Jul 2019): can't set *cur_flag = '\0' to mark the flag as
3821         // already-processed, since we haven't written the start of the .log
3822         // file yet.
3823         continue;
3824       }
3825       const int32_t memcmp_out_result = Memcmp("out", cur_flag, 4);
3826       if (!memcmp_out_result) {
3827         const uint32_t arg_idx = flag_map[cur_flag_idx];
3828         const uint32_t param_ct = GetParamCt(argvk, argc, arg_idx);
3829         if (unlikely(EnforceParamCtRange(argvk[arg_idx], param_ct, 1, 1))) {
3830           fputs(g_logbuf, stderr);
3831           fputs(errstr_append, stderr);
3832           goto CmdlineParsePhase2_ret_INVALID_CMDLINE;
3833         }
3834         if (unlikely(strlen(argvk[arg_idx + 1]) > (kPglFnamesize - kMaxOutfnameExtBlen))) {
3835           fflush(stdout);
3836           fputs("Error: --out argument too long.\n", stderr);
3837           goto CmdlineParsePhase2_ret_OPEN_FAIL;
3838         }
3839         const uint32_t slen = strlen(argvk[arg_idx + 1]);
3840         memcpy(outname, argvk[arg_idx + 1], slen + 1);
3841         *outname_end_ptr = &(outname[slen]);
3842       }
3843       if (memcmp_out_result <= 0) {
3844         break;
3845       }
3846     }
3847     if (unlikely(InitLogfile(0, outname, (*outname_end_ptr)? (*outname_end_ptr) : &(outname[prog_name_str_slen])))) {
3848       goto CmdlineParsePhase2_ret_OPEN_FAIL;
3849     }
3850     logputs_silent(ver_str);
3851     logputs_silent("\n");
3852     logputs("Options in effect:\n");
3853     for (uint32_t cur_flag_idx = 0; cur_flag_idx != flag_ct; ++cur_flag_idx) {
3854       logputs("  --");
3855       logputs(&(flag_buf[cur_flag_idx * max_flag_blen]));
3856       uint32_t arg_idx = flag_map[cur_flag_idx] + 1;
3857       while ((arg_idx < S_CAST(uint32_t, argc)) && (!IsCmdlineFlag(argvk[arg_idx]))) {
3858         logputs(" ");
3859         // Thought about special-casing argvk[arg_idx] == " ", so that
3860         // "--id-delim ' '" actually works with --rerun, but that adds too much
3861         // complexity to the rereader to be worth it.  Instead we just document
3862         // the incompatibility.
3863         logputs(argvk[arg_idx++]);
3864       }
3865       logputs("\n");
3866     }
3867     logputs("\n");
3868 
3869 #ifdef _WIN32
3870     DWORD windows_dw = kTextbufSize;
3871     if (GetComputerName(g_textbuf, &windows_dw))
3872 #else
3873     if (gethostname(g_textbuf, kTextbufSize) != -1)
3874 #endif
3875     {
3876       logputs_silent("Hostname: ");
3877       logputs_silent(g_textbuf);
3878     }
3879     logputs_silent("\nWorking directory: ");
3880     if (unlikely(!getcwd(g_textbuf, kPglFnamesize))) {
3881       logputs_silent("\n");
3882       logerrprintfww("Error: Failed to get current working directory: %s.\n", strerror(errno));
3883       // debatable what error code applies here
3884       goto CmdlineParsePhase2_ret_OPEN_FAIL;
3885     }
3886     logputs_silent(g_textbuf);
3887     logputs_silent("\n");
3888     logputs("Start time: ");
3889     time_t rawtime;
3890     time(&rawtime);
3891     logputs(ctime(&rawtime));
3892     // ctime string always has a newline at the end
3893     logputs_silent("\n");
3894 
3895     *max_thread_ct_ptr = NumCpu(known_procs_ptr);
3896     // don't subtract 1 any more since, when max_thread_ct > 2, one of the
3897     // (virtual) cores will be dedicated to I/O and have lots of downtime.
3898     //
3899     // may make kMaxThreads a parameter later.
3900     if (*max_thread_ct_ptr > kMaxThreads) {
3901       *max_thread_ct_ptr = kMaxThreads;
3902     }
3903   }
3904   while (0) {
3905   CmdlineParsePhase2_ret_NOMEM_NOLOG:
3906     fputs(kErrstrNomem, stderr);
3907     if (g_failed_alloc_attempt_size) {
3908       fprintf(stderr, "Failed allocation size: %" PRIuPTR "\n", g_failed_alloc_attempt_size);
3909     }
3910     reterr = kPglRetNomem;
3911     break;
3912   CmdlineParsePhase2_ret_OPEN_FAIL:
3913     reterr = kPglRetOpenFail;
3914     break;
3915   CmdlineParsePhase2_ret_INVALID_CMDLINE:
3916     reterr = kPglRetInvalidCmdline;
3917     break;
3918   }
3919  CmdlineParsePhase2_ret_1:
3920   return reterr;
3921 }
3922 
CmdlineParsePhase3(uintptr_t max_default_mib,uintptr_t malloc_size_mib,uint32_t memory_require,Plink2CmdlineMeta * pcmp,unsigned char ** bigstack_ua_ptr)3923 PglErr CmdlineParsePhase3(uintptr_t max_default_mib, uintptr_t malloc_size_mib, uint32_t memory_require, Plink2CmdlineMeta* pcmp, unsigned char** bigstack_ua_ptr) {
3924   PglErr reterr = kPglRetSuccess;
3925   {
3926     if (pcmp->subst_argv) {
3927       free(pcmp->subst_argv);
3928       pcmp->subst_argv = nullptr;
3929     }
3930     if (pcmp->script_buf) {
3931       free(pcmp->script_buf);
3932       pcmp->script_buf = nullptr;
3933     }
3934     if (pcmp->rerun_buf) {
3935       free(pcmp->rerun_buf);
3936       pcmp->rerun_buf = nullptr;
3937     }
3938     if (pcmp->flag_buf) {
3939       free(pcmp->flag_buf);
3940       pcmp->flag_buf = nullptr;
3941     }
3942     if (pcmp->flag_map) {
3943       free(pcmp->flag_map);
3944       pcmp->flag_map = nullptr;
3945     }
3946 
3947     uint64_t total_mib = DetectMib();
3948     if (!malloc_size_mib) {
3949       if (!total_mib) {
3950         malloc_size_mib = max_default_mib? max_default_mib : kBigstackDefaultMib;
3951       } else if (total_mib < (kBigstackMinMib * 2)) {
3952         malloc_size_mib = kBigstackMinMib;
3953       } else {
3954         malloc_size_mib = total_mib / 2;
3955         if (max_default_mib && (malloc_size_mib > max_default_mib)) {
3956           malloc_size_mib = max_default_mib;
3957         }
3958       }
3959     }
3960     assert(malloc_size_mib >= kBigstackMinMib);
3961 #ifndef __LP64__
3962     if (malloc_size_mib > kMalloc32bitMibMax) {
3963       malloc_size_mib = kMalloc32bitMibMax;
3964     }
3965 #endif
3966     if (total_mib) {
3967       snprintf(g_logbuf, kLogbufSize, "%" PRIu64 " MiB RAM detected; reserving %" PRIuPTR " MiB for main workspace.\n", total_mib, malloc_size_mib);
3968     } else {
3969       snprintf(g_logbuf, kLogbufSize, "Failed to determine total system memory.  Attempting to reserve %" PRIuPTR " MiB.\n", malloc_size_mib);
3970     }
3971     logputsb();
3972     uintptr_t malloc_mib_final;
3973     if (unlikely(InitBigstack(malloc_size_mib, &malloc_mib_final, bigstack_ua_ptr))) {
3974       goto CmdlineParsePhase3_ret_NOMEM;
3975     }
3976     if (malloc_size_mib != malloc_mib_final) {
3977       if (unlikely(memory_require)) {
3978         goto CmdlineParsePhase3_ret_NOMEM;
3979       }
3980       logprintf("Allocated %" PRIuPTR " MiB successfully, after larger attempt(s) failed.\n", malloc_mib_final);
3981     }
3982   }
3983   while (0) {
3984   CmdlineParsePhase3_ret_NOMEM:
3985     reterr = kPglRetNomem;
3986     break;
3987   }
3988   return reterr;
3989 }
3990 
CleanupPlink2CmdlineMeta(Plink2CmdlineMeta * pcmp)3991 void CleanupPlink2CmdlineMeta(Plink2CmdlineMeta* pcmp) {
3992   free_cond(pcmp->subst_argv);
3993   free_cond(pcmp->script_buf);
3994   free_cond(pcmp->rerun_buf);
3995   free_cond(pcmp->flag_buf);
3996   free_cond(pcmp->flag_map);
3997 }
3998 
3999 
InitCmpExpr(CmpExpr * cmp_expr_ptr)4000 void InitCmpExpr(CmpExpr* cmp_expr_ptr) {
4001   cmp_expr_ptr->pheno_name = nullptr;
4002 }
4003 
CleanupCmpExpr(CmpExpr * cmp_expr_ptr)4004 void CleanupCmpExpr(CmpExpr* cmp_expr_ptr) {
4005   free_cond(cmp_expr_ptr->pheno_name);
4006 }
4007 
4008 // may want CXXCONST_CP treatment later
ParseNextBinaryOp(const char * expr_str,uint32_t expr_slen,const char ** op_start_ptr,CmpBinaryOp * binary_op_ptr)4009 const char* ParseNextBinaryOp(const char* expr_str, uint32_t expr_slen, const char** op_start_ptr, CmpBinaryOp* binary_op_ptr) {
4010   // !=, <>: kCmpOperatorNoteq
4011   // <: kCmpOperatorLe
4012   // <=: kCmpOperatorLeq
4013   // =, ==: kCmpOperatorEq
4014   // >=: kCmpOperatorGeq
4015   // >: kCmpOperatorGe
4016   const char* next_eq = S_CAST(const char*, memchr(expr_str, '=', expr_slen));
4017   const char* next_lt = S_CAST(const char*, memchr(expr_str, '<', expr_slen));
4018   const char* next_gt = S_CAST(const char*, memchr(expr_str, '>', expr_slen));
4019   if (!next_eq) {
4020     if (!next_lt) {
4021       // may want to remove unlikely() later
4022       if (unlikely(!next_gt)) {
4023         return nullptr;
4024       }
4025       *op_start_ptr = next_gt;
4026       *binary_op_ptr = kCmpOperatorGe;
4027       return &(next_gt[1]);
4028     }
4029     if (next_gt == (&(next_lt[1]))) {
4030       *op_start_ptr = next_lt;
4031       *binary_op_ptr = kCmpOperatorNoteq;
4032       return &(next_lt[2]);
4033     }
4034     if ((!next_gt) || (next_gt > next_lt)) {
4035       *op_start_ptr = next_lt;
4036       *binary_op_ptr = kCmpOperatorLe;
4037       return &(next_lt[1]);
4038     }
4039     *op_start_ptr = next_gt;
4040     *binary_op_ptr = kCmpOperatorGe;
4041     return &(next_gt[1]);
4042   }
4043   if ((!next_lt) || (next_lt > next_eq)) {
4044     if ((!next_gt) || (next_gt > next_eq)) {
4045       if ((next_eq != expr_str) && (next_eq[-1] == '!')) {
4046         *op_start_ptr = &(next_eq[-1]);
4047         *binary_op_ptr = kCmpOperatorNoteq;
4048         return &(next_eq[1]);
4049       }
4050       *op_start_ptr = next_eq;
4051       *binary_op_ptr = kCmpOperatorEq;
4052       return (next_eq[1] == '=')? (&(next_eq[2])) : (&(next_eq[1]));
4053     }
4054     *op_start_ptr = next_gt;
4055     if (next_eq == (&(next_gt[1]))) {
4056       *binary_op_ptr = kCmpOperatorGeq;
4057       return &(next_gt[2]);
4058     }
4059     *binary_op_ptr = kCmpOperatorGe;
4060     return &(next_gt[1]);
4061   }
4062   if (next_gt == (&(next_lt[1]))) {
4063     *op_start_ptr = next_lt;
4064     *binary_op_ptr = kCmpOperatorNoteq;
4065     return &(next_lt[2]);
4066   }
4067   if ((!next_gt) || (next_gt > next_lt)) {
4068     *op_start_ptr = next_lt;
4069     if (next_eq == (&(next_lt[1]))) {
4070       *binary_op_ptr = kCmpOperatorLeq;
4071       return &(next_lt[2]);
4072     }
4073     *binary_op_ptr = kCmpOperatorLe;
4074     return &(next_lt[1]);
4075   }
4076   *op_start_ptr = next_gt;
4077   if (next_eq == (&(next_gt[1]))) {
4078     *binary_op_ptr = kCmpOperatorGeq;
4079     return &(next_gt[2]);
4080   }
4081   *binary_op_ptr = kCmpOperatorGe;
4082   return &(next_gt[1]);
4083 }
4084 
ValidateAndAllocCmpExpr(const char * const * sources,const char * flag_name,uint32_t param_ct,CmpExpr * cmp_expr_ptr)4085 PglErr ValidateAndAllocCmpExpr(const char* const* sources, const char* flag_name, uint32_t param_ct, CmpExpr* cmp_expr_ptr) {
4086   // Currently four use cases:
4087   //   [pheno/covar name] [operator] [pheno val]: regular comparison
4088   //   [pheno/covar name]: existence check
4089   //   [INFO key] [operator] [val]: regular comparison
4090   //   [INFO key]: existence check
4091   // Some key/value validation is deferred to LoadPvar()/KeepRemoveIf(),
4092   // since the requirements are different (e.g. no semicolons in anything
4093   // INFO-related, categorical phenotypes can be assumed to not start with a
4094   // valid number).
4095   // May support or/and, parentheses later, but need to be careful to not slow
4096   // down LoadPvar() too much in the no-INFO-filter case.
4097   PglErr reterr = kPglRetSuccess;
4098   {
4099     if (unlikely((param_ct != 1) && (param_ct != 3))) {
4100       goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4101     }
4102     const char* pheno_name_start = sources[0];
4103     const char* pheno_val_start;
4104     uint32_t pheno_name_slen;
4105     uint32_t pheno_val_slen;
4106     if (param_ct == 3) {
4107       pheno_name_slen = strlen(pheno_name_start);
4108       const char* op_str = sources[1];
4109       uint32_t op_slen = strlen(op_str);
4110       // ok to have single/double quotes around operator
4111       if (op_slen > 2) {
4112         const char cc = op_str[0];
4113         if (((cc == '\'') || (cc == '"')) && (op_str[op_slen - 1] == cc)) {
4114           ++op_str;
4115           op_slen -= 2;
4116         }
4117       }
4118       const char* op_start;
4119       const char* op_end = ParseNextBinaryOp(op_str, op_slen, &op_start, &cmp_expr_ptr->binary_op);
4120       if (unlikely((!op_end) || (*op_end) || (op_start != op_str))) {
4121         goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4122       }
4123       pheno_val_start = sources[2];
4124       pheno_val_slen = strlen(pheno_val_start);
4125     } else {
4126       // permit param_ct == 1 as long as tokens are unambiguous
4127       uint32_t expr_slen = strlen(pheno_name_start);
4128       const char* op_start;
4129       pheno_val_start = ParseNextBinaryOp(pheno_name_start, expr_slen, &op_start, &cmp_expr_ptr->binary_op);
4130       if (unlikely((!pheno_val_start) || (!(*pheno_val_start)) || (op_start == pheno_name_start))) {
4131         goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4132       }
4133       pheno_name_slen = op_start - pheno_name_start;
4134       // quasi-bugfix (13 Dec 2017): allow single argument to contain internal
4135       // spaces;
4136       //   --keep-if "PHENO1 > 1"
4137       // is more intuitive and consistent with usage of other command-line
4138       // tools than
4139       //   --keep-if PHENO1 '>' 1
4140       //
4141       // To prevent --rerun from breaking, if there's a space after the
4142       // operator, there must be a space before the operator as well, etc.
4143       if (*pheno_val_start == ' ') {
4144         if (unlikely(pheno_name_start[pheno_name_slen - 1] != ' ')) {
4145           goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4146         }
4147         do {
4148           ++pheno_val_start;
4149         } while (*pheno_val_start == ' ');
4150         do {
4151           --pheno_name_slen;
4152           if (unlikely(!pheno_name_slen)) {
4153             goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4154           }
4155         } while (pheno_name_start[pheno_name_slen - 1] == ' ');
4156       }
4157       pheno_val_slen = expr_slen - S_CAST(uintptr_t, pheno_val_start - pheno_name_start);
4158     }
4159     if (unlikely(memchr(pheno_name_start, ' ', pheno_name_slen) || memchr(pheno_val_start, ' ', pheno_val_slen))) {
4160       goto ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC;
4161     }
4162     if (unlikely((pheno_name_slen > kMaxIdSlen) || (pheno_val_slen > kMaxIdSlen))) {
4163       logerrprintf("Error: ID too long in %s expression.\n", flag_name);
4164       goto ValidateAndAllocCmpExpr_ret_INVALID_CMDLINE;
4165     }
4166     if ((cmp_expr_ptr->binary_op != kCmpOperatorNoteq) && (cmp_expr_ptr->binary_op != kCmpOperatorEq)) {
4167       double dxx;
4168       if (unlikely(!ScantokDouble(pheno_val_start, &dxx))) {
4169         logerrprintfww("Error: Invalid %s value '%s' (finite number expected).\n", flag_name, pheno_val_start);
4170         goto ValidateAndAllocCmpExpr_ret_INVALID_CMDLINE;
4171       }
4172     }
4173     char* new_pheno_name_buf;
4174     if (unlikely(pgl_malloc(2 + pheno_name_slen + pheno_val_slen, &new_pheno_name_buf))) {
4175       goto ValidateAndAllocCmpExpr_ret_NOMEM;
4176     }
4177     memcpyx(new_pheno_name_buf, pheno_name_start, pheno_name_slen, '\0');
4178     // pheno_val_start guaranteed to be null-terminated for now
4179     memcpy(&(new_pheno_name_buf[pheno_name_slen + 1]), pheno_val_start, pheno_val_slen + 1);
4180     cmp_expr_ptr->pheno_name = new_pheno_name_buf;
4181   }
4182   while (0) {
4183   ValidateAndAllocCmpExpr_ret_NOMEM:
4184     reterr = kPglRetNomem;
4185     break;
4186   ValidateAndAllocCmpExpr_ret_INVALID_EXPR_GENERIC:
4187     logerrprintf("Error: Invalid %s expression.\n", flag_name);
4188   ValidateAndAllocCmpExpr_ret_INVALID_CMDLINE:
4189     reterr = kPglRetInvalidCmdline;
4190     break;
4191   }
4192   return reterr;
4193 }
4194 
4195 // See e.g. meta_analysis_open_and_read_header() in plink 1.9.
4196 // Assumes at least one search term.
SearchHeaderLine(const char * header_line_iter,const char * const * search_multistrs,const char * flagname_p,uint32_t search_col_ct,uint32_t * found_col_ct_ptr,uint32_t * found_type_bitset_ptr,uint32_t * col_skips,uint32_t * col_types)4197 PglErr SearchHeaderLine(const char* header_line_iter, const char* const* search_multistrs, const char* flagname_p, uint32_t search_col_ct, uint32_t* found_col_ct_ptr, uint32_t* found_type_bitset_ptr, uint32_t* col_skips, uint32_t* col_types) {
4198   assert(search_col_ct <= 32);
4199   unsigned char* bigstack_mark = g_bigstack_base;
4200   PglErr reterr = kPglRetSuccess;
4201   {
4202     uint32_t search_term_ct = 0;
4203     uintptr_t max_blen = 0;
4204     for (uint32_t search_col_idx = 0; search_col_idx != search_col_ct; ++search_col_idx) {
4205       const uint32_t cur_search_term_ct = CountAndMeasureMultistr(search_multistrs[search_col_idx], &max_blen);
4206       assert(cur_search_term_ct <= (1 << 26));
4207       search_term_ct += cur_search_term_ct;
4208     }
4209     char* merged_strbox;
4210     uint32_t* id_map;
4211     uint32_t* priority_vals;
4212     uint64_t* cols_and_types;
4213     if (unlikely(
4214             bigstack_alloc_c(search_term_ct * max_blen, &merged_strbox) ||
4215             bigstack_alloc_u32(search_term_ct, &id_map) ||
4216             bigstack_alloc_u32(search_col_ct, &priority_vals) ||
4217             bigstack_alloc_u64(search_col_ct, &cols_and_types))) {
4218       goto SearchHeaderLine_ret_NOMEM;
4219     }
4220     uint32_t search_term_idx = 0;
4221     for (uint32_t search_col_idx = 0; search_col_idx != search_col_ct; ++search_col_idx) {
4222       const char* multistr_iter = search_multistrs[search_col_idx];
4223       uint32_t priority_idx = 0;  // 0 = highest priority
4224       while (*multistr_iter) {
4225         const uint32_t cur_blen = strlen(multistr_iter) + 1;
4226         memcpy(&(merged_strbox[search_term_idx * max_blen]), multistr_iter, cur_blen);
4227         id_map[search_term_idx] = search_col_idx + (priority_idx * 32);
4228         ++search_term_idx;
4229         ++priority_idx;
4230         multistr_iter = &(multistr_iter[cur_blen]);
4231       }
4232     }
4233     SortStrboxIndexed(search_term_ct, max_blen, 0, merged_strbox, id_map);
4234     assert(search_term_ct);
4235     const char* duplicate_search_term = ScanForDuplicateIds(merged_strbox, search_term_ct, max_blen);
4236     if (unlikely(duplicate_search_term)) {
4237       logerrprintfww("Error: Duplicate term '%s' in --%s column search order.\n", duplicate_search_term, flagname_p);
4238       goto SearchHeaderLine_ret_INVALID_CMDLINE;
4239     }
4240 
4241     SetAllU32Arr(search_col_ct, priority_vals);
4242     SetAllU64Arr(search_col_ct, cols_and_types);
4243     for (uintptr_t col_idx = 0; ; ++col_idx) {
4244       const char* token_end = CurTokenEnd(header_line_iter);
4245       const uint32_t token_slen = token_end - header_line_iter;
4246       int32_t ii = bsearch_str(header_line_iter, merged_strbox, token_slen, max_blen, search_term_ct);
4247       if (ii != -1) {
4248         const uint32_t cur_map_idx = id_map[S_CAST(uint32_t, ii)];
4249         const uint32_t search_col_idx = cur_map_idx & 31;
4250         const uint32_t priority_idx = cur_map_idx >> 5;
4251         if (priority_vals[search_col_idx] >= priority_idx) {
4252           if (unlikely(priority_vals[search_col_idx] == priority_idx)) {
4253             logerrprintfww("Error: Duplicate column header '%s' in --%s file.\n", &(merged_strbox[max_blen * cur_map_idx]), flagname_p);
4254             goto SearchHeaderLine_ret_MALFORMED_INPUT;
4255           }
4256           priority_vals[search_col_idx] = priority_idx;
4257           cols_and_types[search_col_idx] = (S_CAST(uint64_t, col_idx) << 32) | search_col_idx;
4258         }
4259       }
4260       header_line_iter = FirstNonTspace(token_end);
4261       if (IsEolnKns(*header_line_iter)) {
4262         break;
4263       }
4264     }
4265     uint32_t found_type_bitset = 0;
4266     for (uint32_t search_col_idx = 0; search_col_idx != search_col_ct; ++search_col_idx) {
4267       if (priority_vals[search_col_idx] != UINT32_MAX) {
4268         found_type_bitset |= 1U << search_col_idx;
4269       }
4270     }
4271     const uint32_t found_col_ct = PopcountWord(found_type_bitset);
4272     *found_col_ct_ptr = found_col_ct;
4273     *found_type_bitset_ptr = found_type_bitset;
4274     if (found_col_ct) {
4275       STD_SORT(search_col_ct, u64cmp, cols_and_types);
4276       uint32_t prev_col_idx = cols_and_types[0] >> 32;
4277       col_skips[0] = prev_col_idx;
4278       col_types[0] = S_CAST(uint32_t, cols_and_types[0]);
4279       for (uint32_t found_col_idx = 1; found_col_idx != found_col_ct; ++found_col_idx) {
4280         const uint64_t cur_col_and_type = cols_and_types[found_col_idx];
4281         const uint32_t cur_col_idx = cur_col_and_type >> 32;
4282         col_skips[found_col_idx] = cur_col_idx - prev_col_idx;
4283         col_types[found_col_idx] = S_CAST(uint32_t, cur_col_and_type);
4284         prev_col_idx = cur_col_idx;
4285       }
4286     }
4287   }
4288   while (0) {
4289   SearchHeaderLine_ret_NOMEM:
4290     reterr = kPglRetNomem;
4291     break;
4292   SearchHeaderLine_ret_INVALID_CMDLINE:
4293     reterr = kPglRetInvalidCmdline;
4294     break;
4295   SearchHeaderLine_ret_MALFORMED_INPUT:
4296     reterr = kPglRetMalformedInput;
4297     break;
4298   }
4299   BigstackReset(bigstack_mark);
4300   return reterr;
4301 }
4302 
ParseColDescriptor(const char * col_descriptor_iter,const char * supported_ids,const char * cur_flag_name,uint32_t first_col_shifted,uint32_t default_cols_mask,uint32_t prohibit_empty,void * result_ptr)4303 PglErr ParseColDescriptor(const char* col_descriptor_iter, const char* supported_ids, const char* cur_flag_name, uint32_t first_col_shifted, uint32_t default_cols_mask, uint32_t prohibit_empty, void* result_ptr) {
4304   // col_descriptor is usually a pointer to argv[...][5] (first five characters
4305   // are "cols=").  supported_ids is a multistr.
4306   // may need to switch first_col_shifted/default_cols_mask/result to uint64_t
4307   PglErr reterr = kPglRetSuccess;
4308   uint32_t* id_map = nullptr;
4309   {
4310     uint32_t max_id_blen = 0;
4311     uint32_t id_ct = 0;
4312 
4313     // work around strchr not returning a const char*?
4314     const char* supported_ids_iter = supported_ids;
4315 
4316     // can precompute this sorted index and avoid the dynamic
4317     // allocations/deallocations, but this is cheap enough that I'd rather make
4318     // it easier to extend functionality.
4319     do {
4320       const uint32_t blen = 1 + strlen(supported_ids_iter);
4321       if (blen > max_id_blen) {
4322         max_id_blen = blen;
4323       }
4324       ++id_ct;
4325       supported_ids_iter = &(supported_ids_iter[blen]);
4326     } while (*supported_ids_iter);
4327     // max_id_blen + 4 extra bytes at the end, to support a "maybe" search
4328     // (yes, this can also be precomputed)
4329     if (unlikely(pgl_malloc((max_id_blen + 4) * (id_ct + 1) + 1, &id_map))) {
4330       goto ParseColDescriptor_ret_NOMEM;
4331     }
4332     char* sorted_ids = R_CAST(char*, &(id_map[id_ct]));
4333     supported_ids_iter = supported_ids;
4334     for (uint32_t id_idx = 0; id_idx != id_ct; ++id_idx) {
4335       const uint32_t blen = strlen(supported_ids_iter) + 1;
4336       memcpy(&(sorted_ids[id_idx * max_id_blen]), supported_ids_iter, blen);
4337       id_map[id_idx] = id_idx;
4338       supported_ids_iter = &(supported_ids_iter[blen]);
4339     }
4340     if (unlikely(SortStrboxIndexedMalloc(id_ct, max_id_blen, sorted_ids, id_map))) {
4341       goto ParseColDescriptor_ret_NOMEM;
4342     }
4343     uint32_t result = *S_CAST(uint32_t*, result_ptr);
4344     // might not want to bother splitting this into two loops
4345     if ((col_descriptor_iter[0] == '+') || (col_descriptor_iter[0] == '-')) {
4346       result |= default_cols_mask;
4347       char* maybebuf = &(sorted_ids[max_id_blen * id_ct]);
4348       memcpy_k(maybebuf, "maybe", 5);
4349       while (1) {
4350         const char* id_start = &(col_descriptor_iter[1]);
4351         const char* tok_end = strchrnul(id_start, ',');
4352         const uint32_t slen = tok_end - id_start;
4353         int32_t alpha_idx = bsearch_str(id_start, sorted_ids, slen, max_id_blen, id_ct);
4354         if (unlikely(alpha_idx == -1)) {
4355           char* write_iter = strcpya_k(g_logbuf, "Error: Unrecognized ID '");
4356           write_iter = memcpya(write_iter, id_start, slen);
4357           write_iter = strcpya_k(write_iter, "' in --");
4358           write_iter = strcpya(write_iter, cur_flag_name);
4359           snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " column set descriptor.\n");
4360           goto ParseColDescriptor_ret_INVALID_CMDLINE_WW;
4361         }
4362         uint32_t shift = id_map[S_CAST(uint32_t, alpha_idx)];
4363         if (col_descriptor_iter[0] == '+') {
4364           result |= first_col_shifted << shift;
4365         } else {
4366           if (result & (first_col_shifted << shift)) {
4367             result -= first_col_shifted << shift;
4368           } else if (slen + 5 < max_id_blen) {
4369             // special case: if default column set includes e.g. "maybesid",
4370             // and user types "-sid", that should work
4371             memcpy(&(maybebuf[5]), id_start, slen);
4372             alpha_idx = bsearch_str(maybebuf, sorted_ids, slen + 5, max_id_blen, id_ct);
4373             if (alpha_idx != -1) {
4374               shift = id_map[S_CAST(uint32_t, alpha_idx)];
4375               result &= ~(first_col_shifted << shift);
4376             }
4377           }
4378         }
4379         // bugfix (16 Oct 2017): forgot to switch from !tok_end to !(*tok_end)
4380         // after use of strchrnul().
4381         if (!(*tok_end)) {
4382           break;
4383         }
4384         col_descriptor_iter = &(tok_end[1]);
4385         if (unlikely((col_descriptor_iter[0] != '+') && (col_descriptor_iter[0] != '-'))) {
4386           goto ParseColDescriptor_ret_MIXED_SIGN;
4387         }
4388       }
4389     } else if (*col_descriptor_iter) {
4390       while (1) {
4391         const char* tok_end = strchrnul(col_descriptor_iter, ',');
4392         const uint32_t slen = tok_end - col_descriptor_iter;
4393         const int32_t alpha_idx = bsearch_str(col_descriptor_iter, sorted_ids, slen, max_id_blen, id_ct);
4394         if (unlikely(alpha_idx == -1)) {
4395           char* write_iter = strcpya_k(g_logbuf, "Error: Unrecognized ID '");
4396           write_iter = memcpya(write_iter, col_descriptor_iter, slen);
4397           write_iter = strcpya_k(write_iter, "' in --");
4398           write_iter = strcpya(write_iter, cur_flag_name);
4399           snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " column set descriptor.\n");
4400           goto ParseColDescriptor_ret_INVALID_CMDLINE_WW;
4401         }
4402         uint32_t shift = id_map[S_CAST(uint32_t, alpha_idx)];
4403         result |= first_col_shifted << shift;
4404         if (!(*tok_end)) {
4405           break;
4406         }
4407         col_descriptor_iter = &(tok_end[1]);
4408         if (unlikely((col_descriptor_iter[0] == '+') || (col_descriptor_iter[0] == '-'))) {
4409           goto ParseColDescriptor_ret_MIXED_SIGN;
4410         }
4411       }
4412     }
4413     if (unlikely(prohibit_empty && (!(result & (first_col_shifted * (UINT32_MAX >> (32 - id_ct))))))) {
4414       char* write_iter = strcpya_k(g_logbuf, "Error: All columns excluded by --");
4415       write_iter = strcpya(write_iter, cur_flag_name);
4416       snprintf(write_iter, kLogbufSize - 128, " column set descriptor.\n");
4417       goto ParseColDescriptor_ret_INVALID_CMDLINE_WW;
4418     }
4419     *S_CAST(uint32_t*, result_ptr) = result;
4420   }
4421   while (0) {
4422   ParseColDescriptor_ret_NOMEM:
4423     reterr = kPglRetNomem;
4424     break;
4425   ParseColDescriptor_ret_MIXED_SIGN:
4426     snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s column set descriptor (either all column set IDs must be preceded by +/-, or none of them can be).\n", cur_flag_name);
4427   ParseColDescriptor_ret_INVALID_CMDLINE_WW:
4428     WordWrapB(0);
4429     logerrputsb();
4430     reterr = kPglRetInvalidCmdline;
4431     break;
4432   }
4433   free_cond(id_map);
4434   return reterr;
4435 }
4436 
4437 #ifdef __cplusplus
4438 }  // namespace plink2
4439 #endif
4440