1 // This file is part of PLINK 1.90, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink_common.h"
19 
20 // #include "pigz.h"
21 
22 // no leading \n since this is used in LOGPRINTFWW expressions
23 const char g_errstr_fopen[] = "Error: Failed to open %s.\n";
24 
25 const char g_cmdline_format_str[] = "\n  " PROG_NAME_STR " <input flag(s)...> [command flag(s)...] [other flag(s)...]\n  " PROG_NAME_STR " --help [flag name(s)...]\n\n";
26 
27 char g_textbuf[TEXTBUF_SIZE];
28 
29 // note that \xxx character constants are interpreted in octal.
30 // technically no need to represent 0-31, but 64 extra bytes of data is
31 // probably cheaper than the code to subtract 32 everywhere.
32 const char g_one_char_strs[] = "\0\0\1\0\2\0\3\0\4\0\5\0\6\0\7\0\10\0\11\0\12\0\13\0\14\0\15\0\16\0\17\0\20\0\21\0\22\0\23\0\24\0\25\0\26\0\27\0\30\0\31\0\32\0\33\0\34\0\35\0\36\0\37\0\40\0\41\0\42\0\43\0\44\0\45\0\46\0\47\0\50\0\51\0\52\0\53\0\54\0\55\0\56\0\57\0\60\0\61\0\62\0\63\0\64\0\65\0\66\0\67\0\70\0\71\0\72\0\73\0\74\0\75\0\76\0\77\0\100\0\101\0\102\0\103\0\104\0\105\0\106\0\107\0\110\0\111\0\112\0\113\0\114\0\115\0\116\0\117\0\120\0\121\0\122\0\123\0\124\0\125\0\126\0\127\0\130\0\131\0\132\0\133\0\134\0\135\0\136\0\137\0\140\0\141\0\142\0\143\0\144\0\145\0\146\0\147\0\150\0\151\0\152\0\153\0\154\0\155\0\156\0\157\0\160\0\161\0\162\0\163\0\164\0\165\0\166\0\167\0\170\0\171\0\172\0\173\0\174\0\175\0\176\0\177\0\200\0\201\0\202\0\203\0\204\0\205\0\206\0\207\0\210\0\211\0\212\0\213\0\214\0\215\0\216\0\217\0\220\0\221\0\222\0\223\0\224\0\225\0\226\0\227\0\230\0\231\0\232\0\233\0\234\0\235\0\236\0\237\0\240\0\241\0\242\0\243\0\244\0\245\0\246\0\247\0\250\0\251\0\252\0\253\0\254\0\255\0\256\0\257\0\260\0\261\0\262\0\263\0\264\0\265\0\266\0\267\0\270\0\271\0\272\0\273\0\274\0\275\0\276\0\277\0\300\0\301\0\302\0\303\0\304\0\305\0\306\0\307\0\310\0\311\0\312\0\313\0\314\0\315\0\316\0\317\0\320\0\321\0\322\0\323\0\324\0\325\0\326\0\327\0\330\0\331\0\332\0\333\0\334\0\335\0\336\0\337\0\340\0\341\0\342\0\343\0\344\0\345\0\346\0\347\0\350\0\351\0\352\0\353\0\354\0\355\0\356\0\357\0\360\0\361\0\362\0\363\0\364\0\365\0\366\0\367\0\370\0\371\0\372\0\373\0\374\0\375\0\376\0\377";
33 const char* g_missing_geno_ptr = &(g_one_char_strs[96]);
34 const char* g_output_missing_geno_ptr = &(g_one_char_strs[96]);
35 
36 uintptr_t g_failed_alloc_attempt_size = 0;
37 
38 sfmt_t g_sfmt;
39 
40 FILE* g_logfile = nullptr;
41 
42 char g_logbuf[MAXLINELEN * 2];
43 
44 uint32_t g_debug_on = 0;
45 uint32_t g_log_failed = 0;
46 uint32_t g_thread_ct;
47 
aligned_malloc(uintptr_t size,uintptr_t ** aligned_pp)48 uint32_t aligned_malloc(uintptr_t size, uintptr_t** aligned_pp) {
49 #if defined __LP64__ && !defined __APPLE__
50   // Avoid random segfaults on 64-bit machines which have 8-byte- instead of
51   // 16-byte-aligned malloc().  (Slightly different code is needed if malloc()
52   // does not even guarantee 8-byte alignment.)
53   uintptr_t* malloc_ptr = (uintptr_t*)malloc(size + VEC_BYTES);
54   if (!malloc_ptr) {
55     g_failed_alloc_attempt_size = size + VEC_BYTES;
56     return 1;
57   }
58   *aligned_pp = (uintptr_t*)((((uintptr_t)malloc_ptr) + VEC_BYTES) & (~(VEC_BYTES_M1 * ONELU)));
59   (*aligned_pp)[-1] = (uintptr_t)malloc_ptr;
60 #else
61   // no SSE2 concerns here
62   *aligned_pp = (uintptr_t*)malloc(size);
63   if (!(*aligned_pp)) {
64     g_failed_alloc_attempt_size = size;
65     return 1;
66   }
67 #endif
68   return 0;
69 }
70 
aligned_free(uintptr_t * aligned_pp)71 void aligned_free(uintptr_t* aligned_pp) {
72 #if defined __LP64__ && !defined __APPLE__
73   free((uintptr_t*)(aligned_pp[-1]));
74 #else
75   free(aligned_pp);
76 #endif
77 }
78 
push_ll_str(const char * ss,Ll_str ** ll_stack_ptr)79 uint32_t push_ll_str(const char* ss, Ll_str** ll_stack_ptr) {
80   uintptr_t str_bytes = strlen(ss) + 1;
81   Ll_str* new_ll_str = (Ll_str*)malloc(sizeof(Ll_str) + str_bytes);
82   if (!new_ll_str) {
83     g_failed_alloc_attempt_size = sizeof(Ll_str) + str_bytes;
84     return 1;
85   }
86   new_ll_str->next = *ll_stack_ptr;
87   memcpy(new_ll_str->ss, ss, str_bytes);
88   *ll_stack_ptr = new_ll_str;
89   return 0;
90 }
91 
logstr(const char * ss)92 void logstr(const char* ss) {
93   if (!g_debug_on) {
94     fputs(ss, g_logfile);
95     if (ferror(g_logfile)) {
96       putc_unlocked('\n', stdout);
97       fflush(stdout);
98       fprintf(stderr, "Warning: Logging failure on:\n%s\nFurther logging will not be attempted in this run.\n", ss);
99       g_log_failed = 1;
100     }
101   } else {
102     if (g_log_failed) {
103       fflush(stdout);
104       fputs(ss, stderr);
105     } else {
106       fputs(ss, g_logfile);
107       if (ferror(g_logfile)) {
108 	putc_unlocked('\n', stdout);
109 	fflush(stdout);
110         fprintf(stderr, "Error: Debug logging failure.  Dumping to stderr:\n%s", ss);
111 	g_log_failed = 1;
112       } else {
113 	fflush(g_logfile);
114       }
115     }
116   }
117 }
118 
logprint(const char * ss)119 void logprint(const char* ss) {
120   logstr(ss);
121   fputs(ss, stdout);
122 }
123 
logerrprint(const char * ss)124 void logerrprint(const char* ss) {
125   logstr(ss);
126   fflush(stdout);
127   fputs(ss, stderr);
128 }
129 
logprintb()130 void logprintb() {
131   logstr(g_logbuf);
132   fputs(g_logbuf, stdout);
133 }
134 
logerrprintb()135 void logerrprintb() {
136   logstr(g_logbuf);
137   fflush(stdout);
138   fputs(g_logbuf, stderr);
139 }
140 
wordwrap(uint32_t suffix_len,char * ss)141 void wordwrap(uint32_t suffix_len, char* ss) {
142   // Input: A null-terminated string with no intermediate newlines.  If
143   //        suffix_len is zero, there should be a terminating \n; otherwise,
144   //        the last character should be a space.
145   // Effect: Spaces are replaced with newlines in a manner that plays well with
146   //         80 column terminal windows.  (Multi-space blocks are never
147   //         collapsed.)
148   char* token_start = ss;
149   char* line_end = &(ss[79]);
150   char* token_end;
151   while (1) {
152     while (*token_start == ' ') {
153       token_start++;
154     }
155     if (token_start > line_end) {
156       do {
157 	*line_end = '\n';
158 	line_end = &(line_end[80]);
159       } while (token_start > line_end);
160     }
161     token_end = strchr(token_start, ' ');
162     if (!token_end) {
163       if (&(token_start[79]) == line_end) {
164 	return;
165       }
166       token_end = strchr(token_start, '\0');
167       if (!suffix_len) {
168 	if (token_end <= &(line_end[1])) {
169 	  // okay if end-of-string is one past the end, because function
170 	  // assumes last character is \n in suffix_len == 0 case
171 	  assert(token_end[-1] == '\n');
172 	  return;
173 	}
174       } else {
175         if (&(token_end[suffix_len]) <= line_end) {
176 	  return;
177 	}
178 	// because of terminal space assumption, token_start actually points
179 	// to the end of the string
180 	assert(token_start[-1] == ' ');
181       }
182       token_start[-1] = '\n';
183       return;
184     }
185     if (token_end > line_end) {
186       if (&(token_start[79]) != line_end) {
187 	token_start[-1] = '\n';
188         line_end = &(token_start[79]);
189 	if (token_end > line_end) {
190 	  // single really long token, can't do anything beyond putting it on
191 	  // its own line
192           *token_end = '\n';
193 	  line_end = &(token_end[80]);
194 	}
195       } else {
196 	// single really long token, *and* previous token was either
197 	// nonexistent or long
198 	*token_end = '\n';
199 	line_end = &(token_end[80]);
200       }
201     }
202     token_start = &(token_end[1]);
203   }
204 }
205 
wordwrapb(uint32_t suffix_len)206 void wordwrapb(uint32_t suffix_len) {
207   wordwrap(suffix_len, g_logbuf);
208 }
209 
fopen_checked(const char * fname,const char * mode,FILE ** target_ptr)210 int32_t fopen_checked(const char* fname, const char* mode, FILE** target_ptr) {
211   *target_ptr = fopen(fname, mode);
212   if (!(*target_ptr)) {
213     LOGERRPRINTFWW(g_errstr_fopen, fname);
214     return -1;
215   }
216   return 0;
217 }
218 
fwrite_checked(const void * buf,size_t len,FILE * outfile)219 int32_t fwrite_checked(const void* buf, size_t len, FILE* outfile) {
220   while (len > 0x7ffff000) {
221     // OS X can't perform 2GB+ writes
222     // typical disk block size is 4kb, so 0x7ffff000 is the largest sensible
223     // write size
224     fwrite(buf, 1, 0x7ffff000, outfile);
225     buf = &(((unsigned char*)buf)[0x7ffff000]);
226     len -= 0x7ffff000;
227   }
228   fwrite(buf, 1, len, outfile);
229   return ferror(outfile);
230 }
231 
gzopen_read_checked(const char * fname,gzFile * gzf_ptr)232 int32_t gzopen_read_checked(const char* fname, gzFile* gzf_ptr) {
233   *gzf_ptr = gzopen(fname, FOPEN_RB);
234   if (!(*gzf_ptr)) {
235     LOGERRPRINTFWW(g_errstr_fopen, fname);
236     return RET_OPEN_FAIL;
237   }
238   if (gzbuffer(*gzf_ptr, 131072)) {
239     return RET_NOMEM;
240   }
241   return 0;
242 }
243 
244 // manually managed, very large stack
245 unsigned char* g_bigstack_base;
246 unsigned char* g_bigstack_end;
247 
bigstack_alloc(uintptr_t size)248 unsigned char* bigstack_alloc(uintptr_t size) {
249   unsigned char* alloc_ptr;
250   size = round_up_pow2(size, CACHELINE);
251   if (bigstack_left() < size) {
252     g_failed_alloc_attempt_size = size;
253     return nullptr;
254   }
255   alloc_ptr = g_bigstack_base;
256   g_bigstack_base += size;
257   return alloc_ptr;
258 }
259 
bigstack_shrink_top(const void * rebase,uintptr_t new_size)260 void bigstack_shrink_top(const void* rebase, uintptr_t new_size) {
261   uintptr_t freed_bytes = ((uintptr_t)(g_bigstack_base - ((unsigned char*)rebase))) - round_up_pow2(new_size, CACHELINE);
262   g_bigstack_base -= freed_bytes;
263 }
264 
bigstack_end_alloc_presized(uintptr_t size)265 unsigned char* bigstack_end_alloc_presized(uintptr_t size) {
266   assert(!(size & END_ALLOC_CHUNK_M1));
267   uintptr_t cur_bigstack_left = bigstack_left();
268   if (size > cur_bigstack_left) {
269     g_failed_alloc_attempt_size = size;
270     return nullptr;
271   } else {
272     g_bigstack_end -= size;
273     return g_bigstack_end;
274   }
275 }
276 
match_upper(const char * ss,const char * fixed_str)277 uint32_t match_upper(const char* ss, const char* fixed_str) {
278   char cc = *fixed_str++;
279   do {
280     if ((((unsigned char)(*ss++)) & 0xdf) != ((unsigned char)cc)) {
281       return 0;
282     }
283     cc = *fixed_str++;
284   } while (cc);
285   return !(*ss);
286 }
287 
match_upper_counted(const char * ss,const char * fixed_str,uint32_t ct)288 uint32_t match_upper_counted(const char* ss, const char* fixed_str, uint32_t ct) {
289   do {
290     if ((((unsigned char)(*ss++)) & 0xdf) != ((unsigned char)(*fixed_str++))) {
291       return 0;
292     }
293   } while (--ct);
294   return 1;
295 }
296 
297 #ifdef __LP64__
scan_uint_capped_finish(const char * ss,uint64_t cap,uint32_t * valp)298 static inline uint32_t scan_uint_capped_finish(const char* ss, uint64_t cap, uint32_t* valp) {
299   uint64_t val = *valp;
300   while (1) {
301     // a little bit of unrolling seems to help
302     const uint64_t cur_digit = (uint64_t)((unsigned char)(*ss++)) - 48;
303     if (cur_digit >= 10) {
304       break;
305     }
306     // val = val * 10 + cur_digit;
307     const uint64_t cur_digit2 = (uint64_t)((unsigned char)(*ss++)) - 48;
308     if (cur_digit2 >= 10) {
309       val = val * 10 + cur_digit;
310       if (val > cap) {
311 	return 1;
312       }
313       break;
314     }
315     val = val * 100 + cur_digit * 10 + cur_digit2;
316     if (val > cap) {
317       return 1;
318     }
319   }
320   *valp = val;
321   return 0;
322 }
323 
scan_posint_capped(const char * ss,uint64_t cap,uint32_t * valp)324 uint32_t scan_posint_capped(const char* ss, uint64_t cap, uint32_t* valp) {
325   // '0' has ascii code 48
326   *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
327   if (*valp >= 10) {
328     // permit leading '+' (ascii 43), but not '++' or '+-'
329     if (*valp != 0xfffffffbU) {
330       return 1;
331     }
332     *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
333     if (*valp >= 10) {
334       return 1;
335     }
336   }
337   while (!(*valp)) {
338     *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
339     if ((*valp) >= 10) {
340       return 1;
341     }
342   }
343   return scan_uint_capped_finish(ss, cap, valp);
344 }
345 
scan_uint_capped(const char * ss,uint64_t cap,uint32_t * valp)346 uint32_t scan_uint_capped(const char* ss, uint64_t cap, uint32_t* valp) {
347   // Reads an integer in [0, cap].  Assumes first character is nonspace.
348   uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
349   if (val >= 10) {
350     if (val != 0xfffffffbU) {
351       // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
352       if ((val != 0xfffffffdU) || (*ss != '0')) {
353 	return 1;
354       }
355       // accept "-0", "-00", etc.
356       while (*(++ss) == '0');
357       *valp = 0;
358       return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
359     }
360     // accept leading '+'
361     val = (uint32_t)((unsigned char)(*ss++)) - 48;
362     if (val >= 10) {
363       return 1;
364     }
365   }
366   *valp = val;
367   return scan_uint_capped_finish(ss, cap, valp);
368 }
369 
scan_int_abs_bounded(const char * ss,uint64_t bound,int32_t * valp)370 uint32_t scan_int_abs_bounded(const char* ss, uint64_t bound, int32_t* valp) {
371   // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
372   *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
373   int32_t sign = 1;
374   if (((uint32_t)*valp) >= 10) {
375     if (*valp == -3) {
376       sign = -1;
377     } else if (*valp != -5) {
378       return 1;
379     }
380     *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
381     if (((uint32_t)*valp) >= 10) {
382       return 1;
383     }
384   }
385   if (scan_uint_capped_finish(ss, bound, (uint32_t*)valp)) {
386     return 1;
387   }
388   *valp *= sign;
389   return 0;
390 }
391 #else // not __LP64__
scan_posint_capped32(const char * ss,uint32_t cap_div_10,uint32_t cap_mod_10,uint32_t * valp)392 uint32_t scan_posint_capped32(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
393   // '0' has ascii code 48
394   uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
395   if (val >= 10) {
396     if (val != 0xfffffffbU) {
397       return 1;
398     }
399     val = (uint32_t)((unsigned char)(*ss++)) - 48;
400     if (val >= 10) {
401       return 1;
402     }
403   }
404   while (!val) {
405     val = (uint32_t)((unsigned char)(*ss++)) - 48;
406     if (val >= 10) {
407       return 1;
408     }
409   }
410   while (1) {
411     const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
412     if (cur_digit >= 10) {
413       *valp = val;
414       return 0;
415     }
416     // avoid integer overflow in middle of computation
417     if ((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10))) {
418       return 1;
419     }
420     val = val * 10 + cur_digit;
421   }
422 }
423 
scan_uint_capped32(const char * ss,uint32_t cap_div_10,uint32_t cap_mod_10,uint32_t * valp)424 uint32_t scan_uint_capped32(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
425   // Reads an integer in [0, cap].  Assumes first character is nonspace.
426   uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
427   if (val >= 10) {
428     if (val != 0xfffffffbU) {
429       if ((val != 0xfffffffdU) || (*ss != '0')) {
430 	return 1;
431       }
432       while (*(++ss) == '0');
433       *valp = 0;
434       return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
435     }
436     val = (uint32_t)((unsigned char)(*ss++)) - 48;
437     if (val >= 10) {
438       return 1;
439     }
440   }
441   while (1) {
442     const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
443     if (cur_digit >= 10) {
444       *valp = val;
445       return 0;
446     }
447     if ((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10))) {
448       return 1;
449     }
450     val = val * 10 + cur_digit;
451   }
452 }
453 
scan_int_abs_bounded32(const char * ss,uint32_t bound_div_10,uint32_t bound_mod_10,int32_t * valp)454 uint32_t scan_int_abs_bounded32(const char* ss, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp) {
455   // Reads an integer in [-bound, bound].  Assumes first character is nonspace.
456   uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
457   int32_t sign = 1;
458   if (val >= 10) {
459     if (val == 0xfffffffdU) {
460       sign = -1;
461     } else if (val != 0xfffffffbU) {
462       return 1;
463     }
464     val = (uint32_t)((unsigned char)(*ss++)) - 48;
465     if (val >= 10) {
466       return 1;
467     }
468   }
469   while (1) {
470     const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
471     if (cur_digit >= 10) {
472       *valp = sign * ((int32_t)val);
473       return 0;
474     }
475     if ((val >= bound_div_10) && ((val > bound_div_10) || (cur_digit > bound_mod_10))) {
476       return 1;
477     }
478     val = val * 10 + cur_digit;
479   }
480 }
481 #endif
482 
scan_posintptr(const char * ss,uintptr_t * valp)483 uint32_t scan_posintptr(const char* ss, uintptr_t* valp) {
484   // Reads an integer in [1, 2^BITCT - 1].  Assumes first character is
485   // nonspace.
486   uintptr_t val = (uintptr_t)((unsigned char)(*ss++)) - 48;
487   if (val >= 10) {
488 #ifdef __LP64__
489     if (val != 0xfffffffffffffffbLLU) {
490       return 1;
491     }
492 #else
493     if (val != 0xfffffffbU) {
494       return 1;
495     }
496 #endif
497     val = (uintptr_t)((unsigned char)(*ss++)) - 48;
498     if (val >= 10) {
499       return 1;
500     }
501   }
502   while (!val) {
503     val = (uintptr_t)((unsigned char)(*ss++)) - 48;
504     if (val >= 10) {
505       return 1;
506     }
507   }
508   // limit is 20 digits, we've already read one
509 #ifdef __LP64__
510   const char* ss_limit = &(ss[20]);
511 #else
512   const char* ss_limit = &(ss[10]);
513 #endif
514   while (1) {
515     const uintptr_t cur_digit = (uintptr_t)((unsigned char)(*ss++)) - 48;
516     if (cur_digit >= 10) {
517       *valp = val;
518       return 0;
519     }
520     const uintptr_t cur_digit2 = (uintptr_t)((unsigned char)(*ss++)) - 48;
521     if (ss == ss_limit) {
522       if ((cur_digit2 < 10) || ((val >= (~ZEROLU) / 10) && ((val > (~ZEROLU) / 10) || (cur_digit > (~ZEROLU) % 10)))) {
523 	return 1;
524       }
525       *valp = val * 10 + cur_digit;
526       return 0;
527     }
528     if (cur_digit2 >= 10) {
529       *valp = val * 10 + cur_digit;
530       return 0;
531     }
532     val = val * 100 + cur_digit * 10 + cur_digit2;
533   }
534 }
535 
536 /*
537 uint32_t scan_uintptr(char* ss, uintptr_t* valp) {
538   // [0, 2^BITCT - 1].
539   uintptr_t val = (uint32_t)((unsigned char)*ss) - 48;
540   uintptr_t cur_digit;
541   if (val < 10) {
542     while (1) {
543     scan_uintptr_main_loop:
544       cur_digit = (uint32_t)((unsigned char)(*(++ss))) - 48;
545       if (cur_digit >= 10) {
546 	*valp = val;
547 	return 0;
548       }
549       if ((val >= (~ZEROLU) / 10) && ((val > (~ZEROLU) / 10) || (cur_digit > (~ZEROLU) % 10))) {
550 	return 1;
551       }
552       val = val * 10 + cur_digit;
553     }
554   }
555   ss++;
556   if (val != 0xfffffffdU) {
557     if (val == 0xfffffffbU) {
558       val = (uint32_t)((unsigned char)(*ss)) - 48;
559       if (val < 10) {
560 	goto scan_uintptr_main_loop;
561       }
562     }
563     return 1;
564   }
565   if (*ss != '0') {
566     return 1;
567   }
568   while (*(++ss) == '0');
569   *valp = 0;
570   return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
571 }
572 */
573 
scan_posint_cappedx(const char * ss,uint64_t cap,uint32_t * valp)574 uint32_t scan_posint_cappedx(const char* ss, uint64_t cap, uint32_t* valp) {
575   double val;
576   if (scan_doublex(ss, &val) || (val < 1.0) || (val > ((double)cap))) {
577     return 1;
578   }
579   *valp = (uint32_t)val;
580   return (val != ((double)(*valp)));
581 }
582 
scan_uint_cappedx(const char * ss,uint64_t cap,uint32_t * valp)583 uint32_t scan_uint_cappedx(const char* ss, uint64_t cap, uint32_t* valp) {
584   double val;
585   if (scan_doublex(ss, &val) || (val < 0.0) || (val > ((double)cap))) {
586     return 1;
587   }
588   *valp = (uint32_t)val;
589   return (val != ((double)(*valp)));
590 }
591 
scan_int_abs_boundedx(const char * ss,uint64_t bound,int32_t * valp)592 uint32_t scan_int_abs_boundedx(const char* ss, uint64_t bound, int32_t* valp) {
593   const double bound_d = (double)bound;
594   double val;
595   if (scan_doublex(ss, &val) || (val < -bound_d) || (val > bound_d)) {
596     return 1;
597   }
598   *valp = (int32_t)val;
599   return (val != ((double)(*valp)));
600 }
601 
scan_posintptrx(const char * ss,uintptr_t * valp)602 uint32_t scan_posintptrx(const char* ss, uintptr_t* valp) {
603   double val;
604   if (scan_doublex(ss, &val) || (val < 1.0) || (val > ((double)(~ZEROLU)))) {
605     return 1;
606   }
607   *valp = (uintptr_t)val;
608   return (val != ((double)(*valp)));
609 }
610 
611 
scan_two_doubles(char * ss,double * __restrict val1p,double * __restrict val2p)612 uint32_t scan_two_doubles(char* ss, double* __restrict val1p, double* __restrict val2p) {
613   char* ss2;
614   *val1p = strtod(ss, &ss2);
615   if (ss == ss2) {
616     return 1;
617   }
618   ss = skip_initial_spaces(ss2);
619   *val2p = strtod(ss, &ss2);
620   return (ss == ss2)? 1 : 0;
621 }
622 
scan_token_ct_len(uintptr_t half_bufsize,FILE * infile,char * buf,uintptr_t * __restrict token_ct_ptr,uintptr_t * __restrict max_token_len_ptr)623 int32_t scan_token_ct_len(uintptr_t half_bufsize, FILE* infile, char* buf, uintptr_t* __restrict token_ct_ptr, uintptr_t* __restrict max_token_len_ptr) {
624   // buf must be of size >= (2 * half_bufsize + 2)
625   // max_token_len includes trailing null
626   uintptr_t full_bufsize = half_bufsize * 2;
627   uintptr_t curtoklen = 0;
628   uintptr_t token_ct = *token_ct_ptr;
629   uintptr_t max_token_len = *max_token_len_ptr;
630   char* midbuf = &(buf[half_bufsize]);
631   char* bufptr;
632   char* bufptr2;
633   char* buf_end;
634   uintptr_t bufsize;
635   while (1) {
636     if (fread_checked(midbuf, half_bufsize, infile, &bufsize)) {
637       return RET_READ_FAIL;
638     }
639     if (!bufsize) {
640       if (curtoklen) {
641         // corner case
642         if (curtoklen >= max_token_len) {
643 	  max_token_len = curtoklen + 1;
644 	}
645 	token_ct++;
646       }
647       break;
648     }
649     buf_end = &(midbuf[bufsize]);
650     *buf_end = ' ';
651     buf_end[1] = '0';
652     bufptr = &(buf[half_bufsize - curtoklen]);
653     bufptr2 = midbuf;
654     if (curtoklen) {
655       goto scan_token_ct_len_tok_start;
656     }
657     while (1) {
658       while (*bufptr <= ' ') {
659 	bufptr++;
660       }
661       if (bufptr >= buf_end) {
662 	curtoklen = 0;
663 	break;
664       }
665       bufptr2 = &(bufptr[1]);
666     scan_token_ct_len_tok_start:
667       while (*bufptr2 > ' ') {
668 	bufptr2++;
669       }
670       curtoklen = (uintptr_t)(bufptr2 - bufptr);
671       if ((bufptr2 == buf_end) && (buf_end == &(buf[full_bufsize]))) {
672 	if (curtoklen >= half_bufsize) {
673 	  return RET_INVALID_FORMAT;
674 	}
675 	break;
676       }
677       if (curtoklen >= max_token_len) {
678 	if (curtoklen >= half_bufsize) {
679 	  return RET_INVALID_FORMAT;
680 	}
681 	max_token_len = curtoklen + 1;
682       }
683       token_ct++;
684       bufptr = &(bufptr2[1]);
685     }
686   }
687   if (!feof(infile)) {
688     return RET_READ_FAIL;
689   }
690   *max_token_len_ptr = max_token_len;
691   *token_ct_ptr = token_ct;
692   return 0;
693 }
694 
read_tokens(uintptr_t half_bufsize,uintptr_t token_ct,uintptr_t max_token_len,FILE * infile,char * __restrict buf,char * __restrict token_name_buf)695 int32_t read_tokens(uintptr_t half_bufsize, uintptr_t token_ct, uintptr_t max_token_len, FILE* infile, char* __restrict buf, char* __restrict token_name_buf) {
696   // buf must be of size >= (2 * half_bufsize + 2).
697   // max_token_len includes trailing null
698   uintptr_t full_bufsize = half_bufsize * 2;
699   uintptr_t curtoklen = 0;
700   uintptr_t token_idx = 0;
701   char* midbuf = &(buf[half_bufsize]);
702   char* bufptr = midbuf;
703   char* bufptr2;
704   char* bufptr3;
705   char* buf_end;
706   uintptr_t bufsize;
707   while (1) {
708     if (fread_checked(midbuf, half_bufsize, infile, &bufsize)) {
709       return RET_READ_FAIL;
710     }
711     if (!bufsize) {
712       if (curtoklen) {
713         if (token_idx + 1 == token_ct) {
714           memcpyx(&(token_name_buf[token_idx * max_token_len]), bufptr, curtoklen, '\0');
715 	  return 0;
716         }
717       }
718       // something very strange has to happen to get here
719       return RET_READ_FAIL;
720     }
721     buf_end = &(midbuf[bufsize]);
722     *buf_end = ' ';
723     buf_end[1] = '0';
724     bufptr2 = midbuf;
725     if (curtoklen) {
726       goto read_tokens_tok_start;
727     }
728     while (1) {
729       while (*bufptr <= ' ') {
730 	bufptr++;
731       }
732       if (bufptr >= buf_end) {
733         curtoklen = 0;
734 	bufptr = midbuf;
735 	break;
736       }
737       bufptr2 = &(bufptr[1]);
738     read_tokens_tok_start:
739       while (*bufptr2 > ' ') {
740 	bufptr2++;
741       }
742       curtoklen = (uintptr_t)(bufptr2 - bufptr);
743       if ((bufptr2 == buf_end) && (buf_end == &(buf[full_bufsize]))) {
744 	bufptr3 = &(buf[half_bufsize - curtoklen]);
745         memcpy(bufptr3, bufptr, curtoklen);
746         bufptr = bufptr3;
747 	break;
748       }
749       memcpyx(&(token_name_buf[token_idx * max_token_len]), bufptr, curtoklen, '\0');
750       if (++token_idx == token_ct) {
751 	return 0;
752       }
753       bufptr = &(bufptr2[1]);
754     }
755   }
756 }
757 
gzputs_w4(gzFile gz_outfile,const char * ss)758 int32_t gzputs_w4(gzFile gz_outfile, const char* ss) {
759   if (!ss[1]) {
760     if (gzputs(gz_outfile, "   ") == -1) {
761       return -1;
762     }
763     return gzputc(gz_outfile, ss[0]);
764   }
765   if (!ss[2]) {
766     if (gzputs(gz_outfile, "  ") == -1) {
767       return -1;
768     }
769   } else if (!ss[3]) {
770     if (gzputc(gz_outfile, ' ') == -1) {
771       return -1;
772     }
773   }
774   return gzputs(gz_outfile, ss);
775 }
776 
get_next_noncomment(FILE * fptr,char ** lptr_ptr,uintptr_t * line_idx_ptr)777 int32_t get_next_noncomment(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr) {
778   char* lptr;
779   do {
780     if (!fgets(g_textbuf, MAXLINELEN, fptr)) {
781       return -1;
782     }
783     *line_idx_ptr += 1;
784     lptr = skip_initial_spaces(g_textbuf);
785   } while (is_eoln_or_comment_kns(*lptr));
786   *lptr_ptr = lptr;
787   return 0;
788 }
789 
get_next_noncomment_excl(const uintptr_t * __restrict marker_exclude,FILE * fptr,char ** lptr_ptr,uintptr_t * __restrict line_idx_ptr,uintptr_t * __restrict marker_uidx_ptr)790 int32_t get_next_noncomment_excl(const uintptr_t* __restrict marker_exclude, FILE* fptr, char** lptr_ptr, uintptr_t* __restrict line_idx_ptr, uintptr_t* __restrict marker_uidx_ptr) {
791   while (!get_next_noncomment(fptr, lptr_ptr, line_idx_ptr)) {
792     if (!is_set_ul(marker_exclude, *marker_uidx_ptr)) {
793       return 0;
794     }
795     *marker_uidx_ptr += 1;
796   }
797   return -1;
798 }
799 
get_top_two_ui(const uint32_t * __restrict uint_arr,uintptr_t uia_size,uintptr_t * __restrict top_idx_ptr,uintptr_t * __restrict second_idx_ptr)800 void get_top_two_ui(const uint32_t* __restrict uint_arr, uintptr_t uia_size, uintptr_t* __restrict top_idx_ptr, uintptr_t* __restrict second_idx_ptr) {
801   assert(uia_size > 1);
802   uintptr_t top_idx = (uint_arr[1] > uint_arr[0])? 1 : 0;
803   uintptr_t second_idx = 1 ^ top_idx;
804   uint32_t top_val = uint_arr[top_idx];
805   uint32_t second_val = uint_arr[second_idx];
806   uintptr_t cur_idx;
807   uintptr_t cur_val;
808   for (cur_idx = 2; cur_idx < uia_size; ++cur_idx) {
809     cur_val = uint_arr[cur_idx];
810     if (cur_val > second_val) {
811       if (cur_val > top_val) {
812 	second_val = top_val;
813 	second_idx = top_idx;
814 	top_val = cur_val;
815 	top_idx = cur_idx;
816       } else {
817 	second_val = cur_val;
818 	second_idx = cur_idx;
819       }
820     }
821   }
822   *top_idx_ptr = top_idx;
823   *second_idx_ptr = second_idx;
824 }
825 
intlen(int32_t num)826 uint32_t intlen(int32_t num) {
827   int32_t retval = 1;
828   uint32_t absnum;
829   if (num < 0) {
830     absnum = -num;
831     retval++;
832   } else {
833     absnum = num;
834   }
835   while (absnum > 99) {
836     // division by a constant is faster for unsigned ints
837     absnum /= 100;
838     retval += 2;
839   }
840   if (absnum > 9) {
841     retval++;
842   }
843   return retval;
844 }
845 
strcmp_se(const char * s_read,const char * s_const,uint32_t s_const_len)846 int32_t strcmp_se(const char* s_read, const char* s_const, uint32_t s_const_len) {
847   return memcmp(s_read, s_const, s_const_len) || (!is_space_or_eoln(s_read[s_const_len]));
848 }
849 
next_token(char * sptr)850 char* next_token(char* sptr) {
851   if (!sptr) {
852     return nullptr;
853   }
854   unsigned char ucc = *sptr;
855   while (ucc > 32) {
856     ucc = *(++sptr);
857   }
858   while ((ucc == ' ') || (ucc == '\t')) {
859     ucc = *(++sptr);
860   }
861   return (ucc > 32)? sptr : nullptr;
862 }
863 
next_token_mult(char * sptr,uint32_t ct)864 char* next_token_mult(char* sptr, uint32_t ct) {
865   assert(ct);
866   if (!sptr) {
867     return nullptr;
868   }
869   unsigned char ucc = *sptr;
870   do {
871     while (ucc > 32) {
872       ucc = *(++sptr);
873     }
874     while ((ucc == ' ') || (ucc == '\t')) {
875       ucc = *(++sptr);
876     }
877     if (ucc <= 32) {
878       return nullptr;
879     }
880   } while (--ct);
881   return sptr;
882 }
883 
count_tokens(const char * bufptr)884 uint32_t count_tokens(const char* bufptr) {
885   uint32_t token_ct = 0;
886   while ((*bufptr == ' ') || (*bufptr == '\t')) {
887     bufptr++;
888   }
889   while (!is_eoln_kns(*bufptr)) {
890     token_ct++;
891     while (!is_space_or_eoln(*(++bufptr)));
892     while ((*bufptr == ' ') || (*bufptr == '\t')) {
893       bufptr++;
894     }
895   }
896   return token_ct;
897 }
898 
count_and_measure_multistr(const char * multistr,uintptr_t * max_slen_ptr)899 uint32_t count_and_measure_multistr(const char* multistr, uintptr_t* max_slen_ptr) {
900   // max_slen includes null terminator
901   // assumes multistr is nonempty
902   uint32_t ct = 0;
903   uintptr_t max_slen = *max_slen_ptr;
904   uintptr_t slen;
905   do {
906     slen = strlen(multistr) + 1;
907     if (slen > max_slen) {
908       max_slen = slen;
909     }
910     multistr = &(multistr[slen]);
911     ct++;
912   } while (*multistr);
913   *max_slen_ptr = max_slen;
914   return ct;
915 }
916 
917 // number-to-string encoders
918 
919 static const char digit2_table[200] = {
920   '0', '0', '0', '1', '0', '2', '0', '3', '0', '4',
921   '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
922   '1', '0', '1', '1', '1', '2', '1', '3', '1', '4',
923   '1', '5', '1', '6', '1', '7', '1', '8', '1', '9',
924   '2', '0', '2', '1', '2', '2', '2', '3', '2', '4',
925   '2', '5', '2', '6', '2', '7', '2', '8', '2', '9',
926   '3', '0', '3', '1', '3', '2', '3', '3', '3', '4',
927   '3', '5', '3', '6', '3', '7', '3', '8', '3', '9',
928   '4', '0', '4', '1', '4', '2', '4', '3', '4', '4',
929   '4', '5', '4', '6', '4', '7', '4', '8', '4', '9',
930   '5', '0', '5', '1', '5', '2', '5', '3', '5', '4',
931   '5', '5', '5', '6', '5', '7', '5', '8', '5', '9',
932   '6', '0', '6', '1', '6', '2', '6', '3', '6', '4',
933   '6', '5', '6', '6', '6', '7', '6', '8', '6', '9',
934   '7', '0', '7', '1', '7', '2', '7', '3', '7', '4',
935   '7', '5', '7', '6', '7', '7', '7', '8', '7', '9',
936   '8', '0', '8', '1', '8', '2', '8', '3', '8', '4',
937   '8', '5', '8', '6', '8', '7', '8', '8', '8', '9',
938   '9', '0', '9', '1', '9', '2', '9', '3', '9', '4',
939   '9', '5', '9', '6', '9', '7', '9', '8', '9', '9'};
940 
uint32toa(uint32_t uii,char * start)941 char* uint32toa(uint32_t uii, char* start) {
942   // Memory-efficient fast integer writer.  (You can do a bit better sometimes
943   // by using a larger lookup table, but on average I doubt that pays off.)
944   // Returns a pointer to the end of the integer (not null-terminated).
945   uint32_t quotient;
946   if (uii < 1000) {
947     if (uii < 10) {
948       *start++ = '0' + uii;
949       return start;
950     }
951     if (uii < 100) {
952       goto uint32toa_2;
953     }
954     quotient = uii / 100;
955     *start++ = '0' + quotient;
956   } else {
957     if (uii < 10000000) {
958       if (uii >= 100000) {
959 	if (uii < 1000000) {
960 	  goto uint32toa_6;
961 	}
962 	quotient = uii / 1000000;
963 	*start++ = '0' + quotient;
964 	goto uint32toa_6b;
965       }
966       if (uii < 10000) {
967 	goto uint32toa_4;
968       }
969       quotient = uii / 10000;
970       *start++ = '0' + quotient;
971     } else {
972       if (uii >= 100000000) {
973 	quotient = uii / 100000000;
974 	if (uii >= 1000000000) {
975 	  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
976 	} else {
977 	  *start++ = '0' + quotient;
978 	}
979 	uii -= 100000000 * quotient;
980       }
981       quotient = uii / 1000000;
982       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
983     uint32toa_6b:
984       uii -= 1000000 * quotient;
985     uint32toa_6:
986       quotient = uii / 10000;
987       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
988     }
989     uii -= 10000 * quotient;
990   uint32toa_4:
991     // could make a uitoa_z4() call here, but that's slightly slower
992     quotient = uii / 100;
993     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
994   }
995   uii -= 100 * quotient;
996  uint32toa_2:
997   return memcpya(start, &(digit2_table[uii * 2]), 2);
998 }
999 
int32toa(int32_t ii,char * start)1000 char* int32toa(int32_t ii, char* start) {
1001   uint32_t uii = ii;
1002   if (ii < 0) {
1003     // -INT_MIN is undefined, but negating the unsigned int equivalent works
1004     *start++ = '-';
1005     uii = -uii;
1006   }
1007   return uint32toa(uii, start);
1008 }
1009 
uitoa_z4(uint32_t uii,char * start)1010 char* uitoa_z4(uint32_t uii, char* start) {
1011   uint32_t quotient = uii / 100;
1012   assert(quotient < 100);
1013   uii -= 100 * quotient;
1014   start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1015   return memcpya(start, &(digit2_table[uii * 2]), 2);
1016 }
1017 
uitoa_z6(uint32_t uii,char * start)1018 char* uitoa_z6(uint32_t uii, char* start) {
1019   uint32_t quotient = uii / 10000;
1020   start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1021   return uitoa_z4(uii - 10000 * quotient, start);
1022 }
1023 
uitoa_z8(uint32_t uii,char * start)1024 char* uitoa_z8(uint32_t uii, char* start) {
1025   uint32_t quotient = uii / 1000000;
1026   start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1027   return uitoa_z6(uii - 1000000 * quotient, start);
1028 }
1029 
int64toa(int64_t llii,char * start)1030 char* int64toa(int64_t llii, char* start) {
1031   uint64_t ullii = llii;
1032   uint64_t top_digits;
1033   uint32_t bottom_eight;
1034   uint32_t middle_eight;
1035   if (llii < 0) {
1036     *start++ = '-';
1037     ullii = -ullii;
1038   }
1039   if (ullii <= 0xffffffffLLU) {
1040     return uint32toa((uint32_t)ullii, start);
1041   }
1042   top_digits = ullii / 100000000;
1043   bottom_eight = (uint32_t)(ullii - (top_digits * 100000000));
1044   if (top_digits <= 0xffffffffLLU) {
1045     start = uint32toa((uint32_t)top_digits, start);
1046     return uitoa_z8(bottom_eight, start);
1047   }
1048   ullii = top_digits / 100000000;
1049   middle_eight = (uint32_t)(top_digits - (ullii * 100000000));
1050   start = uint32toa((uint32_t)ullii, start);
1051   start = uitoa_z8(middle_eight, start);
1052   return uitoa_z8(bottom_eight, start);
1053 }
1054 
uint32toa_w4(uint32_t uii,char * start)1055 char* uint32toa_w4(uint32_t uii, char* start) {
1056   uint32_t quotient;
1057   if (uii < 1000) {
1058     if (uii < 10) {
1059       // assumes little-endian
1060       *((uint32_t*)start) = 0x30202020 + (uii << 24);
1061       return &(start[4]);
1062     }
1063     if (uii < 100) {
1064       memset(start, 32, 2);
1065     } else {
1066       quotient = uii / 100;
1067       *start++ = ' ';
1068       *start++ = '0' + quotient;
1069       uii -= quotient * 100;
1070     }
1071     return memcpya(start, &(digit2_table[uii * 2]), 2);
1072   } else {
1073     // presumably the field width is 4 for a reason; don't bother optimizing
1074     // this
1075     return uint32toa(uii, start);
1076   }
1077 }
1078 
uint32toa_w6(uint32_t uii,char * start)1079 char* uint32toa_w6(uint32_t uii, char* start) {
1080   uint32_t quotient;
1081   if (uii < 1000) {
1082     if (uii < 10) {
1083       start = memseta(start, 32, 5);
1084       *start++ = '0' + uii;
1085       return start;
1086     }
1087     if (uii < 100) {
1088       start = memseta(start, 32, 4);
1089       goto uint32toa_w6_2;
1090     }
1091     quotient = uii / 100;
1092     // the little-endian trick doesn't seem to help here.  possibly relevant
1093     // differences from uint32toa_w4() and _w8(): sequential dependence on
1094     // quotient, need to interpret pointer as a char* again
1095     start = memseta(start, 32, 3);
1096     *start++ = '0' + quotient;
1097   } else {
1098     if (uii < 10000000) {
1099       if (uii >= 100000) {
1100 	if (uii < 1000000) {
1101 	  goto uint32toa_w6_6;
1102 	}
1103 	quotient = uii / 1000000;
1104 	*start++ = '0' + quotient;
1105 	goto uint32toa_w6_6b;
1106       } else if (uii >= 10000) {
1107 	*start++ = ' ';
1108 	quotient = uii / 10000;
1109 	*start++ = '0' + quotient;
1110       } else {
1111 	start = memseta(start, 32, 2);
1112 	goto uint32toa_w6_4;
1113       }
1114     } else {
1115       if (uii >= 100000000) {
1116 	quotient = uii / 100000000;
1117 	if (uii >= 1000000000) {
1118 	  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1119 	} else {
1120 	  *start++ = '0' + quotient;
1121 	}
1122 	uii -= 100000000 * quotient;
1123       }
1124       quotient = uii / 1000000;
1125       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1126     uint32toa_w6_6b:
1127       uii -= 1000000 * quotient;
1128     uint32toa_w6_6:
1129       quotient = uii / 10000;
1130       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1131     }
1132     uii -= 10000 * quotient;
1133   uint32toa_w6_4:
1134     quotient = uii / 100;
1135     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1136   }
1137   uii -= 100 * quotient;
1138  uint32toa_w6_2:
1139   return memcpya(start, &(digit2_table[uii * 2]), 2);
1140 }
1141 
uint32toa_w7(uint32_t uii,char * start)1142 char* uint32toa_w7(uint32_t uii, char* start) {
1143   uint32_t quotient;
1144   if (uii < 1000) {
1145     if (uii < 10) {
1146       start = memseta(start, 32, 6);
1147       *start++ = '0' + uii;
1148       return start;
1149     }
1150     if (uii < 100) {
1151       start = memseta(start, 32, 5);
1152       goto uint32toa_w7_2;
1153     }
1154     quotient = uii / 100;
1155     start = memseta(start, 32, 4);
1156     *start++ = '0' + quotient;
1157   } else {
1158     if (uii < 10000000) {
1159       if (uii >= 100000) {
1160 	if (uii >= 1000000) {
1161 	  quotient = uii / 1000000;
1162 	  *start++ = '0' + quotient;
1163 	  goto uint32toa_w7_6b;
1164 	}
1165 	*start++ = ' ';
1166 	goto uint32toa_w7_6;
1167       } else if (uii >= 10000) {
1168 	start = memseta(start, 32, 2);
1169 	quotient = uii / 10000;
1170 	*start++ = '0' + quotient;
1171       } else {
1172 	start = memseta(start, 32, 3);
1173 	goto uint32toa_w7_4;
1174       }
1175     } else {
1176       if (uii >= 100000000) {
1177 	quotient = uii / 100000000;
1178 	if (uii >= 1000000000) {
1179 	  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1180 	} else {
1181 	  *start++ = '0' + quotient;
1182 	}
1183 	uii -= 100000000 * quotient;
1184       }
1185       quotient = uii / 1000000;
1186       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1187     uint32toa_w7_6b:
1188       uii -= 1000000 * quotient;
1189     uint32toa_w7_6:
1190       quotient = uii / 10000;
1191       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1192     }
1193     uii -= 10000 * quotient;
1194   uint32toa_w7_4:
1195     quotient = uii / 100;
1196     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1197   }
1198   uii -= 100 * quotient;
1199  uint32toa_w7_2:
1200   return memcpya(start, &(digit2_table[uii * 2]), 2);
1201 }
1202 
uint32toa_w8(uint32_t uii,char * start)1203 char* uint32toa_w8(uint32_t uii, char* start) {
1204   uint32_t quotient;
1205   if (uii < 1000) {
1206     if (uii < 10) {
1207 #ifdef __LP64__
1208       *((uintptr_t*)start) = 0x3020202020202020LLU + (((uintptr_t)uii) << 56);
1209       return &(start[8]);
1210 #else
1211       start = memseta(start, 32, 7);
1212       *start++ = '0' + uii;
1213       return start;
1214 #endif
1215     }
1216     if (uii < 100) {
1217       start = memseta(start, 32, 6);
1218       goto uint32toa_w8_2;
1219     }
1220     quotient = uii / 100;
1221     start = memseta(start, 32, 5);
1222     *start++ = '0' + quotient;
1223   } else {
1224     if (uii < 10000000) {
1225       if (uii >= 100000) {
1226 	if (uii < 1000000) {
1227 	  start = memseta(start, 32, 2);
1228 	  goto uint32toa_w8_6;
1229 	}
1230 	quotient = uii / 1000000;
1231 	*start = ' ';
1232 	start[1] = '0' + quotient;
1233 	start += 2;
1234 	goto uint32toa_w8_6b;
1235       } else if (uii < 10000) {
1236 	start = memseta(start, 32, 4);
1237 	goto uint32toa_w8_4;
1238       }
1239       memset(start, 32, 3);
1240       quotient = uii / 10000;
1241       start[3] = '0' + quotient;
1242       start += 4;
1243     } else {
1244       if (uii >= 100000000) {
1245 	quotient = uii / 100000000;
1246 	if (uii >= 1000000000) {
1247 	  start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1248 	} else {
1249 	  *start++ = '0' + quotient;
1250 	}
1251 	uii -= 100000000 * quotient;
1252       }
1253       quotient = uii / 1000000;
1254       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1255     uint32toa_w8_6b:
1256       uii -= 1000000 * quotient;
1257     uint32toa_w8_6:
1258       quotient = uii / 10000;
1259       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1260     }
1261     uii -= 10000 * quotient;
1262   uint32toa_w8_4:
1263     quotient = uii / 100;
1264     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1265   }
1266   uii -= 100 * quotient;
1267  uint32toa_w8_2:
1268   return memcpya(start, &(digit2_table[uii * 2]), 2);
1269 }
1270 
uint32toa_w10(uint32_t uii,char * start)1271 char* uint32toa_w10(uint32_t uii, char* start) {
1272   // if we decide to reduce code size and optimize only one field width, this
1273   // should be it
1274   uint32_t quotient;
1275   if (uii < 1000) {
1276     if (uii < 10) {
1277       start = memseta(start, 32, 9);
1278       *start++ = '0' + uii;
1279       return start;
1280     }
1281     if (uii < 100) {
1282       start = memseta(start, 32, 8);
1283       goto uint32toa_w10_2;
1284     }
1285     quotient = uii / 100;
1286     start = memseta(start, 32, 7);
1287     *start++ = '0' + quotient;
1288   } else {
1289     if (uii < 10000000) {
1290       if (uii >= 100000) {
1291 	if (uii < 1000000) {
1292 	  start = memseta(start, 32, 4);
1293 	  goto uint32toa_w10_6;
1294 	}
1295 	quotient = uii / 1000000;
1296 	memset(start, 32, 3);
1297 	start[3] = '0' + quotient;
1298 	start += 4;
1299 	goto uint32toa_w10_6b;
1300       } else if (uii < 10000) {
1301 	start = memseta(start, 32, 6);
1302 	goto uint32toa_w10_4;
1303       }
1304       memset(start, 32, 5);
1305       quotient = uii / 10000;
1306       start[5] = '0' + quotient;
1307       start += 6;
1308     } else {
1309       if (uii >= 100000000) {
1310 	quotient = uii / 100000000;
1311 	if (uii >= 1000000000) {
1312 	  memcpy(start, &(digit2_table[quotient * 2]), 2);
1313 	} else {
1314 	  *start = ' ';
1315 	  start[1] = '0' + quotient;
1316 	}
1317 	uii -= 100000000 * quotient;
1318       } else {
1319 	memset(start, 32, 2);
1320       }
1321       quotient = uii / 1000000;
1322       memcpy(&(start[2]), &(digit2_table[quotient * 2]), 2);
1323       start += 4;
1324     uint32toa_w10_6b:
1325       uii -= 1000000 * quotient;
1326     uint32toa_w10_6:
1327       quotient = uii / 10000;
1328       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1329     }
1330     uii -= 10000 * quotient;
1331   uint32toa_w10_4:
1332     quotient = uii / 100;
1333     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1334   }
1335   uii -= 100 * quotient;
1336  uint32toa_w10_2:
1337   return memcpya(start, &(digit2_table[uii * 2]), 2);
1338 }
1339 
uitoa_trunc2(uint32_t uii,char * start)1340 static inline char* uitoa_trunc2(uint32_t uii, char* start) {
1341   // Given 0 < uii < 100, writes uii without *trailing* zeroes.  (I.e. this is
1342   // for floating-point encoder use.)
1343   memcpy(start, &(digit2_table[uii * 2]), 2);
1344   if (start[1] != '0') {
1345     return &(start[2]);
1346   }
1347   return &(start[1]);
1348 }
1349 
uitoa_trunc3(uint32_t uii,char * start)1350 static inline char* uitoa_trunc3(uint32_t uii, char* start) {
1351   *start++ = '0' + (uii / 100);
1352   uii %= 100;
1353   if (!uii) {
1354     return start;
1355   }
1356   memcpy(start, &(digit2_table[uii * 2]), 2);
1357   if (start[1] != '0') {
1358     return &(start[2]);
1359   }
1360   return &(start[1]);
1361 }
1362 
uitoa_trunc4(uint32_t uii,char * start)1363 static inline char* uitoa_trunc4(uint32_t uii, char* start) {
1364   uint32_t quotient = uii / 100;
1365   memcpy(start, &(digit2_table[quotient * 2]), 2);
1366   uii -= 100 * quotient;
1367   if (uii) {
1368     start += 2;
1369     memcpy(start, &(digit2_table[uii * 2]), 2);
1370   }
1371   if (start[1] != '0') {
1372     return &(start[2]);
1373   }
1374   return &(start[1]);
1375 }
1376 
uitoa_trunc6(uint32_t uii,char * start)1377 static inline char* uitoa_trunc6(uint32_t uii, char* start) {
1378   uint32_t quotient = uii / 10000;
1379   memcpy(start, &(digit2_table[quotient * 2]), 2);
1380   uii -= 10000 * quotient;
1381   if (uii) {
1382     quotient = uii / 100;
1383     start += 2;
1384     memcpy(start, &(digit2_table[quotient * 2]), 2);
1385     uii -= 100 * quotient;
1386     if (uii) {
1387       start += 2;
1388       memcpy(start, &(digit2_table[uii * 2]), 2);
1389     }
1390   }
1391   if (start[1] != '0') {
1392     return &(start[2]);
1393   }
1394   return &(start[1]);
1395 }
1396 
uitoa_trunc8(uint32_t uii,char * start)1397 static inline char* uitoa_trunc8(uint32_t uii, char* start) {
1398   uint32_t quotient = uii / 1000000;
1399   memcpy(start, &(digit2_table[quotient * 2]), 2);
1400   uii -= 1000000 * quotient;
1401   if (uii) {
1402     quotient = uii / 10000;
1403     start += 2;
1404     memcpy(start, &(digit2_table[quotient * 2]), 2);
1405     uii -= 10000 * quotient;
1406     if (uii) {
1407       quotient = uii / 100;
1408       start += 2;
1409       memcpy(start, &(digit2_table[quotient * 2]), 2);
1410       uii -= 100 * quotient;
1411       if (uii) {
1412 	start += 2;
1413 	memcpy(start, &(digit2_table[uii * 2]), 2);
1414       }
1415     }
1416   }
1417   if (start[1] != '0') {
1418     return &(start[2]);
1419   }
1420   return &(start[1]);
1421 }
1422 
qrtoa_1p1(uint32_t quotient,uint32_t remainder,char * start)1423 static inline char* qrtoa_1p1(uint32_t quotient, uint32_t remainder, char* start) {
1424   start[0] = '0' + quotient;
1425   if (!remainder) {
1426     return &(start[1]);
1427   }
1428   start[1] = '.';
1429   start[2] = '0' + remainder;
1430   return &(start[3]);
1431 }
1432 
qrtoa_1p2(uint32_t quotient,uint32_t remainder,char * start)1433 static inline char* qrtoa_1p2(uint32_t quotient, uint32_t remainder, char* start) {
1434   *start++ = '0' + quotient;
1435   if (!remainder) {
1436     return start;
1437   }
1438   *start++ = '.';
1439   memcpy(start, &(digit2_table[remainder * 2]), 2);
1440   if (start[1] != '0') {
1441     return &(start[2]);
1442   }
1443   return &(start[1]);
1444 }
1445 
qrtoa_1p3(uint32_t quotient,uint32_t remainder,char * start)1446 static inline char* qrtoa_1p3(uint32_t quotient, uint32_t remainder, char* start) {
1447   // quotient = (int32_t)dxx;
1448   // remainder = ((int32_t)(dxx * 1000)) - (quotient * 1000);
1449   *start++ = '0' + quotient;
1450   if (!remainder) {
1451     return start;
1452   }
1453   *start++ = '.';
1454   quotient = remainder / 10;
1455   memcpy(start, &(digit2_table[quotient * 2]), 2);
1456   remainder -= 10 * quotient;
1457   if (remainder) {
1458     start[2] = '0' + remainder;
1459     return &(start[3]);
1460   }
1461   if (start[1] != '0') {
1462     return &(start[2]);
1463   }
1464   return &(start[1]);
1465 }
1466 
qrtoa_1p5(uint32_t quotient,uint32_t remainder,char * start)1467 static inline char* qrtoa_1p5(uint32_t quotient, uint32_t remainder, char* start) {
1468   *start++ = '0' + quotient;
1469   if (!remainder) {
1470     return start;
1471   }
1472   *start++ = '.';
1473   quotient = remainder / 1000;
1474   memcpy(start, &(digit2_table[quotient * 2]), 2);
1475   remainder -= 1000 * quotient;
1476   if (remainder) {
1477     quotient = remainder / 10;
1478     start += 2;
1479     memcpy(start, &(digit2_table[quotient * 2]), 2);
1480     remainder -= 10 * quotient;
1481     if (remainder) {
1482       start[2] = '0' + remainder;
1483       return &(start[3]);
1484     }
1485   }
1486   if (start[1] != '0') {
1487     return &(start[2]);
1488   }
1489   return &(start[1]);
1490 }
1491 
qrtoa_1p7(uint32_t quotient,uint32_t remainder,char * start)1492 static inline char* qrtoa_1p7(uint32_t quotient, uint32_t remainder, char* start) {
1493   *start++ = '0' + quotient;
1494   if (!remainder) {
1495     return start;
1496   }
1497   *start++ = '.';
1498   quotient = remainder / 100000;
1499   memcpy(start, &(digit2_table[quotient * 2]), 2);
1500   remainder -= 100000 * quotient;
1501   if (remainder) {
1502     quotient = remainder / 1000;
1503     start += 2;
1504     memcpy(start, &(digit2_table[quotient * 2]), 2);
1505     remainder -= 1000 * quotient;
1506     if (remainder) {
1507       quotient = remainder / 10;
1508       start += 2;
1509       memcpy(start, &(digit2_table[quotient * 2]), 2);
1510       remainder -= 10 * quotient;
1511       if (remainder) {
1512 	start[2] = '0' + remainder;
1513 	return &(start[3]);
1514       }
1515     }
1516   }
1517   if (start[1] != '0') {
1518     return &(start[2]);
1519   }
1520   return &(start[1]);
1521 }
1522 
1523 // Okay, time to do banker's rounding when printing doubles.  14 digits of
1524 // precision are used in judging equality to 0.5 (actual precision of doubles
1525 // is 15-17 digits); the intention is to capture all directly loaded or exactly
1526 // computed edge cases (so enough tolerance is needed to survive the internal
1527 // multiplications by powers of 10, etc.), while rounding a negligible number
1528 // of honest-to-god 0.4999999s up and 0.5000001s down.
1529 // To avoid inadvertent printing of an extra digit, there's a deliberate gap
1530 // between the 99.9994999...-type bounds and the largest numbers that would
1531 // actually round down.
1532 static const double banker_round5[] = {0.499995, 0.500005};
1533 static const double banker_round6[] = {0.4999995, 0.5000005};
1534 static const double banker_round7[] = {0.49999995, 0.50000005};
1535 static const double banker_round8[] = {0.499999995, 0.500000005};
1536 static const double banker_round9[] = {0.4999999995, 0.5000000005};
1537 static const double banker_round10[] = {0.49999999995, 0.50000000005};
1538 static const double banker_round11[] = {0.499999999995, 0.500000000005};
1539 static const double banker_round12[] = {0.4999999999995, 0.5000000000005};
1540 
double_bround(double dxx,const double * banker_round)1541 static inline uint32_t double_bround(double dxx, const double* banker_round) {
1542   uint32_t result = (int32_t)dxx;
1543   return result + (int32_t)((dxx - ((int32_t)result)) + banker_round[result & 1]);
1544 }
1545 
1546 // These are separate functions so the compiler can optimize the integer
1547 // divisions.
double_bround1(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1548 static inline void double_bround1(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1549   dxx *= 10;
1550   uint32_t remainder = (int32_t)dxx;
1551   remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1552   *quotientp = remainder / 10;
1553   *remainderp = remainder - (*quotientp) * 10;
1554 }
1555 
double_bround2(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1556 static inline void double_bround2(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1557   dxx *= 100;
1558   uint32_t remainder = (int32_t)dxx;
1559   remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1560   *quotientp = remainder / 100;
1561   *remainderp = remainder - (*quotientp) * 100;
1562 }
1563 
double_bround3(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1564 static inline void double_bround3(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1565   dxx *= 1000;
1566   uint32_t remainder = (int32_t)dxx;
1567   remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1568   *quotientp = remainder / 1000;
1569   *remainderp = remainder - (*quotientp) * 1000;
1570 }
1571 
double_bround4(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1572 static inline void double_bround4(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1573   dxx *= 10000;
1574   uint32_t remainder = (int32_t)dxx;
1575   remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1576   *quotientp = remainder / 10000;
1577   *remainderp = remainder - (*quotientp) * 10000;
1578 }
1579 
double_bround5(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1580 static inline void double_bround5(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1581   dxx *= 100000;
1582   uint32_t remainder = (int32_t)dxx;
1583   remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1584   *quotientp = remainder / 100000;
1585   *remainderp = remainder - (*quotientp) * 100000;
1586 }
1587 
double_bround6(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1588 static inline void double_bround6(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1589   dxx *= 1000000;
1590   uint32_t remainder = (int32_t)dxx;
1591   remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1592   *quotientp = remainder / 1000000;
1593   *remainderp = remainder - (*quotientp) * 1000000;
1594 }
1595 
double_bround7(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1596 static inline void double_bround7(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1597   dxx *= 10000000;
1598   uint32_t remainder = (int32_t)dxx;
1599   remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1600   *quotientp = remainder / 10000000;
1601   *remainderp = remainder - (*quotientp) * 10000000;
1602 }
1603 
dtoa_so6(double dxx,char * start)1604 char* dtoa_so6(double dxx, char* start) {
1605   // 6 sig fig number, 0.999995 <= dxx < 999999.5
1606   // 'so' = "significand only"
1607   // Just hardcoding all six cases, in the absence of a better approach...
1608   uint32_t uii;
1609   uint32_t quotient;
1610   uint32_t remainder;
1611   if (dxx < 99.999949999999) {
1612     if (dxx < 9.9999949999999) {
1613       double_bround5(dxx, banker_round8, &quotient, &remainder);
1614       return qrtoa_1p5(quotient, remainder, start);
1615     }
1616     double_bround4(dxx, banker_round8, &quotient, &remainder);
1617     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1618     if (!remainder) {
1619       return start;
1620     }
1621     *start++ = '.';
1622     quotient = remainder / 100;
1623     memcpy(start, &(digit2_table[quotient * 2]), 2);
1624     remainder -= 100 * quotient;
1625     if (remainder) {
1626       start += 2;
1627     dtoa_so6_pretail:
1628       memcpy(start, &(digit2_table[remainder * 2]), 2);
1629     }
1630   dtoa_so6_tail:
1631     if (start[1] != '0') {
1632       return &(start[2]);
1633     }
1634     return &(start[1]);
1635   } else if (dxx < 9999.9949999999) {
1636     if (dxx < 999.99949999999) {
1637       double_bround3(dxx, banker_round8, &uii, &remainder);
1638       quotient = uii / 100;
1639       *start++ = '0' + quotient;
1640       quotient = uii - 100 * quotient;
1641       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1642       if (!remainder) {
1643 	return start;
1644       }
1645       *start++ = '.';
1646       quotient = remainder / 10;
1647       memcpy(start, &(digit2_table[quotient * 2]), 2);
1648       remainder -= quotient * 10;
1649       if (!remainder) {
1650         goto dtoa_so6_tail;
1651       }
1652       start[2] = '0' + remainder;
1653       return &(start[3]);
1654     }
1655     double_bround2(dxx, banker_round8, &uii, &remainder);
1656     quotient = uii / 100;
1657     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1658     quotient = uii - (100 * quotient);
1659     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1660     if (!remainder) {
1661       return start;
1662     }
1663     *start++ = '.';
1664     goto dtoa_so6_pretail;
1665   } else if (dxx < 99999.949999999) {
1666     double_bround1(dxx, banker_round8, &uii, &remainder);
1667     quotient = uii / 10000;
1668     *start = '0' + quotient;
1669     uii -= 10000 * quotient;
1670     quotient = uii / 100;
1671     start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1672     uii = uii - 100 * quotient;
1673     start = memcpya(start, &(digit2_table[uii * 2]), 2);
1674     if (!remainder) {
1675       return start;
1676     }
1677     *start++ = '.';
1678     *start = '0' + remainder;
1679     return &(start[1]);
1680   } else {
1681     return uitoa_z6(double_bround(dxx, banker_round8), start);
1682   }
1683 }
1684 
1685 // Briefly had banker's rounding for floats, but then I realized that the only
1686 // float-printing function calls are --make-grm related, they all request 6-7
1687 // digits of precision, and at that point it's impossible to distinguish exact
1688 // 0.5-matches in the remainder.  So we just have generic rounding functions
1689 // here, with similar interfaces to the double-rounding functions to minimize
1690 // the need for separate reasoning about this code.
float_round(float fxx)1691 static inline uint32_t float_round(float fxx) {
1692   return (uint32_t)((int32_t)(fxx + 0.5));
1693 }
1694 
float_round1(float fxx,uint32_t * quotientp,uint32_t * remainderp)1695 static inline void float_round1(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1696   uint32_t remainder = float_round(fxx * 10);
1697   *quotientp = remainder / 10;
1698   *remainderp = remainder - (*quotientp) * 10;
1699 }
1700 
float_round2(float fxx,uint32_t * quotientp,uint32_t * remainderp)1701 static inline void float_round2(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1702   uint32_t remainder = float_round(fxx * 100);
1703   *quotientp = remainder / 100;
1704   *remainderp = remainder - (*quotientp) * 100;
1705 }
1706 
float_round3(float fxx,uint32_t * quotientp,uint32_t * remainderp)1707 static inline void float_round3(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1708   uint32_t remainder = float_round(fxx * 1000);
1709   *quotientp = remainder / 1000;
1710   *remainderp = remainder - (*quotientp) * 1000;
1711 }
1712 
float_round4(float fxx,uint32_t * quotientp,uint32_t * remainderp)1713 static inline void float_round4(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1714   uint32_t remainder = float_round(fxx * 10000);
1715   *quotientp = remainder / 10000;
1716   *remainderp = remainder - (*quotientp) * 10000;
1717 }
1718 
float_round5(float fxx,uint32_t * quotientp,uint32_t * remainderp)1719 static inline void float_round5(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1720   uint32_t remainder = float_round(fxx * 100000);
1721   *quotientp = remainder / 100000;
1722   *remainderp = remainder - (*quotientp) * 100000;
1723 }
1724 
float_round6(float fxx,uint32_t * quotientp,uint32_t * remainderp)1725 static inline void float_round6(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1726   uint32_t remainder = float_round(fxx * 1000000);
1727   *quotientp = remainder / 1000000;
1728   *remainderp = remainder - (*quotientp) * 1000000;
1729 }
1730 
ftoa_so6(float fxx,char * start)1731 char* ftoa_so6(float fxx, char* start) {
1732   uint32_t uii;
1733   uint32_t quotient;
1734   uint32_t remainder;
1735   // difference between consecutive floats near 10 can be as large as
1736   // 10 * 2^{-23}, which is just under 1.2e-6.  So, to avoid printing an extra
1737   // digit, we have to set this bound to be robust to an addition error of size
1738   // 6e-7.
1739   // (possible todo: just brute-force test this on all <2^32 possible floats
1740   // and look for a better threshold)
1741   if (fxx < 99.999944) {
1742     if (fxx < 9.9999944) {
1743       float_round5(fxx, &quotient, &remainder);
1744       return qrtoa_1p5(quotient, remainder, start);
1745     }
1746     float_round4(fxx, &quotient, &remainder);
1747     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1748     if (!remainder) {
1749       return start;
1750     }
1751     *start++ = '.';
1752     quotient = remainder / 100;
1753     memcpy(start, &(digit2_table[quotient * 2]), 2);
1754     remainder -= 100 * quotient;
1755     if (remainder) {
1756       start += 2;
1757     ftoa_so6_pretail:
1758       memcpy(start, &(digit2_table[remainder * 2]), 2);
1759     }
1760   ftoa_so6_tail:
1761     if (start[1] != '0') {
1762       return &(start[2]);
1763     }
1764     return &(start[1]);
1765   } else if (fxx < 9999.9944) {
1766     if (fxx < 999.99944) {
1767       float_round3(fxx, &uii, &remainder);
1768       quotient = uii / 100;
1769       *start = '0' + quotient;
1770       quotient = uii - 100 * quotient;
1771       start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1772       if (!remainder) {
1773 	return start;
1774       }
1775       *start++ = '.';
1776       quotient = remainder / 10;
1777       memcpy(start, &(digit2_table[quotient * 2]), 2);
1778       remainder -= quotient * 10;
1779       if (!remainder) {
1780         goto ftoa_so6_tail;
1781       }
1782       start[2] = '0' + remainder;
1783       return &(start[3]);
1784     }
1785     float_round2(fxx, &uii, &remainder);
1786     quotient = uii / 100;
1787     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1788     quotient = uii - (100 * quotient);
1789     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1790     if (!remainder) {
1791       return start;
1792     }
1793     *start++ = '.';
1794     goto ftoa_so6_pretail;
1795   } else if (fxx < 99999.944) {
1796     float_round1(fxx, &uii, &remainder);
1797     quotient = uii / 10000;
1798     *start = '0' + quotient;
1799     uii -= 10000 * quotient;
1800     quotient = uii / 100;
1801     start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1802     uii = uii - 100 * quotient;
1803     start = memcpya(start, &(digit2_table[uii * 2]), 2);
1804     if (!remainder) {
1805       return start;
1806     }
1807     *start = '.';
1808     start[1] = '0' + remainder;
1809     return &(start[2]);
1810   } else {
1811     return uitoa_z6(float_round(fxx), start);
1812   }
1813 }
1814 
dtoa_so2(double dxx,char * start)1815 char* dtoa_so2(double dxx, char* start) {
1816   // 2 sig fig number, 0.95 <= dxx < 99.5
1817   uint32_t quotient;
1818   uint32_t remainder;
1819   if (dxx < 9.9499999999999) {
1820     double_bround1(dxx, banker_round12, &quotient, &remainder);
1821     return qrtoa_1p1(quotient, remainder, start);
1822   }
1823   return memcpya(start, &(digit2_table[(double_bround(dxx, banker_round12)) * 2]), 2);
1824 }
1825 
dtoa_so3(double dxx,char * start)1826 char* dtoa_so3(double dxx, char* start) {
1827   // 3 sig fig number, 0.995 <= dxx < 999.5
1828   uint32_t quotient;
1829   uint32_t remainder;
1830   if (dxx < 99.949999999999) {
1831     if (dxx < 9.9949999999999) {
1832       double_bround2(dxx, banker_round11, &quotient, &remainder);
1833       return qrtoa_1p2(quotient, remainder, start);
1834     }
1835     double_bround1(dxx, banker_round11, &quotient, &remainder);
1836     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1837     if (!remainder) {
1838       return start;
1839     }
1840     *start++ = '.';
1841   } else {
1842     quotient = double_bround(dxx, banker_round11);
1843     start = memcpya(start, &(digit2_table[(quotient / 10) * 2]), 2);
1844     remainder = quotient % 10;
1845   }
1846   *start = '0' + remainder;
1847   return &(start[1]);
1848 }
1849 
dtoa_so4(double dxx,char * start)1850 char* dtoa_so4(double dxx, char* start) {
1851   // 4 sig fig number, 0.9995 <= dxx < 9999.5
1852   uint32_t uii;
1853   uint32_t quotient;
1854   uint32_t remainder;
1855   if (dxx < 99.994999999999) {
1856     if (dxx < 9.9994999999999) {
1857       double_bround3(dxx, banker_round10, &quotient, &remainder);
1858       return qrtoa_1p3(quotient, remainder, start);
1859     }
1860     double_bround2(dxx, banker_round10, &quotient, &remainder);
1861     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1862     if (!remainder) {
1863       return start;
1864     }
1865     *start++ = '.';
1866     memcpy(start, &(digit2_table[remainder * 2]), 2);
1867     if (start[1] != '0') {
1868       return &(start[2]);
1869     }
1870     return &(start[1]);
1871   } else if (dxx < 999.94999999999) {
1872     double_bround1(dxx, banker_round10, &uii, &remainder);
1873     quotient = uii / 100;
1874     *start = '0' + quotient;
1875     quotient = uii - 100 * quotient;
1876     start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1877     if (!remainder) {
1878       return start;
1879     }
1880     *start = '.';
1881     start[1] = '0' + remainder;
1882     return &(start[2]);
1883   } else {
1884     uitoa_z4(double_bround(dxx, banker_round10), start);
1885     return &(start[4]);
1886   }
1887 }
1888 
dtoa_so8(double dxx,char * start)1889 char* dtoa_so8(double dxx, char* start) {
1890   // 8 sig fig number, 0.99999995 <= dxx < 99999999.5
1891   uint32_t uii;
1892   uint32_t quotient;
1893   uint32_t remainder;
1894   if (dxx < 99.999999499999) {
1895     if (dxx < 9.9999999499999) {
1896       double_bround7(dxx, banker_round6, &quotient, &remainder);
1897       return qrtoa_1p7(quotient, remainder, start);
1898     }
1899     double_bround6(dxx, banker_round6, &quotient, &remainder);
1900     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1901     if (!remainder) {
1902       return start;
1903     }
1904     *start++ = '.';
1905     quotient = remainder / 10000;
1906     memcpy(start, &(digit2_table[quotient * 2]), 2);
1907     remainder -= 10000 * quotient;
1908     if (remainder) {
1909       start += 2;
1910     dtoa_so8_pretail4:
1911       quotient = remainder / 100;
1912       memcpy(start, &(digit2_table[quotient * 2]), 2);
1913       remainder -= 100 * quotient;
1914       if (remainder) {
1915 	start += 2;
1916       dtoa_so8_pretail2:
1917         memcpy(start, &(digit2_table[remainder * 2]), 2);
1918       }
1919     }
1920   dtoa_so8_tail:
1921     if (start[1] != '0') {
1922       return &(start[2]);
1923     }
1924     return &(start[1]);
1925   } else if (dxx < 9999.9999499999) {
1926     if (dxx < 999.99999499999) {
1927       double_bround5(dxx, banker_round6, &uii, &remainder);
1928       quotient = uii / 100;
1929       *start++ = '0' + quotient;
1930       quotient = uii - 100 * quotient;
1931       start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1932       if (!remainder) {
1933 	return start;
1934       }
1935       *start++ = '.';
1936       quotient = remainder / 1000;
1937       memcpy(start, &(digit2_table[quotient * 2]), 2);
1938       remainder -= quotient * 1000;
1939       if (!remainder) {
1940         goto dtoa_so8_tail;
1941       }
1942       start += 2;
1943     dtoa_so8_pretail3:
1944       quotient = remainder / 10;
1945       memcpy(start, &(digit2_table[quotient * 2]), 2);
1946       remainder -= quotient * 10;
1947       if (!remainder) {
1948 	goto dtoa_so8_tail;
1949       }
1950       start[2] = '0' + remainder;
1951       return &(start[3]);
1952     }
1953     double_bround4(dxx, banker_round6, &uii, &remainder);
1954     quotient = uii / 100;
1955     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1956     quotient = uii - (100 * quotient);
1957     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1958     if (!remainder) {
1959       return start;
1960     }
1961     *start++ = '.';
1962     goto dtoa_so8_pretail4;
1963   } else if (dxx < 999999.99499999) {
1964     if (dxx < 99999.999499999) {
1965       double_bround3(dxx, banker_round6, &uii, &remainder);
1966       quotient = uii / 10000;
1967       *start = '0' + quotient;
1968       uii -= 10000 * quotient;
1969       quotient = uii / 100;
1970       start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1971       uii -= 100 * quotient;
1972       start = memcpya(start, &(digit2_table[uii * 2]), 2);
1973       if (!remainder) {
1974 	return start;
1975       }
1976       *start++ = '.';
1977       goto dtoa_so8_pretail3;
1978     }
1979     double_bround2(dxx, banker_round6, &uii, &remainder);
1980     quotient = uii / 10000;
1981     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1982     uii -= 10000 * quotient;
1983     quotient = uii / 100;
1984     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1985     uii -= 100 * quotient;
1986     start = memcpya(start, &(digit2_table[uii * 2]), 2);
1987     if (!remainder) {
1988       return start;
1989     }
1990     *start++ = '.';
1991     goto dtoa_so8_pretail2;
1992   } else if (dxx < 9999999.9499999) {
1993     double_bround1(dxx, banker_round6, &uii, &remainder);
1994     quotient = uii / 1000000;
1995     *start = '0' + quotient;
1996     uii -= 1000000 * quotient;
1997     quotient = uii / 10000;
1998     start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1999     uii -= 10000 * quotient;
2000     quotient = uii / 100;
2001     start = memcpya(start, &(digit2_table[quotient * 2]), 2);
2002     uii -= 100 * quotient;
2003     start = memcpya(start, &(digit2_table[uii * 2]), 2);
2004     if (!remainder) {
2005       return start;
2006     }
2007     *start = '.';
2008     start[1] = '0' + remainder;
2009     return &(start[2]);
2010   } else {
2011     return uitoa_z8(double_bround(dxx, banker_round6), start);
2012   }
2013 }
2014 
dtoa_e(double dxx,char * start)2015 char* dtoa_e(double dxx, char* start) {
2016   uint32_t xp10 = 0;
2017   uint32_t quotient;
2018   uint32_t remainder;
2019   char sign;
2020   if (dxx != dxx) {
2021     // do this first to avoid generating exception
2022     return memcpyl3a(start, "nan");
2023   } else if (dxx < 0) {
2024     *start++ = '-';
2025     dxx = -dxx;
2026   }
2027   if (dxx >= 9.9999994999999e-1) {
2028     if (dxx >= 9.9999994999999e7) {
2029       if (dxx >= 9.9999994999999e127) {
2030 	if (dxx == INFINITY) {
2031 	  return memcpyl3a(start, "inf");
2032 	} else if (dxx >= 9.9999994999999e255) {
2033 	  dxx *= 1.0e-256;
2034 	  xp10 |= 256;
2035 	} else {
2036 	  dxx *= 1.0e-128;
2037 	  xp10 |= 128;
2038 	}
2039       }
2040       if (dxx >= 9.9999994999999e63) {
2041 	dxx *= 1.0e-64;
2042 	xp10 |= 64;
2043       }
2044       if (dxx >= 9.9999994999999e31) {
2045 	dxx *= 1.0e-32;
2046 	xp10 |= 32;
2047       }
2048       if (dxx >= 9.9999994999999e15) {
2049 	dxx *= 1.0e-16;
2050 	xp10 |= 16;
2051       }
2052       if (dxx >= 9.9999994999999e7) {
2053 	dxx *= 1.0e-8;
2054 	xp10 |= 8;
2055       }
2056     }
2057     if (dxx >= 9.9999994999999e3) {
2058       dxx *= 1.0e-4;
2059       xp10 |= 4;
2060     }
2061     if (dxx >= 9.9999994999999e1) {
2062       dxx *= 1.0e-2;
2063       xp10 |= 2;
2064     }
2065     if (dxx >= 9.9999994999999) {
2066       dxx *= 1.0e-1;
2067       xp10++;
2068     }
2069     sign = '+';
2070   } else {
2071     if (dxx < 9.9999994999999e-8) {
2072       // general case
2073       if (dxx < 9.9999994999999e-128) {
2074 	if (dxx == 0.0) {
2075 	  return memcpya(start, "0.000000e+00", 12);
2076 	}
2077 	if (dxx < 9.9999994999999e-256) {
2078 	  dxx *= 1.0e256;
2079 	  xp10 |= 256;
2080 	} else {
2081 	  dxx *= 1.0e128;
2082 	  xp10 |= 128;
2083 	}
2084       }
2085       if (dxx < 9.9999994999999e-64) {
2086 	dxx *= 1.0e64;
2087 	xp10 |= 64;
2088       }
2089       if (dxx < 9.9999994999999e-32) {
2090 	dxx *= 1.0e32;
2091 	xp10 |= 32;
2092       }
2093       if (dxx < 9.9999994999999e-16) {
2094 	dxx *= 1.0e16;
2095 	xp10 |= 16;
2096       }
2097       if (dxx < 9.9999994999999e-8) {
2098 	dxx *= 100000000;
2099 	xp10 |= 8;
2100       }
2101     }
2102     if (dxx < 9.999994999999e-4) {
2103       dxx *= 10000;
2104       xp10 |= 4;
2105     }
2106     if (dxx < 9.9999994999999e-2) {
2107       dxx *= 100;
2108       xp10 |= 2;
2109     }
2110     if (dxx < 9.9999994999999e-1) {
2111       dxx *= 10;
2112       xp10++;
2113     }
2114     sign = '-';
2115   }
2116   double_bround6(dxx, banker_round7, &quotient, &remainder);
2117   *start++ = '0' + quotient;
2118   *start++ = '.';
2119   start = uitoa_z6(remainder, start);
2120   *start++ = 'e';
2121   *start++ = sign;
2122   if (xp10 >= 100) {
2123     quotient = xp10 / 100;
2124     *start++ = '0' + quotient;
2125     xp10 -= quotient * 100;
2126   }
2127   return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2128 }
2129 
ftoa_e(float fxx,char * start)2130 char* ftoa_e(float fxx, char* start) {
2131   uint32_t xp10 = 0;
2132   uint32_t quotient;
2133   uint32_t remainder;
2134   char sign;
2135   if (fxx != fxx) {
2136     // do this first to avoid generating exception
2137     return memcpyl3a(start, "nan");
2138   } else if (fxx < 0) {
2139     *start++ = '-';
2140     fxx = -fxx;
2141   }
2142   if (fxx >= 9.9999995e-1) {
2143     if (fxx >= 9.9999995e15) {
2144       if (fxx == INFINITY) {
2145 	return memcpyl3a(start, "inf");
2146       } else if (fxx >= 9.9999995e31) {
2147 	fxx *= 1.0e-32;
2148 	xp10 |= 32;
2149       } else {
2150 	fxx *= 1.0e-16;
2151 	xp10 |= 16;
2152       }
2153     }
2154     if (fxx >= 9.9999995e7) {
2155       fxx *= 1.0e-8;
2156       xp10 |= 8;
2157     }
2158     if (fxx >= 9.9999995e3) {
2159       fxx *= 1.0e-4;
2160       xp10 |= 4;
2161     }
2162     if (fxx >= 9.9999995e1) {
2163       fxx *= 1.0e-2;
2164       xp10 |= 2;
2165     }
2166     if (fxx >= 9.9999995) {
2167       fxx *= 1.0e-1;
2168       xp10++;
2169     }
2170     sign = '+';
2171   } else {
2172     if (fxx < 9.9999995e-16) {
2173       if (fxx == 0.0) {
2174 	return memcpya(start, "0.000000e+00", 12);
2175       } else if (fxx < 9.9999995e-32) {
2176 	fxx *= 1.0e32;
2177 	xp10 |= 32;
2178       } else {
2179 	fxx *= 1.0e16;
2180 	xp10 |= 16;
2181       }
2182     }
2183     if (fxx < 9.9999995e-8) {
2184       fxx *= 100000000;
2185       xp10 |= 8;
2186     }
2187     if (fxx < 9.9999995e-4) {
2188       fxx *= 10000;
2189       xp10 |= 4;
2190     }
2191     if (fxx < 9.9999995e-2) {
2192       fxx *= 100;
2193       xp10 |= 2;
2194     }
2195     if (fxx < 9.9999995e-1) {
2196       fxx *= 10;
2197       xp10++;
2198     }
2199     sign = '-';
2200   }
2201   float_round6(fxx, &quotient, &remainder);
2202   *start++ = '0' + quotient;
2203   *start++ = '.';
2204   start = uitoa_z6(remainder, start);
2205   *start++ = 'e';
2206   *start++ = sign;
2207   return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2208 }
2209 
dtoa_f_p2(double dxx,char * start)2210 char* dtoa_f_p2(double dxx, char* start) {
2211   const double* br_ptr;
2212   uint32_t quotient;
2213   uint32_t remainder;
2214   if (dxx != dxx) {
2215     return memcpyl3a(start, "nan");
2216   } else if (dxx < 9.9949999999999) {
2217     if (dxx < 0) {
2218       *start++ = '-';
2219       dxx = -dxx;
2220       if (dxx >= 9.9949999999999) {
2221         goto dtoa_f_p2_10;
2222       }
2223     }
2224     double_bround2(dxx, banker_round11, &quotient, &remainder);
2225     *start++ = '0' + quotient;
2226   dtoa_f_p2_dec:
2227     *start++ = '.';
2228     return memcpya(start, &(digit2_table[remainder * 2]), 2);
2229   }
2230  dtoa_f_p2_10:
2231   if (dxx < 9999999.9949999) {
2232     if (dxx < 999.99499999999) {
2233       if (dxx < 99.994999999999) {
2234 	br_ptr = banker_round10;
2235       } else {
2236         br_ptr = banker_round9;
2237       }
2238     } else if (dxx < 99999.994999999) {
2239       if (dxx < 9999.9949999999) {
2240 	br_ptr = banker_round8;
2241       } else {
2242 	br_ptr = banker_round7;
2243       }
2244     } else if (dxx < 999999.99499999) {
2245       br_ptr = banker_round6;
2246     } else {
2247       br_ptr = banker_round5;
2248     }
2249     double_bround2(dxx, br_ptr, &quotient, &remainder);
2250     start = uint32toa(quotient, start);
2251     goto dtoa_f_p2_dec;
2252   }
2253   if (dxx == INFINITY) {
2254     return memcpyl3a(start, "inf");
2255   }
2256   // just punt larger numbers to glibc for now, this isn't a bottleneck
2257   start += sprintf(start, "%.2f", dxx);
2258   return start;
2259 }
2260 
dtoa_f_p3(double dxx,char * start)2261 char* dtoa_f_p3(double dxx, char* start) {
2262   const double* br_ptr;
2263   uint32_t quotient;
2264   uint32_t remainder;
2265   if (dxx != dxx) {
2266     return memcpyl3a(start, "nan");
2267   } else if (dxx < 9.9994999999999) {
2268     if (dxx < 0) {
2269       *start++ = '-';
2270       dxx = -dxx;
2271       if (dxx >= 9.9994999999999) {
2272         goto dtoa_f_p3_10;
2273       }
2274     }
2275     double_bround3(dxx, banker_round10, &quotient, &remainder);
2276     *start++ = '0' + quotient;
2277   dtoa_f_p3_dec:
2278     *start++ = '.';
2279     quotient = remainder / 100;
2280     remainder -= 100 * quotient;
2281     *start++ = '0' + quotient;
2282     return memcpya(start, &(digit2_table[remainder * 2]), 2);
2283   }
2284  dtoa_f_p3_10:
2285   if (dxx < 999999.99949999) {
2286     if (dxx < 999.99949999999) {
2287       if (dxx < 99.999499999999) {
2288 	br_ptr = banker_round9;
2289       } else {
2290         br_ptr = banker_round8;
2291       }
2292     } else if (dxx < 99999.999499999) {
2293       if (dxx < 9999.9994999999) {
2294 	br_ptr = banker_round7;
2295       } else {
2296 	br_ptr = banker_round6;
2297       }
2298     } else {
2299       br_ptr = banker_round5;
2300     }
2301     double_bround3(dxx, br_ptr, &quotient, &remainder);
2302     start = uint32toa(quotient, start);
2303     goto dtoa_f_p3_dec;
2304   }
2305   if (dxx == INFINITY) {
2306     return memcpyl3a(start, "inf");
2307   }
2308   start += sprintf(start, "%.3f", dxx);
2309   return start;
2310 }
2311 
dtoa_f_w9p6(double dxx,char * start)2312 char* dtoa_f_w9p6(double dxx, char* start) {
2313   uint32_t quotient;
2314   uint32_t remainder;
2315   if (dxx != dxx) {
2316     return memcpya(start, "      nan", 9);
2317   } else if (dxx < 9.9999994999999) {
2318     if (dxx < 0) {
2319       *start++ = '-';
2320       dxx = -dxx;
2321       if (dxx >= 9.9999994999999) {
2322 	goto dtoa_f_w9p6_10;
2323       }
2324     } else {
2325       *start++ = ' ';
2326     }
2327     double_bround6(dxx, banker_round7, &quotient, &remainder);
2328     *start++ = '0' + quotient;
2329   dtoa_f_w9p6_dec:
2330     *start++ = '.';
2331     return uitoa_z6(remainder, start);
2332   }
2333  dtoa_f_w9p6_10:
2334   if (dxx < 999.99999949999) {
2335     double_bround6(dxx, (dxx < 99.999999499999)? banker_round6 : banker_round5, &quotient, &remainder);
2336     start = uint32toa(quotient, start);
2337     goto dtoa_f_w9p6_dec;
2338   }
2339   if (dxx == INFINITY) {
2340     return memcpya(start, "      inf", 9);
2341   }
2342   start += sprintf(start, "%.6f", dxx);
2343   return start;
2344 }
2345 
dtoa_f_w7p4(double dxx,char * start)2346 char* dtoa_f_w7p4(double dxx, char* start) {
2347   const double* br_ptr;
2348   uint32_t quotient;
2349   uint32_t remainder;
2350   if (dxx != dxx) {
2351     return memcpya(start, "    nan", 7);
2352   } else if (dxx < 9.9999499999999) {
2353     if (dxx < 0) {
2354       *start++ = '-';
2355       dxx = -dxx;
2356       if (dxx >= 9.9999499999999) {
2357 	goto dtoa_f_w7p4_10;
2358       }
2359     } else {
2360       *start++ = ' ';
2361     }
2362     double_bround4(dxx, banker_round9, &quotient, &remainder);
2363     *start++ = '0' + quotient;
2364   dtoa_f_w7p4_dec:
2365     *start++ = '.';
2366     quotient = remainder / 100;
2367     remainder -= 100 * quotient;
2368     return memcpya(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[remainder * 2]), 2);
2369   }
2370  dtoa_f_w7p4_10:
2371   if (dxx < 99999.999949999) {
2372     if (dxx < 999.99994999999) {
2373       if (dxx < 99.999949999999) {
2374 	br_ptr = banker_round8;
2375       } else {
2376 	br_ptr = banker_round7;
2377       }
2378     } else if (dxx < 9999.9999499999) {
2379       br_ptr = banker_round6;
2380     } else {
2381       br_ptr = banker_round5;
2382     }
2383     double_bround4(dxx, br_ptr, &quotient, &remainder);
2384     start = uint32toa(quotient, start);
2385     goto dtoa_f_w7p4_dec;
2386   }
2387   if (dxx == INFINITY) {
2388     return memcpya(start, "    inf", 7);
2389   }
2390   start += sprintf(start, "%.4f", dxx);
2391   return start;
2392 }
2393 
dtoa_f_w9p6_spaced(double dxx,char * start)2394 char* dtoa_f_w9p6_spaced(double dxx, char* start) {
2395   // Prettier fixed-width decimal: removes trailing zero(es) if and only if the
2396   // match appears to be exact.
2397   // Does not detect exact matches when abs(dxx) > 2^31 / 10^5.
2398   double dyy = dxx * 100000 + 0.00000005;
2399   start = dtoa_f_w9p6(dxx, start);
2400   if (dyy - ((double)((int32_t)dyy)) >= 0.0000001) {
2401     return start;
2402   }
2403   trailing_zeroes_to_spaces(start);
2404   return start;
2405 }
2406 
dtoa_f_w9p6_clipped(double dxx,char * start)2407 char* dtoa_f_w9p6_clipped(double dxx, char* start) {
2408   // same conditions as _spaced()
2409   double dyy = dxx * 100000 + 0.00000005;
2410   start = dtoa_f_w9p6(dxx, start);
2411   if (dyy - ((double)((int32_t)dyy)) >= 0.0000001) {
2412     return start;
2413   }
2414   return clip_trailing_zeroes(start);
2415 }
2416 
dtoa_g(double dxx,char * start)2417 char* dtoa_g(double dxx, char* start) {
2418   uint32_t xp10 = 0;
2419   uint32_t quotient;
2420   uint32_t remainder;
2421   if (dxx != dxx) {
2422     return memcpyl3a(start, "nan");
2423   } else if (dxx < 0) {
2424     *start++ = '-';
2425     dxx = -dxx;
2426   }
2427   if (dxx < 9.9999949999999e-5) {
2428     // 6 sig fig exponential notation, small
2429     if (dxx < 9.9999949999999e-16) {
2430       if (dxx < 9.9999949999999e-128) {
2431 	if (dxx == 0.0) {
2432 	  *start = '0';
2433 	  return &(start[1]);
2434 	} else if (dxx < 9.9999949999999e-256) {
2435 	  dxx *= 1.0e256;
2436 	  xp10 |= 256;
2437 	} else {
2438 	  dxx *= 1.0e128;
2439 	  xp10 |= 128;
2440 	}
2441       }
2442       if (dxx < 9.9999949999999e-64) {
2443 	dxx *= 1.0e64;
2444 	xp10 |= 64;
2445       }
2446       if (dxx < 9.9999949999999e-32) {
2447 	dxx *= 1.0e32;
2448 	xp10 |= 32;
2449       }
2450       if (dxx < 9.9999949999999e-16) {
2451 	dxx *= 1.0e16;
2452 	xp10 |= 16;
2453       }
2454     }
2455     if (dxx < 9.9999949999999e-8) {
2456       dxx *= 100000000;
2457       xp10 |= 8;
2458     }
2459     if (dxx < 9.9999949999999e-4) {
2460       dxx *= 10000;
2461       xp10 |= 4;
2462     }
2463     if (dxx < 9.9999949999999e-2) {
2464       dxx *= 100;
2465       xp10 |= 2;
2466     }
2467     if (dxx < 9.9999949999999e-1) {
2468       dxx *= 10;
2469       xp10++;
2470     }
2471     double_bround5(dxx, banker_round8, &quotient, &remainder);
2472     start = memcpya(qrtoa_1p5(quotient, remainder, start), "e-", 2);
2473     if (xp10 >= 100) {
2474       quotient = xp10 / 100;
2475       *start++ = '0' + quotient;
2476       xp10 -= 100 * quotient;
2477     }
2478     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2479   } else if (dxx >= 999999.49999999) {
2480     // 6 sig fig exponential notation, large
2481     if (dxx >= 9.9999949999999e15) {
2482       if (dxx >= 9.9999949999999e127) {
2483 	if (dxx == INFINITY) {
2484 	  return memcpyl3a(start, "inf");
2485 	} else if (dxx >= 9.9999949999999e255) {
2486 	  dxx *= 1.0e-256;
2487 	  xp10 |= 256;
2488 	} else {
2489 	  dxx *= 1.0e-128;
2490 	  xp10 |= 128;
2491 	}
2492       }
2493       if (dxx >= 9.9999949999999e63) {
2494 	dxx *= 1.0e-64;
2495 	xp10 |= 64;
2496       }
2497       if (dxx >= 9.9999949999999e31) {
2498 	dxx *= 1.0e-32;
2499 	xp10 |= 32;
2500       }
2501       if (dxx >= 9.9999949999999e15) {
2502 	dxx *= 1.0e-16;
2503 	xp10 |= 16;
2504       }
2505     }
2506     if (dxx >= 9.9999949999999e7) {
2507       dxx *= 1.0e-8;
2508       xp10 |= 8;
2509     }
2510     if (dxx >= 9.9999949999999e3) {
2511       dxx *= 1.0e-4;
2512       xp10 |= 4;
2513     }
2514     if (dxx >= 9.9999949999999e1) {
2515       dxx *= 1.0e-2;
2516       xp10 |= 2;
2517     }
2518     if (dxx >= 9.9999949999999e0) {
2519       dxx *= 1.0e-1;
2520       xp10++;
2521     }
2522     double_bround5(dxx, banker_round8, &quotient, &remainder);
2523     start = memcpya(qrtoa_1p5(quotient, remainder, start), "e+", 2);
2524     if (xp10 >= 100) {
2525       quotient = xp10 / 100;
2526       *start++ = '0' + quotient;
2527       xp10 -= 100 * quotient;
2528     }
2529     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2530   } else if (dxx >= 0.99999949999999) {
2531     return dtoa_so6(dxx, start);
2532   } else {
2533     // 6 sig fig decimal, no less than ~0.0001
2534     start = memcpya(start, "0.", 2);
2535     if (dxx < 9.9999949999999e-3) {
2536       dxx *= 100;
2537       start = memcpya(start, "00", 2);
2538     }
2539     if (dxx < 9.9999949999999e-2) {
2540       dxx *= 10;
2541       *start++ = '0';
2542     }
2543     return uitoa_trunc6(double_bround(dxx * 1000000, banker_round8), start);
2544   }
2545 }
2546 
ftoa_g(float fxx,char * start)2547 char* ftoa_g(float fxx, char* start) {
2548   uint32_t xp10 = 0;
2549   uint32_t quotient;
2550   uint32_t remainder;
2551   if (fxx != fxx) {
2552     return memcpyl3a(start, "nan");
2553   } else if (fxx < 0) {
2554     *start++ = '-';
2555     fxx = -fxx;
2556   }
2557   if (fxx < 9.9999944e-5) {
2558     if (fxx < 9.9999944e-16) {
2559       if (fxx == 0.0) {
2560 	*start = '0';
2561 	return &(start[1]);
2562       } else if (fxx < 9.9999944e-32) {
2563 	fxx *= 1.0e32;
2564 	xp10 |= 32;
2565       } else {
2566 	fxx *= 1.0e16;
2567 	xp10 |= 16;
2568       }
2569     }
2570     if (fxx < 9.9999944e-8) {
2571       fxx *= 100000000;
2572       xp10 |= 8;
2573     }
2574     if (fxx < 9.9999944e-4) {
2575       fxx *= 10000;
2576       xp10 |= 4;
2577     }
2578     if (fxx < 9.9999944e-2) {
2579       fxx *= 100;
2580       xp10 |= 2;
2581     }
2582     if (fxx < 9.9999944e-1) {
2583       fxx *= 10;
2584       xp10++;
2585     }
2586     float_round5(fxx, &quotient, &remainder);
2587     return memcpya(memcpya(qrtoa_1p5(quotient, remainder, start), "e-", 2), &(digit2_table[xp10 * 2]), 2);
2588   } else if (fxx >= 999999.44) {
2589     if (fxx >= 9.9999944e15) {
2590       if (fxx == INFINITY) {
2591 	return memcpyl3a(start, "inf");
2592       } else if (fxx >= 9.9999944e31) {
2593 	fxx *= 1.0e-32;
2594 	xp10 |= 32;
2595       } else {
2596 	fxx *= 1.0e-16;
2597 	xp10 |= 16;
2598       }
2599     }
2600     if (fxx >= 9.9999944e7) {
2601       fxx *= 1.0e-8;
2602       xp10 |= 8;
2603     }
2604     if (fxx >= 9.9999944e3) {
2605       fxx *= 1.0e-4;
2606       xp10 |= 4;
2607     }
2608     if (fxx >= 9.9999944e1) {
2609       fxx *= 1.0e-2;
2610       xp10 |= 2;
2611     }
2612     if (fxx >= 9.9999944e0) {
2613       fxx *= 1.0e-1;
2614       xp10++;
2615     }
2616     float_round5(fxx, &quotient, &remainder);
2617     return memcpya(memcpya(qrtoa_1p5(quotient, remainder, start), "e+", 2), &(digit2_table[xp10 * 2]), 2);
2618   } else if (fxx >= 0.99999944) {
2619     return ftoa_so6(fxx, start);
2620   } else {
2621     // 6 sig fig decimal, no less than ~0.0001
2622     start = memcpya(start, "0.", 2);
2623     if (fxx < 9.9999944e-3) {
2624       fxx *= 100;
2625       start = memcpya(start, "00", 2);
2626     }
2627     if (fxx < 9.9999944e-2) {
2628       fxx *= 10;
2629       *start++ = '0';
2630     }
2631     return uitoa_trunc6(float_round(fxx * 1000000), start);
2632   }
2633 }
2634 
dtoa_g_wxp2(double dxx,uint32_t min_width,char * start)2635 char* dtoa_g_wxp2(double dxx, uint32_t min_width, char* start) {
2636   assert(min_width >= 5);
2637   uint32_t xp10 = 0;
2638   char wbuf[16];
2639   char* wpos = wbuf;
2640   uint32_t quotient;
2641   uint32_t remainder;
2642   if (dxx != dxx) {
2643     memcpy(memseta(start, 32, min_width - 4), " nan", 4);
2644     return &(start[min_width]);
2645   } else if (dxx < 0) {
2646     *wpos++ = '-';
2647     dxx = -dxx;
2648   }
2649   if (dxx < 9.9499999999999e-5) {
2650     // 2 sig fig exponential notation, small
2651     if (dxx < 9.9499999999999e-16) {
2652       if (dxx < 9.9499999999999e-128) {
2653 	if (dxx == 0.0) {
2654           memset(start, 32, min_width - 1);
2655 	  start[min_width - 1] = '0';
2656 	  return &(start[min_width]);
2657         } else if (dxx < 9.9499999999999e-256) {
2658 	  dxx *= 1.0e256;
2659 	  xp10 |= 256;
2660 	} else {
2661 	  dxx *= 1.0e128;
2662 	  xp10 |= 128;
2663 	}
2664       }
2665       if (dxx < 9.9499999999999e-64) {
2666 	dxx *= 1.0e64;
2667 	xp10 |= 64;
2668       }
2669       if (dxx < 9.9499999999999e-32) {
2670 	dxx *= 1.0e32;
2671 	xp10 |= 32;
2672       }
2673       if (dxx < 9.9499999999999e-16) {
2674 	dxx *= 1.0e16;
2675 	xp10 |= 16;
2676       }
2677     }
2678     if (dxx < 9.9499999999999e-8) {
2679       dxx *= 100000000;
2680       xp10 |= 8;
2681     }
2682     if (dxx < 9.9499999999999e-4) {
2683       dxx *= 10000;
2684       xp10 |= 4;
2685     }
2686     if (dxx < 9.9499999999999e-2) {
2687       dxx *= 100;
2688       xp10 |= 2;
2689     }
2690     if (dxx < 9.9499999999999e-1) {
2691       dxx *= 10;
2692       xp10++;
2693     }
2694     double_bround1(dxx, banker_round12, &quotient, &remainder);
2695     wpos = qrtoa_1p1(quotient, remainder, wpos);
2696     remainder = wpos - wbuf;
2697     if (xp10 >= 100) {
2698       if (remainder < min_width - 5) {
2699 	memcpy(memseta(start, 32, min_width - 5 - remainder), wbuf, remainder);
2700 	start = &(start[min_width - 5]);
2701       } else {
2702 	start = memcpya(start, wbuf, remainder);
2703       }
2704       quotient = xp10 / 100;
2705       start = memcpyax(start, "e-", 2, '0' + quotient);
2706       xp10 -= 100 * quotient;
2707     } else {
2708       if (remainder < min_width - 4) {
2709 	memcpy(memseta(start, 32, min_width - 4 - remainder), wbuf, remainder);
2710 	start = &(start[min_width - 4]);
2711       } else {
2712 	start = memcpya(start, wbuf, remainder);
2713       }
2714       start = memcpya(start, "e-", 2);
2715     }
2716     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2717   } else if (dxx >= 99.499999999999) {
2718     // 2 sig fig exponential notation, large
2719     if (dxx >= 9.9499999999999e15) {
2720       if (dxx >= 9.9499999999999e127) {
2721 	if (dxx == INFINITY) {
2722 	  start = memseta(start, 32, min_width - 4);
2723 	  if (wpos == wbuf) {
2724 	    return memcpya(start, " inf", 4);
2725 	  } else {
2726 	    return memcpya(start, "-inf", 4);
2727 	  }
2728 	} else if (dxx >= 9.9499999999999e255) {
2729 	  dxx *= 1.0e-256;
2730 	  xp10 |= 256;
2731 	} else {
2732 	  dxx *= 1.0e-128;
2733 	  xp10 |= 128;
2734 	}
2735       }
2736       if (dxx >= 9.9499999999999e63) {
2737 	dxx *= 1.0e-64;
2738 	xp10 |= 64;
2739       }
2740       if (dxx >= 9.9499999999999e31) {
2741 	dxx *= 1.0e-32;
2742 	xp10 |= 32;
2743       }
2744       if (dxx >= 9.9499999999999e15) {
2745 	dxx *= 1.0e-16;
2746 	xp10 |= 16;
2747       }
2748     }
2749     if (dxx >= 9.9499999999999e7) {
2750       dxx *= 1.0e-8;
2751       xp10 |= 8;
2752     }
2753     if (dxx >= 9.9499999999999e3) {
2754       dxx *= 1.0e-4;
2755       xp10 |= 4;
2756     }
2757     if (dxx >= 9.9499999999999e1) {
2758       dxx *= 1.0e-2;
2759       xp10 |= 2;
2760     }
2761     if (dxx >= 9.9499999999999e0) {
2762       dxx *= 1.0e-1;
2763       xp10++;
2764     }
2765     double_bround1(dxx, banker_round12, &quotient, &remainder);
2766     wpos = qrtoa_1p1(quotient, remainder, wpos);
2767     remainder = wpos - wbuf;
2768     if (xp10 >= 100) {
2769       if (remainder < min_width - 5) {
2770 	memcpy(memseta(start, 32, min_width - 5 - remainder), wbuf, remainder);
2771 	start = &(start[min_width - 5]);
2772       } else {
2773 	start = memcpya(start, wbuf, remainder);
2774       }
2775       quotient = xp10 / 100;
2776       start = memcpyax(start, "e+", 2, '0' + quotient);
2777       xp10 -= 100 * quotient;
2778     } else {
2779       if (remainder < min_width - 4) {
2780 	memcpy(memseta(start, 32, min_width - 4 - remainder), wbuf, remainder);
2781 	start = &(start[min_width - 4]);
2782       } else {
2783 	start = memcpya(start, wbuf, remainder);
2784       }
2785       start = memcpya(start, "e+", 2);
2786     }
2787     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2788   } else {
2789     if (dxx >= 0.99499999999999) {
2790       wpos = dtoa_so2(dxx, wpos);
2791     } else {
2792       // 2 sig fig decimal, no less than ~0.0001
2793       wpos = memcpya(wpos, "0.", 2);
2794       if (dxx < 9.9499999999999e-3) {
2795 	dxx *= 100;
2796 	wpos = memcpya(wpos, "00", 2);
2797       }
2798       if (dxx < 9.9499999999999e-2) {
2799 	dxx *= 10;
2800 	*wpos++ = '0';
2801       }
2802       wpos = uitoa_trunc2(double_bround(dxx * 100, banker_round12), wpos);
2803     }
2804     remainder = wpos - wbuf;
2805     if (remainder < min_width) {
2806       memcpy(memseta(start, 32, min_width - remainder), wbuf, remainder);
2807       return &(start[min_width]);
2808     } else {
2809       return memcpya(start, wbuf, remainder);
2810     }
2811   }
2812 }
2813 
dtoa_g_wxp3(double dxx,uint32_t min_width,char * start)2814 char* dtoa_g_wxp3(double dxx, uint32_t min_width, char* start) {
2815   assert(min_width >= 5);
2816   uint32_t xp10 = 0;
2817   char wbuf[16];
2818   char* wpos = wbuf;
2819   uint32_t quotient;
2820   uint32_t remainder;
2821   if (dxx != dxx) {
2822     memcpy(memseta(start, 32, min_width - 4), " nan", 4);
2823     return &(start[min_width]);
2824   } else if (dxx < 0) {
2825     *wpos++ = '-';
2826     dxx = -dxx;
2827   }
2828   if (dxx < 9.9949999999999e-5) {
2829     // 3 sig fig exponential notation, small
2830     if (dxx < 9.9949999999999e-16) {
2831       if (dxx < 9.9949999999999e-128) {
2832 	if (dxx == 0.0) {
2833           memset(start, 32, min_width - 1);
2834 	  start[min_width - 1] = '0';
2835 	  return &(start[min_width]);
2836         } else if (dxx < 9.9949999999999e-256) {
2837 	  dxx *= 1.0e256;
2838 	  xp10 |= 256;
2839 	} else {
2840 	  dxx *= 1.0e128;
2841 	  xp10 |= 128;
2842 	}
2843       }
2844       if (dxx < 9.9949999999999e-64) {
2845 	dxx *= 1.0e64;
2846 	xp10 |= 64;
2847       }
2848       if (dxx < 9.9949999999999e-32) {
2849 	dxx *= 1.0e32;
2850 	xp10 |= 32;
2851       }
2852       if (dxx < 9.9949999999999e-16) {
2853 	dxx *= 1.0e16;
2854 	xp10 |= 16;
2855       }
2856     }
2857     if (dxx < 9.9949999999999e-8) {
2858       dxx *= 100000000;
2859       xp10 |= 8;
2860     }
2861     if (dxx < 9.9949999999999e-4) {
2862       dxx *= 10000;
2863       xp10 |= 4;
2864     }
2865     if (dxx < 9.9949999999999e-2) {
2866       dxx *= 100;
2867       xp10 |= 2;
2868     }
2869     if (dxx < 9.9949999999999e-1) {
2870       dxx *= 10;
2871       xp10++;
2872     }
2873     double_bround2(dxx, banker_round11, &quotient, &remainder);
2874     wpos = qrtoa_1p2(quotient, remainder, wpos);
2875     remainder = wpos - wbuf;
2876     if (xp10 >= 100) {
2877       if (remainder < min_width - 5) {
2878 	memcpy(memseta(start, 32, min_width - 5 - remainder), wbuf, remainder);
2879 	start = &(start[min_width - 5]);
2880       } else {
2881 	start = memcpya(start, wbuf, remainder);
2882       }
2883       quotient = xp10 / 100;
2884       start = memcpyax(start, "e-", 2, '0' + quotient);
2885       xp10 -= 100 * quotient;
2886     } else {
2887       if (remainder < min_width - 4) {
2888 	memcpy(memseta(start, 32, min_width - 4 - remainder), wbuf, remainder);
2889 	start = &(start[min_width - 4]);
2890       } else {
2891 	start = memcpya(start, wbuf, remainder);
2892       }
2893       start = memcpya(start, "e-", 2);
2894     }
2895     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2896   } else if (dxx >= 999.49999999999) {
2897     // 3 sig fig exponential notation, large
2898     if (dxx >= 9.9949999999999e15) {
2899       if (dxx >= 9.9949999999999e127) {
2900 	if (dxx == INFINITY) {
2901 	  start = memseta(start, 32, min_width - 4);
2902 	  if (wpos == wbuf) {
2903 	    return memcpya(start, " inf", 4);
2904 	  } else {
2905 	    return memcpya(start, "-inf", 4);
2906 	  }
2907 	} else if (dxx >= 9.9949999999999e255) {
2908 	  dxx *= 1.0e-256;
2909 	  xp10 |= 256;
2910 	} else {
2911 	  dxx *= 1.0e-128;
2912 	  xp10 |= 128;
2913 	}
2914       }
2915       if (dxx >= 9.9949999999999e63) {
2916 	dxx *= 1.0e-64;
2917 	xp10 |= 64;
2918       }
2919       if (dxx >= 9.9949999999999e31) {
2920 	dxx *= 1.0e-32;
2921 	xp10 |= 32;
2922       }
2923       if (dxx >= 9.9949999999999e15) {
2924 	dxx *= 1.0e-16;
2925 	xp10 |= 16;
2926       }
2927     }
2928     if (dxx >= 9.9949999999999e7) {
2929       dxx *= 1.0e-8;
2930       xp10 |= 8;
2931     }
2932     if (dxx >= 9.9949999999999e3) {
2933       dxx *= 1.0e-4;
2934       xp10 |= 4;
2935     }
2936     if (dxx >= 9.9949999999999e1) {
2937       dxx *= 1.0e-2;
2938       xp10 |= 2;
2939     }
2940     if (dxx >= 9.9949999999999e0) {
2941       dxx *= 1.0e-1;
2942       xp10++;
2943     }
2944     double_bround2(dxx, banker_round11, &quotient, &remainder);
2945     wpos = qrtoa_1p2(quotient, remainder, wpos);
2946     remainder = wpos - wbuf;
2947     if (xp10 >= 100) {
2948       if (remainder < min_width - 5) {
2949 	memcpy(memseta(start, 32, min_width - 5 - remainder), wbuf, remainder);
2950 	start = &(start[min_width - 5]);
2951       } else {
2952 	start = memcpya(start, wbuf, remainder);
2953       }
2954       quotient = xp10 / 100;
2955       start = memcpyax(start, "e+", 2, '0' + quotient);
2956       xp10 -= 100 * quotient;
2957     } else {
2958       if (remainder < min_width - 4) {
2959 	memcpy(memseta(start, 32, min_width - 4 - remainder), wbuf, remainder);
2960 	start = &(start[min_width - 4]);
2961       } else {
2962 	start = memcpya(start, wbuf, remainder);
2963       }
2964       start = memcpya(start, "e+", 2);
2965     }
2966     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2967   } else {
2968     if (dxx >= 0.99949999999999) {
2969       wpos = dtoa_so3(dxx, wpos);
2970     } else {
2971       // 3 sig fig decimal, no less than ~0.001
2972       wpos = memcpya(wpos, "0.", 2);
2973       if (dxx < 9.9949999999999e-3) {
2974 	dxx *= 100;
2975 	wpos = memcpya(wpos, "00", 2);
2976       }
2977       if (dxx < 9.9949999999999e-2) {
2978 	dxx *= 10;
2979 	*wpos++ = '0';
2980       }
2981       wpos = uitoa_trunc3(double_bround(dxx * 1000, banker_round11), wpos);
2982     }
2983     remainder = wpos - wbuf;
2984     if (remainder < min_width) {
2985       memcpy(memseta(start, 32, min_width - remainder), wbuf, remainder);
2986       return &(start[min_width]);
2987     } else {
2988       return memcpya(start, wbuf, remainder);
2989     }
2990   }
2991 }
2992 
dtoa_g_wxp4(double dxx,uint32_t min_width,char * start)2993 char* dtoa_g_wxp4(double dxx, uint32_t min_width, char* start) {
2994   uint32_t xp10 = 0;
2995   char wbuf[16];
2996   char* wpos = wbuf;
2997   uint32_t quotient;
2998   uint32_t remainder;
2999   if (dxx != dxx) {
3000     if (min_width > 3) {
3001       start = memseta(start, 32, min_width - 3);
3002     }
3003     return memcpyl3a(start, "nan");
3004   } else if (dxx < 0) {
3005     *wpos++ = '-';
3006     dxx = -dxx;
3007   }
3008   if (dxx < 9.9994999999999e-5) {
3009     // 4 sig fig exponential notation, small
3010     if (dxx < 9.9994999999999e-16) {
3011       if (dxx < 9.9994999999999e-128) {
3012 	if (dxx == 0.0) {
3013           memset(start, 32, min_width - 1);
3014 	  start[min_width - 1] = '0';
3015 	  return &(start[min_width]);
3016         } else if (dxx < 9.9994999999999e-256) {
3017 	  dxx *= 1.0e256;
3018 	  xp10 |= 256;
3019 	} else {
3020 	  dxx *= 1.0e128;
3021 	  xp10 |= 128;
3022 	}
3023       }
3024       if (dxx < 9.9994999999999e-64) {
3025 	dxx *= 1.0e64;
3026 	xp10 |= 64;
3027       }
3028       if (dxx < 9.9994999999999e-32) {
3029 	dxx *= 1.0e32;
3030 	xp10 |= 32;
3031       }
3032       if (dxx < 9.9994999999999e-16) {
3033 	dxx *= 1.0e16;
3034 	xp10 |= 16;
3035       }
3036     }
3037     if (dxx < 9.9994999999999e-8) {
3038       dxx *= 100000000;
3039       xp10 |= 8;
3040     }
3041     if (dxx < 9.9994999999999e-4) {
3042       dxx *= 10000;
3043       xp10 |= 4;
3044     }
3045     if (dxx < 9.9994999999999e-2) {
3046       dxx *= 100;
3047       xp10 |= 2;
3048     }
3049     if (dxx < 9.9994999999999e-1) {
3050       dxx *= 10;
3051       xp10++;
3052     }
3053     double_bround3(dxx, banker_round10, &quotient, &remainder);
3054     wpos = qrtoa_1p3(quotient, remainder, wpos);
3055     remainder = wpos - wbuf;
3056     if (xp10 >= 100) {
3057       if (remainder + 5 < min_width) {
3058 	memcpy(memseta(start, 32, min_width - (remainder + 5)), wbuf, remainder);
3059 	start = &(start[min_width - 5]);
3060       } else {
3061 	start = memcpya(start, wbuf, remainder);
3062       }
3063       quotient = xp10 / 100;
3064       start = memcpyax(start, "e-", 2, '0' + quotient);
3065       xp10 -= 100 * quotient;
3066     } else {
3067       if (remainder + 4 < min_width) {
3068 	memcpy(memseta(start, 32, min_width - (remainder + 4)), wbuf, remainder);
3069 	start = &(start[min_width - 4]);
3070       } else {
3071 	start = memcpya(start, wbuf, remainder);
3072       }
3073       start = memcpya(start, "e-", 2);
3074     }
3075     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
3076   } else if (dxx >= 9999.4999999999) {
3077     // 4 sig fig exponential notation, large
3078     if (dxx >= 9.9994999999999e15) {
3079       if (dxx >= 9.9994999999999e127) {
3080 	if (dxx == INFINITY) {
3081 	  if (min_width > 4) {
3082 	    start = memseta(start, 32, min_width - 4);
3083 	  }
3084 	  if (wpos == wbuf) {
3085 	    return memcpya(start, " inf", 4);
3086 	  } else {
3087 	    return memcpya(start, "-inf", 4);
3088 	  }
3089 	} else if (dxx >= 9.9994999999999e255) {
3090 	  dxx *= 1.0e-256;
3091 	  xp10 |= 256;
3092 	} else {
3093 	  dxx *= 1.0e-128;
3094 	  xp10 |= 128;
3095 	}
3096       }
3097       if (dxx >= 9.9994999999999e63) {
3098 	dxx *= 1.0e-64;
3099 	xp10 |= 64;
3100       }
3101       if (dxx >= 9.9994999999999e31) {
3102 	dxx *= 1.0e-32;
3103 	xp10 |= 32;
3104       }
3105       if (dxx >= 9.9994999999999e15) {
3106 	dxx *= 1.0e-16;
3107 	xp10 |= 16;
3108       }
3109     }
3110     if (dxx >= 9.9994999999999e7) {
3111       dxx *= 1.0e-8;
3112       xp10 |= 8;
3113     }
3114     if (dxx >= 9.9994999999999e3) {
3115       dxx *= 1.0e-4;
3116       xp10 |= 4;
3117     }
3118     if (dxx >= 9.9994999999999e1) {
3119       dxx *= 1.0e-2;
3120       xp10 |= 2;
3121     }
3122     if (dxx >= 9.9994999999999e0) {
3123       dxx *= 1.0e-1;
3124       xp10++;
3125     }
3126     double_bround3(dxx, banker_round10, &quotient, &remainder);
3127     wpos = qrtoa_1p3(quotient, remainder, wpos);
3128     remainder = wpos - wbuf;
3129     if (xp10 >= 100) {
3130       if (remainder + 5 < min_width) {
3131 	memcpy(memseta(start, 32, min_width - (remainder + 5)), wbuf, remainder);
3132 	start = &(start[min_width - 5]);
3133       } else {
3134 	start = memcpya(start, wbuf, remainder);
3135       }
3136       quotient = xp10 / 100;
3137       start = memcpyax(start, "e+", 2, '0' + quotient);
3138       xp10 -= 100 * quotient;
3139     } else {
3140       if (remainder + 4 < min_width) {
3141 	memcpy(memseta(start, 32, min_width - (remainder + 4)), wbuf, remainder);
3142 	start = &(start[min_width - 4]);
3143       } else {
3144 	start = memcpya(start, wbuf, remainder);
3145       }
3146       start = memcpya(start, "e+", 2);
3147     }
3148     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
3149   } else {
3150     if (dxx >= 0.99994999999999) {
3151       wpos = dtoa_so4(dxx, wpos);
3152     } else {
3153       // 4 sig fig decimal, no less than ~0.0001
3154       wpos = memcpya(wpos, "0.", 2);
3155       if (dxx < 9.9994999999999e-3) {
3156 	dxx *= 100;
3157 	wpos = memcpya(wpos, "00", 2);
3158       }
3159       if (dxx < 9.9994999999999e-2) {
3160 	dxx *= 10;
3161 	*wpos++ = '0';
3162       }
3163       wpos = uitoa_trunc4(double_bround(dxx * 10000, banker_round10), wpos);
3164     }
3165     remainder = wpos - wbuf;
3166     if (remainder < min_width) {
3167       memcpy(memseta(start, 32, min_width - remainder), wbuf, remainder);
3168       return &(start[min_width]);
3169     } else {
3170       return memcpya(start, wbuf, remainder);
3171     }
3172   }
3173 }
3174 
dtoa_g_wxp8(double dxx,uint32_t min_width,char * start)3175 char* dtoa_g_wxp8(double dxx, uint32_t min_width, char* start) {
3176   uint32_t xp10 = 0;
3177   char wbuf[16];
3178   char* wpos = wbuf;
3179   uint32_t quotient;
3180   uint32_t remainder;
3181   if (dxx != dxx) {
3182     if (min_width > 3) {
3183       start = memseta(start, 32, min_width - 3);
3184     }
3185     return memcpyl3a(start, "nan");
3186   } else if (dxx < 0) {
3187     *wpos++ = '-';
3188     dxx = -dxx;
3189   }
3190   if (dxx < 9.9999999499999e-5) {
3191     // 8 sig fig exponential notation, small
3192     if (dxx < 9.9999999499999e-16) {
3193       if (dxx < 9.9999999499999e-128) {
3194 	if (dxx == 0.0) {
3195           memset(start, 32, min_width - 1);
3196 	  start[min_width - 1] = '0';
3197 	  return &(start[min_width]);
3198         } else if (dxx < 9.9999999499999e-256) {
3199 	  dxx *= 1.0e256;
3200 	  xp10 |= 256;
3201 	} else {
3202 	  dxx *= 1.0e128;
3203 	  xp10 |= 128;
3204 	}
3205       }
3206       if (dxx < 9.9999999499999e-64) {
3207 	dxx *= 1.0e64;
3208 	xp10 |= 64;
3209       }
3210       if (dxx < 9.9999999499999e-32) {
3211 	dxx *= 1.0e32;
3212 	xp10 |= 32;
3213       }
3214       if (dxx < 9.9999999499999e-16) {
3215 	dxx *= 1.0e16;
3216 	xp10 |= 16;
3217       }
3218     }
3219     if (dxx < 9.9999999499999e-8) {
3220       dxx *= 100000000;
3221       xp10 |= 8;
3222     }
3223     if (dxx < 9.9999999499999e-4) {
3224       dxx *= 10000;
3225       xp10 |= 4;
3226     }
3227     if (dxx < 9.9999999499999e-2) {
3228       dxx *= 100;
3229       xp10 |= 2;
3230     }
3231     if (dxx < 9.9999999499999e-1) {
3232       dxx *= 10;
3233       xp10++;
3234     }
3235     double_bround7(dxx, banker_round6, &quotient, &remainder);
3236     wpos = qrtoa_1p7(quotient, remainder, wpos);
3237     remainder = wpos - wbuf;
3238     if (xp10 >= 100) {
3239       if (remainder + 5 < min_width) {
3240 	memcpy(memseta(start, 32, min_width - (remainder + 5)), wbuf, remainder);
3241 	start = &(start[min_width - 5]);
3242       } else {
3243 	start = memcpya(start, wbuf, remainder);
3244       }
3245       quotient = xp10 / 100;
3246       start = memcpyax(start, "e-", 2, '0' + quotient);
3247       xp10 -= 100 * quotient;
3248     } else {
3249       if (remainder + 4 < min_width) {
3250 	memcpy(memseta(start, 32, min_width - (remainder + 4)), wbuf, remainder);
3251 	start = &(start[min_width - 4]);
3252       } else {
3253 	start = memcpya(start, wbuf, remainder);
3254       }
3255       start = memcpya(start, "e-", 2);
3256     }
3257     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
3258   } else if (dxx >= 99999999.499999) {
3259     // 8 sig fig exponential notation, large
3260     if (dxx >= 9.9999999499999e15) {
3261       if (dxx >= 9.9999999499999e127) {
3262 	if (dxx == INFINITY) {
3263 	  if (min_width > 4) {
3264 	    start = memseta(start, 32, min_width - 4);
3265 	  }
3266 	  if (wpos == wbuf) {
3267 	    return memcpya(start, " inf", 4);
3268 	  } else {
3269 	    return memcpya(start, "-inf", 4);
3270 	  }
3271 	} else if (dxx >= 9.9999999499999e255) {
3272 	  dxx *= 1.0e-256;
3273 	  xp10 |= 256;
3274 	} else {
3275 	  dxx *= 1.0e-128;
3276 	  xp10 |= 128;
3277 	}
3278       }
3279       if (dxx >= 9.9999999499999e63) {
3280 	dxx *= 1.0e-64;
3281 	xp10 |= 64;
3282       }
3283       if (dxx >= 9.9999999499999e31) {
3284 	dxx *= 1.0e-32;
3285 	xp10 |= 32;
3286       }
3287       if (dxx >= 9.9999999499999e15) {
3288 	dxx *= 1.0e-16;
3289 	xp10 |= 16;
3290       }
3291     }
3292     if (dxx >= 9.9999999499999e7) {
3293       dxx *= 1.0e-8;
3294       xp10 |= 8;
3295     }
3296     if (dxx >= 9.9999999499999e3) {
3297       dxx *= 1.0e-4;
3298       xp10 |= 4;
3299     }
3300     if (dxx >= 9.9999999499999e1) {
3301       dxx *= 1.0e-2;
3302       xp10 |= 2;
3303     }
3304     if (dxx >= 9.9999999499999e0) {
3305       dxx *= 1.0e-1;
3306       xp10++;
3307     }
3308     double_bround7(dxx, banker_round6, &quotient, &remainder);
3309     wpos = qrtoa_1p7(quotient, remainder, wpos);
3310     remainder = wpos - wbuf;
3311     if (xp10 >= 100) {
3312       if (remainder + 5 < min_width) {
3313 	memcpy(memseta(start, 32, min_width - (remainder + 5)), wbuf, remainder);
3314 	start = &(start[min_width - 5]);
3315       } else {
3316 	start = memcpya(start, wbuf, remainder);
3317       }
3318       quotient = xp10 / 100;
3319       start = memcpyax(start, "e+", 2, '0' + quotient);
3320       xp10 -= 100 * quotient;
3321     } else {
3322       if (remainder + 4 < min_width) {
3323 	memcpy(memseta(start, 32, min_width - (remainder + 4)), wbuf, remainder);
3324 	start = &(start[min_width - 4]);
3325       } else {
3326 	start = memcpya(start, wbuf, remainder);
3327       }
3328       start = memcpya(start, "e+", 2);
3329     }
3330     return memcpya(start, &(digit2_table[xp10 * 2]), 2);
3331   } else {
3332     if (dxx >= 0.99999999499999) {
3333       wpos = dtoa_so8(dxx, wpos);
3334     } else {
3335       // 8 sig fig decimal, no less than ~0.0001
3336       wpos = memcpya(wpos, "0.", 2);
3337       if (dxx < 9.9999999499999e-3) {
3338 	dxx *= 100;
3339 	wpos = memcpya(wpos, "00", 2);
3340       }
3341       if (dxx < 9.9999999499999e-2) {
3342 	dxx *= 10;
3343 	*wpos++ = '0';
3344       }
3345       wpos = uitoa_trunc8(double_bround(dxx * 100000000, banker_round6), wpos);
3346     }
3347     remainder = wpos - wbuf;
3348     if (remainder < min_width) {
3349       memcpy(memseta(start, 32, min_width - remainder), wbuf, remainder);
3350       return &(start[min_width]);
3351     } else {
3352       return memcpya(start, wbuf, remainder);
3353     }
3354   }
3355 }
3356 
chrom_print_human(uint32_t num,char * buf)3357 char* chrom_print_human(uint32_t num, char* buf) {
3358   uint32_t n10;
3359   if (num < 10) {
3360     *buf = '0' + num;
3361     return &(buf[1]);
3362   } else if (num < 23) {
3363     n10 = num / 10;
3364     *buf = '0' + n10;
3365     buf[1] = '0' + (num - 10 * n10);
3366     return &(buf[2]);
3367   } else if (num < 25) {
3368     // X is 24th letter of alphabet, and 23rd chromosome
3369     *buf = 'A' + num;
3370     return &(buf[1]);
3371   } else if (num > 26) {
3372     // --allow-extra-chr 0
3373     *buf = '0';
3374     return &(buf[1]);
3375   } else if (num == 25) {
3376     memcpy(buf, "XY", 2);
3377     return &(buf[2]);
3378   } else {
3379     memcpy(buf, "MT", 2);
3380     return &(buf[2]);
3381   }
3382 }
3383 
magic_num(uint32_t divisor,uint64_t * multp,uint32_t * __restrict pre_shiftp,uint32_t * __restrict post_shiftp,uint32_t * __restrict incrp)3384 void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp) {
3385   // Enables fast integer division by a constant not known until runtime.  See
3386   // http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html .
3387   // Assumes divisor is not zero, of course.
3388   // May want to populate a struct instead.
3389   uint32_t down_multiplier = 0;
3390   uint32_t down_exponent = 0;
3391   uint32_t has_magic_down = 0;
3392   uint32_t quotient;
3393   uint32_t remainder;
3394   uint32_t ceil_log_2_d;
3395   uint32_t exponent;
3396   uint32_t uii;
3397   if (divisor & (divisor - 1)) {
3398     quotient = 0x80000000U / divisor;
3399     remainder = 0x80000000U - (quotient * divisor);
3400     ceil_log_2_d = 32 - __builtin_clz(divisor);
3401     for (exponent = 0; ; exponent++) {
3402       if (remainder >= divisor - remainder) {
3403         quotient = quotient * 2 + 1;
3404 	remainder = remainder * 2 - divisor;
3405       } else {
3406 	quotient = quotient * 2;
3407 	remainder = remainder * 2;
3408       }
3409       if ((exponent >= ceil_log_2_d) || (divisor - remainder) <= (1U << exponent)) {
3410 	break;
3411       }
3412       if ((!has_magic_down) && (remainder <= (1U << exponent))) {
3413 	has_magic_down = 1;
3414 	down_multiplier = quotient;
3415 	down_exponent = exponent;
3416       }
3417     }
3418     if (exponent < ceil_log_2_d) {
3419       *multp = quotient + 1;
3420       *pre_shiftp = 0;
3421       *post_shiftp = 32 + exponent;
3422       *incrp = 0;
3423       return;
3424     } else if (divisor & 1) {
3425       *multp = down_multiplier;
3426       *pre_shiftp = 0;
3427       *post_shiftp = 32 + down_exponent;
3428       *incrp = 1;
3429       return;
3430     } else {
3431       *pre_shiftp = __builtin_ctz(divisor);
3432       magic_num(divisor >> (*pre_shiftp), multp, &uii, post_shiftp, incrp);
3433       return;
3434     }
3435   } else {
3436     // power of 2
3437     *multp = 1;
3438     *pre_shiftp = 0;
3439     *post_shiftp = __builtin_ctz(divisor);
3440     *incrp = 0;
3441   }
3442 }
3443 
fill_bits(uintptr_t loc_start,uintptr_t len,uintptr_t * bitarr)3444 void fill_bits(uintptr_t loc_start, uintptr_t len, uintptr_t* bitarr) {
3445   assert(len);
3446   uintptr_t maj_start = loc_start / BITCT;
3447   uintptr_t maj_end = (loc_start + len) / BITCT;
3448   uintptr_t minor;
3449   if (maj_start == maj_end) {
3450     bitarr[maj_start] |= (ONELU << ((loc_start + len) % BITCT)) - (ONELU << (loc_start % BITCT));
3451   } else {
3452     bitarr[maj_start] |= ~((ONELU << (loc_start % BITCT)) - ONELU);
3453     fill_ulong_one(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
3454     minor = (loc_start + len) % BITCT;
3455     if (minor) {
3456       bitarr[maj_end] |= (ONELU << minor) - ONELU;
3457     }
3458   }
3459 }
3460 
clear_bits(uintptr_t loc_start,uintptr_t len,uintptr_t * bitarr)3461 void clear_bits(uintptr_t loc_start, uintptr_t len, uintptr_t* bitarr) {
3462   assert(len);
3463   uintptr_t maj_start = loc_start / BITCT;
3464   uintptr_t maj_end = (loc_start + len) / BITCT;
3465   uintptr_t minor;
3466   if (maj_start == maj_end) {
3467     bitarr[maj_start] &= ~((ONELU << ((loc_start + len) % BITCT)) - (ONELU << (loc_start % BITCT)));
3468   } else {
3469     bitarr[maj_start] &= ((ONELU << (loc_start % BITCT)) - ONELU);
3470     fill_ulong_zero(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
3471     minor = (loc_start + len) % BITCT;
3472     if (minor) {
3473       bitarr[maj_end] &= ~((ONELU << minor) - ONELU);
3474     }
3475   }
3476 }
3477 
next_unset_unsafe(const uintptr_t * bitarr,uint32_t loc)3478 uint32_t next_unset_unsafe(const uintptr_t* bitarr, uint32_t loc) {
3479   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3480   uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
3481   if (ulii) {
3482     return loc + CTZLU(ulii);
3483   }
3484   do {
3485     ulii = *(++bitarr_ptr);
3486   } while (ulii == ~ZEROLU);
3487   return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
3488 }
3489 
3490 #ifdef __LP64__
next_unset_ul_unsafe(const uintptr_t * bitarr,uintptr_t loc)3491 uintptr_t next_unset_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
3492   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3493   uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
3494   if (ulii) {
3495     return loc + CTZLU(ulii);
3496   }
3497   do {
3498     ulii = *(++bitarr_ptr);
3499   } while (ulii == ~ZEROLU);
3500   return (((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii));
3501 }
3502 #endif
3503 
next_unset(const uintptr_t * bitarr,uint32_t loc,uint32_t ceil)3504 uint32_t next_unset(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
3505   // safe version.
3506   assert(ceil >= 1);
3507   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3508   uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
3509   const uintptr_t* bitarr_last;
3510   if (ulii) {
3511     loc += CTZLU(ulii);
3512     return MINV(loc, ceil);
3513   }
3514   bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
3515   do {
3516     if (bitarr_ptr >= bitarr_last) {
3517       return ceil;
3518     }
3519     ulii = *(++bitarr_ptr);
3520   } while (ulii == ~ZEROLU);
3521   loc = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
3522   return MINV(loc, ceil);
3523 }
3524 
3525 #ifdef __LP64__
next_unset_ul(const uintptr_t * bitarr,uintptr_t loc,uintptr_t ceil)3526 uintptr_t next_unset_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
3527   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3528   uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
3529   const uintptr_t* bitarr_last;
3530   if (ulii) {
3531     ulii = loc + CTZLU(ulii);
3532     return MINV(ulii, ceil);
3533   }
3534   bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
3535   do {
3536     if (bitarr_ptr >= bitarr_last) {
3537       return ceil;
3538     }
3539     ulii = *(++bitarr_ptr);
3540   } while (ulii == ~ZEROLU);
3541   ulii = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
3542   return MINV(ulii, ceil);
3543 }
3544 #endif
3545 
next_set_unsafe(const uintptr_t * bitarr,uint32_t loc)3546 uint32_t next_set_unsafe(const uintptr_t* bitarr, uint32_t loc) {
3547   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3548   uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
3549   if (ulii) {
3550     return loc + CTZLU(ulii);
3551   }
3552   do {
3553     ulii = *(++bitarr_ptr);
3554   } while (!ulii);
3555   return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
3556 }
3557 
3558 #ifdef __LP64__
next_set_ul_unsafe(const uintptr_t * bitarr,uintptr_t loc)3559 uintptr_t next_set_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
3560   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3561   uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
3562   if (ulii) {
3563     return loc + CTZLU(ulii);
3564   }
3565   do {
3566     ulii = *(++bitarr_ptr);
3567   } while (!ulii);
3568   return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
3569 }
3570 #endif
3571 
next_set(const uintptr_t * bitarr,uint32_t loc,uint32_t ceil)3572 uint32_t next_set(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
3573   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3574   uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
3575   const uintptr_t* bitarr_last;
3576   uint32_t rval;
3577   if (ulii) {
3578     rval = loc + CTZLU(ulii);
3579     return MINV(rval, ceil);
3580   }
3581   bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
3582   do {
3583     if (bitarr_ptr >= bitarr_last) {
3584       return ceil;
3585     }
3586     ulii = *(++bitarr_ptr);
3587   } while (!ulii);
3588   rval = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
3589   return MINV(rval, ceil);
3590 }
3591 
3592 #ifdef __LP64__
next_set_ul(const uintptr_t * bitarr,uintptr_t loc,uintptr_t ceil)3593 uintptr_t next_set_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
3594   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3595   uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
3596   const uintptr_t* bitarr_last;
3597   if (ulii) {
3598     ulii = loc + CTZLU(ulii);
3599     return MINV(ulii, ceil);
3600   }
3601   bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
3602   do {
3603     if (bitarr_ptr >= bitarr_last) {
3604       return ceil;
3605     }
3606     ulii = *(++bitarr_ptr);
3607   } while (!ulii);
3608   ulii = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
3609   return MINV(ulii, ceil);
3610 }
3611 #endif
3612 
last_set_bit(const uintptr_t * bitarr,uint32_t word_ct)3613 int32_t last_set_bit(const uintptr_t* bitarr, uint32_t word_ct) {
3614   const uintptr_t* bitarr_ptr = &(bitarr[word_ct]);
3615   uintptr_t ulii;
3616   do {
3617     ulii = *(--bitarr_ptr);
3618     if (ulii) {
3619       return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
3620     }
3621   } while (bitarr_ptr > bitarr);
3622   return -1;
3623 }
3624 
last_clear_bit(const uintptr_t * bitarr,uint32_t ceil)3625 int32_t last_clear_bit(const uintptr_t* bitarr, uint32_t ceil) {
3626   // can return ceil or any lower number
3627   const uintptr_t* bitarr_ptr = &(bitarr[ceil / BITCT]);
3628   uint32_t remainder = ceil % BITCT;
3629   uintptr_t ulii;
3630   if (remainder) {
3631     ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
3632     if (ulii) {
3633       return (ceil | (BITCT - 1)) - CLZLU(ulii);
3634     }
3635   }
3636   while (bitarr_ptr > bitarr) {
3637     ulii = ~(*(--bitarr_ptr));
3638     if (ulii) {
3639       return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
3640     }
3641   }
3642   return -1;
3643 }
3644 
prev_unset_unsafe(const uintptr_t * bitarr,uint32_t loc)3645 uint32_t prev_unset_unsafe(const uintptr_t* bitarr, uint32_t loc) {
3646   // unlike the next_{un}set family, this always returns a STRICTLY earlier
3647   // position
3648   const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3649   uint32_t remainder = loc % BITCT;
3650   uintptr_t ulii;
3651   if (remainder) {
3652     ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
3653     if (ulii) {
3654       return (loc | (BITCT - 1)) - CLZLU(ulii);
3655     }
3656   }
3657   do {
3658     ulii = ~(*(--bitarr_ptr));
3659   } while (!ulii);
3660   return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
3661 }
3662 
3663 /*
3664 uint32_t prev_unset(uintptr_t* bitarr, uint32_t loc, uint32_t floor) {
3665   uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3666   uint32_t remainder = loc % BITCT;
3667   uintptr_t* bitarr_first;
3668   uintptr_t ulii;
3669   if (remainder) {
3670     ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
3671     if (ulii) {
3672       loc = (loc | (BITCT - 1)) - CLZLU(ulii);
3673       return MAXV(loc, floor);
3674     }
3675   }
3676   bitarr_first = &(bitarr[floor / BITCT]);
3677   do {
3678     if (bitarr_ptr == bitarr_first) {
3679       return floor;
3680     }
3681     ulii = ~(*(--bitarr_ptr));
3682   } while (!ulii);
3683   loc = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
3684   return MAXV(loc, floor);
3685 }
3686 */
3687 
3688 
bigstack_calloc_uc(uintptr_t ct,unsigned char ** ucp_ptr)3689 int32_t bigstack_calloc_uc(uintptr_t ct, unsigned char** ucp_ptr) {
3690   *ucp_ptr = (unsigned char*)bigstack_alloc(ct);
3691   if (!(*ucp_ptr)) {
3692     return 1;
3693   }
3694   memset(*ucp_ptr, 0, ct);
3695   return 0;
3696 }
3697 
bigstack_calloc_d(uintptr_t ct,double ** dp_ptr)3698 int32_t bigstack_calloc_d(uintptr_t ct, double** dp_ptr) {
3699   *dp_ptr = (double*)bigstack_alloc(ct * sizeof(double));
3700   if (!(*dp_ptr)) {
3701     return 1;
3702   }
3703   fill_double_zero(ct, *dp_ptr);
3704   return 0;
3705 }
3706 
bigstack_calloc_f(uintptr_t ct,float ** fp_ptr)3707 int32_t bigstack_calloc_f(uintptr_t ct, float** fp_ptr) {
3708   *fp_ptr = (float*)bigstack_alloc(ct * sizeof(float));
3709   if (!(*fp_ptr)) {
3710     return 1;
3711   }
3712   fill_float_zero(ct, *fp_ptr);
3713   return 0;
3714 }
3715 
bigstack_calloc_ui(uintptr_t ct,uint32_t ** uip_ptr)3716 int32_t bigstack_calloc_ui(uintptr_t ct, uint32_t** uip_ptr) {
3717   *uip_ptr = (uint32_t*)bigstack_alloc(ct * sizeof(int32_t));
3718   if (!(*uip_ptr)) {
3719     return 1;
3720   }
3721   fill_uint_zero(ct, *uip_ptr);
3722   return 0;
3723 }
3724 
bigstack_calloc_ul(uintptr_t ct,uintptr_t ** ulp_ptr)3725 int32_t bigstack_calloc_ul(uintptr_t ct, uintptr_t** ulp_ptr) {
3726   *ulp_ptr = (uintptr_t*)bigstack_alloc(ct * sizeof(intptr_t));
3727   if (!(*ulp_ptr)) {
3728     return 1;
3729   }
3730   fill_ulong_zero(ct, *ulp_ptr);
3731   return 0;
3732 }
3733 
bigstack_calloc_ull(uintptr_t ct,uint64_t ** ullp_ptr)3734 int32_t bigstack_calloc_ull(uintptr_t ct, uint64_t** ullp_ptr) {
3735   *ullp_ptr = (uint64_t*)bigstack_alloc(ct * sizeof(int64_t));
3736   if (!(*ullp_ptr)) {
3737     return 1;
3738   }
3739   fill_ull_zero(ct, *ullp_ptr);
3740   return 0;
3741 }
3742 
bigstack_end_calloc_uc(uintptr_t ct,unsigned char ** ucp_ptr)3743 int32_t bigstack_end_calloc_uc(uintptr_t ct, unsigned char** ucp_ptr) {
3744   *ucp_ptr = (unsigned char*)bigstack_end_alloc(ct);
3745   if (!(*ucp_ptr)) {
3746     return 1;
3747   }
3748   memset(*ucp_ptr, 0, ct);
3749   return 0;
3750 }
3751 
bigstack_end_calloc_d(uintptr_t ct,double ** dp_ptr)3752 int32_t bigstack_end_calloc_d(uintptr_t ct, double** dp_ptr) {
3753   *dp_ptr = (double*)bigstack_end_alloc(ct * sizeof(double));
3754   if (!(*dp_ptr)) {
3755     return 1;
3756   }
3757   fill_double_zero(ct, *dp_ptr);
3758   return 0;
3759 }
3760 
bigstack_end_calloc_f(uintptr_t ct,float ** fp_ptr)3761 int32_t bigstack_end_calloc_f(uintptr_t ct, float** fp_ptr) {
3762   *fp_ptr = (float*)bigstack_end_alloc(ct * sizeof(float));
3763   if (!(*fp_ptr)) {
3764     return 1;
3765   }
3766   fill_float_zero(ct, *fp_ptr);
3767   return 0;
3768 }
3769 
bigstack_end_calloc_ui(uintptr_t ct,uint32_t ** uip_ptr)3770 int32_t bigstack_end_calloc_ui(uintptr_t ct, uint32_t** uip_ptr) {
3771   *uip_ptr = (uint32_t*)bigstack_end_alloc(ct * sizeof(int32_t));
3772   if (!(*uip_ptr)) {
3773     return 1;
3774   }
3775   fill_uint_zero(ct, *uip_ptr);
3776   return 0;
3777 }
3778 
bigstack_end_calloc_ul(uintptr_t ct,uintptr_t ** ulp_ptr)3779 int32_t bigstack_end_calloc_ul(uintptr_t ct, uintptr_t** ulp_ptr) {
3780   *ulp_ptr = (uintptr_t*)bigstack_end_alloc(ct * sizeof(intptr_t));
3781   if (!(*ulp_ptr)) {
3782     return 1;
3783   }
3784   fill_ulong_zero(ct, *ulp_ptr);
3785   return 0;
3786 }
3787 
bigstack_end_calloc_ull(uintptr_t ct,uint64_t ** ullp_ptr)3788 int32_t bigstack_end_calloc_ull(uintptr_t ct, uint64_t** ullp_ptr) {
3789   *ullp_ptr = (uint64_t*)bigstack_end_alloc(ct * sizeof(int64_t));
3790   if (!(*ullp_ptr)) {
3791     return 1;
3792   }
3793   fill_ull_zero(ct, *ullp_ptr);
3794   return 0;
3795 }
3796 
3797 
3798 // MurmurHash3, from
3799 // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
3800 
rotl32(uint32_t x,int8_t r)3801 static inline uint32_t rotl32(uint32_t x, int8_t r) {
3802   return (x << r) | (x >> (32 - r));
3803 }
3804 
getblock32(const uint32_t * p,int i)3805 static inline uint32_t getblock32(const uint32_t* p, int i) {
3806   return p[i];
3807 }
3808 
3809 //-----------------------------------------------------------------------------
3810 // Finalization mix - force all bits of a hash block to avalanche
3811 
fmix32(uint32_t h)3812 static inline uint32_t fmix32(uint32_t h) {
3813   h ^= h >> 16;
3814   h *= 0x85ebca6b;
3815   h ^= h >> 13;
3816   h *= 0xc2b2ae35;
3817   h ^= h >> 16;
3818 
3819   return h;
3820 }
3821 
murmurhash3_32(const void * key,uint32_t len)3822 uint32_t murmurhash3_32(const void* key, uint32_t len) {
3823   const uint8_t* data = (const uint8_t*)key;
3824   const int32_t nblocks = len / 4;
3825 
3826   uint32_t h1 = 0;
3827   // uint32_t h1 = seed;
3828 
3829   const uint32_t c1 = 0xcc9e2d51;
3830   const uint32_t c2 = 0x1b873593;
3831 
3832   //----------
3833   // body
3834 
3835   const uint32_t* blocks = (const uint32_t*)(data + nblocks*4);
3836 
3837   int32_t i;
3838   uint32_t k1;
3839   for(i = -nblocks; i; i++) {
3840       k1 = getblock32(blocks,i);
3841 
3842       k1 *= c1;
3843       k1 = rotl32(k1,15);
3844       k1 *= c2;
3845 
3846       h1 ^= k1;
3847       h1 = rotl32(h1,13);
3848       h1 = h1*5+0xe6546b64;
3849   }
3850 
3851   //----------
3852   // tail
3853 
3854   const uint8_t* tail = (const uint8_t*)(data + nblocks*4);
3855 
3856   k1 = 0;
3857 
3858   switch(len & 3) {
3859     case 3:
3860       k1 ^= tail[2] << 16;
3861       // fall through
3862     case 2:
3863       k1 ^= tail[1] << 8;
3864       // fall through
3865     case 1:
3866       k1 ^= tail[0];
3867       k1 *= c1;
3868       k1 = rotl32(k1,15);
3869       k1 *= c2;
3870       h1 ^= k1;
3871   }
3872 
3873   //----------
3874   // finalization
3875 
3876   h1 ^= len;
3877 
3878   return fmix32(h1);
3879 }
3880 
is_composite6(uintptr_t num)3881 uint32_t is_composite6(uintptr_t num) {
3882   // assumes num is congruent to 1 or 5 mod 6.
3883   // can speed this up by ~50% by hardcoding avoidance of multiples of 5/7,
3884   // but this isn't currently a bottleneck so I'll keep this simple
3885   uintptr_t divisor = 5;
3886   while (divisor * divisor <= num) {
3887     if (!(num % divisor)) {
3888       return 1;
3889     }
3890     divisor += 2;
3891     if (!(num % divisor)) {
3892       return 1;
3893     }
3894     divisor += 4;
3895   }
3896   return 0;
3897 }
3898 
geqprime(uintptr_t floor)3899 uintptr_t geqprime(uintptr_t floor) {
3900   // assumes floor is odd and greater than 1.  Returns 5 if floor = 3,
3901   // otherwise returns the first prime >= floor.
3902   uintptr_t ulii = floor % 3;
3903   if (!ulii) {
3904     floor += 2;
3905   } else if (ulii == 1) {
3906     goto geqprime_1mod6;
3907   }
3908   while (is_composite6(floor)) {
3909     floor += 2;
3910   geqprime_1mod6:
3911     if (!is_composite6(floor)) {
3912       return floor;
3913     }
3914     floor += 4;
3915   }
3916   return floor;
3917 }
3918 
populate_id_htable(uintptr_t unfiltered_ct,const uintptr_t * exclude_arr,uintptr_t item_ct,const char * item_ids,uintptr_t max_id_len,uint32_t store_dups,uint32_t id_htable_size,uint32_t * id_htable)3919 int32_t populate_id_htable(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t store_dups, uint32_t id_htable_size, uint32_t* id_htable) {
3920   // While unique IDs are normally assumed (and enforced) here, --extract and
3921   // --exclude are an exception, since we want to be able to e.g. exclude all
3922   // variants named '.'.  Since there could be millions of them, ordinary
3923   // O(n^2) hash table duplicate resolution is unacceptably slow; instead, we
3924   // allocate additional linked lists past the end of id_htable to track all
3925   // unfiltered indexes of duplicate names.  (This requires the
3926   // alloc_and_populate_id_htable interface; bigstack_end_alloc doesn't work
3927   // there.)
3928   uintptr_t item_uidx = 0;
3929   uint32_t extra_alloc = 0;
3930   uint32_t prev_llidx = 0;
3931   // needs to be synced with extract_exclude_flag_norange()
3932   uint32_t* extra_alloc_base = (uint32_t*)g_bigstack_base;
3933   uint32_t item_idx = 0;
3934   const char* sptr;
3935   uintptr_t prev_uidx;
3936   uintptr_t cur_bigstack_left;
3937   uint32_t max_extra_alloc;
3938   uint32_t slen;
3939   uint32_t hashval;
3940   uint32_t next_incr;
3941   uint32_t top_diff;
3942   uint32_t hash_result;
3943   uint32_t cur_dup;
3944   fill_uint_one(id_htable_size, id_htable);
3945   if (!store_dups) {
3946     for (; item_idx < item_ct; item_uidx++, item_idx++) {
3947       next_unset_ul_unsafe_ck(exclude_arr, &item_uidx);
3948       sptr = &(item_ids[item_uidx * max_id_len]);
3949       slen = strlen(sptr);
3950       hashval = murmurhash3_32(sptr, slen) % id_htable_size;
3951       next_incr = 1;
3952       while (1) {
3953 	hash_result = id_htable[hashval];
3954 	if (hash_result == 0xffffffffU) {
3955 	  id_htable[hashval] = item_uidx;
3956 	  break;
3957 	} else if (!memcmp(sptr, &(item_ids[hash_result * max_id_len]), slen + 1)) {
3958 	  // could add an allow_dups parameter which controls whether this is
3959 	  // an error
3960 	  LOGERRPRINTFWW("Error: Duplicate ID '%s'.\n", sptr);
3961 	  return RET_INVALID_FORMAT;
3962 	}
3963 	// defend against overflow
3964 	top_diff = id_htable_size - hashval;
3965 	if (top_diff > next_incr) {
3966 	  hashval += next_incr;
3967 	} else {
3968 	  hashval = next_incr - top_diff;
3969 	}
3970 	next_incr += 2; // quadratic probing
3971       }
3972     }
3973   } else {
3974     cur_bigstack_left = bigstack_left();
3975 #ifdef __LP64__
3976     if (cur_bigstack_left >= 0x400000000LLU) {
3977       max_extra_alloc = 0xfffffffeU;
3978     } else {
3979       max_extra_alloc = cur_bigstack_left / sizeof(int32_t);
3980     }
3981 #else
3982     max_extra_alloc = cur_bigstack_left / sizeof(int32_t);
3983 #endif
3984     for (; item_idx < item_ct; item_uidx++, item_idx++) {
3985       next_unset_ul_unsafe_ck(exclude_arr, &item_uidx);
3986       sptr = &(item_ids[item_uidx * max_id_len]);
3987       slen = strlen(sptr);
3988       hashval = murmurhash3_32(sptr, slen) % id_htable_size;
3989       next_incr = 1;
3990       while (1) {
3991 	hash_result = id_htable[hashval];
3992 	if (hash_result == 0xffffffffU) {
3993 	  id_htable[hashval] = item_uidx;
3994 	  break;
3995         } else {
3996 	  cur_dup = hash_result >> 31;
3997           if (cur_dup) {
3998 	    prev_llidx = hash_result << 1;
3999 	    prev_uidx = extra_alloc_base[prev_llidx];
4000           } else {
4001 	    prev_uidx = hash_result;
4002           }
4003           if (!memcmp(sptr, &(item_ids[prev_uidx * max_id_len]), slen + 1)) {
4004 	    if (extra_alloc + 4 > max_extra_alloc) {
4005 	      return RET_NOMEM;
4006 	    }
4007 	    // point to linked list entry instead
4008 	    if (!cur_dup) {
4009 	      extra_alloc_base[extra_alloc] = hash_result;
4010 	      extra_alloc_base[extra_alloc + 1] = 0xffffffffU; // list end
4011 	      prev_llidx = extra_alloc;
4012 	      extra_alloc += 2;
4013 	    }
4014 	    extra_alloc_base[extra_alloc] = item_uidx;
4015 	    extra_alloc_base[extra_alloc + 1] = prev_llidx;
4016 	    id_htable[hashval] = 0x80000000U | (extra_alloc >> 1);
4017 	    extra_alloc += 2;
4018 	    break; // bugfix
4019           }
4020 	}
4021 	top_diff = id_htable_size - hashval;
4022 	if (top_diff > next_incr) {
4023 	  hashval += next_incr;
4024 	} else {
4025 	  hashval = next_incr - top_diff;
4026 	}
4027 	next_incr += 2;
4028       }
4029     }
4030     if (extra_alloc) {
4031       bigstack_alloc(extra_alloc * sizeof(int32_t));
4032     }
4033   }
4034   return 0;
4035 }
4036 
id_htable_find(const char * id_buf,uintptr_t cur_id_len,const uint32_t * id_htable,uint32_t id_htable_size,const char * item_ids,uintptr_t max_id_len)4037 uint32_t id_htable_find(const char* id_buf, uintptr_t cur_id_len, const uint32_t* id_htable, uint32_t id_htable_size, const char* item_ids, uintptr_t max_id_len) {
4038   // assumes no duplicate entries, and nonzero id_htable_size
4039   // returns 0xffffffffU on failure
4040   if (cur_id_len >= max_id_len) {
4041     return 0xffffffffU;
4042   }
4043   uint32_t hashval = murmurhash3_32(id_buf, cur_id_len) % id_htable_size;
4044   uint32_t next_incr = 1;
4045   const char* sptr;
4046   uint32_t hash_result;
4047   uint32_t top_diff;
4048   while (1) {
4049     hash_result = id_htable[hashval];
4050     if (hash_result == 0xffffffffU) {
4051       return 0xffffffffU;
4052     }
4053     sptr = &(item_ids[hash_result * max_id_len]);
4054     if ((!memcmp(id_buf, sptr, cur_id_len)) && (!sptr[cur_id_len])) {
4055       return hash_result;
4056     }
4057     top_diff = id_htable_size - hashval;
4058     if (top_diff > next_incr) {
4059       hashval += next_incr;
4060     } else {
4061       hashval = next_incr - top_diff;
4062     }
4063     next_incr += 2;
4064   }
4065 }
4066 
fill_idx_to_uidx(const uintptr_t * exclude_arr,uintptr_t unfiltered_item_ct,uintptr_t item_ct,uint32_t * idx_to_uidx)4067 void fill_idx_to_uidx(const uintptr_t* exclude_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
4068   uint32_t* idx_to_uidx_end = &(idx_to_uidx[item_ct]);
4069   uint32_t item_uidx = 0;
4070   uint32_t item_uidx_stop;
4071   while (idx_to_uidx < idx_to_uidx_end) {
4072     item_uidx = next_unset_unsafe(exclude_arr, item_uidx);
4073     item_uidx_stop = next_set(exclude_arr, item_uidx, unfiltered_item_ct);
4074     do {
4075       *idx_to_uidx++ = item_uidx++;
4076     } while (item_uidx < item_uidx_stop);
4077   }
4078 }
4079 
fill_idx_to_uidx_incl(const uintptr_t * include_arr,uintptr_t unfiltered_item_ct,uintptr_t item_ct,uint32_t * idx_to_uidx)4080 void fill_idx_to_uidx_incl(const uintptr_t* include_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
4081   uint32_t* idx_to_uidx_end = &(idx_to_uidx[item_ct]);
4082   uint32_t item_uidx = 0;
4083   uint32_t item_uidx_stop;
4084   while (idx_to_uidx < idx_to_uidx_end) {
4085     item_uidx = next_set_unsafe(include_arr, item_uidx);
4086     item_uidx_stop = next_unset(include_arr, item_uidx, unfiltered_item_ct);
4087     do {
4088       *idx_to_uidx++ = item_uidx++;
4089     } while (item_uidx < item_uidx_stop);
4090   }
4091 }
4092 
fill_uidx_to_idx(const uintptr_t * exclude_arr,uint32_t unfiltered_item_ct,uint32_t item_ct,uint32_t * uidx_to_idx)4093 void fill_uidx_to_idx(const uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
4094   uint32_t item_uidx = 0;
4095   uint32_t item_idx = 0;
4096   uint32_t* uidx_to_idx_ptr;
4097   uint32_t* uidx_to_idx_stop;
4098   while (item_idx < item_ct) {
4099     item_uidx = next_unset_unsafe(exclude_arr, item_uidx);
4100     uidx_to_idx_ptr = &(uidx_to_idx[item_uidx]);
4101     item_uidx = next_set(exclude_arr, item_uidx, unfiltered_item_ct);
4102     uidx_to_idx_stop = &(uidx_to_idx[item_uidx]);
4103     do {
4104       *uidx_to_idx_ptr++ = item_idx++;
4105     } while (uidx_to_idx_ptr < uidx_to_idx_stop);
4106   }
4107 }
4108 
fill_uidx_to_idx_incl(const uintptr_t * include_arr,uint32_t unfiltered_item_ct,uint32_t item_ct,uint32_t * uidx_to_idx)4109 void fill_uidx_to_idx_incl(const uintptr_t* include_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
4110   uint32_t item_uidx = 0;
4111   uint32_t item_idx = 0;
4112   uint32_t* uidx_to_idx_ptr;
4113   uint32_t* uidx_to_idx_stop;
4114   while (item_idx < item_ct) {
4115     item_uidx = next_set_unsafe(include_arr, item_uidx);
4116     uidx_to_idx_ptr = &(uidx_to_idx[item_uidx]);
4117     item_uidx = next_unset(include_arr, item_uidx, unfiltered_item_ct);
4118     uidx_to_idx_stop = &(uidx_to_idx[item_uidx]);
4119     do {
4120       *uidx_to_idx_ptr++ = item_idx++;
4121     } while (uidx_to_idx_ptr < uidx_to_idx_stop);
4122   }
4123 }
4124 
fill_midx_to_idx(const uintptr_t * exclude_arr_orig,const uintptr_t * exclude_arr,uint32_t item_ct,uint32_t * midx_to_idx)4125 void fill_midx_to_idx(const uintptr_t* exclude_arr_orig, const uintptr_t* exclude_arr, uint32_t item_ct, uint32_t* midx_to_idx) {
4126   // Assumes item_ct is nonzero.
4127 
4128   // May want to switch to alternate behavior: when current midx is excluded,
4129   // fill midx_to_idx[] with the next item_idx.
4130   uint32_t item_uidx = next_unset_unsafe(exclude_arr_orig, 0);
4131   uint32_t item_idx = 0;
4132   uint32_t item_midx;
4133   for (item_midx = 0; item_idx < item_ct; item_uidx++, item_midx++) {
4134     next_unset_unsafe_ck(exclude_arr_orig, &item_uidx);
4135     if (!IS_SET(exclude_arr, item_uidx)) {
4136       midx_to_idx[item_midx] = item_idx++;
4137     }
4138   }
4139 }
4140 
fill_quatervec_55(uint32_t ct,uintptr_t * quatervec)4141 void fill_quatervec_55(uint32_t ct, uintptr_t* quatervec) {
4142   uint32_t rem = ct & (BITCT - 1);
4143 #ifdef __LP64__
4144   const __m128i m1 = {FIVEMASK, FIVEMASK};
4145   __m128i* vecp = (__m128i*)quatervec;
4146   __m128i* vec_end = (__m128i*)(&(quatervec[2 * (ct / BITCT)]));
4147   uintptr_t* second_to_last;
4148   while (vecp < vec_end) {
4149     *vecp++ = m1;
4150   }
4151   if (rem) {
4152     second_to_last = (uintptr_t*)vecp;
4153     if (rem > BITCT2) {
4154       second_to_last[0] = FIVEMASK;
4155       second_to_last[1] = FIVEMASK >> ((BITCT - rem) * 2);
4156     } else {
4157       second_to_last[0] = FIVEMASK >> ((BITCT2 - rem) * 2);
4158       second_to_last[1] = 0;
4159     }
4160   }
4161 #else
4162   uintptr_t* vec_end = &(quatervec[2 * (ct / BITCT)]);
4163   while (quatervec < vec_end) {
4164     *quatervec++ = FIVEMASK;
4165   }
4166   if (rem) {
4167     if (rem > BITCT2) {
4168       quatervec[0] = FIVEMASK;
4169       quatervec[1] = FIVEMASK >> ((BITCT - rem) * 2);
4170     } else {
4171       quatervec[0] = FIVEMASK >> ((BITCT2 - rem) * 2);
4172       quatervec[1] = 0;
4173     }
4174   }
4175 #endif
4176 }
4177 
quaterarr_collapse_init(const uintptr_t * __restrict unfiltered_bitarr,uint32_t unfiltered_ct,const uintptr_t * __restrict filter_bitarr,uint32_t filtered_ct,uintptr_t * __restrict output_quaterarr)4178 void quaterarr_collapse_init(const uintptr_t* __restrict unfiltered_bitarr, uint32_t unfiltered_ct, const uintptr_t* __restrict filter_bitarr, uint32_t filtered_ct, uintptr_t* __restrict output_quaterarr) {
4179   // Used to unpack e.g. unfiltered sex_male to a filtered quaterarr usable as
4180   // a raw input bitmask.
4181   // Assumes output_quaterarr is sized to a multiple of 16 bytes.
4182   uintptr_t cur_write = 0;
4183   uint32_t item_uidx = 0;
4184   uint32_t write_bit = 0;
4185   uint32_t item_idx = 0;
4186   uint32_t item_uidx_stop;
4187   while (item_idx < filtered_ct) {
4188     item_uidx = next_set_unsafe(filter_bitarr, item_uidx);
4189     item_uidx_stop = next_unset(filter_bitarr, item_uidx, unfiltered_ct);
4190     item_idx += item_uidx_stop - item_uidx;
4191     do {
4192       cur_write |= ((unfiltered_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << (write_bit * 2);
4193       if (++write_bit == BITCT2) {
4194 	*output_quaterarr++ = cur_write;
4195         cur_write = 0;
4196 	write_bit = 0;
4197       }
4198     } while (++item_uidx < item_uidx_stop);
4199   }
4200   if (write_bit) {
4201     *output_quaterarr++ = cur_write;
4202   }
4203   if ((filtered_ct + (BITCT2 - 1)) & BITCT2) {
4204     *output_quaterarr = 0;
4205   }
4206 }
4207 
quaterarr_collapse_init_exclude(const uintptr_t * __restrict unfiltered_bitarr,uint32_t unfiltered_ct,const uintptr_t * __restrict filter_exclude_bitarr,uint32_t filtered_ct,uintptr_t * __restrict output_quaterarr)4208 void quaterarr_collapse_init_exclude(const uintptr_t* __restrict unfiltered_bitarr, uint32_t unfiltered_ct, const uintptr_t* __restrict filter_exclude_bitarr, uint32_t filtered_ct, uintptr_t* __restrict output_quaterarr) {
4209   uintptr_t cur_write = 0;
4210   uint32_t item_uidx = 0;
4211   uint32_t write_bit = 0;
4212   uint32_t item_idx = 0;
4213   uint32_t item_uidx_stop;
4214   while (item_idx < filtered_ct) {
4215     item_uidx = next_unset_unsafe(filter_exclude_bitarr, item_uidx);
4216     item_uidx_stop = next_set(filter_exclude_bitarr, item_uidx, unfiltered_ct);
4217     item_idx += item_uidx_stop - item_uidx;
4218     do {
4219       cur_write |= ((unfiltered_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << (write_bit * 2);
4220       if (++write_bit == BITCT2) {
4221 	*output_quaterarr++ = cur_write;
4222         cur_write = 0;
4223 	write_bit = 0;
4224       }
4225     } while (++item_uidx < item_uidx_stop);
4226   }
4227   if (write_bit) {
4228     *output_quaterarr++ = cur_write;
4229   }
4230   if ((filtered_ct + (BITCT2 - 1)) & BITCT2) {
4231     *output_quaterarr = 0;
4232   }
4233 }
4234 
alloc_collapsed_haploid_filters(const uintptr_t * __restrict sample_bitarr,const uintptr_t * __restrict sex_male,uint32_t unfiltered_sample_ct,uint32_t sample_ct,uint32_t hh_exists,uint32_t is_include,uintptr_t ** sample_include_quatervec_ptr,uintptr_t ** sample_male_include_quatervec_ptr)4235 uint32_t alloc_collapsed_haploid_filters(const uintptr_t* __restrict sample_bitarr, const uintptr_t* __restrict sex_male, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t** sample_include_quatervec_ptr, uintptr_t** sample_male_include_quatervec_ptr) {
4236   uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
4237   if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
4238     // if already allocated, we assume this is fully initialized
4239     if (!(*sample_include_quatervec_ptr)) {
4240       if (bigstack_alloc_ul(sample_ctv2, sample_include_quatervec_ptr)) {
4241 	return 1;
4242       }
4243       fill_quatervec_55(sample_ct, *sample_include_quatervec_ptr);
4244     }
4245   }
4246   if (hh_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
4247     // if already allocated, we assume it's been bigstack_end_alloc'd but not
4248     // initialized
4249     if (!(*sample_male_include_quatervec_ptr)) {
4250       if (bigstack_alloc_ul(sample_ctv2, sample_male_include_quatervec_ptr)) {
4251 	return 1;
4252       }
4253     }
4254     if (is_include) {
4255       quaterarr_collapse_init(sex_male, unfiltered_sample_ct, sample_bitarr, sample_ct, *sample_male_include_quatervec_ptr);
4256     } else {
4257       quaterarr_collapse_init_exclude(sex_male, unfiltered_sample_ct, sample_bitarr, sample_ct, *sample_male_include_quatervec_ptr);
4258     }
4259   }
4260   return 0;
4261 }
4262 
sample_delim_convert(uintptr_t unfiltered_sample_ct,const uintptr_t * sample_exclude,uint32_t sample_ct,uintptr_t max_sample_id_len,char oldc,char newc,char * sample_ids)4263 void sample_delim_convert(uintptr_t unfiltered_sample_ct, const uintptr_t* sample_exclude, uint32_t sample_ct, uintptr_t max_sample_id_len, char oldc, char newc, char* sample_ids) {
4264   // assumes there is exactly one delimiter to convert per name
4265   uintptr_t sample_uidx = 0;
4266   uint32_t sample_idx;
4267   char* nptr;
4268   for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
4269     next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
4270     nptr = (char*)memchr(&(sample_ids[sample_uidx * max_sample_id_len]), (unsigned char)oldc, max_sample_id_len);
4271     *nptr = newc;
4272   }
4273 }
4274 
get_set_wrange_align(const uintptr_t * __restrict bitarr,uintptr_t word_ct,uintptr_t * __restrict firstw_ptr,uintptr_t * __restrict wlen_ptr)4275 void get_set_wrange_align(const uintptr_t* __restrict bitarr, uintptr_t word_ct, uintptr_t* __restrict firstw_ptr, uintptr_t* __restrict wlen_ptr) {
4276   const uintptr_t* bitarr_ptr = bitarr;
4277   const uintptr_t* bitarr_end = &(bitarr[word_ct]);
4278 #ifdef __LP64__
4279   const uintptr_t* bitarr_end2 = &(bitarr[word_ct & (~ONELU)]);
4280   while (bitarr_ptr < bitarr_end2) {
4281     if (bitarr_ptr[0] || bitarr_ptr[1]) {
4282       *firstw_ptr = (uintptr_t)(bitarr_ptr - bitarr);
4283       while (!(*(--bitarr_end)));
4284       *wlen_ptr = 1 + (uintptr_t)(bitarr_end - bitarr_ptr);
4285       return;
4286     }
4287     bitarr_ptr = &(bitarr_ptr[2]);
4288   }
4289   if ((bitarr_end2 != bitarr_end) && (*bitarr_end2)) {
4290     *firstw_ptr = word_ct - 1;
4291     *wlen_ptr = 1;
4292     return;
4293   }
4294 #else
4295   while (bitarr_ptr < bitarr_end) {
4296     if (*bitarr_ptr) {
4297       *firstw_ptr = (uintptr_t)(bitarr_ptr - bitarr);
4298       while (!(*(--bitarr_end)));
4299       *wlen_ptr = 1 + (uintptr_t)(bitarr_end - bitarr_ptr);
4300       return;
4301     }
4302     bitarr_ptr++;
4303   }
4304 #endif
4305   *firstw_ptr = 0;
4306   *wlen_ptr = 0;
4307 }
4308 
4309 // hashval computation left to caller since this is frequently used with
4310 // chromosome IDs, where the compiler can optimize the integer modulus
4311 // operation since the hash table size is preset
unklen_id_htable_find(const char * cur_id,const char * const * item_ids,const uint32_t * id_htable,uint32_t hashval,uint32_t id_htable_size)4312 uint32_t unklen_id_htable_find(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t hashval, uint32_t id_htable_size) {
4313   // returns 0xffffffffU on failure
4314   uint32_t next_incr = 1;
4315   while (1) {
4316     const uint32_t hash_result = id_htable[hashval];
4317     if (hash_result == 0xffffffffU) {
4318       return 0xffffffffU;
4319     }
4320     const char* htable_entry = item_ids[hash_result];
4321     if (!strcmp(cur_id, htable_entry)) {
4322       return hash_result;
4323     }
4324     const uint32_t top_diff = id_htable_size - hashval;
4325     if (top_diff > next_incr) {
4326       hashval += next_incr;
4327     } else {
4328       hashval = next_incr - top_diff;
4329     }
4330     next_incr += 2;
4331   }
4332 }
4333 
nonstd_chrom_name_htable_find(const char * chrom_name,const char * const * nonstd_names,const uint32_t * nonstd_id_htable,uint32_t name_slen)4334 static inline uint32_t nonstd_chrom_name_htable_find(const char* chrom_name, const char* const* nonstd_names, const uint32_t* nonstd_id_htable, uint32_t name_slen) {
4335   const uint32_t hashval = murmurhash3_32(chrom_name, name_slen) % CHROM_NAME_HTABLE_SIZE;
4336   return unklen_id_htable_find(chrom_name, nonstd_names, nonstd_id_htable, hashval, CHROM_NAME_HTABLE_SIZE);
4337 }
4338 
4339 
4340 // Global since species_str() may be called by functions which don't actually
4341 // care about chrom_info.  (chrom_info is really a global variable too, but I
4342 // find it easier to maintain this code when chrom_info dependencies are made
4343 // explicit in the function signatures; in contrast, g_species_singular and
4344 // g_species_plural are just for pretty printing and lend no insight into what
4345 // the functions which reference them are doing.)
4346 const char* g_species_singular = nullptr;
4347 const char* g_species_plural = nullptr;
4348 
init_chrom_info(Chrom_info * chrom_info_ptr)4349 int32_t init_chrom_info(Chrom_info* chrom_info_ptr) {
4350   // "constructor".  initializes with maximum capacity.  doesn't use bigstack.
4351   // chrom_mask, haploid_mask: bits
4352   // chrom_file_order, chrom_idx_to_foidx: int32s
4353   // chrom_fo_vidx_start: int32s, with an extra trailing element
4354   // nonstd_names: intptr_ts
4355   // nonstd_id_htable: CHROM_NAME_HTABLE_SIZE int32s
4356 
4357   assert(!(MAX_POSSIBLE_CHROM % VEC_BYTES));
4358   const uintptr_t vecs_required = 2 * BITCT_TO_VECCT(MAX_POSSIBLE_CHROM) + 3 * (MAX_POSSIBLE_CHROM / VEC_INT32) + 1 + (MAX_POSSIBLE_CHROM / VEC_WORDS) + (CHROM_NAME_HTABLE_SIZE + (VEC_INT32 - 1)) / VEC_INT32;
4359 
4360   // needed for proper cleanup
4361   chrom_info_ptr->name_ct = 0;
4362   chrom_info_ptr->incl_excl_name_stack = nullptr;
4363   if (aligned_malloc(vecs_required * VEC_BYTES, &(chrom_info_ptr->chrom_mask))) {
4364     return RET_NOMEM;
4365   }
4366   uintptr_t* alloc_iter = &(chrom_info_ptr->chrom_mask[BITCT_TO_VECCT(MAX_POSSIBLE_CHROM) * VEC_WORDS]);
4367   chrom_info_ptr->haploid_mask = alloc_iter;
4368   alloc_iter = &(alloc_iter[BITCT_TO_VECCT(MAX_POSSIBLE_CHROM) * VEC_WORDS]);
4369   chrom_info_ptr->chrom_file_order = (uint32_t*)alloc_iter;
4370   alloc_iter = &(alloc_iter[(MAX_POSSIBLE_CHROM / VEC_INT32) * VEC_WORDS]);
4371   chrom_info_ptr->chrom_fo_vidx_start = (uint32_t*)alloc_iter;
4372   alloc_iter = &(alloc_iter[((MAX_POSSIBLE_CHROM / VEC_INT32) + 1) * VEC_WORDS]);
4373   chrom_info_ptr->chrom_idx_to_foidx = (uint32_t*)alloc_iter;
4374   alloc_iter = &(alloc_iter[(MAX_POSSIBLE_CHROM / VEC_INT32) * VEC_WORDS]);
4375   chrom_info_ptr->nonstd_names = (char**)alloc_iter;
4376   alloc_iter = &(alloc_iter[MAX_POSSIBLE_CHROM]);
4377   chrom_info_ptr->nonstd_id_htable = (uint32_t*)alloc_iter;
4378   // alloc_iter = &(alloc_iter[((CHROM_NAME_HTABLE_SIZE + (VEC_INT32 - 1)) / VEC_INT32) * VEC_WORDS]);
4379   // postpone nonstd_id_htable initialization until first nonstandard ID is
4380   // loaded
4381   // fill_uint_one(CHROM_NAME_HTABLE_SIZE, chrom_info_ptr->nonstd_id_htable);
4382   return 0;
4383 }
4384 
4385 // if these are defined within init_species(), they may not persist after
4386 // function exit
4387 static const char species_singular_constants[][7] = {"person", "cow", "dog", "horse", "mouse", "plant", "sheep", "sample"};
4388 static const char species_plural_constants[][8] = {"people", "cattle", "dogs", "horses", "mice", "plants", "sheep", "samples"};
4389 
init_species(uint32_t species_code,Chrom_info * chrom_info_ptr)4390 void init_species(uint32_t species_code, Chrom_info* chrom_info_ptr) {
4391   // human: 22, X, Y, XY, MT
4392   // cow: 29, X, Y, MT
4393   // dog: 38, X, Y, XY, MT
4394   // horse: 31, X, Y
4395   // mouse: 19, X, Y
4396   // rice: 12
4397   // sheep: 26, X, Y
4398   const int32_t species_xymt_codes[] = {
4399     23, 24, 25, 26,
4400     30, 31, -2, 33,
4401     39, 40, 41, 42,
4402     32, 33, -2, -2,
4403     20, 21, -2, -2,
4404     -2, -2, -2, -2,
4405     27, 28, -2, -2};
4406   const uint32_t species_autosome_ct[] = {22, 29, 38, 31, 19, 12, 26};
4407   const uint32_t species_max_code[] = {26, 33, 42, 33, 21, 12, 28};
4408   fill_ulong_zero(CHROM_MASK_WORDS, chrom_info_ptr->chrom_mask);
4409   chrom_info_ptr->output_encoding = 0;
4410   chrom_info_ptr->zero_extra_chroms = 0;
4411   chrom_info_ptr->species = species_code;
4412   chrom_info_ptr->is_include_stack = 0;
4413   g_species_singular = species_singular_constants[species_code];
4414   g_species_plural = species_plural_constants[species_code];
4415   if (species_code != SPECIES_UNKNOWN) {
4416     // these are assumed to be already initialized in the SPECIES_UNKNOWN case
4417 
4418     // bugfix: haploid_mask was being cleared in --chr-set case
4419     fill_ulong_zero(CHROM_MASK_WORDS, chrom_info_ptr->haploid_mask);
4420     memcpy(chrom_info_ptr->xymt_codes, &(species_xymt_codes[species_code * XYMT_OFFSET_CT]), XYMT_OFFSET_CT * sizeof(int32_t));
4421     chrom_info_ptr->autosome_ct = species_autosome_ct[species_code];
4422     chrom_info_ptr->max_code = species_max_code[species_code];
4423     switch (species_code) {
4424     case SPECIES_HUMAN:
4425       chrom_info_ptr->haploid_mask[0] = 0x1800000;
4426       break;
4427     case SPECIES_COW:
4428       chrom_info_ptr->haploid_mask[0] = 0xc0000000LU;
4429       break;
4430     case SPECIES_DOG:
4431 #ifdef __LP64__
4432       chrom_info_ptr->haploid_mask[0] = 0x18000000000LLU;
4433 #else
4434       chrom_info_ptr->haploid_mask[1] = 0x180;
4435 #endif
4436       break;
4437     case SPECIES_HORSE:
4438 #ifdef __LP64__
4439       chrom_info_ptr->haploid_mask[0] = 0x300000000LLU;
4440 #else
4441       chrom_info_ptr->haploid_mask[1] = 3;
4442 #endif
4443       break;
4444     case SPECIES_MOUSE:
4445       chrom_info_ptr->haploid_mask[0] = 0x300000;
4446       break;
4447     case SPECIES_RICE:
4448       chrom_info_ptr->haploid_mask[0] = 0x1fff;
4449       break;
4450     case SPECIES_SHEEP:
4451       chrom_info_ptr->haploid_mask[0] = 0x18000000;
4452       break;
4453     }
4454   }
4455   fill_uint_one(chrom_info_ptr->max_code + 1, chrom_info_ptr->chrom_idx_to_foidx);
4456 }
4457 
init_default_chrom_mask(Chrom_info * chrom_info_ptr)4458 void init_default_chrom_mask(Chrom_info* chrom_info_ptr) {
4459   if (chrom_info_ptr->species != SPECIES_UNKNOWN) {
4460     fill_all_bits(chrom_info_ptr->max_code + 1, chrom_info_ptr->chrom_mask);
4461   } else {
4462     fill_all_bits(chrom_info_ptr->autosome_ct + 1, chrom_info_ptr->chrom_mask);
4463     // --chr-set support
4464     for (uint32_t xymt_idx = 0; xymt_idx < XYMT_OFFSET_CT; ++xymt_idx) {
4465       int32_t cur_code = chrom_info_ptr->xymt_codes[xymt_idx];
4466       if (cur_code != -2) {
4467 	set_bit(chrom_info_ptr->xymt_codes[xymt_idx], chrom_info_ptr->chrom_mask);
4468       }
4469     }
4470   }
4471 }
4472 
forget_extra_chrom_names(uint32_t reinitialize,Chrom_info * chrom_info_ptr)4473 void forget_extra_chrom_names(uint32_t reinitialize, Chrom_info* chrom_info_ptr) {
4474   const uint32_t name_ct = chrom_info_ptr->name_ct;
4475   // guard against init_species() not being called yet
4476   if (name_ct) {
4477     char** nonstd_names = chrom_info_ptr->nonstd_names;
4478     const uint32_t chrom_idx_last = chrom_info_ptr->max_code + name_ct;
4479     for (uint32_t chrom_idx = chrom_info_ptr->max_code + 1; chrom_idx <= chrom_idx_last; ++chrom_idx) {
4480       free(nonstd_names[chrom_idx]);
4481       nonstd_names[chrom_idx] = nullptr;
4482     }
4483     if (reinitialize) {
4484       fill_uint_one(CHROM_NAME_HTABLE_SIZE, chrom_info_ptr->nonstd_id_htable);
4485       chrom_info_ptr->name_ct = 0;
4486     }
4487   }
4488 }
4489 
finalize_chrom_info(Chrom_info * chrom_info_ptr)4490 int32_t finalize_chrom_info(Chrom_info* chrom_info_ptr) {
4491   const uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
4492   const uint32_t name_ct = chrom_info_ptr->name_ct;
4493   const uint32_t chrom_code_end = chrom_info_ptr->max_code + 1 + name_ct;
4494   const uint32_t chrom_code_bitvec_ct = BITCT_TO_VECCT(chrom_code_end);
4495   const uint32_t chrom_ct_int32vec_ct = (chrom_ct + (VEC_INT32 - 1)) / VEC_INT32;
4496   const uint32_t chrom_ct_p1_int32vec_ct = 1 + (chrom_ct / VEC_INT32);
4497   const uint32_t chrom_code_end_int32vec_ct = (chrom_code_end + (VEC_INT32 - 1)) / VEC_INT32;
4498   const uint32_t chrom_code_end_wordvec_ct = (chrom_code_end + (VEC_WORDS - 1)) / VEC_WORDS;
4499   uint32_t final_vecs_required = 2 * chrom_code_bitvec_ct + chrom_ct_int32vec_ct + chrom_ct_p1_int32vec_ct + chrom_code_end_int32vec_ct;
4500   if (name_ct) {
4501     final_vecs_required += chrom_code_end_wordvec_ct + (CHROM_NAME_HTABLE_SIZE + (VEC_INT32 - 1)) / VEC_INT32;
4502   }
4503   uintptr_t* new_alloc;
4504   if (aligned_malloc(final_vecs_required * VEC_BYTES, &new_alloc)) {
4505     return RET_NOMEM;
4506   }
4507   uintptr_t* old_alloc = chrom_info_ptr->chrom_mask;
4508   uintptr_t* new_alloc_iter = new_alloc;
4509 
4510   memcpy(new_alloc_iter, chrom_info_ptr->chrom_mask, chrom_code_bitvec_ct * VEC_BYTES);
4511   chrom_info_ptr->chrom_mask = new_alloc_iter;
4512   new_alloc_iter = &(new_alloc_iter[chrom_code_bitvec_ct * VEC_WORDS]);
4513 
4514   memcpy(new_alloc_iter, chrom_info_ptr->haploid_mask, chrom_code_bitvec_ct * VEC_BYTES);
4515   chrom_info_ptr->haploid_mask = new_alloc_iter;
4516   new_alloc_iter = &(new_alloc_iter[chrom_code_bitvec_ct * VEC_WORDS]);
4517 
4518   memcpy(new_alloc_iter, chrom_info_ptr->chrom_file_order, chrom_ct_int32vec_ct * VEC_BYTES);
4519   chrom_info_ptr->chrom_file_order = (uint32_t*)new_alloc_iter;
4520   new_alloc_iter = &(new_alloc_iter[chrom_ct_int32vec_ct * VEC_WORDS]);
4521 
4522   memcpy(new_alloc_iter, chrom_info_ptr->chrom_fo_vidx_start, chrom_ct_p1_int32vec_ct * VEC_BYTES);
4523   chrom_info_ptr->chrom_fo_vidx_start = (uint32_t*)new_alloc_iter;
4524   new_alloc_iter = &(new_alloc_iter[chrom_ct_p1_int32vec_ct * VEC_WORDS]);
4525 
4526   memcpy(new_alloc_iter, chrom_info_ptr->chrom_idx_to_foidx, chrom_code_end_int32vec_ct * VEC_BYTES);
4527   chrom_info_ptr->chrom_idx_to_foidx = (uint32_t*)new_alloc_iter;
4528 
4529   if (!name_ct) {
4530     chrom_info_ptr->nonstd_names = nullptr;
4531     chrom_info_ptr->nonstd_id_htable = nullptr;
4532   } else {
4533     new_alloc_iter = &(new_alloc_iter[chrom_code_end_int32vec_ct * VEC_WORDS]);
4534 
4535     memcpy(new_alloc_iter, chrom_info_ptr->nonstd_names, chrom_code_end_wordvec_ct * VEC_BYTES);
4536     chrom_info_ptr->nonstd_names = (char**)new_alloc_iter;
4537     new_alloc_iter = &(new_alloc_iter[chrom_code_end_wordvec_ct * VEC_WORDS]);
4538 
4539     memcpy(new_alloc_iter, chrom_info_ptr->nonstd_id_htable, CHROM_NAME_HTABLE_SIZE * sizeof(int32_t));
4540     chrom_info_ptr->nonstd_id_htable = (uint32_t*)new_alloc_iter;
4541   }
4542   aligned_free(old_alloc);
4543   return 0;
4544 }
4545 
cleanup_chrom_info(Chrom_info * chrom_info_ptr)4546 void cleanup_chrom_info(Chrom_info* chrom_info_ptr) {
4547   if (chrom_info_ptr->chrom_mask) {
4548     // bugfix: this must happened before aligned_free() call
4549     forget_extra_chrom_names(0, chrom_info_ptr);
4550 
4551     aligned_free(chrom_info_ptr->chrom_mask);
4552     chrom_info_ptr->chrom_mask = nullptr;
4553   }
4554   Ll_str* ll_str_ptr = chrom_info_ptr->incl_excl_name_stack;
4555   while (ll_str_ptr) {
4556     Ll_str* next_ptr = ll_str_ptr->next;
4557     free(ll_str_ptr);
4558     ll_str_ptr = next_ptr;
4559   }
4560   chrom_info_ptr->incl_excl_name_stack = nullptr;
4561 }
4562 
chrom_name_std(const Chrom_info * chrom_info_ptr,uint32_t chrom_idx,char * buf)4563 char* chrom_name_std(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, char* buf) {
4564   const uint32_t output_encoding = chrom_info_ptr->output_encoding;
4565   if (output_encoding & (CHR_OUTPUT_PREFIX | CHR_OUTPUT_0M)) {
4566     if (output_encoding == CHR_OUTPUT_0M) {
4567       // force two chars
4568       if (chrom_idx <= chrom_info_ptr->autosome_ct) {
4569 	buf = (char*)memcpya(buf, &(digit2_table[chrom_idx * 2]), 2);
4570       } else if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[XY_OFFSET]) {
4571 	buf = (char*)memcpya(buf, "XY", 2);
4572       } else {
4573 	*buf++ = '0';
4574 	if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[X_OFFSET]) {
4575 	  *buf++ = 'X';
4576 	} else {
4577 	  // assumes only X/Y/XY/MT defined
4578 	  *buf++ = ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[Y_OFFSET])? 'Y' : 'M';
4579 	}
4580       }
4581       return buf;
4582     }
4583     buf = memcpyl3a(buf, "chr");
4584   }
4585   if ((!(output_encoding & (CHR_OUTPUT_M | CHR_OUTPUT_MT))) || (chrom_idx <= chrom_info_ptr->autosome_ct)) {
4586     return uint32toa(chrom_idx, buf);
4587   } else if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[X_OFFSET]) {
4588     *buf++ = 'X';
4589   } else if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[Y_OFFSET]) {
4590     *buf++ = 'Y';
4591   } else if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[XY_OFFSET]) {
4592     buf = (char*)memcpya(buf, "XY", 2);
4593   } else {
4594     *buf++ = 'M';
4595     if (output_encoding & CHR_OUTPUT_MT) {
4596       *buf++ = 'T';
4597     }
4598   }
4599   return buf;
4600 }
4601 
chrom_name_write(const Chrom_info * chrom_info_ptr,uint32_t chrom_idx,char * buf)4602 char* chrom_name_write(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, char* buf) {
4603   // assumes chrom_idx is valid
4604   if (!chrom_idx) {
4605     *buf++ = '0';
4606     return buf;
4607   } else if (chrom_idx <= chrom_info_ptr->max_code) {
4608     return chrom_name_std(chrom_info_ptr, chrom_idx, buf);
4609   } else if (chrom_info_ptr->zero_extra_chroms) {
4610     *buf++ = '0';
4611     return buf;
4612   } else {
4613     return strcpya(buf, chrom_info_ptr->nonstd_names[chrom_idx]);
4614   }
4615 }
4616 
chrom_name_buf5w4write(const Chrom_info * chrom_info_ptr,uint32_t chrom_idx,uint32_t * chrom_name_len_ptr,char * buf5)4617 char* chrom_name_buf5w4write(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t* chrom_name_len_ptr, char* buf5) {
4618   uint32_t slen;
4619   *chrom_name_len_ptr = 4;
4620   if (!chrom_idx) {
4621     memcpy(buf5, "   0", 4);
4622   } else if (chrom_idx <= chrom_info_ptr->max_code) {
4623     if (chrom_info_ptr->output_encoding & CHR_OUTPUT_PREFIX) {
4624       *chrom_name_len_ptr = (uintptr_t)(chrom_name_std(chrom_info_ptr, chrom_idx, buf5) - buf5);
4625     } else {
4626       width_force(4, buf5, chrom_name_std(chrom_info_ptr, chrom_idx, buf5));
4627     }
4628   } else if (chrom_info_ptr->zero_extra_chroms) {
4629     memcpy(buf5, "   0", 4);
4630   } else {
4631     slen = strlen(chrom_info_ptr->nonstd_names[chrom_idx]);
4632     if (slen < 4) {
4633       fw_strcpyn(4, slen, chrom_info_ptr->nonstd_names[chrom_idx], buf5);
4634     } else {
4635       *chrom_name_len_ptr = slen;
4636       return chrom_info_ptr->nonstd_names[chrom_idx];
4637     }
4638   }
4639   return buf5;
4640 }
4641 
get_max_chrom_slen(const Chrom_info * chrom_info_ptr)4642 uint32_t get_max_chrom_slen(const Chrom_info* chrom_info_ptr) {
4643   // does not include trailing null
4644   // can be overestimate
4645   // if more functions start calling this, it should just be built into
4646   // load_bim() instead
4647   if (chrom_info_ptr->zero_extra_chroms) {
4648     return 3 + MAX_CHROM_TEXTNUM_SLEN;
4649   }
4650   const uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
4651   const uint32_t max_code = chrom_info_ptr->max_code;
4652   uint32_t max_chrom_slen = 3 + MAX_CHROM_TEXTNUM_SLEN;
4653   for (uint32_t chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
4654     const uint32_t chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
4655     if (!is_set(chrom_info_ptr->chrom_mask, chrom_idx)) {
4656       continue;
4657     }
4658     if (chrom_idx > max_code) {
4659       const uint32_t name_slen = strlen(chrom_info_ptr->nonstd_names[chrom_idx]);
4660       if (name_slen > max_chrom_slen) {
4661 	max_chrom_slen = name_slen;
4662       }
4663     }
4664   }
4665   return max_chrom_slen;
4666 }
4667 
haploid_chrom_present(const Chrom_info * chrom_info_ptr)4668 uint32_t haploid_chrom_present(const Chrom_info* chrom_info_ptr) {
4669   const uintptr_t* chrom_mask = chrom_info_ptr->chrom_mask;
4670   const uintptr_t* haploid_mask = chrom_info_ptr->haploid_mask;
4671   for (uint32_t widx = 0; widx < CHROM_MASK_INITIAL_WORDS; widx++) {
4672     if (chrom_mask[widx] & haploid_mask[widx]) {
4673       return 1;
4674     }
4675   }
4676   return 0;
4677 }
4678 
single_letter_chrom(uint32_t letter)4679 static inline int32_t single_letter_chrom(uint32_t letter) {
4680   letter &= 0xdf;
4681   if (letter == 'X') {
4682     return CHROM_X;
4683   } else if (letter == 'Y') {
4684     return CHROM_Y;
4685   } else if (letter == 'M') {
4686     return CHROM_MT;
4687   } else {
4688     return -1;
4689   }
4690 }
4691 
get_chrom_code_raw(const char * sptr)4692 int32_t get_chrom_code_raw(const char* sptr) {
4693   // any character <= ' ' is considered a terminator
4694   // note that char arithmetic tends to be compiled to int32 operations, so we
4695   // mostly work with ints here
4696   // assumes MAX_CHROM_TEXTNUM_SLEN == 2
4697   uint32_t first_char_code = (unsigned char)sptr[0];
4698   uint32_t second_char_code = (unsigned char)sptr[1];
4699   if ((first_char_code & 0xdf) == 'C') {
4700     if (((second_char_code & 0xdf) == 'H') && ((((unsigned char)sptr[2]) & 0xdf) == 'R')) {
4701       sptr = &(sptr[3]);
4702       first_char_code = (unsigned char)sptr[0];
4703       second_char_code = (unsigned char)sptr[1];
4704     } else {
4705       return -1;
4706     }
4707   }
4708   if (second_char_code > ' ') {
4709     if (sptr[2] > ' ') {
4710       return -1;
4711     }
4712     const uint32_t first_char_toi = first_char_code - '0';
4713     if (first_char_toi < 10) {
4714       const uint32_t second_char_toi = second_char_code - '0';
4715       if (second_char_toi < 10) {
4716 	return first_char_toi * 10 + second_char_toi;
4717       } else if (!first_char_toi) {
4718 	// accept '0X', '0Y', '0M' emitted by Oxford software
4719 	return single_letter_chrom(second_char_code);
4720       }
4721     } else {
4722       first_char_code &= 0xdf;
4723       if (first_char_code == 'X') {
4724         if ((second_char_code == 'Y') || (second_char_code == 'y')) {
4725 	  return CHROM_XY;
4726 	}
4727       } else if (first_char_code == 'M') {
4728         if ((second_char_code == 'T') || (second_char_code == 't')) {
4729 	  return CHROM_MT;
4730 	}
4731       }
4732     }
4733   } else {
4734     const uint32_t first_char_toi = first_char_code - '0';
4735     if (first_char_toi < 10) {
4736       return first_char_toi;
4737     } else {
4738       return single_letter_chrom(first_char_code);
4739     }
4740   }
4741   return -1;
4742 }
4743 
get_chrom_code(const char * chrom_name,const Chrom_info * chrom_info_ptr,uint32_t name_slen)4744 int32_t get_chrom_code(const char* chrom_name, const Chrom_info* chrom_info_ptr, uint32_t name_slen) {
4745   // requires chrom_name to be null-terminated
4746   // in practice, name_slen will usually already be known, may as well avoid
4747   // redundant strlen() calls even though this uglifies the interface
4748   // does not perform exhaustive error-checking
4749   // -1 = --allow-extra-chr ok, -2 = total fail
4750   const int32_t chrom_code_raw = get_chrom_code_raw(chrom_name);
4751   if (((const uint32_t)chrom_code_raw) <= chrom_info_ptr->max_code) {
4752     return chrom_code_raw;
4753   }
4754   if (chrom_code_raw != -1) {
4755     if (chrom_code_raw >= MAX_POSSIBLE_CHROM) {
4756       return chrom_info_ptr->xymt_codes[chrom_code_raw - MAX_POSSIBLE_CHROM];
4757     }
4758     return -2;
4759   }
4760   if (!chrom_info_ptr->name_ct) {
4761     return -1;
4762   }
4763   // 0xffffffffU gets casted to -1
4764   return (int32_t)nonstd_chrom_name_htable_find(chrom_name, (const char* const*)chrom_info_ptr->nonstd_names, chrom_info_ptr->nonstd_id_htable, name_slen);
4765 }
4766 
get_chrom_code_counted(const Chrom_info * chrom_info_ptr,uint32_t name_slen,char * chrom_name)4767 int32_t get_chrom_code_counted(const Chrom_info* chrom_info_ptr, uint32_t name_slen, char* chrom_name) {
4768   // when the chromosome name isn't null-terminated
4769   char* s_end = &(chrom_name[name_slen]);
4770   const char tmpc = *s_end;
4771   *s_end = '\0';
4772   const int32_t retval = get_chrom_code(chrom_name, chrom_info_ptr, name_slen);
4773   *s_end = tmpc;
4774   return retval;
4775 }
4776 
get_variant_chrom_fo_idx(const Chrom_info * chrom_info_ptr,uintptr_t variant_uidx)4777 uint32_t get_variant_chrom_fo_idx(const Chrom_info* chrom_info_ptr, uintptr_t variant_uidx) {
4778   const uint32_t* variant_binsearch = chrom_info_ptr->chrom_fo_vidx_start;
4779   uint32_t chrom_fo_min = 0;
4780   uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
4781   while (chrom_ct - chrom_fo_min > 1) {
4782     const uint32_t chrom_fo_cur = (chrom_ct + chrom_fo_min) / 2;
4783     if (variant_binsearch[chrom_fo_cur] > variant_uidx) {
4784       chrom_ct = chrom_fo_cur;
4785     } else {
4786       chrom_fo_min = chrom_fo_cur;
4787     }
4788   }
4789   return chrom_fo_min;
4790 }
4791 
chrom_error(const char * chrom_name,const char * file_descrip,const Chrom_info * chrom_info_ptr,uintptr_t line_idx,int32_t error_code)4792 void chrom_error(const char* chrom_name, const char* file_descrip, const Chrom_info* chrom_info_ptr, uintptr_t line_idx, int32_t error_code) {
4793   // assumes chrom_name is null-terminated
4794   const int32_t raw_code = get_chrom_code_raw(chrom_name);
4795   logprint("\n");
4796   if (line_idx) {
4797     LOGERRPRINTFWW("Error: Invalid chromosome code '%s' on line %" PRIuPTR " of %s.\n", chrom_name, line_idx, file_descrip);
4798   } else {
4799     LOGERRPRINTFWW("Error: Invalid chromosome code '%s' in %s.\n", chrom_name, file_descrip);
4800   }
4801   if ((raw_code > ((int32_t)chrom_info_ptr->max_code)) && ((raw_code <= MAX_CHROM_TEXTNUM + XYMT_OFFSET_CT) || (raw_code >= MAX_POSSIBLE_CHROM))) {
4802     if (chrom_info_ptr->species != SPECIES_UNKNOWN) {
4803       if (chrom_info_ptr->species == SPECIES_HUMAN) {
4804 	logerrprint("(This is disallowed for humans.  Check if the problem is with your data, or if\nyou forgot to define a different chromosome set with e.g. --chr-set.)\n");
4805       } else {
4806 	logerrprint("(This is disallowed by the PLINK 1.07 species flag you used.  You can\ntemporarily work around this restriction with --chr-set; contact the developers\nif you want the flag to be permanently redefined.)\n");
4807       }
4808     } else {
4809       logerrprint("(This is disallowed by your --chr-set/--autosome-num parameters.  Check if the\nproblem is with your data, or your command line.)\n");
4810     }
4811   } else if (error_code == -1) {
4812     logerrprint("(Use --allow-extra-chr to force it to be accepted.)\n");
4813   }
4814 }
4815 
try_to_add_chrom_name(const char * chrom_name,const char * file_descrip,uintptr_t line_idx,uint32_t name_slen,uint32_t allow_extra_chroms,int32_t * chrom_idx_ptr,Chrom_info * chrom_info_ptr)4816 int32_t try_to_add_chrom_name(const char* chrom_name, const char* file_descrip, uintptr_t line_idx, uint32_t name_slen, uint32_t allow_extra_chroms, int32_t* chrom_idx_ptr, Chrom_info* chrom_info_ptr) {
4817   // assumes chrom_name is nonstandard (i.e. not "2", "chr2", "chrX", etc.)
4818   // requires chrom_name to be null-terminated
4819   // assumes chrom_idx currently has the return value of get_chrom_code()
4820   if ((!allow_extra_chroms) || ((*chrom_idx_ptr) == -2)) {
4821     chrom_error(chrom_name, file_descrip, chrom_info_ptr, line_idx, *chrom_idx_ptr);
4822     return RET_MALFORMED_INPUT;
4823   }
4824 
4825   // quasi-bugfix: remove redundant hash table check
4826 
4827   if (chrom_name[0] == '#') {
4828     // redundant with some of the comment-skipping loaders, but this isn't
4829     // performance-critical
4830     logprint("\n");
4831     logerrprint("Error: Chromosome/contig names may not begin with '#'.\n");
4832     return RET_MALFORMED_INPUT;
4833   }
4834   if (name_slen > MAX_ID_SLEN) {
4835     logprint("\n");
4836     if (line_idx) {
4837       LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has an excessively long chromosome/contig name. (The " PROG_NAME_CAPS " limit is " MAX_ID_SLEN_STR " characters.)\n", line_idx, file_descrip);
4838     } else {
4839       LOGERRPRINTFWW("Error: Excessively long chromosome/contig name in %s. (The " PROG_NAME_CAPS " limit is " MAX_ID_SLEN_STR " characters.)\n", file_descrip);
4840     }
4841     return RET_MALFORMED_INPUT;
4842   }
4843   const uint32_t max_code_p1 = chrom_info_ptr->max_code + 1;
4844   const uint32_t name_ct = chrom_info_ptr->name_ct;
4845   const uint32_t chrom_code_end = max_code_p1 + name_ct;
4846   if (chrom_code_end == MAX_POSSIBLE_CHROM) {
4847     logprint("\n");
4848     logerrprint("Error: Too many distinct nonstandard chromosome/contig names.\n");
4849     return RET_MALFORMED_INPUT;
4850   }
4851   if (!name_ct) {
4852     // lazy initialization
4853     fill_uint_one(CHROM_NAME_HTABLE_SIZE, chrom_info_ptr->nonstd_id_htable);
4854   }
4855   char** nonstd_names = chrom_info_ptr->nonstd_names;
4856   nonstd_names[chrom_code_end] = (char*)malloc(name_slen + 1);
4857   if (!nonstd_names[chrom_code_end]) {
4858     return RET_NOMEM;
4859   }
4860   Ll_str* name_stack_ptr = chrom_info_ptr->incl_excl_name_stack;
4861   uint32_t in_name_stack = 0;
4862   while (name_stack_ptr) {
4863     // there shouldn't be many of these, so sorting is unimportant
4864     if (!strcmp(chrom_name, name_stack_ptr->ss)) {
4865       in_name_stack = 1;
4866       break;
4867     }
4868     name_stack_ptr = name_stack_ptr->next;
4869   }
4870   if ((in_name_stack && chrom_info_ptr->is_include_stack) || ((!in_name_stack) && (!chrom_info_ptr->is_include_stack))) {
4871     SET_BIT(chrom_code_end, chrom_info_ptr->chrom_mask);
4872     if (chrom_info_ptr->haploid_mask[0] & 1) {
4873       SET_BIT(chrom_code_end, chrom_info_ptr->haploid_mask);
4874     }
4875   }
4876   memcpy(nonstd_names[chrom_code_end], chrom_name, name_slen + 1);
4877   *chrom_idx_ptr = (int32_t)chrom_code_end;
4878   chrom_info_ptr->name_ct = name_ct + 1;
4879   uint32_t* id_htable = chrom_info_ptr->nonstd_id_htable;
4880   uint32_t hashval = murmurhash3_32(chrom_name, name_slen) % CHROM_NAME_HTABLE_SIZE;
4881   uint32_t next_incr = 1;
4882   while (1) {
4883     if (id_htable[hashval] == 0xffffffffU) {
4884       id_htable[hashval] = chrom_code_end;
4885       return 0;
4886     }
4887     // no overflow danger here
4888     hashval += next_incr;
4889     if (hashval >= CHROM_NAME_HTABLE_SIZE) {
4890       hashval -= CHROM_NAME_HTABLE_SIZE;
4891     }
4892     next_incr += 2; // quadratic probing
4893   }
4894 }
4895 
allele_set(const char * newval,uint32_t slen,char ** allele_ptr)4896 uint32_t allele_set(const char* newval, uint32_t slen, char** allele_ptr) {
4897   char* newptr;
4898   if (slen == 1) {
4899     newptr = (char*)(&(g_one_char_strs[((unsigned char)*newval) * 2]));
4900   } else {
4901     newptr = (char*)malloc(slen + 1);
4902     if (!newptr) {
4903       return 1;
4904     }
4905     memcpyx(newptr, newval, slen, '\0');
4906   }
4907   *allele_ptr = newptr;
4908   return 0;
4909 }
4910 
allele_reset(const char * newval,uint32_t slen,char ** allele_ptr)4911 uint32_t allele_reset(const char* newval, uint32_t slen, char** allele_ptr) {
4912   char* newptr;
4913   if (slen == 1) {
4914     newptr = (char*)(&(g_one_char_strs[((uint8_t)*newval) * 2]));
4915   } else {
4916     newptr = (char*)malloc(slen + 1);
4917     if (!newptr) {
4918       return 1;
4919     }
4920     memcpyx(newptr, newval, slen, '\0');
4921   }
4922   if (allele_ptr[0][1]) {
4923     free(*allele_ptr);
4924   }
4925   *allele_ptr = newptr;
4926   return 0;
4927 }
4928 
cleanup_allele_storage(uint32_t max_allele_slen,uintptr_t allele_storage_entry_ct,char ** allele_storage)4929 void cleanup_allele_storage(uint32_t max_allele_slen, uintptr_t allele_storage_entry_ct, char** allele_storage) {
4930   if (allele_storage && (max_allele_slen > 1)) {
4931     const uintptr_t one_char_strs_addr = (uintptr_t)g_one_char_strs;
4932     for (uintptr_t idx = 0; idx < allele_storage_entry_ct; ++idx) {
4933       char* cur_entry = allele_storage[idx];
4934       assert(cur_entry);
4935       // take advantage of unsigned wraparound
4936       if ((((uintptr_t)cur_entry) - one_char_strs_addr) >= 512) {
4937 	free(cur_entry);
4938       }
4939     }
4940   }
4941 }
4942 
cleanup_allele_storage2(uintptr_t allele_storage_entry_ct,char ** allele_storage)4943 void cleanup_allele_storage2(uintptr_t allele_storage_entry_ct, char** allele_storage) {
4944   if (allele_storage) {
4945     const uintptr_t one_char_strs_addr = (uintptr_t)g_one_char_strs;
4946     for (uintptr_t idx = 0; idx < allele_storage_entry_ct;) {
4947       char* cur_entry = allele_storage[idx];
4948       if (!cur_entry) {
4949         // --merge-equal-pos hacked entry
4950         idx += 2;
4951         continue;
4952       }
4953       // take advantage of unsigned wraparound
4954       if ((((uintptr_t)cur_entry) - one_char_strs_addr) >= 512) {
4955 	free(cur_entry);
4956       }
4957       ++idx;
4958     }
4959   }
4960 }
4961 
refresh_chrom_info(const Chrom_info * chrom_info_ptr,uintptr_t marker_uidx,uint32_t * __restrict chrom_end_ptr,uint32_t * __restrict chrom_fo_idx_ptr,uint32_t * __restrict is_x_ptr,uint32_t * __restrict is_y_ptr,uint32_t * __restrict is_mt_ptr,uint32_t * __restrict is_haploid_ptr)4962 void refresh_chrom_info(const Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* __restrict chrom_end_ptr, uint32_t* __restrict chrom_fo_idx_ptr, uint32_t* __restrict is_x_ptr, uint32_t* __restrict is_y_ptr, uint32_t* __restrict is_mt_ptr, uint32_t* __restrict is_haploid_ptr) {
4963   // assumes we are at the end of the chromosome denoted by chrom_fo_idx.  Ok
4964   // for chrom_fo_idx == 0xffffffffU.
4965   // assumes marker_uidx < unfiltered_marker_ct
4966   *chrom_end_ptr = chrom_info_ptr->chrom_fo_vidx_start[(*chrom_fo_idx_ptr) + 1];
4967   while (marker_uidx >= (*chrom_end_ptr)) {
4968     *chrom_end_ptr = chrom_info_ptr->chrom_fo_vidx_start[(++(*chrom_fo_idx_ptr)) + 1];
4969   }
4970   const int32_t chrom_idx = chrom_info_ptr->chrom_file_order[*chrom_fo_idx_ptr];
4971   *is_x_ptr = (chrom_idx == chrom_info_ptr->xymt_codes[X_OFFSET]);
4972   *is_y_ptr = (chrom_idx == chrom_info_ptr->xymt_codes[Y_OFFSET]);
4973   *is_mt_ptr = (chrom_idx == chrom_info_ptr->xymt_codes[MT_OFFSET]);
4974   *is_haploid_ptr = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
4975 }
4976 
single_chrom_start(const Chrom_info * chrom_info_ptr,const uintptr_t * marker_exclude,uint32_t unfiltered_marker_ct)4977 int32_t single_chrom_start(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t unfiltered_marker_ct) {
4978   // Assumes there is at least one marker, and there are no split chromosomes.
4979   // Returns first marker_uidx in chromosome if there is only one, or -1 if
4980   // there's more than one chromosome.
4981   uint32_t first_marker_uidx = next_unset_unsafe(marker_exclude, 0);
4982   uint32_t last_marker_chrom = get_variant_chrom(chrom_info_ptr, last_clear_bit(marker_exclude, unfiltered_marker_ct));
4983   return (get_variant_chrom(chrom_info_ptr, first_marker_uidx) == last_marker_chrom)? first_marker_uidx : -1;
4984 }
4985 
get_dmedian(const double * sorted_arr,uintptr_t len)4986 double get_dmedian(const double* sorted_arr, uintptr_t len) {
4987   if (len) {
4988     if (len % 2) {
4989       return sorted_arr[len / 2];
4990     } else {
4991       return (sorted_arr[len / 2] + sorted_arr[(len / 2) - 1]) * 0.5;
4992     }
4993   } else {
4994     return 0.0;
4995   }
4996 }
4997 
4998 #ifdef __cplusplus
destructive_get_dmedian(uintptr_t len,double * unsorted_arr)4999 double destructive_get_dmedian(uintptr_t len, double* unsorted_arr) {
5000   if (!len) {
5001     return 0.0;
5002   }
5003   uintptr_t len_d2 = len / 2;
5004   std::nth_element(unsorted_arr, &(unsorted_arr[len_d2]), &(unsorted_arr[len]));
5005   if (!(len % 2)) {
5006     std::nth_element(unsorted_arr, &(unsorted_arr[len_d2 - 1]), &(unsorted_arr[len_d2]));
5007     return (unsorted_arr[len_d2 - 1] + unsorted_arr[len_d2]) * 0.5;
5008   } else {
5009     return unsorted_arr[len_d2];
5010   }
5011 }
5012 #else
destructive_get_dmedian(uintptr_t len,double * unsorted_arr)5013 double destructive_get_dmedian(uintptr_t len, double* unsorted_arr) {
5014   // no, I'm not gonna bother reimplementing introselect just for folks who
5015   // insist on using gcc over g++
5016   qsort(unsorted_arr, len, sizeof(double), double_cmp);
5017   return get_dmedian(unsorted_arr, len);
5018 }
5019 #endif
5020 
strcmp_casted(const void * s1,const void * s2)5021 int32_t strcmp_casted(const void* s1, const void* s2) {
5022   return strcmp((char*)s1, (char*)s2);
5023 }
5024 
5025 // PLINK 2's natural sort uses the following logic:
5026 // - All alphabetic characters act as if they are capitalized, except for
5027 // tiebreaking purposes (where ASCII is used).
5028 // - Numbers are compared by magnitude, with the exception of...
5029 // - Numbers with leading zero(es).  If you're putting extraneous zeroes in
5030 // front of IDs, we assume they're there to force particular items to be sorted
5031 // earlier, rather than just appearing at random.  So, unlike many natural sort
5032 // implementations, we sort 00200 < 021 < 20: all numbers with n leading zeroes
5033 // are sorted before all numbers with (n-1) leading zeroes; magnitude only
5034 // applies if the leading zero counts match.  This handles e.g. subbasement
5035 // room numbering properly.
5036 //
5037 // This won't always do what you want if your IDs have variable-length decimals
5038 // in them (e.g. it yields 0.99 < 0.101); if you don't want to fall back on
5039 // ASCII sort, enforce a fixed number of digits after the decimal point.  Also
5040 // note that ASCII sort is outright better for e.g. numbers represented in
5041 // hexadecimal or base 36.  In principle, it's possible to reliably autodetect
5042 // some of these cases (especially hexadecimal numbers beginning with "0x"),
5043 // but that'll never be perfect so we just let the user toggle the sort method.
strcmp_natural_scan_forward(const unsigned char * s1,const unsigned char * s2)5044 int32_t strcmp_natural_scan_forward(const unsigned char* s1, const unsigned char* s2) {
5045   // assumes s1 and s2 currently point to the middle of a mismatching number,
5046   // where s1 < s2.
5047   unsigned char c1;
5048   unsigned char c2;
5049   do {
5050     c1 = *(++s1);
5051     c2 = *(++s2);
5052     if (is_not_digit(c1)) {
5053       return -1;
5054     }
5055   } while (is_digit(c2));
5056   return 1;
5057 }
5058 
5059 // We have the following major states:
5060 //   0 (initial): strings perfectly match so far, last char (if any) is
5061 //                nonnumeric.
5062 //   1: strings perfectly match so far, last char is numeric.
5063 //   2: strings match except for capitalization, last char is nonnumeric.
5064 //   3: strings match except for capitalization, last char is numeric.
5065 // strcmp_natural_tiebroken() expresses the logic for states 2 and 3, while
5066 // strcmp_natural_uncasted() handles states 0 and 1.
strcmp_natural_tiebroken(const unsigned char * s1,const unsigned char * s2)5067 int32_t strcmp_natural_tiebroken(const unsigned char* s1, const unsigned char* s2) {
5068   // assumes ties should be broken in favor of s2.
5069   unsigned char c1 = *(++s1);
5070   unsigned char c2 = *(++s2);
5071   while (is_not_nzdigit(c1) && is_not_nzdigit(c2)) {
5072     // state 2
5073   strcmp_natural_tiebroken_state_2:
5074     if (c1 != c2) {
5075       if ((c1 >= 'a') && (c1 <= 'z')) {
5076 	c1 -= 32;
5077       }
5078       if ((c2 >= 'a') && (c2 <= 'z')) {
5079 	c2 -= 32;
5080       }
5081       if (c1 < c2) {
5082 	return -1;
5083       } else if (c1 > c2) {
5084 	return 1;
5085       }
5086     } else if (!c1) {
5087       return -1;
5088     }
5089     c1 = *(++s1);
5090     c2 = *(++s2);
5091   }
5092   if (is_not_nzdigit(c1) || is_not_nzdigit(c2)) {
5093     return (c1 < c2)? -1 : 1;
5094   }
5095   do {
5096     // state 3
5097     if (c1 != c2) {
5098       if (is_digit(c2)) {
5099 	if (c1 < c2) {
5100 	  return strcmp_natural_scan_forward(s1, s2);
5101 	} else {
5102 	  return -strcmp_natural_scan_forward(s2, s1);
5103 	}
5104       }
5105       return 1;
5106     }
5107     c1 = *(++s1);
5108     c2 = *(++s2);
5109   } while (is_digit(c1));
5110   if (is_digit(c2)) {
5111     return -1;
5112   }
5113   // skip the while (is_not_digit...) check
5114   goto strcmp_natural_tiebroken_state_2;
5115 }
5116 
strcmp_natural_uncasted(const unsigned char * s1,const unsigned char * s2)5117 static inline int32_t strcmp_natural_uncasted(const unsigned char* s1, const unsigned char* s2) {
5118   unsigned char c1 = *s1;
5119   unsigned char c2 = *s2;
5120   while (is_not_nzdigit(c1) && is_not_nzdigit(c2)) {
5121     // state 0
5122   strcmp_natural_uncasted_state_0:
5123     if (c1 != c2) {
5124       if ((c1 >= 'a') && (c1 <= 'z')) {
5125 	if (c2 + 32 == c1) {
5126 	  return -strcmp_natural_tiebroken(s2, s1);
5127 	} else if ((c2 < 'a') || (c2 > 'z')) {
5128 	  c1 -= 32;
5129 	}
5130       } else if ((c2 >= 'a') && (c2 <= 'z')) {
5131 	c2 -= 32;
5132 	if (c1 == c2) {
5133 	  return strcmp_natural_tiebroken(s1, s2);
5134 	}
5135       }
5136       return (c1 < c2)? -1 : 1;
5137     } else if (!c1) {
5138       return 0;
5139     }
5140     c1 = *(++s1);
5141     c2 = *(++s2);
5142   }
5143   if (is_not_nzdigit(c1) || is_not_nzdigit(c2)) {
5144     return (c1 < c2)? -1 : 1;
5145   }
5146   do {
5147     // state 1
5148     if (c1 != c2) {
5149       if (is_digit(c2)) {
5150 	if (c1 < c2) {
5151 	  return strcmp_natural_scan_forward(s1, s2);
5152 	} else {
5153 	  return -strcmp_natural_scan_forward(s2, s1);
5154 	}
5155       }
5156       return 1;
5157     }
5158     c1 = *(++s1);
5159     c2 = *(++s2);
5160   } while (is_digit(c1));
5161   if (is_digit(c2)) {
5162     return -1;
5163   }
5164   goto strcmp_natural_uncasted_state_0;
5165 }
5166 
strcmp_natural(const void * s1,const void * s2)5167 int32_t strcmp_natural(const void* s1, const void* s2) {
5168   return strcmp_natural_uncasted((unsigned char*)s1, (unsigned char*)s2);
5169 }
5170 
strcmp_deref(const void * s1,const void * s2)5171 int32_t strcmp_deref(const void* s1, const void* s2) {
5172   return strcmp(*(char**)s1, *(char**)s2);
5173 }
5174 
strcmp_natural_deref(const void * s1,const void * s2)5175 int32_t strcmp_natural_deref(const void* s1, const void* s2) {
5176   return strcmp_natural_uncasted(*(unsigned char**)s1, *(unsigned char**)s2);
5177 }
5178 
get_uidx_from_unsorted(const char * idstr,const uintptr_t * exclude_arr,uint32_t id_ct,const char * unsorted_ids,uintptr_t max_id_len)5179 int32_t get_uidx_from_unsorted(const char* idstr, const uintptr_t* exclude_arr, uint32_t id_ct, const char* unsorted_ids, uintptr_t max_id_len) {
5180   uintptr_t id_uidx = 0;
5181   uintptr_t slen_p1 = strlen(idstr) + 1;
5182   uint32_t id_idx;
5183   if (slen_p1 > max_id_len) {
5184     return -1;
5185   }
5186   for (id_idx = 0; id_idx < id_ct; id_uidx++, id_idx++) {
5187     id_uidx = next_unset_ul_unsafe(exclude_arr, id_uidx);
5188     if (!memcmp(idstr, &(unsorted_ids[id_uidx * max_id_len]), slen_p1)) {
5189       return (int32_t)((uint32_t)id_uidx);
5190     }
5191   }
5192   return -1;
5193 }
5194 
scan_for_duplicate_ids(char * sorted_ids,uintptr_t id_ct,uintptr_t max_id_len)5195 char* scan_for_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len) {
5196   uintptr_t id_idx;
5197   id_ct--;
5198   for (id_idx = 0; id_idx < id_ct; id_idx++) {
5199     if (!strcmp(&(sorted_ids[id_idx * max_id_len]), &(sorted_ids[(id_idx + 1) * max_id_len]))) {
5200       return &(sorted_ids[id_idx * max_id_len]);
5201     }
5202   }
5203   return nullptr;
5204 }
5205 
scan_for_duplicate_or_overlap_ids(char * sorted_ids,uintptr_t id_ct,uintptr_t max_id_len,const char * sorted_nonoverlap_ids,uintptr_t nonoverlap_id_ct,uintptr_t max_nonoverlap_id_len)5206 char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, const char* sorted_nonoverlap_ids, uintptr_t nonoverlap_id_ct, uintptr_t max_nonoverlap_id_len) {
5207   // extended scan_for_duplicate_ids() which also verifies that no entry in
5208   // sorted_ids matches any entry in sorted_nonoverlap_ids.
5209   // nonoverlap_id_ct == 0 and sorted_nonoverlap_ids == nullptr ok.  id_ct
5210   // cannot be zero, though.
5211   uintptr_t nonoverlap_id_idx = 0;
5212   uintptr_t id_idx = 0;
5213   char* cur_id_ptr = sorted_ids;
5214   const char* nonoverlap_id_ptr;
5215   char* other_id_ptr;
5216   int32_t ii;
5217   while (1) {
5218     if (nonoverlap_id_idx == nonoverlap_id_ct) {
5219       return scan_for_duplicate_ids(cur_id_ptr, id_ct - id_idx, max_id_len);
5220     }
5221     nonoverlap_id_ptr = &(sorted_nonoverlap_ids[nonoverlap_id_idx * max_nonoverlap_id_len]);
5222     ii = strcmp(cur_id_ptr, nonoverlap_id_ptr);
5223     if (ii < 0) {
5224       if (++id_idx == id_ct) {
5225 	return nullptr;
5226       }
5227       other_id_ptr = &(cur_id_ptr[max_id_len]);
5228       if (!strcmp(cur_id_ptr, other_id_ptr)) {
5229 	return cur_id_ptr;
5230       }
5231       cur_id_ptr = other_id_ptr;
5232       continue;
5233     } else if (!ii) {
5234       return cur_id_ptr;
5235     }
5236     nonoverlap_id_idx++;
5237   }
5238 }
5239 
eval_affection(const char * bufptr,double missing_phenod)5240 int32_t eval_affection(const char* bufptr, double missing_phenod) {
5241   // turns out --1 had the side-effect of *forcing* case/control
5242   // interpretation in 1.07.  We replicate that for backward compatibility, and
5243   // no longer call this function in that context.
5244   char* ss;
5245   double dxx;
5246   // this used to be an integer read, but that could do the wrong thing if e.g.
5247   // all phenotypes were -9.xxx...
5248   dxx = strtod(bufptr, &ss);
5249   if ((ss == bufptr) || (dxx == missing_phenod)) {
5250     return 1;
5251   }
5252   return ((bufptr[0] == '0') || (bufptr[0] == '1') || (bufptr[0] == '2')) && is_space_or_eoln(bufptr[1]);
5253 }
5254 
triangle_divide(int64_t cur_prod,int32_t modif)5255 uint32_t triangle_divide(int64_t cur_prod, int32_t modif) {
5256   // return smallest integer vv for which (vv * (vv + modif)) is no smaller
5257   // than cur_prod, and neither term in the product is negative.  (Note the
5258   // lack of a divide by two; cur_prod should also be double its "true" value
5259   // as a result.)
5260   int64_t vv;
5261   if (cur_prod == 0) {
5262     if (modif < 0) {
5263       return -modif;
5264     } else {
5265       return 0;
5266     }
5267   }
5268   vv = (int64_t)sqrt((double)cur_prod);
5269   while ((vv - 1) * (vv + modif - 1) >= cur_prod) {
5270     vv--;
5271   }
5272   while (vv * (vv + modif) < cur_prod) {
5273     vv++;
5274   }
5275   return vv;
5276 }
5277 
parallel_bounds(uint32_t ct,int32_t start,uint32_t parallel_idx,uint32_t parallel_tot,int32_t * __restrict bound_start_ptr,int32_t * __restrict bound_end_ptr)5278 void parallel_bounds(uint32_t ct, int32_t start, uint32_t parallel_idx, uint32_t parallel_tot, int32_t* __restrict bound_start_ptr, int32_t* __restrict bound_end_ptr) {
5279   int32_t modif = 1 - start * 2;
5280   int64_t ct_tot = ((int64_t)ct) * (ct + modif);
5281   *bound_start_ptr = triangle_divide((ct_tot * parallel_idx) / parallel_tot, modif);
5282   *bound_end_ptr = triangle_divide((ct_tot * (parallel_idx + 1)) / parallel_tot, modif);
5283 }
5284 
5285 // this might belong in plink_calc instead, not being used anywhere else
5286 // set align to 1 for no alignment
triangle_fill(uint32_t ct,uint32_t pieces,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t start,uint32_t align,uint32_t * target_arr)5287 void triangle_fill(uint32_t ct, uint32_t pieces, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align, uint32_t* target_arr) {
5288   int32_t modif = 1 - start * 2;
5289   uint32_t cur_piece = 1;
5290   int64_t ct_tr;
5291   int64_t cur_prod;
5292   int32_t lbound;
5293   int32_t ubound;
5294   uint32_t uii;
5295   uint32_t align_m1;
5296   parallel_bounds(ct, start, parallel_idx, parallel_tot, &lbound, &ubound);
5297   // x(x+1)/2 is divisible by y iff (x % (2y)) is 0 or (2y - 1).
5298   align *= 2;
5299   align_m1 = align - 1;
5300   target_arr[0] = lbound;
5301   target_arr[pieces] = ubound;
5302   cur_prod = ((int64_t)lbound) * (lbound + modif);
5303   ct_tr = (((int64_t)ubound) * (ubound + modif) - cur_prod) / pieces;
5304   while (cur_piece < pieces) {
5305     cur_prod += ct_tr;
5306     lbound = triangle_divide(cur_prod, modif);
5307     uii = (lbound - ((int32_t)start)) & align_m1;
5308     if ((uii) && (uii != align_m1)) {
5309       lbound = start + ((lbound - ((int32_t)start)) | align_m1);
5310     }
5311     // lack of this check caused a nasty bug earlier
5312     if (((uint32_t)lbound) > ct) {
5313       lbound = ct;
5314     }
5315     target_arr[cur_piece++] = lbound;
5316   }
5317 }
5318 
relationship_req(uint64_t calculation_type)5319 int32_t relationship_req(uint64_t calculation_type) {
5320   return (calculation_type & (CALC_RELATIONSHIP | CALC_UNRELATED_HERITABILITY | CALC_REL_CUTOFF | CALC_REGRESS_REL | CALC_PCA))? 1 : 0;
5321 }
5322 
distance_req(const char * read_dists_fname,uint64_t calculation_type)5323 int32_t distance_req(const char* read_dists_fname, uint64_t calculation_type) {
5324   return ((calculation_type & CALC_DISTANCE) || ((calculation_type & (CALC_PLINK1_DISTANCE_MATRIX | CALC_PLINK1_IBS_MATRIX)) && (!(calculation_type & CALC_GENOME))) || ((!read_dists_fname) && (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE))));
5325 }
5326 
double_cmp(const void * aa,const void * bb)5327 int32_t double_cmp(const void* aa, const void* bb) {
5328   double cc = *((const double*)aa) - *((const double*)bb);
5329   if (cc > 0.0) {
5330     return 1;
5331   } else if (cc < 0.0) {
5332     return -1;
5333   } else {
5334     return 0;
5335   }
5336 }
5337 
double_cmp_decr(const void * aa,const void * bb)5338 int32_t double_cmp_decr(const void* aa, const void* bb) {
5339   double cc = *((const double*)aa) - *((const double*)bb);
5340   if (cc < 0.0) {
5341     return 1;
5342   } else if (cc > 0.0) {
5343     return -1;
5344   } else {
5345     return 0;
5346   }
5347 }
5348 
double_cmp_deref(const void * aa,const void * bb)5349 int32_t double_cmp_deref(const void* aa, const void* bb) {
5350   double cc = **((const double**)aa) - **((const double**)bb);
5351   if (cc > 0.0) {
5352     return 1;
5353   } else if (cc < 0.0) {
5354     return -1;
5355   } else {
5356     return 0;
5357   }
5358 }
5359 
char_cmp_deref(const void * aa,const void * bb)5360 int32_t char_cmp_deref(const void* aa, const void* bb) {
5361   return (int32_t)(**((const char**)aa) - **((const char**)bb));
5362 }
5363 
intcmp(const void * aa,const void * bb)5364 int32_t intcmp(const void* aa, const void* bb) {
5365   return *((const int32_t*)aa) - *((const int32_t*)bb);
5366 }
5367 
uintcmp(const void * aa,const void * bb)5368 int32_t uintcmp(const void* aa, const void* bb) {
5369   if (*((const uint32_t*)aa) < *((const uint32_t*)bb)) {
5370     return -1;
5371   } else {
5372     return (*((const uint32_t*)aa) > *((const uint32_t*)bb));
5373   }
5374 }
5375 
intcmp2(const void * aa,const void * bb)5376 int32_t intcmp2(const void* aa, const void* bb) {
5377   if (*((const int32_t*)aa) < *((const int32_t*)bb)) {
5378     return -1;
5379   } else {
5380     return (*((const int32_t*)aa) > *((const int32_t*)bb));
5381   }
5382 }
5383 
intcmp3_decr(const void * aa,const void * bb)5384 int32_t intcmp3_decr(const void* aa, const void* bb) {
5385   int32_t ii = *((const int32_t*)bb) - *((const int32_t*)aa);
5386   if (ii) {
5387     return ii;
5388   }
5389   ii = ((const int32_t*)bb)[1] - ((const int32_t*)aa)[1];
5390   if (ii) {
5391     return ii;
5392   }
5393   return ((const int32_t*)bb)[2] - ((const int32_t*)aa)[2];
5394 }
5395 
5396 #ifndef __cplusplus
llcmp(const void * aa,const void * bb)5397 int32_t llcmp(const void* aa, const void* bb) {
5398   int64_t diff = *((const int64_t*)aa) - *((const int64_t*)bb);
5399   if (diff > 0) {
5400     return 1;
5401   } else if (diff < 0) {
5402     return -1;
5403   } else {
5404     return 0;
5405   }
5406 }
5407 #endif
5408 
5409 // alas, qsort_r not available on some Linux distributions
5410 
5411 // Normally use qsort_ext(), but this version is necessary before g_bigstack
5412 // has been allocated.
qsort_ext2(char * main_arr,uintptr_t arr_length,uintptr_t item_length,int (* comparator_deref)(const void *,const void *),char * secondary_arr,uintptr_t secondary_item_len,char * proxy_arr,uintptr_t proxy_len)5413 void qsort_ext2(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, uintptr_t secondary_item_len, char* proxy_arr, uintptr_t proxy_len) {
5414   uintptr_t ulii;
5415   for (ulii = 0; ulii < arr_length; ulii++) {
5416     *(char**)(&(proxy_arr[ulii * proxy_len])) = &(main_arr[ulii * item_length]);
5417     memcpy(&(proxy_arr[ulii * proxy_len + sizeof(void*)]), &(secondary_arr[ulii * secondary_item_len]), secondary_item_len);
5418   }
5419   qsort(proxy_arr, arr_length, proxy_len, comparator_deref);
5420   for (ulii = 0; ulii < arr_length; ulii++) {
5421     memcpy(&(secondary_arr[ulii * secondary_item_len]), &(proxy_arr[ulii * proxy_len + sizeof(void*)]), secondary_item_len);
5422     memcpy(&(proxy_arr[ulii * proxy_len]), *(char**)(&(proxy_arr[ulii * proxy_len])), item_length);
5423   }
5424   for (ulii = 0; ulii < arr_length; ulii++) {
5425     memcpy(&(main_arr[ulii * item_length]), &(proxy_arr[ulii * proxy_len]), item_length);
5426   }
5427 }
5428 
5429 // This actually tends to be faster than just sorting an array of indices,
5430 // because of memory locality issues.
qsort_ext(char * main_arr,uintptr_t arr_length,uintptr_t item_length,int (* comparator_deref)(const void *,const void *),char * secondary_arr,intptr_t secondary_item_len)5431 int32_t qsort_ext(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, intptr_t secondary_item_len) {
5432   // main_arr = packed array of equal-length items to sort
5433   // arr_length = number of items
5434   // item_length = byte count of each main_arr item
5435   // comparator_deref = returns positive if *first > *second, 0 if equal,
5436   //                    negative if *first < *second.  Note the extra
5437   //                    dereference.
5438   // secondary_arr = packed array of fixed-length records associated with the
5439   //                 main_arr items, to be resorted in the same way.  (e.g.
5440   //                 if one is building an index, this could start as a sorted
5441   //                 0..(n-1) sequence of integers; then, post-sort, this would
5442   //                 be a lookup table for the original position of each
5443   //                 main_arr item.)
5444   // secondary_item_len = byte count of each secondary_arr item
5445   uintptr_t proxy_len = secondary_item_len + sizeof(void*);
5446   unsigned char* bigstack_mark = g_bigstack_base;
5447   char* proxy_arr;
5448   if (!arr_length) {
5449     return 0;
5450   }
5451   if (proxy_len < item_length) {
5452     proxy_len = item_length;
5453   }
5454   if (bigstack_alloc_c(arr_length * proxy_len, &proxy_arr)) {
5455     return -1;
5456   }
5457   qsort_ext2(main_arr, arr_length, item_length, comparator_deref, secondary_arr, secondary_item_len, proxy_arr, proxy_len);
5458   bigstack_reset(bigstack_mark);
5459   return 0;
5460 }
5461 
sort_item_ids_noalloc(uintptr_t unfiltered_ct,const uintptr_t * exclude_arr,uintptr_t item_ct,const char * __restrict item_ids,uintptr_t max_id_len,uint32_t allow_dups,uint32_t collapse_idxs,int (* comparator_deref)(const void *,const void *),char * __restrict sorted_ids,uint32_t * id_map)5462 int32_t sort_item_ids_noalloc(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* __restrict item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*), char* __restrict sorted_ids, uint32_t* id_map) {
5463   // Stores a lexicographically sorted list of IDs in sorted_ids and the raw
5464   // positions of the corresponding markers/samples in *id_map_ptr.  Does not
5465   // include excluded markers/samples in the list.
5466   // Assumes sorted_ids and id_map have been allocated; use the sort_item_ids()
5467   // wrapper if they haven't been.
5468   // Note that this DOES still perform a "stack" allocation (in the qsort_ext()
5469   // call).
5470   uint32_t uii = 0;
5471   char* dup_id;
5472   char* tptr;
5473   uint32_t ujj;
5474   if (!item_ct) {
5475     return 0;
5476   }
5477   if (!collapse_idxs) {
5478     for (ujj = 0; ujj < item_ct; uii++, ujj++) {
5479       next_unset_unsafe_ck(exclude_arr, &uii);
5480       memcpy(&(sorted_ids[ujj * max_id_len]), &(item_ids[uii * max_id_len]), max_id_len);
5481       id_map[ujj] = uii;
5482     }
5483   } else {
5484     for (ujj = 0; ujj < item_ct; uii++, ujj++) {
5485       next_unset_unsafe_ck(exclude_arr, &uii);
5486       memcpy(&(sorted_ids[ujj * max_id_len]), &(item_ids[uii * max_id_len]), max_id_len);
5487       id_map[ujj] = ujj;
5488     }
5489   }
5490   if (qsort_ext(sorted_ids, item_ct, max_id_len, comparator_deref, (char*)id_map, sizeof(int32_t))) {
5491     return RET_NOMEM;
5492   }
5493   if (!allow_dups) {
5494     dup_id = scan_for_duplicate_ids(sorted_ids, item_ct, max_id_len);
5495     if (dup_id) {
5496       tptr = strchr(dup_id, '\t');
5497       if (tptr) {
5498         *tptr = ' ';
5499       }
5500       LOGERRPRINTFWW("Error: Duplicate ID '%s'.\n", dup_id);
5501       return RET_INVALID_FORMAT;
5502     }
5503   }
5504   return 0;
5505 }
5506 
sort_item_ids(uintptr_t unfiltered_ct,const uintptr_t * exclude_arr,uintptr_t exclude_ct,const char * __restrict item_ids,uintptr_t max_id_len,uint32_t allow_dups,uint32_t collapse_idxs,int (* comparator_deref)(const void *,const void *),char ** sorted_ids_ptr,uint32_t ** id_map_ptr)5507 int32_t sort_item_ids(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t exclude_ct, const char* __restrict item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*), char** sorted_ids_ptr, uint32_t** id_map_ptr) {
5508   uintptr_t item_ct = unfiltered_ct - exclude_ct;
5509   // id_map on bottom because --indiv-sort frees *sorted_ids_ptr
5510   if (bigstack_alloc_ui(item_ct, id_map_ptr) ||
5511       bigstack_alloc_c(item_ct * max_id_len, sorted_ids_ptr)) {
5512     return RET_NOMEM;
5513   }
5514   return sort_item_ids_noalloc(unfiltered_ct, exclude_arr, item_ct, item_ids, max_id_len, allow_dups, collapse_idxs, comparator_deref, *sorted_ids_ptr, *id_map_ptr);
5515 }
5516 
uint32arr_greater_than(const uint32_t * sorted_uint32_arr,uint32_t arr_length,uint32_t uii)5517 uint32_t uint32arr_greater_than(const uint32_t* sorted_uint32_arr, uint32_t arr_length, uint32_t uii) {
5518   // assumes arr_length is nonzero, and sorted_uint32_arr is in nondecreasing
5519   // order.  (useful for searching marker_pos.)
5520   // uii guaranteed to be larger than sorted_uint32_arr[min_idx - 1] if it
5521   // exists, but NOT necessarily sorted_uint32_arr[min_idx].
5522   int32_t min_idx = 0;
5523   // similarly, uii guaranteed to be no greater than
5524   // sorted_uint32_arr[max_idx + 1] if it exists, but not necessarily
5525   // sorted_uint32_arr[max_idx].  Signed integer since it could become -1.
5526   int32_t max_idx = arr_length - 1;
5527   uint32_t mid_idx;
5528   while (min_idx < max_idx) {
5529     mid_idx = (((uint32_t)min_idx) + ((uint32_t)max_idx)) / 2;
5530     if (uii > sorted_uint32_arr[mid_idx]) {
5531       min_idx = mid_idx + 1;
5532     } else {
5533       max_idx = mid_idx - 1;
5534     }
5535   }
5536   if (uii > sorted_uint32_arr[((uint32_t)min_idx)]) {
5537     return (min_idx + 1);
5538   } else {
5539     return min_idx;
5540   }
5541 }
5542 
int32arr_greater_than(const int32_t * sorted_int32_arr,uint32_t arr_length,int32_t ii)5543 uint32_t int32arr_greater_than(const int32_t* sorted_int32_arr, uint32_t arr_length, int32_t ii) {
5544   int32_t min_idx = 0;
5545   int32_t max_idx = arr_length - 1;
5546   uint32_t mid_idx;
5547   while (min_idx < max_idx) {
5548     mid_idx = (((uint32_t)min_idx) + ((uint32_t)max_idx)) / 2;
5549     if (ii > sorted_int32_arr[mid_idx]) {
5550       min_idx = mid_idx + 1;
5551     } else {
5552       max_idx = mid_idx - 1;
5553     }
5554   }
5555   if (ii > sorted_int32_arr[((uint32_t)min_idx)]) {
5556     return (min_idx + 1);
5557   } else {
5558     return min_idx;
5559   }
5560 }
5561 
uint64arr_greater_than(const uint64_t * sorted_uint64_arr,uintptr_t arr_length,uint64_t ullii)5562 uintptr_t uint64arr_greater_than(const uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii) {
5563   intptr_t min_idx = 0;
5564   intptr_t max_idx = arr_length - 1;
5565   uintptr_t mid_idx;
5566   while (min_idx < max_idx) {
5567     mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
5568     if (ullii > sorted_uint64_arr[mid_idx]) {
5569       min_idx = mid_idx + 1;
5570     } else {
5571       max_idx = mid_idx - 1;
5572     }
5573   }
5574   if (ullii > sorted_uint64_arr[((uintptr_t)min_idx)]) {
5575     return (min_idx + 1);
5576   } else {
5577     return min_idx;
5578   }
5579 }
5580 
doublearr_greater_than(const double * sorted_dbl_arr,uintptr_t arr_length,double dxx)5581 uintptr_t doublearr_greater_than(const double* sorted_dbl_arr, uintptr_t arr_length, double dxx) {
5582   // returns number of items in sorted_dbl_arr which dxx is greater than.
5583   // assumes array is nonempty and sorted in nondecreasing order
5584   intptr_t min_idx = 0;
5585   intptr_t max_idx = arr_length - 1;
5586   uintptr_t mid_idx;
5587   while (min_idx < max_idx) {
5588     mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
5589     if (dxx > sorted_dbl_arr[mid_idx]) {
5590       min_idx = mid_idx + 1;
5591     } else {
5592       max_idx = mid_idx - 1;
5593     }
5594   }
5595   if (dxx > sorted_dbl_arr[((uintptr_t)min_idx)]) {
5596     return (min_idx + 1);
5597   } else {
5598     return min_idx;
5599   }
5600 }
5601 
nonincr_doublearr_leq_stride(const double * nonincr_dbl_arr,uintptr_t arr_length,uintptr_t stride,double dxx)5602 uintptr_t nonincr_doublearr_leq_stride(const double* nonincr_dbl_arr, uintptr_t arr_length, uintptr_t stride, double dxx) {
5603   // assumes relevant elements of array are sorted in nonincreasing order
5604   // instead, and they are spaced stride units apart
5605   intptr_t min_idx = 0;
5606   intptr_t max_idx = arr_length - 1;
5607   uintptr_t mid_idx;
5608   while (min_idx < max_idx) {
5609     mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
5610     if (dxx <= nonincr_dbl_arr[mid_idx * stride]) {
5611       min_idx = mid_idx + 1;
5612     } else {
5613       max_idx = mid_idx - 1;
5614     }
5615   }
5616   if (dxx <= nonincr_dbl_arr[((uintptr_t)min_idx) * stride]) {
5617     return (min_idx + 1);
5618   } else {
5619     return min_idx;
5620   }
5621 }
5622 
bsearch_str(const char * id_buf,uintptr_t cur_id_len,const char * lptr,uintptr_t max_id_len,uintptr_t end_idx)5623 int32_t bsearch_str(const char* id_buf, uintptr_t cur_id_len, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
5624   // does not assume null-terminated id_buf, or nonempty array.
5625   // N.B. max_id_len includes null terminator as usual, while cur_id_len does
5626   // NOT.
5627   uintptr_t start_idx = 0;
5628   uintptr_t mid_idx;
5629   int32_t ii;
5630   if (cur_id_len >= max_id_len) {
5631     return -1;
5632   }
5633   while (start_idx < end_idx) {
5634     mid_idx = (start_idx + end_idx) / 2;
5635     ii = memcmp(id_buf, &(lptr[mid_idx * max_id_len]), cur_id_len);
5636     if (ii > 0) {
5637       start_idx = mid_idx + 1;
5638     } else if ((ii < 0) || lptr[mid_idx * max_id_len + cur_id_len]) {
5639       end_idx = mid_idx;
5640     } else {
5641       return ((uint32_t)mid_idx);
5642     }
5643   }
5644   return -1;
5645 }
5646 
bsearch_str_natural(const char * id_buf,const char * lptr,uintptr_t max_id_len,uintptr_t end_idx)5647 int32_t bsearch_str_natural(const char* id_buf, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
5648   // unlike bsearch_str(), caller is responsible for slen > max_id_len check
5649   // if appropriate here
5650   uintptr_t start_idx = 0;
5651   uintptr_t mid_idx;
5652   int32_t ii;
5653   while (start_idx < end_idx) {
5654     mid_idx = (start_idx + end_idx) / 2;
5655     ii = strcmp_natural(id_buf, &(lptr[mid_idx * max_id_len]));
5656     if (ii > 0) {
5657       start_idx = mid_idx + 1;
5658     } else if (ii < 0) {
5659       end_idx = mid_idx;
5660     } else {
5661       return ((uint32_t)mid_idx);
5662     }
5663   }
5664   return -1;
5665 }
5666 
bsearch_str_lb(const char * id_buf,uintptr_t cur_id_len,const char * lptr,uintptr_t max_id_len,uintptr_t end_idx)5667 uintptr_t bsearch_str_lb(const char* id_buf, uintptr_t cur_id_len, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
5668   // returns number of elements in lptr[] less than id_buf.
5669   uintptr_t start_idx = 0;
5670   uintptr_t mid_idx;
5671   if (cur_id_len > max_id_len) {
5672     cur_id_len = max_id_len;
5673   }
5674   while (start_idx < end_idx) {
5675     mid_idx = (start_idx + end_idx) / 2;
5676     if (memcmp(id_buf, &(lptr[mid_idx * max_id_len]), cur_id_len) > 0) {
5677       start_idx = mid_idx + 1;
5678     } else {
5679       end_idx = mid_idx;
5680     }
5681   }
5682   return start_idx;
5683 }
5684 
bsearch_read_fam_indiv(char * __restrict read_ptr,const char * __restrict lptr,uintptr_t max_id_len,uintptr_t filter_line_ct,char ** read_pp_new,int32_t * retval_ptr,char * __restrict id_buf)5685 uint32_t bsearch_read_fam_indiv(char* __restrict read_ptr, const char* __restrict lptr, uintptr_t max_id_len, uintptr_t filter_line_ct, char** read_pp_new, int32_t* retval_ptr, char* __restrict id_buf) {
5686   // id_buf = workspace
5687   // lptr = packed, sorted list of ID strings to search over
5688   // read_ptr is assumed to point to beginning of FID.  FID is terminated by
5689   // any space/eoln character, then IID is assumed to follow it (and is also
5690   // terminated by any space/eoln).  Nonzero error value is returned if IID
5691   // does not exist.
5692   char* iid_ptr;
5693   uintptr_t slen_fid;
5694   uintptr_t slen_iid;
5695   uintptr_t slen_final;
5696   slen_fid = strlen_se(read_ptr);
5697   iid_ptr = skip_initial_spaces(&(read_ptr[slen_fid]));
5698   if (is_eoln_kns(*iid_ptr)) {
5699     return 1;
5700   }
5701   slen_iid = strlen_se(iid_ptr);
5702   if (read_pp_new) {
5703     *read_pp_new = skip_initial_spaces(&(iid_ptr[slen_iid]));
5704   }
5705   slen_final = slen_fid + slen_iid + 1;
5706   if (slen_final >= max_id_len) {
5707     // avoid buffer overflow
5708     *retval_ptr = -1;
5709     return 0;
5710   }
5711   // error message bugfix: null-terminate this string
5712   memcpyx(memcpyax(id_buf, read_ptr, slen_fid, '\t'), iid_ptr, slen_iid, '\0');
5713   *retval_ptr = bsearch_str(id_buf, slen_final, lptr, max_id_len, filter_line_ct);
5714   return 0;
5715 }
5716 
bsearch_fam(const char * __restrict fam_id,const char * __restrict lptr,uintptr_t max_id_len,uint32_t filter_line_ct,uint32_t * __restrict first_idx_ptr,uint32_t * __restrict last_idx_ptr,char * __restrict id_buf)5717 void bsearch_fam(const char* __restrict fam_id, const char* __restrict lptr, uintptr_t max_id_len, uint32_t filter_line_ct, uint32_t* __restrict first_idx_ptr, uint32_t* __restrict last_idx_ptr, char* __restrict id_buf) {
5718   uint32_t slen;
5719   uint32_t fidx;
5720   uint32_t loff;
5721   if (!filter_line_ct) {
5722     goto bsearch_fam_ret_null;
5723   }
5724   slen = strlen_se(fam_id);
5725   if (slen + 3 > max_id_len) {
5726     goto bsearch_fam_ret_null;
5727   }
5728   memcpy(id_buf, fam_id, slen);
5729   id_buf[slen] = '\t';
5730   fidx = bsearch_str_lb(id_buf, slen + 1, lptr, max_id_len, filter_line_ct);
5731   if (fidx == filter_line_ct) {
5732     goto bsearch_fam_ret_null;
5733   }
5734   id_buf[slen] = ' ';
5735   loff = bsearch_str_lb(id_buf, slen + 1, &(lptr[fidx * max_id_len]), max_id_len, filter_line_ct - fidx);
5736   if (!loff) {
5737     goto bsearch_fam_ret_null;
5738   }
5739   *first_idx_ptr = fidx;
5740   *last_idx_ptr = fidx + loff;
5741   return;
5742  bsearch_fam_ret_null:
5743   *first_idx_ptr = 0;
5744   *last_idx_ptr = 0;
5745 }
5746 
bitarr_invert(uintptr_t bit_ct,uintptr_t * bitarr)5747 void bitarr_invert(uintptr_t bit_ct, uintptr_t* bitarr) {
5748   uintptr_t* bitarr_stop = &(bitarr[bit_ct / BITCT]);
5749   while (bitarr < bitarr_stop) {
5750     *bitarr = ~(*bitarr);
5751     bitarr++;
5752   }
5753   if (bit_ct % BITCT) {
5754     *bitarr = (~(*bitarr)) & ((ONELU << (bit_ct % BITCT)) - ONELU);
5755   }
5756 }
5757 
bitarr_invert_copy(const uintptr_t * input_bitarr,uintptr_t bit_ct,uintptr_t * output_bitarr)5758 void bitarr_invert_copy(const uintptr_t* input_bitarr, uintptr_t bit_ct, uintptr_t* output_bitarr) {
5759   const uintptr_t* input_stop = &(input_bitarr[bit_ct / BITCT]);
5760   while (input_bitarr < input_stop) {
5761     *output_bitarr++ = ~(*input_bitarr++);
5762   }
5763   if (bit_ct % BITCT) {
5764     *output_bitarr = (~(*input_bitarr)) & ((ONELU << (bit_ct % BITCT)) - ONELU);
5765   }
5766 }
5767 
bitvec_and(const uintptr_t * __restrict arg_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5768 void bitvec_and(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5769   // main_bitvec := main_bitvec AND arg_bitvec
5770 #ifdef __LP64__
5771   __m128i* vv128 = (__m128i*)main_bitvec;
5772   const __m128i* iv128 = (const __m128i*)arg_bitvec;
5773   __m128i* vv128_end = &(vv128[word_ct / 2]);
5774   while (vv128 < vv128_end) {
5775     *vv128 = _mm_and_si128(*iv128++, *vv128);
5776     vv128++;
5777   }
5778   if (word_ct & 1) {
5779     word_ct--;
5780     main_bitvec[word_ct] &= arg_bitvec[word_ct];
5781   }
5782 #else
5783   uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
5784   while (main_bitvec < bitvec_end) {
5785     *main_bitvec++ &= *arg_bitvec++;
5786   }
5787 #endif
5788 }
5789 
bitvec_andnot(const uintptr_t * __restrict exclude_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5790 void bitvec_andnot(const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5791   // main_bitvec := main_bitvec ANDNOT exclude_bitvec
5792   // note that this is the reverse of the _mm_andnot() operand order
5793 #ifdef __LP64__
5794   __m128i* vv128 = (__m128i*)main_bitvec;
5795   const __m128i* ev128 = (const __m128i*)exclude_bitvec;
5796   __m128i* vv128_end = &(vv128[word_ct / 2]);
5797   while (vv128 < vv128_end) {
5798     *vv128 = _mm_andnot_si128(*ev128++, *vv128);
5799     vv128++;
5800   }
5801   if (word_ct & 1) {
5802     word_ct--;
5803     main_bitvec[word_ct] &= ~(exclude_bitvec[word_ct]);
5804   }
5805 #else
5806   uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
5807   while (main_bitvec < bitvec_end) {
5808     *main_bitvec++ &= ~(*exclude_bitvec++);
5809   }
5810 #endif
5811 }
5812 
bitvec_andnot_reversed_args(const uintptr_t * __restrict include_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5813 void bitvec_andnot_reversed_args(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5814   // main_bitvec := (~main_bitvec) AND include_bitvec
5815 #ifdef __LP64__
5816   __m128i* vv128 = (__m128i*)main_bitvec;
5817   const __m128i* iv128 = (const __m128i*)include_bitvec;
5818   __m128i* vv128_end = &(vv128[word_ct / 2]);
5819   while (vv128 < vv128_end) {
5820     *vv128 = _mm_andnot_si128(*vv128, *iv128++);
5821     vv128++;
5822   }
5823   if (word_ct & 1) {
5824     word_ct--;
5825     main_bitvec[word_ct] = (~main_bitvec[word_ct]) & include_bitvec[word_ct];
5826   }
5827 #else
5828   uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
5829   while (main_bitvec < bitvec_end) {
5830     *main_bitvec = (~(*main_bitvec)) & (*include_bitvec++);
5831     main_bitvec++;
5832   }
5833 #endif
5834 }
5835 
bitvec_or(const uintptr_t * __restrict arg_bitvec,uintptr_t word_ct,uintptr_t * main_bitvec)5836 void bitvec_or(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
5837   // main_bitvec := main_bitvec OR arg_bitvec
5838 #ifdef __LP64__
5839   __m128i* vv128 = (__m128i*)main_bitvec;
5840   const __m128i* ov128 = (const __m128i*)arg_bitvec;
5841   __m128i* vv128_end = &(vv128[word_ct / 2]);
5842   while (vv128 < vv128_end) {
5843     *vv128 = _mm_or_si128(*ov128++, *vv128);
5844     vv128++;
5845   }
5846   if (word_ct & 1) {
5847     word_ct--;
5848     main_bitvec[word_ct] |= arg_bitvec[word_ct];
5849   }
5850 #else
5851   uintptr_t* vec_end = &(main_bitvec[word_ct]);
5852   while (main_bitvec < vec_end) {
5853     *main_bitvec++ |= *arg_bitvec++;
5854   }
5855 #endif
5856 }
5857 
bitvec_ornot(const uintptr_t * __restrict inverted_or_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5858 void bitvec_ornot(const uintptr_t* __restrict inverted_or_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5859   // main_bitvec := main_bitvec OR (~inverted_or_bitvec)
5860 #ifdef __LP64__
5861 #ifdef __APPLE__
5862   const __m128i all1 = {0xffffffffffffffffLLU, 0xffffffffffffffffLLU};
5863 #else
5864   const __m128i all1 = {-1LL, -1LL};
5865 #endif
5866   __m128i* vv128 = (__m128i*)main_bitvec;
5867   const __m128i* ev128 = (const __m128i*)inverted_or_bitvec;
5868   __m128i* vv128_end = &(vv128[word_ct / 2]);
5869   while (vv128 < vv128_end) {
5870     *vv128 = _mm_or_si128(_mm_xor_si128(*ev128++, all1), *vv128);
5871     vv128++;
5872   }
5873   if (word_ct & 1) {
5874     word_ct--;
5875     main_bitvec[word_ct] |= ~(inverted_or_bitvec[word_ct]);
5876   }
5877 #else
5878   uintptr_t* vec_end = &(main_bitvec[word_ct]);
5879   while (main_bitvec < vec_end) {
5880     *main_bitvec++ |= ~(*inverted_or_bitvec++);
5881   }
5882 #endif
5883 }
5884 
bitvec_xor(const uintptr_t * __restrict arg_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5885 void bitvec_xor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5886   // main_bitvec := main_bitvec XOR xor_bitvec
5887 #ifdef __LP64__
5888   __m128i* bitv128 = (__m128i*)main_bitvec;
5889   __m128i* xorv128 = (__m128i*)arg_bitvec;
5890   __m128i* bitv128_end = &(bitv128[word_ct / 2]);
5891   while (bitv128 < bitv128_end) {
5892     *bitv128 = _mm_xor_si128(*xorv128++, *bitv128);
5893     bitv128++;
5894   }
5895   if (word_ct & 1) {
5896     word_ct--;
5897     main_bitvec[word_ct] ^= arg_bitvec[word_ct];
5898   }
5899 #else
5900   uintptr_t* main_bitvec_end = &(main_bitvec[word_ct]);
5901   while (main_bitvec < main_bitvec_end) {
5902     *main_bitvec++ ^= *arg_bitvec++;
5903   }
5904 #endif
5905 }
5906 
is_monomorphic_a2(const uintptr_t * geno_arr,uint32_t sample_ct)5907 uint32_t is_monomorphic_a2(const uintptr_t* geno_arr, uint32_t sample_ct) {
5908   const uintptr_t* loop_end = &(geno_arr[sample_ct / BITCT2]);
5909   uint32_t sample_rem = sample_ct % BITCT2;
5910   for (; geno_arr < loop_end; geno_arr++) {
5911     if ((~(*geno_arr)) & FIVEMASK) {
5912       return 0;
5913     }
5914   }
5915   return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5916 }
5917 
is_monomorphic(const uintptr_t * geno_arr,uint32_t sample_ct)5918 uint32_t is_monomorphic(const uintptr_t* geno_arr, uint32_t sample_ct) {
5919   uint32_t sample_ctd2 = sample_ct / BITCT2;
5920   uint32_t sample_rem = sample_ct % BITCT2;
5921   uintptr_t ulii;
5922   uintptr_t uljj;
5923   while (sample_ctd2) {
5924     ulii = *geno_arr++;
5925     uljj = (ulii >> 1) & FIVEMASK;
5926     ulii = ~ulii;
5927     // now ulii & FIVEMASK = low bit zero, uljj = high bit one
5928     if (uljj) {
5929       if (uljj & ulii) {
5930         // heterozygote observed
5931         return 0;
5932       }
5933       // homozyg A2 observed
5934       while (1) {
5935 	// 00 and 10 now both demonstrate marker is polymorphic
5936 	if (ulii & FIVEMASK) {
5937 	  return 0;
5938 	}
5939 	if (!(--sample_ctd2)) {
5940 	  return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5941 	}
5942 	ulii = ~(*geno_arr++);
5943       }
5944     } else if (ulii & FIVEMASK) {
5945       do {
5946         if (!(--sample_ctd2)) {
5947           return (sample_rem && ((*geno_arr) & (AAAAMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5948 	}
5949 	ulii = *geno_arr++;
5950       } while (!(ulii & AAAAMASK));
5951       return 0;
5952     }
5953     sample_ctd2--;
5954   }
5955   if (sample_rem) {
5956     ulii = *geno_arr;
5957     uljj = (ulii >> 1) & FIVEMASK;
5958     ulii = ~ulii;
5959     if ((uljj & ulii) || (uljj && (ulii & (~uljj) & (FIVEMASK >> (BITCT - sample_rem * 2))))) {
5960       return 0;
5961     }
5962   }
5963   return 1;
5964 }
5965 
less_than_two_genotypes(const uintptr_t * geno_arr,uint32_t sample_ct)5966 uint32_t less_than_two_genotypes(const uintptr_t* geno_arr, uint32_t sample_ct) {
5967   uint32_t sample_ctd2 = sample_ct / BITCT2;
5968   uint32_t sample_rem = sample_ct % BITCT2;
5969   uintptr_t ulii;
5970   uintptr_t uljj;
5971   uintptr_t ulkk;
5972   uint32_t distinct_genotype_ct;
5973   while (sample_ctd2) {
5974     ulii = *geno_arr++;
5975     uljj = (ulii >> 1) & FIVEMASK;
5976     ulkk = ~ulii;
5977     if (uljj) {
5978       if (uljj & ulii) {
5979 	// homozygote major observed; either 00 or 10 now demonstrate marker
5980 	// is polymorphic
5981 	while (1) {
5982 	  if (ulkk & FIVEMASK) {
5983 	    return 0;
5984 	  }
5985 	  if (!(--sample_ctd2)) {
5986 	    return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5987 	  }
5988 	  ulkk = ~(*geno_arr++);
5989 	}
5990       } else {
5991         // heterozygote observed; either 00 or 11 now means we have 2+
5992 	// genotypes
5993 	while (1) {
5994 	  ulii = ~(*geno_arr++);
5995 	  if (!(--sample_ctd2)) {
5996 	    return (sample_rem && (((~ulii) ^ (ulii >> 1)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5997 	  }
5998 	  if (((~ulii) ^ (ulii >> 1)) & FIVEMASK) {
5999 	    return 0;
6000 	  }
6001 	}
6002       }
6003     } else if (ulkk & FIVEMASK) {
6004       // homozygous minor observed; either 10 or 11 now demonstrate marker is
6005       // polymorphic
6006       do {
6007         if (!(--sample_ctd2)) {
6008           return (sample_rem && ((*geno_arr) & (AAAAMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
6009 	}
6010 	ulii = *geno_arr++;
6011       } while (!(ulii & AAAAMASK));
6012       return 0;
6013     }
6014     sample_ctd2--;
6015   }
6016   if (sample_rem) {
6017     ulii = *geno_arr;
6018     uljj = (ulii >> 1) & FIVEMASK;
6019     ulkk = ~ulii;
6020     // homozygous minor present?
6021     distinct_genotype_ct = (ulkk & (~uljj) & (FIVEMASK >> (BITCT - sample_rem * 2)))? 1 : 0;
6022     // heterozygous present?
6023     distinct_genotype_ct += (uljj & ulkk)? 1 : 0;
6024     // homozygous major present?
6025     distinct_genotype_ct += (uljj & ulii)? 1 : 0;
6026     if (distinct_genotype_ct > 1) {
6027       return 0;
6028     }
6029   }
6030   return 1;
6031 }
6032 
6033 /*
6034 uint32_t has_three_genotypes(uintptr_t* lptr, uint32_t sample_ct) {
6035   uintptr_t* lptr_end = &(lptr[sample_ct / BITCT2]);
6036   uint32_t sample_rem = sample_ct % BITCT2;
6037   uintptr_t* cur_lptr;
6038   uintptr_t ulii;
6039   uintptr_t uljj;
6040   cur_lptr = lptr;
6041   while (1) {
6042     ulii = ~(*cur_lptr);
6043     uljj = ulii & (ulii >> 1) & FIVEMASK;
6044     if (cur_lptr == lptr_end) {
6045       if ((!sample_rem) || (!(uljj << (BITCT - sample_rem * 2)))) {
6046 	return 0;
6047       }
6048       break;
6049     }
6050     if (uljj) {
6051       // found hom A1
6052       break;
6053     }
6054     cur_lptr++;
6055   }
6056   cur_lptr = lptr;
6057   // zero-padding is benign for het and hom A2 checks
6058   lptr_end = &(lptr[QUATERCT_TO_WORDCT(sample_ct)]);
6059   while (1) {
6060     ulii = *cur_lptr;
6061     uljj = (ulii >> 1) & FIVEMASK;
6062     if ((~ulii) & uljj) {
6063       break;
6064     }
6065     if (++cur_lptr == lptr_end) {
6066       return 0;
6067     }
6068   }
6069   cur_lptr = lptr;
6070   do {
6071     ulii = *cur_lptr;
6072     uljj = (ulii >> 1) & FIVEMASK;
6073     if (ulii & uljj) {
6074       return 1;
6075     }
6076   } while (++cur_lptr < lptr_end);
6077   return 0;
6078 }
6079 */
6080 
6081 #ifdef __LP64__
6082 // Basic SSE2 implementation of Lauradoux/Walisch popcount.
popcount_vecs(const __m128i * vptr,uintptr_t ct)6083 static inline uintptr_t popcount_vecs(const __m128i* vptr, uintptr_t ct) {
6084   // popcounts vptr[0..(ct-1)].  Assumes ct is a multiple of 3 (0 ok).
6085   const __m128i m1 = {FIVEMASK, FIVEMASK};
6086   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6087   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6088   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6089   uintptr_t tot = 0;
6090   const __m128i* vend;
6091   __m128i count1;
6092   __m128i count2;
6093   __m128i half1;
6094   __m128i half2;
6095   __univec acc;
6096 
6097   while (ct >= 30) {
6098     ct -= 30;
6099     vend = &(vptr[30]);
6100   popcount_vecs_main_loop:
6101     acc.vi = _mm_setzero_si128();
6102     do {
6103       count1 = *vptr++;
6104       count2 = *vptr++;
6105       half1 = *vptr++;
6106       half2 = _mm_and_si128(_mm_srli_epi64(half1, 1), m1);
6107       half1 = _mm_and_si128(half1, m1);
6108       // Two bits can represent values from 0-3, so make each pair in count1
6109       // count2 store a partial bitcount covering themselves AND another bit
6110       // from elsewhere.
6111       count1 = _mm_sub_epi64(count1, _mm_and_si128(_mm_srli_epi64(count1, 1), m1));
6112       count2 = _mm_sub_epi64(count2, _mm_and_si128(_mm_srli_epi64(count2, 1), m1));
6113       count1 = _mm_add_epi64(count1, half1);
6114       count2 = _mm_add_epi64(count2, half2);
6115       // Four bits represent 0-15, so we can safely add four 0-3 partial
6116       // bitcounts together.
6117       count1 = _mm_add_epi64(_mm_and_si128(count1, m2), _mm_and_si128(_mm_srli_epi64(count1, 2), m2));
6118       count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(count2, m2), _mm_and_si128(_mm_srli_epi64(count2, 2), m2)));
6119       // Accumulator stores sixteen 0-255 counts in parallel.
6120       acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
6121     } while (vptr < vend);
6122     acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
6123     tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
6124   }
6125   if (ct) {
6126     vend = &(vptr[ct]);
6127     ct = 0;
6128     goto popcount_vecs_main_loop;
6129   }
6130   return tot;
6131 }
6132 
popcount2_vecs(const __m128i * vptr,uintptr_t ct)6133 static inline uintptr_t popcount2_vecs(const __m128i* vptr, uintptr_t ct) {
6134   // assumes ct is a multiple of 6.
6135   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6136   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6137   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6138   uintptr_t tot = 0;
6139   const __m128i* vend;
6140   __m128i loader1;
6141   __m128i loader2;
6142   __m128i count1;
6143   __m128i count2;
6144   __univec acc;
6145 
6146   while (ct >= 30) {
6147     ct -= 30;
6148     vend = &(vptr[30]);
6149   popcount2_vecs_main_loop:
6150     acc.vi = _mm_setzero_si128();
6151     do {
6152       loader1 = *vptr++;
6153       loader2 = *vptr++;
6154       count1 = _mm_add_epi64(_mm_and_si128(loader1, m2), _mm_and_si128(_mm_srli_epi64(loader1, 2), m2));
6155       count2 = _mm_add_epi64(_mm_and_si128(loader2, m2), _mm_and_si128(_mm_srli_epi64(loader2, 2), m2));
6156 
6157       loader1 = *vptr++;
6158       loader2 = *vptr++;
6159       count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(loader1, m2), _mm_and_si128(_mm_srli_epi64(loader1, 2), m2)));
6160       count2 = _mm_add_epi64(count2, _mm_add_epi64(_mm_and_si128(loader2, m2), _mm_and_si128(_mm_srli_epi64(loader2, 2), m2)));
6161 
6162       loader1 = *vptr++;
6163       loader2 = *vptr++;
6164       count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(loader1, m2), _mm_and_si128(_mm_srli_epi64(loader1, 2), m2)));
6165       count2 = _mm_add_epi64(count2, _mm_add_epi64(_mm_and_si128(loader2, m2), _mm_and_si128(_mm_srli_epi64(loader2, 2), m2)));
6166 
6167       acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
6168       acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count2, m4), _mm_and_si128(_mm_srli_epi64(count2, 4), m4)));
6169     } while (vptr < vend);
6170     acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
6171     tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
6172   }
6173   if (ct) {
6174     vend = &(vptr[ct]);
6175     ct = 0;
6176     goto popcount2_vecs_main_loop;
6177   }
6178   return tot;
6179 }
6180 
popcount_vecs_exclude(const __m128i * __restrict vptr,const __m128i * __restrict exclude_ptr,uintptr_t ct)6181 static inline uintptr_t popcount_vecs_exclude(const __m128i* __restrict vptr, const __m128i* __restrict exclude_ptr, uintptr_t ct) {
6182   // popcounts vptr ANDNOT exclude_ptr[0..(ct-1)].  ct is a multiple of 3.
6183   const __m128i m1 = {FIVEMASK, FIVEMASK};
6184   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6185   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6186   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6187   uintptr_t tot = 0;
6188   const __m128i* vend;
6189   __m128i count1, count2, half1, half2;
6190   __univec acc;
6191 
6192   while (ct >= 30) {
6193     ct -= 30;
6194     vend = &(vptr[30]);
6195   popcount_vecs_exclude_main_loop:
6196     acc.vi = _mm_setzero_si128();
6197     do {
6198       // nots the FIRST value
6199       count1 = _mm_andnot_si128(*exclude_ptr++, *vptr++);
6200       count2 = _mm_andnot_si128(*exclude_ptr++, *vptr++);
6201       half1 = _mm_andnot_si128(*exclude_ptr++, *vptr++);
6202       half2 = _mm_and_si128(_mm_srli_epi64(half1, 1), m1);
6203       half1 = _mm_and_si128(half1, m1);
6204       count1 = _mm_sub_epi64(count1, _mm_and_si128(_mm_srli_epi64(count1, 1), m1));
6205       count2 = _mm_sub_epi64(count2, _mm_and_si128(_mm_srli_epi64(count2, 1), m1));
6206       count1 = _mm_add_epi64(count1, half1);
6207       count2 = _mm_add_epi64(count2, half2);
6208       count1 = _mm_add_epi64(_mm_and_si128(count1, m2), _mm_and_si128(_mm_srli_epi64(count1, 2), m2));
6209       count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(count2, m2), _mm_and_si128(_mm_srli_epi64(count2, 2), m2)));
6210       acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
6211     } while (vptr < vend);
6212     acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
6213     tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
6214   }
6215   if (ct) {
6216     vend = &(vptr[ct]);
6217     ct = 0;
6218     goto popcount_vecs_exclude_main_loop;
6219   }
6220   return tot;
6221 }
6222 
popcount_vecs_intersect(const __m128i * __restrict vptr1,const __m128i * __restrict vptr2,uintptr_t ct)6223 static inline uintptr_t popcount_vecs_intersect(const __m128i* __restrict vptr1, const __m128i* __restrict vptr2, uintptr_t ct) {
6224   // popcounts vptr1 AND vptr2[0..(ct-1)].  ct is a multiple of 3.
6225   const __m128i m1 = {FIVEMASK, FIVEMASK};
6226   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6227   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6228   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6229   uintptr_t tot = 0;
6230   const __m128i* vend1;
6231   __m128i count1, count2, half1, half2;
6232   __univec acc;
6233 
6234   while (ct >= 30) {
6235     ct -= 30;
6236     vend1 = &(vptr1[30]);
6237   popcount_vecs_intersect_main_loop:
6238     acc.vi = _mm_setzero_si128();
6239     do {
6240       count1 = _mm_and_si128(*vptr2++, *vptr1++);
6241       count2 = _mm_and_si128(*vptr2++, *vptr1++);
6242       half1 = _mm_and_si128(*vptr2++, *vptr1++);
6243       half2 = _mm_and_si128(_mm_srli_epi64(half1, 1), m1);
6244       half1 = _mm_and_si128(half1, m1);
6245       count1 = _mm_sub_epi64(count1, _mm_and_si128(_mm_srli_epi64(count1, 1), m1));
6246       count2 = _mm_sub_epi64(count2, _mm_and_si128(_mm_srli_epi64(count2, 1), m1));
6247       count1 = _mm_add_epi64(count1, half1);
6248       count2 = _mm_add_epi64(count2, half2);
6249       count1 = _mm_add_epi64(_mm_and_si128(count1, m2), _mm_and_si128(_mm_srli_epi64(count1, 2), m2));
6250       count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(count2, m2), _mm_and_si128(_mm_srli_epi64(count2, 2), m2)));
6251       acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
6252     } while (vptr1 < vend1);
6253     acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
6254     tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
6255   }
6256   if (ct) {
6257     vend1 = &(vptr1[ct]);
6258     ct = 0;
6259     goto popcount_vecs_intersect_main_loop;
6260   }
6261   return tot;
6262 }
6263 #endif
6264 
popcount_longs(const uintptr_t * lptr,uintptr_t word_ct)6265 uintptr_t popcount_longs(const uintptr_t* lptr, uintptr_t word_ct) {
6266   // Efficiently popcounts lptr[0..(word_ct - 1)].  In the 64-bit case, lptr[]
6267   // must be 16-byte aligned.
6268   // The popcount_longs_nzbase() wrapper takes care of starting from a later
6269   // index.
6270   uintptr_t tot = 0;
6271   const uintptr_t* lptr_end = &(lptr[word_ct]);
6272 #ifdef __LP64__
6273   uintptr_t six_ct;
6274   const __m128i* vptr;
6275   vptr = (const __m128i*)lptr;
6276   six_ct = word_ct / 6;
6277   tot += popcount_vecs(vptr, six_ct * 3);
6278   lptr = &(lptr[six_ct * 6]);
6279 #else
6280   // The humble 16-bit lookup table actually beats
6281   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
6282   // on my development machine by a hair.
6283   // However, if we take the hint from Lauradoux/Walisch and postpone the
6284   // multiply and right shift, this is no longer true.  Ah well.
6285   const uintptr_t* lptr_six_end;
6286   uintptr_t tmp_stor;
6287   uintptr_t loader;
6288   uintptr_t ulii;
6289   uintptr_t uljj;
6290   lptr_six_end = &(lptr[word_ct - (word_ct % 6)]);
6291   while (lptr < lptr_six_end) {
6292     loader = *lptr++;
6293     ulii = loader - ((loader >> 1) & FIVEMASK);
6294     loader = *lptr++;
6295     uljj = loader - ((loader >> 1) & FIVEMASK);
6296     loader = *lptr++;
6297     ulii += (loader >> 1) & FIVEMASK;
6298     uljj += loader & FIVEMASK;
6299     ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6300     ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6301     tmp_stor = (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6302 
6303     loader = *lptr++;
6304     ulii = loader - ((loader >> 1) & FIVEMASK);
6305     loader = *lptr++;
6306     uljj = loader - ((loader >> 1) & FIVEMASK);
6307     loader = *lptr++;
6308     ulii += (loader >> 1) & FIVEMASK;
6309     uljj += loader & FIVEMASK;
6310     ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6311     ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6312     tmp_stor += (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6313 
6314     // Each 8-bit slot stores a number in 0..48.  Multiplying by 0x01010101 is
6315     // equivalent to the left-shifts and adds we need to sum those four 8-bit
6316     // numbers in the high-order slot.
6317     tot += (tmp_stor * 0x01010101) >> 24;
6318   }
6319 #endif
6320   while (lptr < lptr_end) {
6321     tot += popcount_long(*lptr++);
6322   }
6323   return tot;
6324 }
6325 
popcount2_longs(const uintptr_t * lptr,uintptr_t word_ct)6326 uintptr_t popcount2_longs(const uintptr_t* lptr, uintptr_t word_ct) {
6327   // treats lptr[] as an array of two-bit instead of one-bit numbers
6328   uintptr_t tot = 0;
6329   const uintptr_t* lptr_end = &(lptr[word_ct]);
6330 #ifdef __LP64__
6331   uintptr_t twelve_ct;
6332   const __m128i* vptr;
6333   vptr = (const __m128i*)lptr;
6334   twelve_ct = word_ct / 12;
6335   tot += popcount2_vecs(vptr, twelve_ct * 6);
6336   lptr = &(lptr[twelve_ct * 12]);
6337 #else
6338   const uintptr_t* lptr_six_end;
6339   uintptr_t loader1;
6340   uintptr_t loader2;
6341   uintptr_t ulii;
6342   uintptr_t uljj;
6343   lptr_six_end = &(lptr[word_ct - (word_ct % 6)]);
6344   while (lptr < lptr_six_end) {
6345     loader1 = *lptr++;
6346     loader2 = *lptr++;
6347     ulii = (loader1 & 0x33333333) + ((loader1 >> 2) & 0x33333333);
6348     uljj = (loader2 & 0x33333333) + ((loader2 >> 2) & 0x33333333);
6349     loader1 = *lptr++;
6350     loader2 = *lptr++;
6351     ulii += (loader1 & 0x33333333) + ((loader1 >> 2) & 0x33333333);
6352     uljj += (loader2 & 0x33333333) + ((loader2 >> 2) & 0x33333333);
6353     loader1 = *lptr++;
6354     loader2 = *lptr++;
6355     ulii += (loader1 & 0x33333333) + ((loader1 >> 2) & 0x33333333);
6356     uljj += (loader2 & 0x33333333) + ((loader2 >> 2) & 0x33333333);
6357     ulii = (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6358     ulii += (uljj & 0x0f0f0f0f) + ((uljj >> 4) & 0x0f0f0f0f);
6359 
6360     // Each 8-bit slot stores a number in 0..48.  Multiplying by 0x01010101 is
6361     // equivalent to the left-shifts and adds we need to sum those four 8-bit
6362     // numbers in the high-order slot.
6363     tot += (ulii * 0x01010101) >> 24;
6364   }
6365 #endif
6366   while (lptr < lptr_end) {
6367     tot += popcount2_long(*lptr++);
6368   }
6369   return tot;
6370 }
6371 
popcount_bit_idx(const uintptr_t * lptr,uintptr_t start_idx,uintptr_t end_idx)6372 uintptr_t popcount_bit_idx(const uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
6373   uintptr_t start_idxl = start_idx / BITCT;
6374   uintptr_t start_idxlr = start_idx & (BITCT - 1);
6375   uintptr_t end_idxl = end_idx / BITCT;
6376   uintptr_t end_idxlr = end_idx & (BITCT - 1);
6377   uintptr_t ct = 0;
6378   if (start_idxl == end_idxl) {
6379     return popcount_long(lptr[start_idxl] & ((ONELU << end_idxlr) - (ONELU << start_idxlr)));
6380   }
6381   if (start_idxlr) {
6382     ct = popcount_long(lptr[start_idxl++] >> start_idxlr);
6383   }
6384   if (end_idxl > start_idxl) {
6385     ct += popcount_longs_nzbase(lptr, start_idxl, end_idxl);
6386   }
6387   if (end_idxlr) {
6388     ct += popcount_long(lptr[end_idxl] & ((ONELU << end_idxlr) - ONELU));
6389   }
6390   return ct;
6391 }
6392 
chrom_window_max(const uint32_t * marker_pos,const uintptr_t * marker_exclude,const Chrom_info * chrom_info_ptr,uint32_t chrom_idx,uint32_t ct_max,uint32_t bp_max,uint32_t cur_window_max)6393 uint32_t chrom_window_max(const uint32_t* marker_pos, const uintptr_t* marker_exclude, const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max) {
6394   // okay, it's absurd to keep rewriting this from scratch, especially given
6395   // that makes it likely that some reimplementations suck (--indep{-pairwise}
6396   // version was O(n^2) instead of O(n); sure, it didn't really matter because
6397   // the main calculation was more expensive, but still, ugh).
6398 
6399   if (cur_window_max >= ct_max) {
6400     return ct_max;
6401   }
6402   // assumes chrom_idx exists
6403   uint32_t chrom_fo_idx = chrom_info_ptr->chrom_idx_to_foidx[chrom_idx];
6404   uint32_t chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
6405   uint32_t marker_uidx = next_unset(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], chrom_end);
6406   uint32_t marker_ct = chrom_end - marker_uidx - popcount_bit_idx(marker_exclude, marker_uidx, chrom_end);
6407   if (marker_ct <= cur_window_max) {
6408     return cur_window_max;
6409   }
6410   uint32_t window_idx_first = 0;
6411   uint32_t window_uidx_first = marker_uidx;
6412   uint32_t window_pos_first = marker_pos[marker_uidx];
6413   uint32_t marker_idx;
6414   uint32_t marker_pos_thresh;
6415   for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
6416     next_unset_unsafe_ck(marker_exclude, &marker_uidx);
6417     marker_pos_thresh = marker_pos[marker_uidx];
6418     if (marker_pos_thresh < bp_max) {
6419       marker_pos_thresh = 0;
6420     } else {
6421       marker_pos_thresh -= bp_max;
6422     }
6423     if (marker_pos_thresh > window_pos_first) {
6424       do {
6425         window_uidx_first++;
6426         next_unset_unsafe_ck(marker_exclude, &window_uidx_first);
6427         window_pos_first = marker_pos[window_uidx_first];
6428         window_idx_first++;
6429       } while (marker_pos_thresh > window_pos_first);
6430     } else if (marker_idx - window_idx_first == cur_window_max) {
6431       if (++cur_window_max == ct_max) {
6432 	return cur_window_max;
6433       }
6434     }
6435   }
6436   return cur_window_max;
6437 }
6438 
window_back(const uint32_t * __restrict marker_pos,const double * __restrict marker_cms,const uintptr_t * marker_exclude,uint32_t marker_uidx_min,uint32_t marker_uidx_start,uint32_t count_max,uint32_t bp_max,double cm_max,uint32_t * __restrict window_trail_ct_ptr)6439 uint32_t window_back(const uint32_t* __restrict marker_pos, const double* __restrict marker_cms, const uintptr_t* marker_exclude, uint32_t marker_uidx_min, uint32_t marker_uidx_start, uint32_t count_max, uint32_t bp_max, double cm_max, uint32_t* __restrict window_trail_ct_ptr) {
6440   // Finds the earliest location which is within count_max sites, bp_max bps,
6441   // and (if marker_cms != nullptr) cm_max centimorgans.
6442   // count_max must be positive.
6443   if (marker_uidx_min == marker_uidx_start) {
6444     // special-case this since it happens frequently
6445     *window_trail_ct_ptr = 0;
6446     return marker_uidx_min;
6447   }
6448   double min_cm = marker_cms? (marker_cms[marker_uidx_start] - cm_max) : 0.0;
6449   uint32_t min_pos = 0;
6450   uint32_t marker_uwidx_cur = marker_uidx_start / BITCT;
6451   uint32_t uii = marker_uidx_start % BITCT;
6452   uint32_t marker_uidx_last = marker_uidx_start;
6453   uint32_t remaining_count = count_max;
6454   const uintptr_t* marker_exclude_cur = &(marker_exclude[marker_uwidx_cur]);
6455   uintptr_t cur_word;
6456   uint32_t ujj;
6457   uint32_t ukk;
6458   marker_uwidx_cur *= BITCT;
6459   if (bp_max <= marker_pos[marker_uidx_start]) {
6460     min_pos = marker_pos[marker_uidx_start] - bp_max;
6461   }
6462   if (!uii) {
6463     goto window_back_zstart;
6464   }
6465   cur_word = (~(*marker_exclude_cur)) & ((ONELU << uii) - ONELU);
6466   while (1) {
6467     if (marker_uwidx_cur <= marker_uidx_min) {
6468       cur_word &= ~((ONELU << (marker_uidx_min % BITCT)) - ONELU);
6469       marker_uwidx_cur = marker_uidx_min;
6470       uii = popcount_long(cur_word);
6471       if (uii >= remaining_count) {
6472 	goto window_back_count;
6473       } else if ((marker_pos[marker_uwidx_cur] < min_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] < min_cm))) {
6474 	goto window_back_find_first_pos;
6475       } else {
6476 	goto window_back_min;
6477       }
6478     }
6479     uii = popcount_long(cur_word);
6480     if (uii >= remaining_count) {
6481     window_back_count:
6482       uii -= remaining_count; // now a count of number of bits to advance
6483       while (uii) {
6484 	cur_word &= cur_word - 1;
6485         uii--;
6486       }
6487       // bugfix (7 May 2017): forgot to round marker_uwidx_cur down to word
6488       //   boundary, before adding CTZLU(cur_word) offset
6489       marker_uwidx_cur = (marker_uwidx_cur & (~(BITCT - ONELU))) + CTZLU(cur_word);
6490       if ((marker_pos[marker_uwidx_cur] < min_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] < min_cm))) {
6491 	goto window_back_find_first_pos;
6492       }
6493       *window_trail_ct_ptr = count_max;
6494       return marker_uwidx_cur;
6495     }
6496     if ((marker_pos[marker_uwidx_cur] < min_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] < min_cm))) {
6497     window_back_find_first_pos:
6498       ujj = uint32arr_greater_than(&(marker_pos[marker_uwidx_cur]), marker_uidx_last - marker_uwidx_cur, min_pos);
6499       if (marker_cms) {
6500 	ukk = doublearr_greater_than(&(marker_cms[marker_uwidx_cur]), marker_uidx_last - marker_uwidx_cur, min_cm);
6501 	if (ujj < ukk) {
6502 	  ujj = ukk;
6503 	}
6504       }
6505       marker_uwidx_cur += ujj;
6506       if (marker_uwidx_cur > marker_uidx_min) {
6507 	next_unset_unsafe_ck(marker_exclude, &marker_uwidx_cur);
6508       }
6509     window_back_min:
6510       *window_trail_ct_ptr = marker_uidx_start - marker_uwidx_cur - popcount_bit_idx(marker_exclude, marker_uwidx_cur, marker_uidx_start);
6511       return marker_uwidx_cur;
6512     }
6513     remaining_count -= uii;
6514     marker_uidx_last = marker_uwidx_cur;
6515   window_back_zstart:
6516     cur_word = ~(*(--marker_exclude_cur));
6517     marker_uwidx_cur -= BITCT;
6518   }
6519 }
6520 
window_forward(const uint32_t * __restrict marker_pos,const double * __restrict marker_cms,const uintptr_t * marker_exclude,uint32_t marker_uidx_start,uint32_t marker_uidx_last,uint32_t count_max,uint32_t bp_max,double cm_max,uint32_t * __restrict window_lead_ct_ptr)6521 uint32_t window_forward(const uint32_t* __restrict marker_pos, const double* __restrict marker_cms, const uintptr_t* marker_exclude, uint32_t marker_uidx_start, uint32_t marker_uidx_last, uint32_t count_max, uint32_t bp_max, double cm_max, uint32_t* __restrict window_lead_ct_ptr) {
6522   // window_lead_ct_ptr currently cannot be nullptr
6523   if (marker_uidx_start == marker_uidx_last) {
6524     *window_lead_ct_ptr = 0;
6525     return marker_uidx_start;
6526   }
6527   double max_cm = marker_cms? (cm_max + marker_cms[marker_uidx_start]) : 0.0;
6528   uint32_t marker_uwidx_prev = marker_uidx_start;
6529   uint32_t max_pos = bp_max + marker_pos[marker_uidx_start];
6530   uint32_t marker_uwidx_cur = (marker_uidx_start + 1) / BITCT;
6531   uint32_t uii = (marker_uidx_start + 1) % BITCT;
6532   uint32_t remaining_count = count_max;
6533   const uintptr_t* marker_exclude_cur = &(marker_exclude[marker_uwidx_cur]);
6534   uintptr_t cur_word;
6535   uint32_t ujj;
6536   uint32_t ukk;
6537   marker_uwidx_cur *= BITCT;
6538   cur_word = ~((*marker_exclude_cur) | ((ONELU << uii) - ONELU));
6539   while (1) {
6540     uii = popcount_long(cur_word);
6541     if (uii >= remaining_count) {
6542       while (--remaining_count) {
6543 	cur_word &= cur_word - 1;
6544       }
6545       marker_uwidx_cur += CTZLU(cur_word);
6546       if (marker_uwidx_cur <= marker_uidx_last) {
6547 	if ((marker_pos[marker_uwidx_cur] > max_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] > max_cm))) {
6548 	  break;
6549 	}
6550 	*window_lead_ct_ptr = count_max;
6551 	return marker_uwidx_cur;
6552       }
6553       if ((marker_pos[marker_uidx_last] <= max_pos) && ((!marker_cms) || (marker_cms[marker_uidx_last] <= max_cm))) {
6554 	marker_uwidx_prev = marker_uidx_last;
6555 	goto window_forward_return;
6556       }
6557       marker_uwidx_cur = marker_uidx_last;
6558       break;
6559     }
6560     marker_uwidx_cur += BITCT;
6561     if (marker_uwidx_cur >= marker_uidx_last) {
6562       if ((marker_pos[marker_uidx_last] <= max_pos) && ((!marker_cms) || (marker_cms[marker_uidx_last] <= max_cm))) {
6563 	marker_uwidx_prev = marker_uidx_last;
6564 	goto window_forward_return;
6565       }
6566       marker_uwidx_cur = marker_uidx_last;
6567       break;
6568     } else if ((marker_pos[marker_uwidx_cur] > max_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] > max_cm))) {
6569       break;
6570     }
6571     marker_uwidx_prev = marker_uwidx_cur;
6572     remaining_count -= uii;
6573     cur_word = ~(*(++marker_exclude_cur));
6574   }
6575   ujj = uint32arr_greater_than(&(marker_pos[marker_uwidx_prev]), marker_uwidx_cur - marker_uwidx_prev, max_pos + 1);
6576   if (marker_cms) {
6577     ukk = doublearr_greater_than(&(marker_cms[marker_uwidx_prev]), marker_uwidx_cur - marker_uwidx_prev, max_cm * (1 + SMALL_EPSILON));
6578     if (ujj > ukk) {
6579       ujj = ukk;
6580     }
6581   }
6582   marker_uwidx_prev += ujj;
6583   prev_unset_unsafe_ck(marker_exclude, &marker_uwidx_prev);
6584  window_forward_return:
6585   *window_lead_ct_ptr = marker_uwidx_prev - marker_uidx_start - popcount_bit_idx(marker_exclude, marker_uidx_start, marker_uwidx_prev);
6586   return marker_uwidx_prev;
6587 }
6588 
jump_forward_unset_unsafe(const uintptr_t * bitvec,uintptr_t cur_pos,uintptr_t forward_ct)6589 uintptr_t jump_forward_unset_unsafe(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct) {
6590   // advances forward_ct unset bits; forward_ct must be positive.  (stays put
6591   // if forward_ct == 1 and current bit is unset.  may want to tweak this
6592   // interface, easy to introduce off-by-one bugs...)
6593   // In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end
6594   // of the trailing 16-byte block can be safely read from.
6595   uintptr_t widx = cur_pos / BITCT;
6596   uintptr_t ulii = cur_pos % BITCT;
6597   const uintptr_t* bptr = &(bitvec[widx]);
6598   uintptr_t uljj;
6599   uintptr_t ulkk;
6600 #ifdef __LP64__
6601   const __m128i* vptr;
6602 #endif
6603   if (ulii) {
6604     uljj = (~(*bptr)) >> ulii;
6605     ulkk = popcount_long(uljj);
6606     if (ulkk >= forward_ct) {
6607     jump_forward_unset_unsafe_finish:
6608       ulkk = CTZLU(uljj);
6609       while (--forward_ct) {
6610         uljj &= uljj - 1;
6611         ulkk = CTZLU(uljj);
6612       }
6613       return widx * BITCT + ulii + ulkk;
6614     }
6615     forward_ct -= ulkk;
6616     widx++;
6617     bptr++;
6618   }
6619   ulii = 0;
6620 #ifdef __LP64__
6621   if (widx & 1) {
6622     uljj = ~(*bptr);
6623     ulkk = popcount_long(uljj);
6624     if (ulkk >= forward_ct) {
6625       goto jump_forward_unset_unsafe_finish;
6626     }
6627     forward_ct -= ulkk;
6628     bptr++;
6629   }
6630   vptr = (const __m128i*)bptr;
6631   while (forward_ct > BITCT * 6) {
6632     uljj = ((forward_ct - 1) / (BITCT * 6)) * 3;
6633     ulkk = popcount_vecs(vptr, uljj);
6634     vptr = &(vptr[uljj]);
6635     forward_ct -= uljj * BITCT * 2 - ulkk;
6636   }
6637   bptr = (const uintptr_t*)vptr;
6638   while (forward_ct > BITCT) {
6639     forward_ct -= popcount_long(~(*bptr++));
6640   }
6641 #else
6642   while (forward_ct > BITCT) {
6643     uljj = (forward_ct - 1) / BITCT;
6644     ulkk = popcount_longs(bptr, uljj);
6645     bptr = &(bptr[uljj]);
6646     forward_ct -= uljj * BITCT - ulkk;
6647   }
6648 #endif
6649   while (1) {
6650     uljj = ~(*bptr);
6651     ulkk = popcount_long(uljj);
6652     if (ulkk >= forward_ct) {
6653       widx = (uintptr_t)(bptr - bitvec);
6654       goto jump_forward_unset_unsafe_finish;
6655     }
6656     forward_ct -= ulkk;
6657     bptr++;
6658   }
6659 }
6660 
popcount_longs_exclude(const uintptr_t * __restrict lptr,const uintptr_t * __restrict exclude_arr,uintptr_t end_idx)6661 uintptr_t popcount_longs_exclude(const uintptr_t* __restrict lptr, const uintptr_t* __restrict exclude_arr, uintptr_t end_idx) {
6662   // popcounts lptr ANDNOT exclude_arr[0..(end_idx-1)].
6663   // N.B. on 64-bit systems, assumes lptr and exclude_arr are 16-byte aligned.
6664   uintptr_t tot = 0;
6665   const uintptr_t* lptr_end = &(lptr[end_idx]);
6666 #ifdef __LP64__
6667   uintptr_t six_ct = end_idx / 6;
6668   tot += popcount_vecs_exclude((const __m128i*)lptr, (const __m128i*)exclude_arr, six_ct * 3);
6669   lptr = &(lptr[six_ct * 6]);
6670   exclude_arr = &(exclude_arr[six_ct * 6]);
6671 #else
6672   const uintptr_t* lptr_six_end;
6673   uintptr_t tmp_stor;
6674   uintptr_t loader;
6675   uintptr_t ulii;
6676   uintptr_t uljj;
6677   lptr_six_end = &(lptr[end_idx - (end_idx % 6)]);
6678   while (lptr < lptr_six_end) {
6679     loader = (*lptr++) & (~(*exclude_arr++));
6680     ulii = loader - ((loader >> 1) & FIVEMASK);
6681     loader = (*lptr++) & (~(*exclude_arr++));
6682     uljj = loader - ((loader >> 1) & FIVEMASK);
6683     loader = (*lptr++) & (~(*exclude_arr++));
6684     ulii += (loader >> 1) & FIVEMASK;
6685     uljj += loader & FIVEMASK;
6686     ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6687     ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6688     tmp_stor = (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6689 
6690     loader = (*lptr++) & (~(*exclude_arr++));
6691     ulii = loader - ((loader >> 1) & FIVEMASK);
6692     loader = (*lptr++) & (~(*exclude_arr++));
6693     uljj = loader - ((loader >> 1) & FIVEMASK);
6694     loader = (*lptr++) & (~(*exclude_arr++));
6695     ulii += (loader >> 1) & FIVEMASK;
6696     uljj += loader & FIVEMASK;
6697     ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6698     ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6699     tmp_stor += (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6700 
6701     // Each 8-bit slot stores a number in 0..48.  Multiplying by 0x01010101 is
6702     // equivalent to the left-shifts and adds we need to sum those four 8-bit
6703     // numbers in the high-order slot.
6704     tot += (tmp_stor * 0x01010101) >> 24;
6705   }
6706 #endif
6707   while (lptr < lptr_end) {
6708     tot += popcount_long((*lptr++) & (~(*exclude_arr++)));
6709   }
6710   return tot;
6711 }
6712 
popcount_longs_intersect(const uintptr_t * __restrict lptr1,const uintptr_t * __restrict lptr2,uintptr_t word_ct)6713 uintptr_t popcount_longs_intersect(const uintptr_t* __restrict lptr1, const uintptr_t* __restrict lptr2, uintptr_t word_ct) {
6714   uintptr_t tot = 0;
6715   const uintptr_t* lptr1_end = &(lptr1[word_ct]);
6716 #ifdef __LP64__
6717   uintptr_t six_ct = word_ct / 6;
6718   tot += popcount_vecs_intersect((const __m128i*)lptr1, (const __m128i*)lptr2, six_ct * 3);
6719   lptr1 = &(lptr1[six_ct * 6]);
6720   lptr2 = &(lptr2[six_ct * 6]);
6721 #else
6722   const uintptr_t* lptr1_six_end;
6723   uintptr_t tmp_stor;
6724   uintptr_t loader;
6725   uintptr_t ulii;
6726   uintptr_t uljj;
6727   lptr1_six_end = &(lptr1[word_ct - (word_ct % 6)]);
6728   while (lptr1 < lptr1_six_end) {
6729     loader = (*lptr1++) & (*lptr2++);
6730     ulii = loader - ((loader >> 1) & FIVEMASK);
6731     loader = (*lptr1++) & (*lptr2++);
6732     uljj = loader - ((loader >> 1) & FIVEMASK);
6733     loader = (*lptr1++) & (*lptr2++);
6734     ulii += (loader >> 1) & FIVEMASK;
6735     uljj += loader & FIVEMASK;
6736     ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6737     ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6738     tmp_stor = (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6739 
6740     loader = (*lptr1++) & (*lptr2++);
6741     ulii = loader - ((loader >> 1) & FIVEMASK);
6742     loader = (*lptr1++) & (*lptr2++);
6743     uljj = loader - ((loader >> 1) & FIVEMASK);
6744     loader = (*lptr1++) & (*lptr2++);
6745     ulii += (loader >> 1) & FIVEMASK;
6746     uljj += loader & FIVEMASK;
6747     ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6748     ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6749     tmp_stor += (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6750 
6751     // Each 8-bit slot stores a number in 0..48.  Multiplying by 0x01010101 is
6752     // equivalent to the left-shifts and adds we need to sum those four 8-bit
6753     // numbers in the high-order slot.
6754     tot += (tmp_stor * 0x01010101) >> 24;
6755   }
6756 #endif
6757   while (lptr1 < lptr1_end) {
6758     tot += popcount_long((*lptr1++) & (*lptr2++));
6759   }
6760   return tot;
6761 }
6762 
vertical_bitct_subtract(const uintptr_t * bitarr,uint32_t item_ct,uint32_t * sum_arr)6763 void vertical_bitct_subtract(const uintptr_t* bitarr, uint32_t item_ct, uint32_t* sum_arr) {
6764   // assumes trailing bits are zeroed out
6765   uintptr_t cur_word;
6766   uint32_t idx_offset;
6767   uint32_t last_set_bit;
6768   for (idx_offset = 0; idx_offset < item_ct; idx_offset += BITCT) {
6769     cur_word = *bitarr++;
6770     while (cur_word) {
6771       last_set_bit = CTZLU(cur_word);
6772       sum_arr[idx_offset + last_set_bit] -= 1;
6773       cur_word &= cur_word - ONELU;
6774     }
6775   }
6776 }
6777 
6778 #ifdef __LP64__
count_2freq_dbl_960b(const VECITYPE * geno_vvec,const VECITYPE * geno_vvec_end,const VECITYPE * __restrict mask1vp,const VECITYPE * __restrict mask2vp,uint32_t * __restrict ct1abp,uint32_t * __restrict ct1cp,uint32_t * __restrict ct2abp,uint32_t * __restrict ct2cp)6779 void count_2freq_dbl_960b(const VECITYPE* geno_vvec, const VECITYPE* geno_vvec_end, const VECITYPE* __restrict mask1vp, const VECITYPE* __restrict mask2vp, uint32_t* __restrict ct1abp, uint32_t* __restrict ct1cp, uint32_t* __restrict ct2abp, uint32_t* __restrict ct2cp) {
6780   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6781   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6782   __m128i loader;
6783   __m128i loader2;
6784   __m128i loader3;
6785   __m128i to_ct1_ab;
6786   __m128i to_ct_abtmp;
6787   __m128i to_ct1_c;
6788   __m128i to_ct2_ab;
6789   __m128i to_ct2_c;
6790   __univec acc1_ab;
6791   __univec acc1_c;
6792   __univec acc2_ab;
6793   __univec acc2_c;
6794 
6795   acc1_ab.vi = _mm_setzero_si128();
6796   acc1_c.vi = _mm_setzero_si128();
6797   acc2_ab.vi = _mm_setzero_si128();
6798   acc2_c.vi = _mm_setzero_si128();
6799   do {
6800     loader = *geno_vvec++;
6801     loader2 = *mask1vp++;
6802     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6803     loader2 = _mm_and_si128(loader2, loader);
6804     to_ct1_ab = _mm_add_epi64(loader3, loader2);
6805     to_ct1_c = _mm_andnot_si128(loader3, loader2);
6806     loader2 = *mask2vp++;
6807     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6808     loader2 = _mm_and_si128(loader2, loader);
6809     to_ct2_ab = _mm_add_epi64(loader3, loader2);
6810     to_ct2_c = _mm_andnot_si128(loader3, loader2);
6811     to_ct1_ab = _mm_add_epi64(_mm_and_si128(to_ct1_ab, m2), _mm_and_si128(_mm_srli_epi64(to_ct1_ab, 2), m2));
6812     to_ct2_ab = _mm_add_epi64(_mm_and_si128(to_ct2_ab, m2), _mm_and_si128(_mm_srli_epi64(to_ct2_ab, 2), m2));
6813 
6814     loader = *geno_vvec++;
6815     loader2 = *mask1vp++;
6816     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6817     loader2 = _mm_and_si128(loader2, loader);
6818     to_ct_abtmp = _mm_add_epi64(loader3, loader2);
6819     to_ct1_c = _mm_add_epi64(to_ct1_c, _mm_andnot_si128(loader3, loader2));
6820     to_ct1_ab = _mm_add_epi64(to_ct1_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
6821     loader2 = *mask2vp++;
6822     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6823     loader2 = _mm_and_si128(loader2, loader);
6824     to_ct_abtmp = _mm_add_epi64(loader3, loader2);
6825     to_ct2_c = _mm_add_epi64(to_ct2_c, _mm_andnot_si128(loader3, loader2));
6826     to_ct2_ab = _mm_add_epi64(to_ct2_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
6827 
6828     loader = *geno_vvec++;
6829     loader2 = *mask1vp++;
6830     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6831     loader2 = _mm_and_si128(loader2, loader);
6832     to_ct_abtmp = _mm_add_epi64(loader3, loader2);
6833     to_ct1_c = _mm_add_epi64(to_ct1_c, _mm_andnot_si128(loader3, loader2));
6834     to_ct1_ab = _mm_add_epi64(to_ct1_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
6835     loader2 = *mask2vp++;
6836     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6837     loader2 = _mm_and_si128(loader2, loader);
6838     to_ct_abtmp = _mm_add_epi64(loader3, loader2);
6839     to_ct2_c = _mm_add_epi64(to_ct2_c, _mm_andnot_si128(loader3, loader2));
6840     to_ct2_ab = _mm_add_epi64(to_ct2_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
6841 
6842     to_ct1_c = _mm_add_epi64(_mm_and_si128(to_ct1_c, m2), _mm_and_si128(_mm_srli_epi64(to_ct1_c, 2), m2));
6843     to_ct2_c = _mm_add_epi64(_mm_and_si128(to_ct2_c, m2), _mm_and_si128(_mm_srli_epi64(to_ct2_c, 2), m2));
6844 
6845     acc1_ab.vi = _mm_add_epi64(acc1_ab.vi, _mm_add_epi64(_mm_and_si128(to_ct1_ab, m4), _mm_and_si128(_mm_srli_epi64(to_ct1_ab, 4), m4)));
6846     acc1_c.vi = _mm_add_epi64(acc1_c.vi, _mm_add_epi64(_mm_and_si128(to_ct1_c, m4), _mm_and_si128(_mm_srli_epi64(to_ct1_c, 4), m4)));
6847     acc2_ab.vi = _mm_add_epi64(acc2_ab.vi, _mm_add_epi64(_mm_and_si128(to_ct2_ab, m4), _mm_and_si128(_mm_srli_epi64(to_ct2_ab, 4), m4)));
6848     acc2_c.vi = _mm_add_epi64(acc2_c.vi, _mm_add_epi64(_mm_and_si128(to_ct2_c, m4), _mm_and_si128(_mm_srli_epi64(to_ct2_c, 4), m4)));
6849   } while (geno_vvec < geno_vvec_end);
6850   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6851   acc1_ab.vi = _mm_add_epi64(_mm_and_si128(acc1_ab.vi, m8), _mm_and_si128(_mm_srli_epi64(acc1_ab.vi, 8), m8));
6852   acc1_c.vi = _mm_and_si128(_mm_add_epi64(acc1_c.vi, _mm_srli_epi64(acc1_c.vi, 8)), m8);
6853   acc2_ab.vi = _mm_add_epi64(_mm_and_si128(acc2_ab.vi, m8), _mm_and_si128(_mm_srli_epi64(acc2_ab.vi, 8), m8));
6854   acc2_c.vi = _mm_and_si128(_mm_add_epi64(acc2_c.vi, _mm_srli_epi64(acc2_c.vi, 8)), m8);
6855   *ct1abp += ((acc1_ab.u8[0] + acc1_ab.u8[1]) * 0x1000100010001LLU) >> 48;
6856   *ct1cp += ((acc1_c.u8[0] + acc1_c.u8[1]) * 0x1000100010001LLU) >> 48;
6857   *ct2abp += ((acc2_ab.u8[0] + acc2_ab.u8[1]) * 0x1000100010001LLU) >> 48;
6858   *ct2cp += ((acc2_c.u8[0] + acc2_c.u8[1]) * 0x1000100010001LLU) >> 48;
6859 }
6860 
count_3freq_1920b(const VECITYPE * geno_vvec,const VECITYPE * geno_vvec_end,const VECITYPE * __restrict maskvp,uint32_t * __restrict even_ctp,uint32_t * __restrict odd_ctp,uint32_t * __restrict homset_ctp)6861 void count_3freq_1920b(const VECITYPE* geno_vvec, const VECITYPE* geno_vvec_end, const VECITYPE* __restrict maskvp, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict homset_ctp) {
6862   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6863   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6864   __m128i loader;
6865   __m128i loader2;
6866   __m128i loader3;
6867   __m128i even1;
6868   __m128i odd1;
6869   __m128i homset1;
6870   __m128i even2;
6871   __m128i odd2;
6872   __m128i homset2;
6873   __univec acc_even;
6874   __univec acc_odd;
6875   __univec acc_homset;
6876 
6877   acc_even.vi = _mm_setzero_si128();
6878   acc_odd.vi = _mm_setzero_si128();
6879   acc_homset.vi = _mm_setzero_si128();
6880   do {
6881     loader = *geno_vvec++;
6882     loader2 = *maskvp++;
6883     odd1 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6884     even1 = _mm_and_si128(loader2, loader);
6885     homset1 = _mm_and_si128(odd1, loader);
6886     loader = *geno_vvec++;
6887     loader2 = *maskvp++;
6888     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6889     even1 = _mm_add_epi64(even1, _mm_and_si128(loader2, loader));
6890     odd1 = _mm_add_epi64(odd1, loader3);
6891     homset1 = _mm_add_epi64(homset1, _mm_and_si128(loader3, loader));
6892     loader = *geno_vvec++;
6893     loader2 = *maskvp++;
6894     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6895     even1 = _mm_add_epi64(even1, _mm_and_si128(loader2, loader));
6896     odd1 = _mm_add_epi64(odd1, loader3);
6897     homset1 = _mm_add_epi64(homset1, _mm_and_si128(loader3, loader));
6898 
6899     even1 = _mm_add_epi64(_mm_and_si128(even1, m2), _mm_and_si128(_mm_srli_epi64(even1, 2), m2));
6900     odd1 = _mm_add_epi64(_mm_and_si128(odd1, m2), _mm_and_si128(_mm_srli_epi64(odd1, 2), m2));
6901     homset1 = _mm_add_epi64(_mm_and_si128(homset1, m2), _mm_and_si128(_mm_srli_epi64(homset1, 2), m2));
6902 
6903     loader = *geno_vvec++;
6904     loader2 = *maskvp++;
6905     odd2 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6906     even2 = _mm_and_si128(loader2, loader);
6907     homset2 = _mm_and_si128(odd2, loader);
6908     loader = *geno_vvec++;
6909     loader2 = *maskvp++;
6910     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6911     even2 = _mm_add_epi64(even2, _mm_and_si128(loader2, loader));
6912     odd2 = _mm_add_epi64(odd2, loader3);
6913     homset2 = _mm_add_epi64(homset2, _mm_and_si128(loader3, loader));
6914     loader = *geno_vvec++;
6915     loader2 = *maskvp++;
6916     loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6917     even2 = _mm_add_epi64(even2, _mm_and_si128(loader2, loader));
6918     odd2 = _mm_add_epi64(odd2, loader3);
6919     homset2 = _mm_add_epi64(homset2, _mm_and_si128(loader3, loader));
6920 
6921     even1 = _mm_add_epi64(even1, _mm_add_epi64(_mm_and_si128(even2, m2), _mm_and_si128(_mm_srli_epi64(even2, 2), m2)));
6922     odd1 = _mm_add_epi64(odd1, _mm_add_epi64(_mm_and_si128(odd2, m2), _mm_and_si128(_mm_srli_epi64(odd2, 2), m2)));
6923     homset1 = _mm_add_epi64(homset1, _mm_add_epi64(_mm_and_si128(homset2, m2), _mm_and_si128(_mm_srli_epi64(homset2, 2), m2)));
6924 
6925     acc_even.vi = _mm_add_epi64(acc_even.vi, _mm_add_epi64(_mm_and_si128(even1, m4), _mm_and_si128(_mm_srli_epi64(even1, 4), m4)));
6926     acc_odd.vi = _mm_add_epi64(acc_odd.vi, _mm_add_epi64(_mm_and_si128(odd1, m4), _mm_and_si128(_mm_srli_epi64(odd1, 4), m4)));
6927     acc_homset.vi = _mm_add_epi64(acc_homset.vi, _mm_add_epi64(_mm_and_si128(homset1, m4), _mm_and_si128(_mm_srli_epi64(homset1, 4), m4)));
6928   } while (geno_vvec < geno_vvec_end);
6929   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6930   acc_even.vi = _mm_add_epi64(_mm_and_si128(acc_even.vi, m8), _mm_and_si128(_mm_srli_epi64(acc_even.vi, 8), m8));
6931   acc_odd.vi = _mm_add_epi64(_mm_and_si128(acc_odd.vi, m8), _mm_and_si128(_mm_srli_epi64(acc_odd.vi, 8), m8));
6932   acc_homset.vi = _mm_add_epi64(_mm_and_si128(acc_homset.vi, m8), _mm_and_si128(_mm_srli_epi64(acc_homset.vi, 8), m8));
6933   *even_ctp += ((acc_even.u8[0] + acc_even.u8[1]) * 0x1000100010001LLU) >> 48;
6934   *odd_ctp += ((acc_odd.u8[0] + acc_odd.u8[1]) * 0x1000100010001LLU) >> 48;
6935   *homset_ctp += ((acc_homset.u8[0] + acc_homset.u8[1]) * 0x1000100010001LLU) >> 48;
6936 }
6937 #else
count_2freq_dbl_24b(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict mask1p,const uintptr_t * __restrict mask2p,uint32_t * __restrict ct1abp,uint32_t * __restrict ct1cp,uint32_t * __restrict ct2abp,uint32_t * __restrict ct2cp)6938 void count_2freq_dbl_24b(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict mask1p, const uintptr_t* __restrict mask2p, uint32_t* __restrict ct1abp, uint32_t* __restrict ct1cp, uint32_t* __restrict ct2abp, uint32_t* __restrict ct2cp) {
6939   uintptr_t loader = *geno_vec++;
6940   uintptr_t loader2 = *mask1p++;
6941   uintptr_t loader3 = (loader >> 1) & loader2;
6942   uintptr_t to_ct1_ab;
6943   uintptr_t to_ct1_c;
6944   uintptr_t to_ct2_ab;
6945   uintptr_t to_ct2_c;
6946   uintptr_t to_ct_abtmp;
6947   uintptr_t partial1_ab;
6948   uintptr_t partial1_c;
6949   uintptr_t partial2_ab;
6950   uintptr_t partial2_c;
6951   loader2 &= loader;
6952   to_ct1_ab = loader2 + loader3;
6953   to_ct1_c = loader2 & (~loader3);
6954   loader2 = *mask2p++;
6955   loader3 = (loader >> 1) & loader2;
6956   loader2 &= loader;
6957   to_ct2_ab = loader2 + loader3;
6958   to_ct2_c = loader2 & (~loader3);
6959 
6960   to_ct1_ab = (to_ct1_ab & 0x33333333) + ((to_ct1_ab >> 2) & 0x33333333);
6961   to_ct2_ab = (to_ct2_ab & 0x33333333) + ((to_ct2_ab >> 2) & 0x33333333);
6962 
6963   loader = *geno_vec++;
6964   loader2 = *mask1p++;
6965   loader3 = (loader >> 1) & loader2;
6966   loader2 &= loader;
6967   to_ct_abtmp = loader2 + loader3;
6968   to_ct1_c += loader2 & (~loader3);
6969   to_ct1_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
6970   loader2 = *mask2p++;
6971   loader3 = (loader >> 1) & loader2;
6972   loader2 &= loader;
6973   to_ct_abtmp = loader2 + loader3;
6974   to_ct2_c += loader2 & (~loader3);
6975   to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
6976 
6977   loader = *geno_vec++;
6978   loader2 = *mask1p++;
6979   loader3 = (loader >> 1) & loader2;
6980   loader2 &= loader;
6981   to_ct_abtmp = loader2 + loader3;
6982   to_ct1_c += loader2 & (~loader3);
6983   to_ct1_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
6984   loader2 = *mask2p++;
6985   loader3 = (loader >> 1) & loader2;
6986   loader2 &= loader;
6987   to_ct_abtmp = loader2 + loader3;
6988   to_ct2_c += loader2 & (~loader3);
6989   to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
6990 
6991   partial1_ab = (to_ct1_ab & 0x0f0f0f0f) + ((to_ct1_ab >> 4) & 0x0f0f0f0f);
6992   partial1_c = (to_ct1_c & 0x33333333) + ((to_ct1_c >> 2) & 0x33333333);
6993   partial2_ab = (to_ct2_ab & 0x0f0f0f0f) + ((to_ct2_ab >> 4) & 0x0f0f0f0f);
6994   partial2_c = (to_ct2_c & 0x33333333) + ((to_ct2_c >> 2) & 0x33333333);
6995 
6996   loader = *geno_vec++;
6997   loader2 = *mask1p++;
6998   loader3 = (loader >> 1) & loader2;
6999   loader2 &= loader;
7000   to_ct1_ab = loader2 + loader3;
7001   to_ct1_c = loader2 & (~loader3);
7002   loader2 = *mask2p++;
7003   loader3 = (loader >> 1) & loader2;
7004   loader2 &= loader;
7005   to_ct2_ab = loader2 + loader3;
7006   to_ct2_c = loader2 & (~loader3);
7007 
7008   to_ct1_ab = (to_ct1_ab & 0x33333333) + ((to_ct1_ab >> 2) & 0x33333333);
7009   to_ct2_ab = (to_ct2_ab & 0x33333333) + ((to_ct2_ab >> 2) & 0x33333333);
7010 
7011   loader = *geno_vec++;
7012   loader2 = *mask1p++;
7013   loader3 = (loader >> 1) & loader2;
7014   loader2 &= loader;
7015   to_ct_abtmp = loader2 + loader3;
7016   to_ct1_c += loader2 & (~loader3);
7017   to_ct1_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
7018   loader2 = *mask2p++;
7019   loader3 = (loader >> 1) & loader2;
7020   loader2 &= loader;
7021   to_ct_abtmp = loader2 + loader3;
7022   to_ct2_c += loader2 & (~loader3);
7023   to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
7024 
7025   loader = *geno_vec++;
7026   loader2 = *mask1p++;
7027   loader3 = (loader >> 1) & loader2;
7028   loader2 &= loader;
7029   to_ct_abtmp = loader2 + loader3;
7030   to_ct1_c += loader2 & (~loader3);
7031   to_ct1_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
7032   loader2 = *mask2p++;
7033   loader3 = (loader >> 1) & loader2;
7034   loader2 &= loader;
7035   to_ct_abtmp = loader2 + loader3;
7036   to_ct2_c += loader2 & (~loader3);
7037   to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
7038 
7039   partial1_ab += (to_ct1_ab & 0x0f0f0f0f) + ((to_ct1_ab >> 4) & 0x0f0f0f0f);
7040   partial1_c += (to_ct1_c & 0x33333333) + ((to_ct1_c >> 2) & 0x33333333);
7041   partial2_ab += (to_ct2_ab & 0x0f0f0f0f) + ((to_ct2_ab >> 4) & 0x0f0f0f0f);
7042   partial2_c += (to_ct2_c & 0x33333333) + ((to_ct2_c >> 2) & 0x33333333);
7043 
7044   partial1_c = (partial1_c & 0x0f0f0f0f) + ((partial1_c >> 4) & 0x0f0f0f0f);
7045   partial2_c = (partial2_c & 0x0f0f0f0f) + ((partial2_c >> 4) & 0x0f0f0f0f);
7046 
7047   *ct1abp += (partial1_ab * 0x01010101) >> 24;
7048   *ct1cp += (partial1_c * 0x01010101) >> 24;
7049   *ct2abp += (partial2_ab * 0x01010101) >> 24;
7050   *ct2cp += (partial2_c * 0x01010101) >> 24;
7051 }
7052 
count_3freq_48b(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict maskp,uint32_t * __restrict ctap,uint32_t * __restrict ctbp,uint32_t * __restrict ctcp)7053 void count_3freq_48b(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict maskp, uint32_t* __restrict ctap, uint32_t* __restrict ctbp, uint32_t* __restrict ctcp) {
7054   uintptr_t loader = *geno_vec++;
7055   uintptr_t loader2 = *maskp++;
7056   uint32_t to_ct_a1 = loader & loader2;
7057   uint32_t to_ct_b1 = (loader >> 1) & loader2;
7058   uint32_t to_ct_c1 = loader & to_ct_b1;
7059   uintptr_t loader3;
7060   uint32_t to_ct_a2;
7061   uint32_t to_ct_b2;
7062   uint32_t to_ct_c2;
7063   uintptr_t partial_a;
7064   uintptr_t partial_b;
7065   uintptr_t partial_c;
7066   loader = *geno_vec++;
7067   loader2 = *maskp++;
7068   loader3 = (loader >> 1) & loader2;
7069   to_ct_a1 += loader & loader2;
7070   to_ct_b1 += loader3;
7071   to_ct_c1 += loader & loader3;
7072   loader = *geno_vec++;
7073   loader2 = *maskp++;
7074   loader3 = (loader >> 1) & loader2;
7075   to_ct_a1 += loader & loader2;
7076   to_ct_b1 += loader3;
7077   to_ct_c1 += loader & loader3;
7078 
7079   loader = *geno_vec++;
7080   loader2 = *maskp++;
7081   to_ct_a2 = loader & loader2;
7082   to_ct_b2 = (loader >> 1) & loader2;
7083   to_ct_c2 = loader & to_ct_b2;
7084   loader = *geno_vec++;
7085   loader2 = *maskp++;
7086   loader3 = (loader >> 1) & loader2;
7087   to_ct_a2 += loader & loader2;
7088   to_ct_b2 += loader3;
7089   to_ct_c2 += loader & loader3;
7090   loader = *geno_vec++;
7091   loader2 = *maskp++;
7092   loader3 = (loader >> 1) & loader2;
7093   to_ct_a2 += loader & loader2;
7094   to_ct_b2 += loader3;
7095   to_ct_c2 += loader & loader3;
7096 
7097   to_ct_a1 = (to_ct_a1 & 0x33333333) + ((to_ct_a1 >> 2) & 0x33333333);
7098   to_ct_a1 += (to_ct_a2 & 0x33333333) + ((to_ct_a2 >> 2) & 0x33333333);
7099   partial_a = (to_ct_a1 & 0x0f0f0f0f) + ((to_ct_a1 >> 4) & 0x0f0f0f0f);
7100   to_ct_b1 = (to_ct_b1 & 0x33333333) + ((to_ct_b1 >> 2) & 0x33333333);
7101   to_ct_b1 += (to_ct_b2 & 0x33333333) + ((to_ct_b2 >> 2) & 0x33333333);
7102   partial_b = (to_ct_b1 & 0x0f0f0f0f) + ((to_ct_b1 >> 4) & 0x0f0f0f0f);
7103   to_ct_c1 = (to_ct_c1 & 0x33333333) + ((to_ct_c1 >> 2) & 0x33333333);
7104   to_ct_c1 += (to_ct_c2 & 0x33333333) + ((to_ct_c2 >> 2) & 0x33333333);
7105   partial_c = (to_ct_c1 & 0x0f0f0f0f) + ((to_ct_c1 >> 4) & 0x0f0f0f0f);
7106 
7107   loader = *geno_vec++;
7108   loader2 = *maskp++;
7109   to_ct_a1 = loader & loader2;
7110   to_ct_b1 = (loader >> 1) & loader2;
7111   to_ct_c1 = loader & to_ct_b1;
7112   loader = *geno_vec++;
7113   loader2 = *maskp++;
7114   loader3 = (loader >> 1) & loader2;
7115   to_ct_a1 += loader & loader2;
7116   to_ct_b1 += loader3;
7117   to_ct_c1 += loader & loader3;
7118   loader = *geno_vec++;
7119   loader2 = *maskp++;
7120   loader3 = (loader >> 1) & loader2;
7121   to_ct_a1 += loader & loader2;
7122   to_ct_b1 += loader3;
7123   to_ct_c1 += loader & loader3;
7124 
7125   loader = *geno_vec++;
7126   loader2 = *maskp++;
7127   to_ct_a2 = loader & loader2;
7128   to_ct_b2 = (loader >> 1) & loader2;
7129   to_ct_c2 = loader & to_ct_b2;
7130   loader = *geno_vec++;
7131   loader2 = *maskp++;
7132   loader3 = (loader >> 1) & loader2;
7133   to_ct_a2 += loader & loader2;
7134   to_ct_b2 += loader3;
7135   to_ct_c2 += loader & loader3;
7136   loader = *geno_vec;
7137   loader2 = *maskp;
7138   loader3 = (loader >> 1) & loader2;
7139   to_ct_a2 += loader & loader2;
7140   to_ct_b2 += loader3;
7141   to_ct_c2 += loader & loader3;
7142 
7143   to_ct_a1 = (to_ct_a1 & 0x33333333) + ((to_ct_a1 >> 2) & 0x33333333);
7144   to_ct_a1 += (to_ct_a2 & 0x33333333) + ((to_ct_a2 >> 2) & 0x33333333);
7145   partial_a += (to_ct_a1 & 0x0f0f0f0f) + ((to_ct_a1 >> 4) & 0x0f0f0f0f);
7146   to_ct_b1 = (to_ct_b1 & 0x33333333) + ((to_ct_b1 >> 2) & 0x33333333);
7147   to_ct_b1 += (to_ct_b2 & 0x33333333) + ((to_ct_b2 >> 2) & 0x33333333);
7148   partial_b += (to_ct_b1 & 0x0f0f0f0f) + ((to_ct_b1 >> 4) & 0x0f0f0f0f);
7149   to_ct_c1 = (to_ct_c1 & 0x33333333) + ((to_ct_c1 >> 2) & 0x33333333);
7150   to_ct_c1 += (to_ct_c2 & 0x33333333) + ((to_ct_c2 >> 2) & 0x33333333);
7151   partial_c += (to_ct_c1 & 0x0f0f0f0f) + ((to_ct_c1 >> 4) & 0x0f0f0f0f);
7152 
7153   *ctap += (partial_a * 0x01010101) >> 24;
7154   *ctbp += (partial_b * 0x01010101) >> 24;
7155   *ctcp += (partial_c * 0x01010101) >> 24;
7156 }
7157 #endif
7158 
7159 #ifdef __LP64__
count_set_freq_60v(const __m128i * vptr,const __m128i * vend,const __m128i * __restrict include_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7160 void count_set_freq_60v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7161   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7162   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7163   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7164   __m128i loader;
7165   __m128i loader2;
7166   __m128i loader3;
7167   __m128i odds;
7168   __m128i evens;
7169   __m128i missings;
7170   __univec acc;
7171   __univec accm;
7172   acc.vi = _mm_setzero_si128();
7173   accm.vi = _mm_setzero_si128();
7174   do {
7175     loader = *vptr++;
7176     loader2 = _mm_srli_epi64(loader, 1);
7177     loader3 = *include_vec++;
7178     odds = _mm_and_si128(loader2, loader3);
7179     evens = _mm_and_si128(odds, loader);
7180     missings = _mm_and_si128(loader, _mm_andnot_si128(loader2, loader3));
7181 
7182     loader = *vptr++;
7183     loader2 = _mm_srli_epi64(loader, 1);
7184     loader3 = *include_vec++;
7185     odds = _mm_add_epi64(odds, _mm_and_si128(loader2, loader3));
7186     loader3 = _mm_and_si128(loader, loader3);
7187     evens = _mm_add_epi64(evens, _mm_and_si128(loader2, loader3));
7188     missings = _mm_add_epi64(missings, _mm_andnot_si128(loader2, loader3));
7189 
7190     loader = *vptr++;
7191     loader2 = _mm_srli_epi64(loader, 1);
7192     loader3 = *include_vec++;
7193     odds = _mm_add_epi64(odds, _mm_and_si128(loader2, loader3));
7194     loader3 = _mm_and_si128(loader, loader3);
7195     evens = _mm_add_epi64(evens, _mm_and_si128(loader2, loader3));
7196     missings = _mm_add_epi64(missings, _mm_andnot_si128(loader2, loader3));
7197 
7198     odds = _mm_add_epi64(_mm_and_si128(odds, m2), _mm_and_si128(_mm_srli_epi64(odds, 2), m2));
7199     missings = _mm_add_epi64(_mm_and_si128(missings, m2), _mm_and_si128(_mm_srli_epi64(missings, 2), m2));
7200     odds = _mm_add_epi64(odds, _mm_add_epi64(_mm_and_si128(evens, m2), _mm_and_si128(_mm_srli_epi64(evens, 2), m2)));
7201 
7202     // each 4-bit value here <= 6, so safe to add before m4 mask
7203     accm.vi = _mm_add_epi64(accm.vi, _mm_and_si128(_mm_add_epi64(missings, _mm_srli_epi64(missings, 4)), m4));
7204 
7205     acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(odds, m4), _mm_and_si128(_mm_srli_epi64(odds, 4), m4)));
7206   } while (vptr < vend);
7207   // and each 8-bit value here <= 120
7208   accm.vi = _mm_and_si128(_mm_add_epi64(accm.vi, _mm_srli_epi64(accm.vi, 8)), m8);
7209 
7210   acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7211   *set_ctp += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7212   *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
7213 }
7214 
count_set_freq_hap_120v(const __m128i * vptr,const __m128i * vend,const __m128i * __restrict include_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7215 void count_set_freq_hap_120v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7216   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7217   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7218   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7219   __univec acc;
7220   __univec accm;
7221   __m128i loader;
7222   __m128i loader2;
7223   __m128i loader3;
7224   __m128i partial;
7225   __m128i partialm;
7226   __m128i partial2;
7227   __m128i partial2m;
7228   acc.vi = _mm_setzero_si128();
7229   accm.vi = _mm_setzero_si128();
7230   do {
7231     loader = *vptr++;
7232     loader2 = _mm_srli_epi64(loader, 1);
7233     loader3 = *include_vec++;
7234     partial = _mm_and_si128(loader3, _mm_and_si128(loader, loader2));
7235     partialm = _mm_and_si128(loader3, _mm_xor_si128(loader, loader2));
7236     loader = *vptr++;
7237     loader2 = _mm_srli_epi64(loader, 1);
7238     loader3 = *include_vec++;
7239     partial = _mm_add_epi64(partial, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7240     partialm = _mm_add_epi64(partialm, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7241     loader = *vptr++;
7242     loader2 = _mm_srli_epi64(loader, 1);
7243     loader3 = *include_vec++;
7244     partial = _mm_add_epi64(partial, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7245     partialm = _mm_add_epi64(partialm, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7246     partial2 = _mm_add_epi64(_mm_and_si128(partial, m2), _mm_and_si128(_mm_srli_epi64(partial, 2), m2));
7247     partial2m = _mm_add_epi64(_mm_and_si128(partialm, m2), _mm_and_si128(_mm_srli_epi64(partialm, 2), m2));
7248 
7249     loader = *vptr++;
7250     loader2 = _mm_srli_epi64(loader, 1);
7251     loader3 = *include_vec++;
7252     partial = _mm_and_si128(loader3, _mm_and_si128(loader, loader2));
7253     partialm = _mm_and_si128(loader3, _mm_xor_si128(loader, loader2));
7254     loader = *vptr++;
7255     loader2 = _mm_srli_epi64(loader, 1);
7256     loader3 = *include_vec++;
7257     partial = _mm_add_epi64(partial, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7258     partialm = _mm_add_epi64(partialm, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7259     loader = *vptr++;
7260     loader2 = _mm_srli_epi64(loader, 1);
7261     loader3 = *include_vec++;
7262     partial = _mm_add_epi64(partial, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7263     partialm = _mm_add_epi64(partialm, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7264     partial2 = _mm_add_epi64(partial2, _mm_add_epi64(_mm_and_si128(partial, m2), _mm_and_si128(_mm_srli_epi64(partial, 2), m2)));
7265     partial2m = _mm_add_epi64(partial2m, _mm_add_epi64(_mm_and_si128(partialm, m2), _mm_and_si128(_mm_srli_epi64(partialm, 2), m2)));
7266     acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(partial2, m4), _mm_and_si128(_mm_srli_epi64(partial2, 4), m4)));
7267     accm.vi = _mm_add_epi64(accm.vi, _mm_add_epi64(_mm_and_si128(partial2m, m4), _mm_and_si128(_mm_srli_epi64(partial2m, 4), m4)));
7268   } while (vptr < vend);
7269   acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7270   accm.vi = _mm_add_epi64(_mm_and_si128(accm.vi, m8), _mm_and_si128(_mm_srli_epi64(accm.vi, 8), m8));
7271   *set_ctp += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7272   *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
7273 }
7274 
count_set_freq_x_60v(const __m128i * vptr,const __m128i * vend,const __m128i * __restrict include_vec,const __m128i * __restrict male_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7275 void count_set_freq_x_60v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, const __m128i* __restrict male_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7276   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7277   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7278   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7279   __m128i loader;
7280   __m128i loader2;
7281   __m128i loader3;
7282   __m128i loader4;
7283   __m128i set_odds;
7284   __m128i set_evens;
7285   __m128i missings_nm;
7286   __m128i missings_m;
7287   __m128i males;
7288   __univec acc;
7289   __univec accm;
7290   acc.vi = _mm_setzero_si128();
7291   accm.vi = _mm_setzero_si128();
7292   do {
7293     loader = *vptr++;
7294     loader2 = _mm_srli_epi64(loader, 1);
7295     loader3 = *include_vec++;
7296     loader4 = _mm_andnot_si128(*male_vec, loader3);
7297     set_evens = _mm_and_si128(loader, loader4); // subtract missings_nm later
7298     set_odds = _mm_and_si128(loader2, loader4);
7299     missings_nm = _mm_andnot_si128(loader2, set_evens);
7300     males = _mm_and_si128(loader3, *male_vec++);
7301     set_evens = _mm_or_si128(set_evens, _mm_and_si128(_mm_and_si128(loader, loader2), males));
7302     missings_m = _mm_and_si128(_mm_xor_si128(loader, loader2), males);
7303 
7304     loader = *vptr++;
7305     loader2 = _mm_srli_epi64(loader, 1);
7306     loader3 = *include_vec++;
7307     loader4 = _mm_andnot_si128(*male_vec, loader3);
7308     set_odds = _mm_add_epi64(set_odds, _mm_and_si128(loader2, loader4));
7309     loader4 = _mm_and_si128(loader, loader4);
7310     set_evens = _mm_add_epi64(set_evens, loader4);
7311     missings_nm = _mm_add_epi64(missings_nm, _mm_andnot_si128(loader2, loader4));
7312     loader4 = _mm_and_si128(loader3, *male_vec++);
7313     set_evens = _mm_add_epi64(set_evens, _mm_and_si128(_mm_and_si128(loader, loader2), loader4));
7314     missings_m = _mm_add_epi64(missings_m, _mm_and_si128(_mm_xor_si128(loader, loader2), loader4));
7315     males = _mm_add_epi64(males, loader4);
7316 
7317     loader = *vptr++;
7318     loader2 = _mm_srli_epi64(loader, 1);
7319     loader3 = *include_vec++;
7320     loader4 = _mm_andnot_si128(*male_vec, loader3);
7321     set_odds = _mm_add_epi64(set_odds, _mm_and_si128(loader2, loader4));
7322     loader4 = _mm_and_si128(loader, loader4);
7323     set_evens = _mm_add_epi64(set_evens, loader4);
7324     missings_nm = _mm_add_epi64(missings_nm, _mm_andnot_si128(loader2, loader4));
7325     loader4 = _mm_and_si128(loader3, *male_vec++);
7326     set_evens = _mm_add_epi64(set_evens, _mm_and_si128(_mm_and_si128(loader, loader2), loader4));
7327     missings_m = _mm_add_epi64(missings_m, _mm_and_si128(_mm_xor_si128(loader, loader2), loader4));
7328     males = _mm_add_epi64(males, loader4);
7329 
7330     set_evens = _mm_sub_epi64(set_evens, missings_nm);
7331     missings_nm = _mm_slli_epi64(_mm_add_epi64(_mm_and_si128(missings_nm, m2), _mm_and_si128(_mm_srli_epi64(missings_nm, 2), m2)), 1);
7332     set_odds = _mm_add_epi64(_mm_and_si128(set_odds, m2), _mm_and_si128(_mm_srli_epi64(set_odds, 2), m2));
7333     missings_nm = _mm_add_epi64(missings_nm, _mm_add_epi64(_mm_and_si128(missings_m, m2), _mm_and_si128(_mm_srli_epi64(missings_m, 2), m2)));
7334     set_odds = _mm_add_epi64(set_odds, _mm_add_epi64(_mm_and_si128(set_evens, m2), _mm_and_si128(_mm_srli_epi64(set_evens, 2), m2)));
7335     missings_nm = _mm_add_epi64(missings_nm, _mm_add_epi64(_mm_and_si128(males, m2), _mm_and_si128(_mm_srli_epi64(males, 2), m2)));
7336     acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(set_odds, m4), _mm_and_si128(_mm_srli_epi64(set_odds, 4), m4)));
7337     accm.vi = _mm_add_epi64(accm.vi, _mm_add_epi64(_mm_and_si128(missings_nm, m4), _mm_and_si128(_mm_srli_epi64(missings_nm, 4), m4)));
7338   } while (vptr < vend);
7339   acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7340   accm.vi = _mm_add_epi64(_mm_and_si128(accm.vi, m8), _mm_and_si128(_mm_srli_epi64(accm.vi, 8), m8));
7341   *set_ctp += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7342   *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
7343 }
7344 
count_set_freq_y_120v(const __m128i * vptr,const __m128i * vend,const __m128i * __restrict include_vec,const __m128i * __restrict nonmale_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7345 void count_set_freq_y_120v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, const __m128i* __restrict nonmale_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7346   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7347   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7348   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7349   __m128i loader;
7350   __m128i loader2;
7351   __m128i loader3;
7352   __m128i loader4;
7353   __m128i sets1;
7354   __m128i missings1;
7355   __m128i sets2;
7356   __m128i missings2;
7357   __univec acc;
7358   __univec accm;
7359   acc.vi = _mm_setzero_si128();
7360   accm.vi = _mm_setzero_si128();
7361   do {
7362     loader = *vptr++;
7363     loader3 = *include_vec++;
7364     loader2 = _mm_srli_epi64(loader, 1);
7365     loader4 = *nonmale_vec++;
7366     sets1 = _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2));
7367     missings1 = _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2)));
7368 
7369     loader = *vptr++;
7370     loader3 = *include_vec++;
7371     loader2 = _mm_srli_epi64(loader, 1);
7372     loader4 = *nonmale_vec++;
7373     sets1 = _mm_add_epi64(sets1, _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2)));
7374     missings1 = _mm_add_epi64(missings1, _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2))));
7375 
7376     loader = *vptr++;
7377     loader3 = *include_vec++;
7378     loader2 = _mm_srli_epi64(loader, 1);
7379     loader4 = *nonmale_vec++;
7380     sets1 = _mm_add_epi64(sets1, _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2)));
7381     missings1 = _mm_add_epi64(missings1, _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2))));
7382     sets1 = _mm_add_epi64(_mm_and_si128(sets1, m2), _mm_and_si128(_mm_srli_epi64(sets1, 2), m2));
7383     missings1 = _mm_add_epi64(_mm_and_si128(missings1, m2), _mm_and_si128(_mm_srli_epi64(missings1, 2), m2));
7384 
7385     loader = *vptr++;
7386     loader3 = *include_vec++;
7387     loader2 = _mm_srli_epi64(loader, 1);
7388     loader4 = *nonmale_vec++;
7389     sets2 = _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2));
7390     missings2 = _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2)));
7391 
7392     loader = *vptr++;
7393     loader3 = *include_vec++;
7394     loader2 = _mm_srli_epi64(loader, 1);
7395     loader4 = *nonmale_vec++;
7396     sets2 = _mm_add_epi64(sets2, _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2)));
7397     missings2 = _mm_add_epi64(missings2, _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2))));
7398 
7399     loader = *vptr++;
7400     loader3 = *include_vec++;
7401     loader2 = _mm_srli_epi64(loader, 1);
7402     loader4 = *nonmale_vec++;
7403     sets2 = _mm_add_epi64(sets2, _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2)));
7404     missings2 = _mm_add_epi64(missings2, _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2))));
7405     sets1 = _mm_add_epi64(sets1, _mm_add_epi64(_mm_and_si128(sets2, m2), _mm_and_si128(_mm_srli_epi64(sets2, 2), m2)));
7406     missings1 = _mm_add_epi64(missings1, _mm_add_epi64(_mm_and_si128(missings2, m2), _mm_and_si128(_mm_srli_epi64(missings2, 2), m2)));
7407     acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(sets1, m4), _mm_and_si128(_mm_srli_epi64(sets1, 4), m4)));
7408     accm.vi = _mm_add_epi64(accm.vi, _mm_add_epi64(_mm_and_si128(missings1, m4), _mm_and_si128(_mm_srli_epi64(missings1, 4), m4)));
7409   } while (vptr < vend);
7410   acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7411   accm.vi = _mm_add_epi64(_mm_and_si128(accm.vi, m8), _mm_and_si128(_mm_srli_epi64(accm.vi, 8), m8));
7412   *set_ctp += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7413   *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
7414 }
7415 
count_01_vecs(const __m128i * vptr,uintptr_t vct)7416 uintptr_t count_01_vecs(const __m128i* vptr, uintptr_t vct) {
7417   // counts number of aligned 01s (i.e. PLINK missing genotypes) in
7418   // [vptr, vend).  Assumes number of words in interval is a multiple of 12.
7419   const __m128i m1 = {FIVEMASK, FIVEMASK};
7420   const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7421   const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7422   const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7423   uintptr_t tot = 0;
7424   const __m128i* vend;
7425   __m128i loader1;
7426   __m128i loader2;
7427   __m128i count1;
7428   __m128i count2;
7429   __univec acc;
7430 
7431   while (vct >= 60) {
7432     vct -= 60;
7433     vend = &(vptr[60]);
7434   count_01_vecs_main_loop:
7435     acc.vi = _mm_setzero_si128();
7436     do {
7437       loader1 = *vptr++;
7438       loader2 = *vptr++;
7439       count1 = _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader1, 1), loader1), m1);
7440       count2 = _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2), m1);
7441       loader1 = *vptr++;
7442       loader2 = *vptr++;
7443       count1 = _mm_add_epi64(count1, _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader1, 1), loader1), m1));
7444       count2 = _mm_add_epi64(count2, _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2), m1));
7445       loader1 = *vptr++;
7446       loader2 = *vptr++;
7447       count1 = _mm_add_epi64(count1, _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader1, 1), loader1), m1));
7448       count2 = _mm_add_epi64(count2, _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2), m1));
7449       count1 = _mm_add_epi64(_mm_and_si128(count1, m2), _mm_and_si128(_mm_srli_epi64(count1, 2), m2));
7450       count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(count2, m2), _mm_and_si128(_mm_srli_epi64(count2, 2), m2)));
7451       acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
7452     } while (vptr < vend);
7453     acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7454     tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7455   }
7456   if (vct) {
7457     vend = &(vptr[vct]);
7458     vct = 0;
7459     goto count_01_vecs_main_loop;
7460   }
7461   return tot;
7462 }
7463 
7464 #else
count_set_freq_6(const uintptr_t * __restrict lptr,const uintptr_t * __restrict include_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7465 void count_set_freq_6(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7466   uintptr_t loader = *lptr++;
7467   uintptr_t loader2 = loader >> 1;
7468   uintptr_t loader3 = *include_vec++;
7469   uintptr_t odds = loader2 & loader3;
7470   uintptr_t evens = odds & loader;
7471   uintptr_t missings = (~loader2) & loader3 & loader;
7472   uintptr_t acc;
7473   uintptr_t accm;
7474 
7475   loader = *lptr++;
7476   loader2 = loader >> 1;
7477   loader3 = *include_vec++;
7478   odds += loader2 & loader3;
7479   loader3 &= loader;
7480   evens += loader2 & loader3;
7481   missings += (~loader2) & loader3;
7482 
7483   loader = *lptr++;
7484   loader2 = loader >> 1;
7485   loader3 = *include_vec++;
7486   odds += loader2 & loader3;
7487   loader3 &= loader;
7488   evens += loader2 & loader3;
7489   missings += (~loader2) & loader3;
7490 
7491   odds = (odds & 0x33333333) + ((odds >> 2) & 0x33333333);
7492   odds += (evens & 0x33333333) + ((evens >> 2) & 0x33333333);
7493   accm = (missings & 0x33333333) + ((missings >> 2) & 0x33333333);
7494   acc = (odds & 0x0f0f0f0f) + ((odds >> 4) & 0x0f0f0f0f);
7495 
7496   loader = *lptr++;
7497   loader2 = loader >> 1;
7498   loader3 = *include_vec++;
7499   odds = loader2 & loader3;
7500   evens = odds & loader;
7501   missings = (~loader2) & loader3 & loader;
7502 
7503   loader = *lptr++;
7504   loader2 = loader >> 1;
7505   loader3 = *include_vec++;
7506   odds += loader2 & loader3;
7507   loader3 &= loader;
7508   evens += loader2 & loader3;
7509   missings += (~loader2) & loader3;
7510 
7511   loader = *lptr++;
7512   loader2 = loader >> 1;
7513   loader3 = *include_vec++;
7514   odds += loader2 & loader3;
7515   loader3 &= loader;
7516   evens += loader2 & loader3;
7517   missings += (~loader2) & loader3;
7518 
7519   odds = (odds & 0x33333333) + ((odds >> 2) & 0x33333333);
7520   accm += (missings & 0x33333333) + ((missings >> 2) & 0x33333333);
7521   odds += (evens & 0x33333333) + ((evens >> 2) & 0x33333333);
7522   accm = (accm & 0x0f0f0f0f) + ((accm >> 4) & 0x0f0f0f0f);
7523   acc += (odds & 0x0f0f0f0f) + ((odds >> 4) & 0x0f0f0f0f);
7524   *set_ctp += (acc * 0x01010101) >> 24;
7525   *missing_ctp += (accm * 0x01010101) >> 24;
7526 }
7527 
count_set_freq_hap_12(const uintptr_t * __restrict lptr,const uintptr_t * __restrict include_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7528 void count_set_freq_hap_12(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7529   uintptr_t loader = *lptr++;
7530   uintptr_t loader2 = loader >> 1;
7531   uintptr_t loader3 = *include_vec++;
7532   uintptr_t partial = loader & loader2 & loader3;
7533   uintptr_t partialm = (loader ^ loader2) & loader3;
7534   uintptr_t partial2;
7535   uintptr_t partial2m;
7536   uintptr_t acc;
7537   uintptr_t accm;
7538   loader = *lptr++;
7539   loader2 = loader >> 1;
7540   loader3 = *include_vec++;
7541   partial += loader & loader2 & loader3;
7542   partialm += (loader ^ loader2) & loader3;
7543 
7544   loader = *lptr++;
7545   loader2 = loader >> 1;
7546   loader3 = *include_vec++;
7547   partial += loader & loader2 & loader3;
7548   partialm += (loader ^ loader2) & loader3;
7549   partial2 = (partial & 0x33333333) + ((partial >> 2) & 0x33333333);
7550   partial2m = (partialm & 0x33333333) + ((partialm >> 2) & 0x33333333);
7551 
7552   loader = *lptr++;
7553   loader2 = loader >> 1;
7554   loader3 = *include_vec++;
7555   partial = loader & loader2 & loader3;
7556   partialm = (loader ^ loader2) & loader3;
7557 
7558   loader = *lptr++;
7559   loader2 = loader >> 1;
7560   loader3 = *include_vec++;
7561   partial += loader & loader2 & loader3;
7562   partialm += (loader ^ loader2) & loader3;
7563 
7564   loader = *lptr++;
7565   loader2 = loader >> 1;
7566   loader3 = *include_vec++;
7567   partial += loader & loader2 & loader3;
7568   partialm += (loader ^ loader2) & loader3;
7569   partial2 += (partial & 0x33333333) + ((partial >> 2) & 0x33333333);
7570   partial2m += (partialm & 0x33333333) + ((partialm >> 2) & 0x33333333);
7571   acc = (partial2 & 0x0f0f0f0f) + ((partial2 >> 4) & 0x0f0f0f0f);
7572   accm = (partial2m & 0x0f0f0f0f) + ((partial2m >> 4) & 0x0f0f0f0f);
7573 
7574   loader = *lptr++;
7575   loader2 = loader >> 1;
7576   loader3 = *include_vec++;
7577   partial = loader & loader2 & loader3;
7578   partialm = (loader ^ loader2) & loader3;
7579 
7580   loader = *lptr++;
7581   loader2 = loader >> 1;
7582   loader3 = *include_vec++;
7583   partial += loader & loader2 & loader3;
7584   partialm += (loader ^ loader2) & loader3;
7585 
7586   loader = *lptr++;
7587   loader2 = loader >> 1;
7588   loader3 = *include_vec++;
7589   partial += loader & loader2 & loader3;
7590   partialm += (loader ^ loader2) & loader3;
7591   partial2 = (partial & 0x33333333) + ((partial >> 2) & 0x33333333);
7592   partial2m = (partialm & 0x33333333) + ((partialm >> 2) & 0x33333333);
7593 
7594   loader = *lptr++;
7595   loader2 = loader >> 1;
7596   loader3 = *include_vec++;
7597   partial = loader & loader2 & loader3;
7598   partialm = (loader ^ loader2) & loader3;
7599 
7600   loader = *lptr++;
7601   loader2 = loader >> 1;
7602   loader3 = *include_vec++;
7603   partial += loader & loader2 & loader3;
7604   partialm += (loader ^ loader2) & loader3;
7605 
7606   loader = *lptr++;
7607   loader2 = loader >> 1;
7608   loader3 = *include_vec++;
7609   partial += loader & loader2 & loader3;
7610   partialm += (loader ^ loader2) & loader3;
7611   partial2 += (partial & 0x33333333) + ((partial >> 2) & 0x33333333);
7612   partial2m += (partialm & 0x33333333) + ((partialm >> 2) & 0x33333333);
7613   acc += (partial2 & 0x0f0f0f0f) + ((partial2 >> 4) & 0x0f0f0f0f);
7614   accm += (partial2m & 0x0f0f0f0f) + ((partial2m >> 4) & 0x0f0f0f0f);
7615   *set_ctp += (acc * 0x01010101) >> 24;
7616   *missing_ctp += (accm * 0x01010101) >> 24;
7617 }
7618 
count_set_freq_x_6(const uintptr_t * __restrict lptr,const uintptr_t * __restrict include_vec,const uintptr_t * __restrict male_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7619 void count_set_freq_x_6(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, const uintptr_t* __restrict male_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7620   uintptr_t loader = *lptr++;
7621   uintptr_t loader2 = loader >> 1;
7622   uintptr_t loader3 = *include_vec++;
7623   uintptr_t loader4 = loader3 & (~(*male_vec));
7624   uintptr_t set_odds = loader2 & loader4;
7625   uintptr_t set_evens = loader & loader4;
7626   uintptr_t missings_nm = set_evens & (~loader2);
7627   uintptr_t missings_m;
7628   uintptr_t males;
7629   uintptr_t acc;
7630   uintptr_t accm;
7631   males = loader3 & (*male_vec++);
7632   set_evens |= loader & loader2 & males;
7633   missings_m = (loader ^ loader2) & males;
7634 
7635   loader = *lptr++;
7636   loader2 = loader >> 1;
7637   loader3 = *include_vec++;
7638   loader4 = loader3 & (~(*male_vec));
7639   set_odds += loader2 & loader4;
7640   loader4 &= loader;
7641   set_evens += loader4;
7642   missings_nm += loader4 & (~loader2);
7643   loader4 = loader3 & (*male_vec++);
7644   set_evens += loader & loader2 & loader4;
7645   missings_m += (loader ^ loader2) & loader4;
7646   males += loader4;
7647 
7648   loader = *lptr++;
7649   loader2 = loader >> 1;
7650   loader3 = *include_vec++;
7651   loader4 = loader3 & (~(*male_vec));
7652   set_odds += loader2 & loader4;
7653   loader4 &= loader;
7654   set_evens += loader4;
7655   missings_nm += loader4 & (~loader2);
7656   loader4 = loader3 & (*male_vec++);
7657   set_evens += loader & loader2 & loader4;
7658   missings_m += (loader ^ loader2) & loader4;
7659   males += loader4;
7660 
7661   set_evens -= missings_nm;
7662   set_odds = (set_odds & 0x33333333) + ((set_odds >> 2) & 0x33333333);
7663   set_odds += (set_evens & 0x33333333) + ((set_evens >> 2) & 0x33333333);
7664   missings_nm = ((missings_nm & 0x33333333) + ((missings_nm >> 2) & 0x33333333)) * 2;
7665   missings_nm += (missings_m & 0x33333333) + ((missings_m >> 2) & 0x33333333);
7666   missings_nm += (males & 0x33333333) + ((males >> 2) & 0x33333333);
7667   acc = (set_odds & 0x0f0f0f0f) + ((set_odds >> 4) & 0x0f0f0f0f);
7668   accm = (missings_nm & 0x0f0f0f0f) + ((missings_nm >> 4) & 0x0f0f0f0f);
7669 
7670   loader = *lptr++;
7671   loader2 = loader >> 1;
7672   loader3 = *include_vec++;
7673   loader4 = loader3 & (~(*male_vec));
7674   set_odds = loader2 & loader4;
7675   set_evens = loader & loader4;
7676   missings_nm = set_evens & (~loader2);
7677   males = loader3 & (*male_vec++);
7678   set_evens |= loader & loader2 & males;
7679   missings_m = (loader ^ loader2) & males;
7680 
7681   loader = *lptr++;
7682   loader2 = loader >> 1;
7683   loader3 = *include_vec++;
7684   loader4 = loader3 & (~(*male_vec));
7685   set_odds += loader2 & loader4;
7686   loader4 &= loader;
7687   set_evens += loader4;
7688   missings_nm += loader4 & (~loader2);
7689   loader4 = loader3 & (*male_vec++);
7690   set_evens += loader & loader2 & loader4;
7691   missings_m += (loader ^ loader2) & loader4;
7692   males += loader4;
7693 
7694   loader = *lptr++;
7695   loader2 = loader >> 1;
7696   loader3 = *include_vec++;
7697   loader4 = loader3 & (~(*male_vec));
7698   set_odds += loader2 & loader4;
7699   loader4 &= loader;
7700   set_evens += loader4;
7701   missings_nm += loader4 & (~loader2);
7702   loader4 = loader3 & (*male_vec++);
7703   set_evens += loader & loader2 & loader4;
7704   missings_m += (loader ^ loader2) & loader4;
7705   males += loader4;
7706 
7707   set_evens -= missings_nm;
7708   set_odds = (set_odds & 0x33333333) + ((set_odds >> 2) & 0x33333333);
7709   set_odds += (set_evens & 0x33333333) + ((set_evens >> 2) & 0x33333333);
7710   missings_nm = ((missings_nm & 0x33333333) + ((missings_nm >> 2) & 0x33333333)) * 2;
7711   missings_nm += (missings_m & 0x33333333) + ((missings_m >> 2) & 0x33333333);
7712   missings_nm += (males & 0x33333333) + ((males >> 2) & 0x33333333);
7713   acc += (set_odds & 0x0f0f0f0f) + ((set_odds >> 4) & 0x0f0f0f0f);
7714   accm += (missings_nm & 0x0f0f0f0f) + ((missings_nm >> 4) & 0x0f0f0f0f);
7715   *set_ctp += (acc * 0x01010101) >> 24;
7716   *missing_ctp += (accm * 0x01010101) >> 24;
7717 }
7718 
count_set_freq_y_12(const uintptr_t * __restrict lptr,const uintptr_t * __restrict include_vec,const uintptr_t * __restrict nonmale_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7719 void count_set_freq_y_12(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, const uintptr_t* __restrict nonmale_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7720   uintptr_t loader = *lptr++;
7721   uintptr_t loader2 = loader >> 1;
7722   uintptr_t loader3 = *include_vec++;
7723   uintptr_t loader4 = *nonmale_vec++;
7724   uintptr_t sets1 = loader3 & loader & loader2 & (~loader4);
7725   uintptr_t missings1 = loader3 & (loader4 | (loader ^ loader2));
7726   uintptr_t sets2;
7727   uintptr_t missings2;
7728   uintptr_t acc;
7729   uintptr_t accm;
7730 
7731   loader = *lptr++;
7732   loader2 = loader >> 1;
7733   loader3 = *include_vec++;
7734   loader4 = *nonmale_vec++;
7735   sets1 += loader3 & loader & loader2 & (~loader4);
7736   missings1 += loader3 & (loader4 | (loader ^ loader2));
7737 
7738   loader = *lptr++;
7739   loader2 = loader >> 1;
7740   loader3 = *include_vec++;
7741   loader4 = *nonmale_vec++;
7742   sets1 += loader3 & loader & loader2 & (~loader4);
7743   missings1 += loader3 & (loader4 | (loader ^ loader2));
7744   sets1 = (sets1 & 0x33333333) + ((sets1 >> 2) & 0x33333333);
7745   missings1 = (missings1 & 0x33333333) + ((missings1 >> 2) & 0x33333333);
7746 
7747   loader = *lptr++;
7748   loader2 = loader >> 1;
7749   loader3 = *include_vec++;
7750   loader4 = *nonmale_vec++;
7751   sets2 = loader3 & loader & loader2 & (~loader4);
7752   missings2 = loader3 & (loader4 | (loader ^ loader2));
7753 
7754   loader = *lptr++;
7755   loader2 = loader >> 1;
7756   loader3 = *include_vec++;
7757   loader4 = *nonmale_vec++;
7758   sets2 += loader3 & loader & loader2 & (~loader4);
7759   missings2 += loader3 & (loader4 | (loader ^ loader2));
7760 
7761   loader = *lptr++;
7762   loader2 = loader >> 1;
7763   loader3 = *include_vec++;
7764   loader4 = *nonmale_vec++;
7765   sets2 += loader3 & loader & loader2 & (~loader4);
7766   missings2 += loader3 & (loader4 | (loader ^ loader2));
7767   sets1 += (sets2 & 0x33333333) + ((sets2 >> 2) & 0x33333333);
7768   missings1 += (missings2 & 0x33333333) + ((missings2 >> 2) & 0x33333333);
7769   acc = (sets1 & 0x0f0f0f0f) + ((sets1 >> 4) & 0x0f0f0f0f);
7770   accm = (missings1 & 0x0f0f0f0f) + ((missings1 >> 4) & 0x0f0f0f0f);
7771 
7772   loader = *lptr++;
7773   loader2 = loader >> 1;
7774   loader3 = *include_vec++;
7775   loader4 = *nonmale_vec++;
7776   sets1 = loader3 & loader & loader2 & (~loader4);
7777   missings1 = loader3 & (loader4 | (loader ^ loader2));
7778 
7779   loader = *lptr++;
7780   loader2 = loader >> 1;
7781   loader3 = *include_vec++;
7782   loader4 = *nonmale_vec++;
7783   sets1 += loader3 & loader & loader2 & (~loader4);
7784   missings1 += loader3 & (loader4 | (loader ^ loader2));
7785 
7786   loader = *lptr++;
7787   loader2 = loader >> 1;
7788   loader3 = *include_vec++;
7789   loader4 = *nonmale_vec++;
7790   sets1 += loader3 & loader & loader2 & (~loader4);
7791   missings1 += loader3 & (loader4 | (loader ^ loader2));
7792   sets1 = (sets1 & 0x33333333) + ((sets1 >> 2) & 0x33333333);
7793   missings1 = (missings1 & 0x33333333) + ((missings1 >> 2) & 0x33333333);
7794 
7795   loader = *lptr++;
7796   loader2 = loader >> 1;
7797   loader3 = *include_vec++;
7798   loader4 = *nonmale_vec++;
7799   sets2 = loader3 & loader & loader2 & (~loader4);
7800   missings2 = loader3 & (loader4 | (loader ^ loader2));
7801 
7802   loader = *lptr++;
7803   loader2 = loader >> 1;
7804   loader3 = *include_vec++;
7805   loader4 = *nonmale_vec++;
7806   sets2 += loader3 & loader & loader2 & (~loader4);
7807   missings2 += loader3 & (loader4 | (loader ^ loader2));
7808 
7809   loader = *lptr++;
7810   loader2 = loader >> 1;
7811   loader3 = *include_vec++;
7812   loader4 = *nonmale_vec++;
7813   sets2 += loader3 & loader & loader2 & (~loader4);
7814   missings2 += loader3 & (loader4 | (loader ^ loader2));
7815   sets1 += (sets2 & 0x33333333) + ((sets2 >> 2) & 0x33333333);
7816   missings1 += (missings2 & 0x33333333) + ((missings2 >> 2) & 0x33333333);
7817   acc += (sets1 & 0x0f0f0f0f) + ((sets1 >> 4) & 0x0f0f0f0f);
7818   accm += (missings1 & 0x0f0f0f0f) + ((missings1 >> 4) & 0x0f0f0f0f);
7819   *set_ctp += (acc * 0x01010101) >> 24;
7820   *missing_ctp += (accm * 0x01010101) >> 24;
7821 }
7822 
count_01_12(const uintptr_t * lptr)7823 uintptr_t count_01_12(const uintptr_t* lptr) {
7824   uintptr_t loader1 = *lptr++;
7825   uintptr_t loader2 = *lptr++;
7826   uintptr_t count1 = loader1 & (~(loader1 >> 1)) & FIVEMASK;
7827   uintptr_t count2 = loader2 & (~(loader2 >> 1)) & FIVEMASK;
7828   uintptr_t partial1;
7829   uintptr_t partial2;
7830   loader1 = *lptr++;
7831   loader2 = *lptr++;
7832   count1 += loader1 & (~(loader1 >> 1)) & FIVEMASK;
7833   count2 += loader2 & (~(loader2 >> 1)) & FIVEMASK;
7834   loader1 = *lptr++;
7835   loader2 = *lptr++;
7836   count1 += loader1 & (~(loader1 >> 1)) & FIVEMASK;
7837   count2 += loader2 & (~(loader2 >> 1)) & FIVEMASK;
7838   partial1 = (count1 & 0x33333333) + ((count1 >> 2) & 0x33333333);
7839   partial2 = (count2 & 0x33333333) + ((count2 >> 2) & 0x33333333);
7840 
7841   loader1 = *lptr++;
7842   loader2 = *lptr++;
7843   count1 = loader1 & (~(loader1 >> 1)) & FIVEMASK;
7844   count2 = loader2 & (~(loader2 >> 1)) & FIVEMASK;
7845   loader1 = *lptr++;
7846   loader2 = *lptr++;
7847   count1 += loader1 & (~(loader1 >> 1)) & FIVEMASK;
7848   count2 += loader2 & (~(loader2 >> 1)) & FIVEMASK;
7849   loader1 = *lptr++;
7850   loader2 = *lptr++;
7851   count1 += loader1 & (~(loader1 >> 1)) & FIVEMASK;
7852   count2 += loader2 & (~(loader2 >> 1)) & FIVEMASK;
7853   partial1 += (count1 & 0x33333333) + ((count1 >> 2) & 0x33333333);
7854   partial2 += (count2 & 0x33333333) + ((count2 >> 2) & 0x33333333);
7855 
7856   partial1 = (partial1 & 0x0f0f0f0f) + ((partial1 >> 4) & 0x0f0f0f0f);
7857   partial1 += (partial2 & 0x0f0f0f0f) + ((partial2 >> 4) & 0x0f0f0f0f);
7858   return (partial1 * 0x01010101) >> 24;
7859 }
7860 #endif
7861 
genovec_set_freq(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict include_quatervec,uintptr_t sample_ctl2,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7862 void genovec_set_freq(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7863   // Assuming include_quatervec describes e.g. cases, and an autosomal marker,
7864   // this counts the number of case set alleles loaded in geno_vec[], as well
7865   // as the number of cases with missing genotype info.
7866   // See single_marker_freqs_and_hwe() for discussion.
7867   // missing count: popcount2(genotype & (~(genotype >> 1)) & 0x5555...)
7868   // set allele count: popcount(genotype) - missing count
7869   const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
7870   uintptr_t loader;
7871   uintptr_t loader2;
7872   uintptr_t missing_incr;
7873   uint32_t acc = 0;
7874   uint32_t accm = 0;
7875 #ifdef __LP64__
7876   uintptr_t cur_decr = 60;
7877   const uintptr_t* geno_vec_6x_end;
7878   sample_ctl2 -= sample_ctl2 % 6;
7879   while (sample_ctl2 >= 60) {
7880   genovec_set_freq_loop:
7881     geno_vec_6x_end = &(geno_vec[cur_decr]);
7882     count_set_freq_60v((const __m128i*)geno_vec, (const __m128i*)geno_vec_6x_end, (const __m128i*)include_quatervec, &acc, &accm);
7883     geno_vec = geno_vec_6x_end;
7884     include_quatervec = &(include_quatervec[cur_decr]);
7885     sample_ctl2 -= cur_decr;
7886   }
7887   if (sample_ctl2) {
7888     cur_decr = sample_ctl2;
7889     goto genovec_set_freq_loop;
7890   }
7891 #else
7892   const uintptr_t* geno_vec_six_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 6)]);
7893   while (geno_vec < geno_vec_six_end) {
7894     count_set_freq_6(geno_vec, include_quatervec, &acc, &accm);
7895     geno_vec = &(geno_vec[6]);
7896     include_quatervec = &(include_quatervec[6]);
7897   }
7898 #endif
7899   while (geno_vec < geno_vec_end) {
7900     loader = *geno_vec++;
7901     loader2 = *include_quatervec++;
7902     missing_incr = popcount2_long(loader & (~(loader >> 1)) & loader2);
7903     accm += missing_incr;
7904     acc += popcount_long(loader & (loader2 * 3)) - missing_incr;
7905   }
7906   *set_ctp = acc;
7907   *missing_ctp = accm;
7908 }
7909 
genovec_set_freq_x(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict include_quatervec,const uintptr_t * __restrict male_quatervec,uintptr_t sample_ctl2,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7910 void genovec_set_freq_x(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, const uintptr_t* __restrict male_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7911   // diploid counting for nonmales, haploid counting for males
7912   // missing_ct := male_obs + male_missing + 2 * female_missing
7913   const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
7914   uintptr_t loader;
7915   uintptr_t loader2;
7916   uintptr_t loader3;
7917   uintptr_t loader4;
7918   uintptr_t missing_incr;
7919   uint32_t acc = 0;
7920   uint32_t accm = 0;
7921 #ifdef __LP64__
7922   uintptr_t cur_decr = 60;
7923   const uintptr_t* geno_vec_6x_end;
7924   sample_ctl2 -= sample_ctl2 % 6;
7925   while (sample_ctl2 >= 60) {
7926   genovec_set_freq_x_loop:
7927     geno_vec_6x_end = &(geno_vec[cur_decr]);
7928     count_set_freq_x_60v((const __m128i*)geno_vec, (const __m128i*)geno_vec_6x_end, (const __m128i*)include_quatervec, (const __m128i*)male_quatervec, &acc, &accm);
7929     geno_vec = geno_vec_6x_end;
7930     include_quatervec = &(include_quatervec[cur_decr]);
7931     male_quatervec = &(male_quatervec[cur_decr]);
7932     sample_ctl2 -= cur_decr;
7933   }
7934   if (sample_ctl2) {
7935     cur_decr = sample_ctl2;
7936     goto genovec_set_freq_x_loop;
7937   }
7938 #else
7939   const uintptr_t* geno_vec_six_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 6)]);
7940   while (geno_vec < geno_vec_six_end) {
7941     count_set_freq_x_6(geno_vec, include_quatervec, male_quatervec, &acc, &accm);
7942     geno_vec = &(geno_vec[6]);
7943     include_quatervec = &(include_quatervec[6]);
7944     male_quatervec = &(male_quatervec[6]);
7945   }
7946 #endif
7947   while (geno_vec < geno_vec_end) {
7948     loader = *geno_vec++;
7949     loader2 = loader >> 1;
7950     loader3 = *include_quatervec++;
7951     loader4 = loader3 & (~(*male_quatervec));
7952     missing_incr = popcount2_long(loader & (~loader2) & loader4);
7953     accm += 2 * missing_incr;
7954     acc += popcount_long(loader & (loader4 * 3)) - missing_incr;
7955 
7956     loader4 = loader3 & (*male_quatervec++);
7957     acc += popcount2_long(loader & loader2 & loader4);
7958     accm += popcount_long(((loader ^ loader2) & loader4) | (loader4 << 1));
7959   }
7960   *set_ctp = acc;
7961   *missing_ctp = accm;
7962 }
7963 
genovec_set_freq_y(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict include_quatervec,const uintptr_t * __restrict nonmale_quatervec,uintptr_t sample_ctl2,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7964 void genovec_set_freq_y(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, const uintptr_t* __restrict nonmale_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7965   // all nonmales contribute to missing_ct here
7966   const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
7967   uintptr_t loader;
7968   uintptr_t loader2;
7969   uintptr_t loader3;
7970   uintptr_t loader4;
7971   uint32_t acc = 0;
7972   uint32_t accm = 0;
7973 #ifdef __LP64__
7974   uintptr_t cur_decr = 120;
7975   const uintptr_t* geno_vec_12x_end;
7976   sample_ctl2 -= sample_ctl2 % 12;
7977   while (sample_ctl2 >= 120) {
7978   genovec_set_freq_y_loop:
7979     geno_vec_12x_end = &(geno_vec[cur_decr]);
7980     count_set_freq_y_120v((__m128i*)geno_vec, (__m128i*)geno_vec_12x_end, (__m128i*)include_quatervec, (__m128i*)nonmale_quatervec, &acc, &accm);
7981     geno_vec = geno_vec_12x_end;
7982     include_quatervec = &(include_quatervec[cur_decr]);
7983     nonmale_quatervec = &(nonmale_quatervec[cur_decr]);
7984     sample_ctl2 -= cur_decr;
7985   }
7986   if (sample_ctl2) {
7987     cur_decr = sample_ctl2;
7988     goto genovec_set_freq_y_loop;
7989   }
7990 #else
7991   const uintptr_t* geno_vec_twelve_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 12)]);
7992   while (geno_vec < geno_vec_twelve_end) {
7993     count_set_freq_y_12(geno_vec, include_quatervec, nonmale_quatervec, &acc, &accm);
7994     geno_vec = &(geno_vec[12]);
7995     include_quatervec = &(include_quatervec[12]);
7996     nonmale_quatervec = &(nonmale_quatervec[12]);
7997   }
7998 #endif
7999   while (geno_vec < geno_vec_end) {
8000     loader = *geno_vec++;
8001     loader2 = loader >> 1;
8002     loader3 = *include_quatervec++;
8003     loader4 = *nonmale_quatervec++;
8004     acc += popcount2_long(loader & loader2 & loader3 & (~loader4));
8005     accm += popcount2_long(loader3 & ((loader ^ loader2) | loader4));
8006   }
8007   *set_ctp = acc;
8008   *missing_ctp = accm;
8009 }
8010 
genovec_3freq(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict include_quatervec,uintptr_t sample_ctl2,uint32_t * __restrict missing_ctp,uint32_t * __restrict het_ctp,uint32_t * __restrict homset_ctp)8011 void genovec_3freq(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict missing_ctp, uint32_t* __restrict het_ctp, uint32_t* __restrict homset_ctp) {
8012   // generic routine for getting all counts.
8013   const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
8014   uintptr_t loader;
8015   uintptr_t loader2;
8016   uintptr_t loader3;
8017   uint32_t acc_even = 0;
8018   uint32_t acc_odd = 0;
8019   uint32_t acc_and = 0;
8020 #ifdef __LP64__
8021   uintptr_t cur_decr = 120;
8022   const uintptr_t* geno_vec_12x_end;
8023   sample_ctl2 -= sample_ctl2 % 12;
8024   while (sample_ctl2 >= 120) {
8025   genovec_3freq_loop:
8026     geno_vec_12x_end = &(geno_vec[cur_decr]);
8027     count_3freq_1920b((const __m128i*)geno_vec, (const __m128i*)geno_vec_12x_end, (const __m128i*)include_quatervec, &acc_even, &acc_odd, &acc_and);
8028     geno_vec = geno_vec_12x_end;
8029     include_quatervec = &(include_quatervec[cur_decr]);
8030     sample_ctl2 -= cur_decr;
8031   }
8032   if (sample_ctl2) {
8033     cur_decr = sample_ctl2;
8034     goto genovec_3freq_loop;
8035   }
8036 #else
8037   const uintptr_t* geno_vec_twelve_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 12)]);
8038   while (geno_vec < geno_vec_twelve_end) {
8039     count_3freq_48b(geno_vec, include_quatervec, &acc_even, &acc_odd, &acc_and);
8040     geno_vec = &(geno_vec[12]);
8041     include_quatervec = &(include_quatervec[12]);
8042   }
8043 #endif
8044   while (geno_vec < geno_vec_end) {
8045     loader = *geno_vec++;
8046     loader2 = *include_quatervec++;
8047     loader3 = loader2 & (loader >> 1);
8048     acc_even += popcount2_long(loader & loader2);
8049     acc_odd += popcount2_long(loader3);
8050     acc_and += popcount2_long(loader & loader3);
8051   }
8052   *missing_ctp = acc_even - acc_and;
8053   *het_ctp = acc_odd - acc_and;
8054   *homset_ctp = acc_and;
8055 }
8056 
count_01(const uintptr_t * quatervec,uintptr_t word_ct)8057 uintptr_t count_01(const uintptr_t* quatervec, uintptr_t word_ct) {
8058   // really just for getting a missing count
8059   // unlike popcount01_longs, this does not assume quatervec[] has no 11s
8060   const uintptr_t* quatervec_end = &(quatervec[word_ct]);
8061   uintptr_t loader;
8062 #ifdef __LP64__
8063   uintptr_t acc;
8064   word_ct -= word_ct % 12;
8065   acc = count_01_vecs((__m128i*)quatervec, word_ct / 2);
8066   quatervec = &(quatervec[word_ct]);
8067 #else
8068   const uintptr_t* quatervec_twelve_end = &(quatervec[word_ct - (word_ct % 12)]);
8069   uintptr_t acc = 0;
8070   while (quatervec < quatervec_twelve_end) {
8071     acc += count_01_12(quatervec);
8072     quatervec = &(quatervec[12]);
8073   }
8074 #endif
8075   while (quatervec < quatervec_end) {
8076     loader = *quatervec++;
8077     acc += popcount2_long(loader & (~(loader >> 1)) & FIVEMASK);
8078   }
8079   return acc;
8080 }
8081 
fill_all_bits(uintptr_t ct,uintptr_t * bitarr)8082 void fill_all_bits(uintptr_t ct, uintptr_t* bitarr) {
8083   // leaves bits beyond the end unset
8084   // ok for ct == 0
8085   uintptr_t quotient = ct / BITCT;
8086   uintptr_t remainder = ct % BITCT;
8087   fill_ulong_one(quotient, bitarr);
8088   if (remainder) {
8089     bitarr[quotient] = (ONELU << remainder) - ONELU;
8090   }
8091 }
8092 
numeric_range_list_to_bitarr(const Range_list * range_list_ptr,uint32_t item_ct,uint32_t offset,uint32_t ignore_overflow,uintptr_t * bitarr)8093 uint32_t numeric_range_list_to_bitarr(const Range_list* range_list_ptr, uint32_t item_ct, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr) {
8094   // bitarr assumed to be initialized
8095   const char* names = range_list_ptr->names;
8096   const unsigned char* starts_range = range_list_ptr->starts_range;
8097   uint32_t name_ct = range_list_ptr->name_ct;
8098   uint32_t name_max_len = range_list_ptr->name_max_len;
8099   uint32_t idx_max = item_ct + offset;
8100   uint32_t name_idx;
8101   uint32_t idx1;
8102   uint32_t idx2;
8103   for (name_idx = 0; name_idx < name_ct; name_idx++) {
8104     if (scan_uint_capped(&(names[name_idx * name_max_len]), idx_max, &idx1)) {
8105       if (ignore_overflow) {
8106 	continue;
8107       }
8108       return 1;
8109     }
8110     if (starts_range[name_idx]) {
8111       name_idx++;
8112       if (scan_uint_capped(&(names[name_idx * name_max_len]), idx_max, &idx2)) {
8113 	if (!ignore_overflow) {
8114 	  return 1;
8115 	}
8116         idx2 = idx_max - 1;
8117       }
8118       fill_bits(idx1 - offset, (idx2 - idx1) + 1, bitarr);
8119     } else {
8120       set_bit(idx1 - offset, bitarr);
8121     }
8122   }
8123   return 0;
8124 }
8125 
string_range_list_to_bitarr(char * header_line,uint32_t item_ct,uint32_t fixed_len,const Range_list * range_list_ptr,const char * __restrict sorted_ids,const uint32_t * __restrict id_map,const char * __restrict range_list_flag,const char * __restrict file_descrip,uintptr_t * bitarr,int32_t * __restrict seen_idxs)8126 int32_t string_range_list_to_bitarr(char* header_line, uint32_t item_ct, uint32_t fixed_len, const Range_list* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uintptr_t* bitarr, int32_t* __restrict seen_idxs) {
8127   // bitarr assumed to be initialized
8128   // if fixed_len is zero, header_line is assumed to be a list of
8129   // space-delimited unequal-length names
8130   uintptr_t max_id_len = range_list_ptr->name_max_len;
8131   uintptr_t name_ct = range_list_ptr->name_ct;
8132   uint32_t item_idx = 0;
8133   int32_t retval = 0;
8134   char* bufptr;
8135   uint32_t cmdline_pos;
8136   int32_t ii;
8137   while (1) {
8138     bufptr = token_endnn(header_line);
8139     ii = bsearch_str(header_line, (uintptr_t)(bufptr - header_line), sorted_ids, max_id_len, name_ct);
8140     if (ii != -1) {
8141       cmdline_pos = id_map[(uint32_t)ii];
8142       if (seen_idxs[cmdline_pos] != -1) {
8143 	sprintf(g_logbuf, "Error: Duplicate --%s token in %s.\n", range_list_flag, file_descrip);
8144         goto string_range_list_to_bitarr_ret_INVALID_FORMAT_2;
8145       }
8146       seen_idxs[cmdline_pos] = item_idx;
8147       if (cmdline_pos && range_list_ptr->starts_range[cmdline_pos - 1]) {
8148         if (seen_idxs[cmdline_pos - 1] == -1) {
8149           LOGPREPRINTFWW("Error: Second element of --%s range appears before first element in %s.\n", range_list_flag, file_descrip);
8150           goto string_range_list_to_bitarr_ret_INVALID_CMDLINE_2;
8151 	}
8152 	fill_bits(seen_idxs[cmdline_pos - 1], (item_idx - seen_idxs[cmdline_pos - 1]) + 1, bitarr);
8153       } else if (!(range_list_ptr->starts_range[cmdline_pos])) {
8154 	SET_BIT(item_idx, bitarr);
8155       }
8156     }
8157     if (++item_idx == item_ct) {
8158       break;
8159     }
8160     if (fixed_len) {
8161       header_line = &(header_line[fixed_len]);
8162     } else {
8163       header_line = skip_initial_spaces(&(bufptr[1]));
8164     }
8165   }
8166   for (cmdline_pos = 0; cmdline_pos < name_ct; cmdline_pos++) {
8167     if (seen_idxs[cmdline_pos] == -1) {
8168       goto string_range_list_to_bitarr_ret_INVALID_CMDLINE_3;
8169     }
8170   }
8171   while (0) {
8172   string_range_list_to_bitarr_ret_INVALID_CMDLINE_3:
8173     sprintf(g_logbuf, "Error: Missing --%s token in %s.\n", range_list_flag, file_descrip);
8174   string_range_list_to_bitarr_ret_INVALID_CMDLINE_2:
8175     logerrprintb();
8176     retval = RET_INVALID_CMDLINE;
8177     break;
8178   string_range_list_to_bitarr_ret_INVALID_FORMAT_2:
8179     logerrprintb();
8180     retval = RET_INVALID_FORMAT;
8181     break;
8182   }
8183   return retval;
8184 }
8185 
string_range_list_to_bitarr_alloc(char * header_line,uint32_t item_ct,uint32_t fixed_len,const Range_list * range_list_ptr,const char * __restrict range_list_flag,const char * __restrict file_descrip,uintptr_t ** bitarr_ptr)8186 int32_t string_range_list_to_bitarr_alloc(char* header_line, uint32_t item_ct, uint32_t fixed_len, const Range_list* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uintptr_t** bitarr_ptr) {
8187   // wrapper for string_range_list_to_bitarr which allocates the bitfield and
8188   // temporary buffers on the heap
8189   uintptr_t item_ctl = BITCT_TO_WORDCT(item_ct);
8190   uintptr_t name_ct = range_list_ptr->name_ct;
8191   int32_t retval = 0;
8192   int32_t* seen_idxs;
8193   char* sorted_ids;
8194   uint32_t* id_map;
8195   if (bigstack_calloc_ul(item_ctl, bitarr_ptr) ||
8196       bigstack_alloc_i(name_ct, &seen_idxs)) {
8197     return RET_NOMEM;
8198   }
8199   // kludge to use sort_item_ids()
8200   fill_ulong_zero(BITCT_TO_WORDCT(name_ct), (uintptr_t*)seen_idxs);
8201   if (sort_item_ids(name_ct, (uintptr_t*)seen_idxs, 0, range_list_ptr->names, range_list_ptr->name_max_len, 0, 0, strcmp_deref, &sorted_ids, &id_map)) {
8202     return RET_NOMEM;
8203   }
8204   fill_int_one(name_ct, seen_idxs);
8205   retval = string_range_list_to_bitarr(header_line, item_ct, fixed_len, range_list_ptr, sorted_ids, id_map, range_list_flag, file_descrip, *bitarr_ptr, seen_idxs);
8206   bigstack_reset(seen_idxs);
8207   return retval;
8208 }
8209 
string_range_list_to_bitarr2(const char * __restrict sorted_ids,const uint32_t * id_map,uintptr_t item_ct,uintptr_t max_id_len,const Range_list * __restrict range_list_ptr,const char * __restrict range_list_flag,uintptr_t * bitfield_excl)8210 int32_t string_range_list_to_bitarr2(const char* __restrict sorted_ids, const uint32_t* id_map, uintptr_t item_ct, uintptr_t max_id_len, const Range_list* __restrict range_list_ptr, const char* __restrict range_list_flag, uintptr_t* bitfield_excl) {
8211   // sorted_ids/id_map is for e.g. marker IDs instead of command line
8212   // parameters.  bitfield_excl is assumed to be initialized (since its length
8213   // is not known by this function).
8214   char* names = range_list_ptr->names;
8215   const unsigned char* starts_range = range_list_ptr->starts_range;
8216   uintptr_t name_max_len = range_list_ptr->name_max_len;
8217   uint32_t name_ct = range_list_ptr->name_ct;
8218   int32_t retval = 0;
8219   uint32_t param_idx;
8220   char* bufptr;
8221   uint32_t item_uidx;
8222   uint32_t item_uidx2;
8223   int32_t ii;
8224   for (param_idx = 0; param_idx < name_ct; param_idx++) {
8225     bufptr = &(names[param_idx * name_max_len]);
8226     ii = bsearch_str_nl(bufptr, sorted_ids, max_id_len, item_ct);
8227     if (ii == -1) {
8228       goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3;
8229     }
8230     item_uidx = id_map[(uint32_t)ii];
8231     if (starts_range[param_idx]) {
8232       param_idx++;
8233       bufptr = &(names[param_idx * name_max_len]);
8234       ii = bsearch_str_nl(bufptr, sorted_ids, max_id_len, item_ct);
8235       if (ii == -1) {
8236         goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3;
8237       }
8238       item_uidx2 = id_map[(uint32_t)ii];
8239       if (item_uidx2 < item_uidx) {
8240 	sprintf(g_logbuf, "Error: Second element of --%s range appears before first.\n", range_list_flag);
8241 	goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_2;
8242       }
8243       clear_bits(item_uidx, item_uidx2 - item_uidx + 1, bitfield_excl);
8244     } else {
8245       clear_bit(item_uidx, bitfield_excl);
8246     }
8247   }
8248   while (0) {
8249   string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3:
8250     sprintf(g_logbuf, "Error: --%s ID not found.\n", range_list_flag);
8251   string_range_list_to_bitarr2_ret_INVALID_CMDLINE_2:
8252     logerrprintb();
8253     retval = RET_INVALID_CMDLINE;
8254     break;
8255   }
8256   return retval;
8257 }
8258 
count_non_autosomal_markers(const Chrom_info * chrom_info_ptr,const uintptr_t * marker_exclude,uint32_t count_x,uint32_t count_mt)8259 uint32_t count_non_autosomal_markers(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t count_x, uint32_t count_mt) {
8260   // for backward compatibility, unplaced markers are considered to be
8261   // autosomal here
8262   const int32_t x_code = chrom_info_ptr->xymt_codes[X_OFFSET];
8263   const int32_t y_code = chrom_info_ptr->xymt_codes[Y_OFFSET];
8264   const int32_t mt_code = chrom_info_ptr->xymt_codes[MT_OFFSET];
8265   uint32_t ct = 0;
8266   if (count_x && (x_code != -2)) {
8267     ct += count_chrom_markers(chrom_info_ptr, marker_exclude, x_code);
8268   }
8269   if (y_code != -2) {
8270     ct += count_chrom_markers(chrom_info_ptr, marker_exclude, y_code);
8271   }
8272   if (count_mt && (mt_code != -2)) {
8273     ct += count_chrom_markers(chrom_info_ptr, marker_exclude, mt_code);
8274   }
8275   return ct;
8276 }
8277 
conditional_allocate_non_autosomal_markers(const Chrom_info * chrom_info_ptr,uintptr_t unfiltered_marker_ct,const uintptr_t * marker_exclude_orig,uint32_t marker_ct,uint32_t count_x,uint32_t count_mt,const char * calc_descrip,uintptr_t ** marker_exclude_ptr,uint32_t * newly_excluded_ct_ptr)8278 int32_t conditional_allocate_non_autosomal_markers(const Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, const uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr) {
8279   // if all markers are autosomal (or pseudoautosomal) diploid, nothing
8280   // happens.  otherwise, this creates a marker_exclude copy with
8281   // non-{autosomal diploid} markers excluded for the caller to use.
8282   const uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
8283   const int32_t* xymt_codes = chrom_info_ptr->xymt_codes;
8284   uint32_t xymt_cts[XYMT_OFFSET_CT];
8285   fill_uint_zero(XYMT_OFFSET_CT, xymt_cts);
8286   if (is_set(chrom_info_ptr->haploid_mask, 0)) {
8287     *newly_excluded_ct_ptr = marker_ct;
8288   } else {
8289     if (count_x && (xymt_codes[X_OFFSET] != -2)) {
8290       xymt_cts[X_OFFSET] = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, xymt_codes[X_OFFSET]);
8291     }
8292     if (xymt_codes[Y_OFFSET] != -2) {
8293       xymt_cts[Y_OFFSET] = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, xymt_codes[Y_OFFSET]);
8294     }
8295     if (count_mt && (xymt_codes[MT_OFFSET] != -2)) {
8296       xymt_cts[MT_OFFSET] = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, xymt_codes[MT_OFFSET]);
8297     }
8298     *newly_excluded_ct_ptr = xymt_cts[X_OFFSET] + xymt_cts[Y_OFFSET] + xymt_cts[MT_OFFSET];
8299   }
8300   if (*newly_excluded_ct_ptr) {
8301     LOGPRINTF("Excluding %u variant%s on non-autosomes from %s.\n", *newly_excluded_ct_ptr, (*newly_excluded_ct_ptr == 1)? "" : "s", calc_descrip);
8302   }
8303   if (*newly_excluded_ct_ptr == marker_ct) {
8304     logerrprint("Error: No variants remaining.\n");
8305     return RET_INVALID_CMDLINE;
8306   }
8307   if (!(*newly_excluded_ct_ptr)) {
8308     return 0;
8309   }
8310   if (bigstack_alloc_ul(unfiltered_marker_ctl, marker_exclude_ptr)) {
8311     return RET_NOMEM;
8312   }
8313   memcpy(*marker_exclude_ptr, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
8314   for (uint32_t xymt_idx = 0; xymt_idx < XYMT_OFFSET_CT; ++xymt_idx) {
8315     if (xymt_cts[xymt_idx]) {
8316       const uint32_t chrom_fo_idx = chrom_info_ptr->chrom_idx_to_foidx[xymt_codes[xymt_idx]];
8317       fill_bits(chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1] - chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], *marker_exclude_ptr);
8318     }
8319   }
8320   return 0;
8321 }
8322 
get_max_chrom_size(const Chrom_info * chrom_info_ptr,const uintptr_t * marker_exclude,uint32_t * last_chrom_fo_idx_ptr)8323 uint32_t get_max_chrom_size(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr) {
8324   const uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
8325   uint32_t max_chrom_size = 0;
8326   uint32_t last_chrom_fo_idx = 0;
8327   for (uint32_t chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
8328     const uint32_t cur_chrom_size = count_chrom_markers(chrom_info_ptr, marker_exclude, chrom_info_ptr->chrom_file_order[chrom_fo_idx]);
8329     if (cur_chrom_size) {
8330       last_chrom_fo_idx = chrom_fo_idx;
8331       if (cur_chrom_size > max_chrom_size) {
8332         max_chrom_size = cur_chrom_size;
8333       }
8334     }
8335   }
8336   if (last_chrom_fo_idx_ptr) {
8337     *last_chrom_fo_idx_ptr = last_chrom_fo_idx;
8338   }
8339   return max_chrom_size;
8340 }
8341 
count_genders(const uintptr_t * __restrict sex_nm,const uintptr_t * __restrict sex_male,const uintptr_t * __restrict sample_exclude,uintptr_t unfiltered_sample_ct,uint32_t * __restrict male_ct_ptr,uint32_t * __restrict female_ct_ptr,uint32_t * __restrict unk_ct_ptr)8342 void count_genders(const uintptr_t* __restrict sex_nm, const uintptr_t* __restrict sex_male, const uintptr_t* __restrict sample_exclude, uintptr_t unfiltered_sample_ct, uint32_t* __restrict male_ct_ptr, uint32_t* __restrict female_ct_ptr, uint32_t* __restrict unk_ct_ptr) {
8343   // unfiltered_sample_ct can be zero
8344   uint32_t male_ct = 0;
8345   uint32_t female_ct = 0;
8346   uint32_t unk_ct = 0;
8347   uint32_t unfiltered_sample_ctld = unfiltered_sample_ct / BITCT;
8348   uint32_t unfiltered_sample_ct_rem = unfiltered_sample_ct & (BITCT - 1);
8349   uintptr_t ulii;
8350   uintptr_t uljj;
8351   uintptr_t sample_bidx;
8352   for (sample_bidx = 0; sample_bidx < unfiltered_sample_ctld; sample_bidx++) {
8353     ulii = ~(*sample_exclude++);
8354   count_genders_last_loop:
8355     uljj = *sex_nm++;
8356     unk_ct += popcount_long(ulii & (~uljj));
8357     ulii &= uljj;
8358     uljj = *sex_male++;
8359     male_ct += popcount_long(ulii & uljj);
8360     female_ct += popcount_long(ulii & (~uljj));
8361   }
8362   if (unfiltered_sample_ct_rem) {
8363     ulii = (~(*sample_exclude)) & ((ONELU << unfiltered_sample_ct_rem) - ONELU);
8364     unfiltered_sample_ct_rem = 0;
8365     goto count_genders_last_loop;
8366   }
8367   *male_ct_ptr = male_ct;
8368   *female_ct_ptr = female_ct;
8369   *unk_ct_ptr = unk_ct;
8370 }
8371 
reverse_loadbuf(uintptr_t unfiltered_sample_ct,unsigned char * loadbuf)8372 void reverse_loadbuf(uintptr_t unfiltered_sample_ct, unsigned char* loadbuf) {
8373   // unfiltered_sample_ct can be zero
8374   uintptr_t sample_bidx = 0;
8375   unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
8376   unsigned char ucc;
8377   unsigned char ucc2;
8378   uintptr_t unfiltered_sample_ctd;
8379   uint32_t* loadbuf_alias32;
8380   uint32_t uii;
8381   uint32_t ujj;
8382 #ifdef __LP64__
8383   const __m128i m1 = {FIVEMASK, FIVEMASK};
8384   __m128i* loadbuf_alias;
8385   __m128i vii;
8386   __m128i vjj;
8387   // todo: use this vector loop even when loadbuf is unaligned, so stuff like
8388   // recode_load_to() is faster
8389   if (!(((uintptr_t)loadbuf) & 15)) {
8390     loadbuf_alias = (__m128i*)loadbuf;
8391     unfiltered_sample_ctd = unfiltered_sample_ct / 64;
8392     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
8393       vii = *loadbuf_alias;
8394       // we want to exchange 00 and 11, and leave 01/10 untouched.  So make
8395       // vjj := 11 iff vii is 00/11, and vjj := 00 otherwise; then xor.
8396       vjj = _mm_andnot_si128(_mm_xor_si128(vii, _mm_srli_epi64(vii, 1)), m1);
8397       vjj = _mm_or_si128(vjj, _mm_slli_epi64(vjj, 1));
8398       *loadbuf_alias++ = _mm_xor_si128(vii, vjj);
8399     }
8400     loadbuf = (unsigned char*)loadbuf_alias;
8401   } else if (!(((uintptr_t)loadbuf) & 3)) {
8402     loadbuf_alias32 = (uint32_t*)loadbuf;
8403     unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
8404     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
8405       uii = *loadbuf_alias32;
8406       ujj = 0x55555555 & (~(uii ^ (uii >> 1)));
8407       ujj *= 3;
8408       *loadbuf_alias32++ = uii ^ ujj;
8409     }
8410     loadbuf = (unsigned char*)loadbuf_alias32;
8411   }
8412 #else
8413   if (!(((uintptr_t)loadbuf) & 3)) {
8414     loadbuf_alias32 = (uint32_t*)loadbuf;
8415     unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
8416     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
8417       uii = *loadbuf_alias32;
8418       ujj = 0x55555555 & (~(uii ^ (uii >> 1)));
8419       ujj *= 3;
8420       *loadbuf_alias32++ = uii ^ ujj;
8421     }
8422     loadbuf = (unsigned char*)loadbuf_alias32;
8423   }
8424 #endif
8425   for (; loadbuf < loadbuf_end;) {
8426     ucc = *loadbuf;
8427     ucc2 = 0x55 & (~(ucc ^ (ucc >> 1)));
8428     ucc2 *= 3;
8429     *loadbuf++ = ucc ^ ucc2;
8430   }
8431   uii = unfiltered_sample_ct & 3;
8432   if (uii) {
8433     loadbuf[-1] &= (0xff >> (8 - 2 * uii));
8434   }
8435 }
8436 
8437 // deprecated, try to just use copy_quaterarr_nonempty_subset()
copy_quaterarr_nonempty_subset_excl(const uintptr_t * __restrict raw_quaterarr,const uintptr_t * __restrict subset_excl,uint32_t raw_quaterarr_size,uint32_t subset_size,uintptr_t * __restrict output_quaterarr)8438 void copy_quaterarr_nonempty_subset_excl(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_excl, uint32_t raw_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict output_quaterarr) {
8439   assert(subset_size);
8440   assert(raw_quaterarr_size >= subset_size);
8441   uintptr_t cur_output_word = 0;
8442   uintptr_t* output_quaterarr_last = &(output_quaterarr[subset_size / BITCT2]);
8443   const uint32_t word_write_halfshift_end = subset_size % BITCT2;
8444   uint32_t word_write_halfshift = 0;
8445   // if < 2/3-filled, use sparse copy algorithm
8446   if (subset_size * (3 * ONELU) < raw_quaterarr_size * (2 * ONELU)) {
8447     const uint32_t subset_excl_widx_last = raw_quaterarr_size / BITCT;
8448     uint32_t subset_excl_widx = 0;
8449     while (1) {
8450       uintptr_t cur_include_word = ~subset_excl[subset_excl_widx];
8451 
8452       // this, kiddies, is why exclude masks were a mistake.
8453       if (subset_excl_widx == subset_excl_widx_last) {
8454 	cur_include_word &= (ONELU << (raw_quaterarr_size % BITCT)) - ONELU;
8455       }
8456 
8457       if (cur_include_word) {
8458 	uint32_t wordhalf_idx = 0;
8459 #ifdef __LP64__
8460 	uint32_t cur_include_halfword = (uint32_t)cur_include_word;
8461 #else
8462 	uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8463 #endif
8464 	while (1) {
8465 	  if (cur_include_halfword) {
8466 	    uintptr_t raw_quaterarr_word = raw_quaterarr[subset_excl_widx * 2 + wordhalf_idx];
8467 	    do {
8468 	      uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
8469 	      cur_output_word |= ((raw_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
8470 	      if (++word_write_halfshift == BITCT2) {
8471 		*output_quaterarr++ = cur_output_word;
8472 		word_write_halfshift = 0;
8473 		cur_output_word = 0;
8474 	      }
8475 	      cur_include_halfword &= cur_include_halfword - 1;
8476 	    } while (cur_include_halfword);
8477 	  }
8478 	  if (wordhalf_idx) {
8479 	    break;
8480 	  }
8481 	  wordhalf_idx++;
8482 #ifdef __LP64__
8483 	  cur_include_halfword = cur_include_word >> 32;
8484 #else
8485 	  cur_include_halfword = cur_include_word >> 16;
8486 #endif
8487 	}
8488 	if (output_quaterarr == output_quaterarr_last) {
8489 	  if (word_write_halfshift == word_write_halfshift_end) {
8490             if (word_write_halfshift_end) {
8491 	      *output_quaterarr_last = cur_output_word;
8492 	    }
8493 	    return;
8494 	  }
8495 	}
8496       }
8497       subset_excl_widx++;
8498     }
8499   }
8500   // blocked copy
8501   const uintptr_t* subset_excl_last = &(subset_excl[raw_quaterarr_size / BITCT]);
8502   while (1) {
8503     uintptr_t cur_include_word = ~(*subset_excl);
8504     if (subset_excl == subset_excl_last) {
8505       cur_include_word &= (ONELU << (raw_quaterarr_size % BITCT)) - ONELU;
8506     }
8507     subset_excl++;
8508     uint32_t wordhalf_idx = 0;
8509 #ifdef __LP64__
8510     uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
8511 #else
8512     uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8513 #endif
8514     while (1) {
8515       uintptr_t raw_quaterarr_word = *raw_quaterarr++;
8516       while (cur_include_halfword) {
8517 	uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
8518 	uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
8519 	uintptr_t raw_quaterarr_curblock_unmasked = raw_quaterarr_word >> (rqa_idx_lowbits * 2);
8520 	uint32_t rqa_block_len = CTZLU(halfword_invshifted);
8521 	uint32_t block_len_limit = BITCT2 - word_write_halfshift;
8522 	cur_output_word |= raw_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
8523 	if (rqa_block_len < block_len_limit) {
8524 	  word_write_halfshift += rqa_block_len;
8525 	  cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
8526 	} else {
8527 	  // no need to mask, extra bits vanish off the high end
8528 	  *output_quaterarr++ = cur_output_word;
8529 	  word_write_halfshift = rqa_block_len - block_len_limit;
8530 	  cur_output_word = (raw_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
8531 	}
8532 	cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
8533       }
8534       if (wordhalf_idx) {
8535 	break;
8536       }
8537       wordhalf_idx++;
8538 #ifdef __LP64__
8539       cur_include_halfword = cur_include_word >> 32;
8540 #else
8541       cur_include_halfword = cur_include_word >> 16;
8542 #endif
8543     }
8544     if (output_quaterarr == output_quaterarr_last) {
8545       if (word_write_halfshift == word_write_halfshift_end) {
8546 	if (word_write_halfshift_end) {
8547 	  *output_quaterarr_last = cur_output_word;
8548 	}
8549 	return;
8550       }
8551     }
8552   }
8553 }
8554 
load_and_collapse(uint32_t unfiltered_sample_ct,uint32_t sample_ct,const uintptr_t * __restrict sample_exclude,uintptr_t final_mask,uint32_t do_reverse,FILE * bedfile,uintptr_t * __restrict rawbuf,uintptr_t * __restrict mainbuf)8555 uint32_t load_and_collapse(uint32_t unfiltered_sample_ct, uint32_t sample_ct, const uintptr_t* __restrict sample_exclude, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict mainbuf) {
8556   assert(unfiltered_sample_ct);
8557   uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8558   if (unfiltered_sample_ct == sample_ct) {
8559     rawbuf = mainbuf;
8560   }
8561   if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
8562     return RET_READ_FAIL;
8563   }
8564   if (unfiltered_sample_ct != sample_ct) {
8565     copy_quaterarr_nonempty_subset_excl(rawbuf, sample_exclude, unfiltered_sample_ct, sample_ct, mainbuf);
8566   } else {
8567     rawbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
8568   }
8569   if (do_reverse) {
8570     reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
8571   }
8572   return 0;
8573 }
8574 
copy_quaterarr_nonempty_subset(const uintptr_t * __restrict raw_quaterarr,const uintptr_t * __restrict subset_mask,uint32_t raw_quaterarr_size,uint32_t subset_size,uintptr_t * __restrict output_quaterarr)8575 void copy_quaterarr_nonempty_subset(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_mask, uint32_t raw_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict output_quaterarr) {
8576   // in plink 2.0, we probably want (0-based) bit raw_quaterarr_size of
8577   // subset_mask to be always allocated and unset.  This removes a few special
8578   // cases re: iterating past the end of arrays.
8579   assert(subset_size);
8580   assert(raw_quaterarr_size >= subset_size);
8581   uintptr_t cur_output_word = 0;
8582   uintptr_t* output_quaterarr_last = &(output_quaterarr[subset_size / BITCT2]);
8583   const uint32_t word_write_halfshift_end = subset_size % BITCT2;
8584   uint32_t word_write_halfshift = 0;
8585   // if < 2/3-filled, use sparse copy algorithm
8586   if (subset_size * (3 * ONELU) < raw_quaterarr_size * (2 * ONELU)) {
8587     uint32_t subset_mask_widx = 0;
8588     while (1) {
8589       const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
8590       if (cur_include_word) {
8591 	uint32_t wordhalf_idx = 0;
8592 #ifdef __LP64__
8593 	uint32_t cur_include_halfword = (uint32_t)cur_include_word;
8594 #else
8595 	uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8596 #endif
8597 	while (1) {
8598 	  if (cur_include_halfword) {
8599 	    uintptr_t raw_quaterarr_word = raw_quaterarr[subset_mask_widx * 2 + wordhalf_idx];
8600 	    do {
8601 	      uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
8602 	      cur_output_word |= ((raw_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
8603 	      if (++word_write_halfshift == BITCT2) {
8604 		*output_quaterarr++ = cur_output_word;
8605 		word_write_halfshift = 0;
8606 		cur_output_word = 0;
8607 	      }
8608 	      cur_include_halfword &= cur_include_halfword - 1;
8609 	    } while (cur_include_halfword);
8610 	  }
8611 	  if (wordhalf_idx) {
8612 	    break;
8613 	  }
8614 	  wordhalf_idx++;
8615 #ifdef __LP64__
8616 	  cur_include_halfword = cur_include_word >> 32;
8617 #else
8618 	  cur_include_halfword = cur_include_word >> 16;
8619 #endif
8620 	}
8621 	if (output_quaterarr == output_quaterarr_last) {
8622 	  if (word_write_halfshift == word_write_halfshift_end) {
8623             if (word_write_halfshift_end) {
8624 	      *output_quaterarr_last = cur_output_word;
8625 	    }
8626 	    return;
8627 	  }
8628 	}
8629       }
8630       subset_mask_widx++;
8631     }
8632   }
8633   // blocked copy
8634   while (1) {
8635     const uintptr_t cur_include_word = *subset_mask++;
8636     uint32_t wordhalf_idx = 0;
8637 #ifdef __LP64__
8638     uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
8639 #else
8640     uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8641 #endif
8642     while (1) {
8643       uintptr_t raw_quaterarr_word = *raw_quaterarr++;
8644       while (cur_include_halfword) {
8645 	uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
8646 	uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
8647 	uintptr_t raw_quaterarr_curblock_unmasked = raw_quaterarr_word >> (rqa_idx_lowbits * 2);
8648 	uint32_t rqa_block_len = CTZLU(halfword_invshifted);
8649 	uint32_t block_len_limit = BITCT2 - word_write_halfshift;
8650 	cur_output_word |= raw_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
8651 	if (rqa_block_len < block_len_limit) {
8652 	  word_write_halfshift += rqa_block_len;
8653 	  cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
8654 	} else {
8655 	  // no need to mask, extra bits vanish off the high end
8656 	  *output_quaterarr++ = cur_output_word;
8657 	  word_write_halfshift = rqa_block_len - block_len_limit;
8658 	  if (word_write_halfshift) {
8659 	    cur_output_word = (raw_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
8660 	  } else {
8661 	    // avoid potential right-shift-64
8662 	    cur_output_word = 0;
8663 	  }
8664 	}
8665 	cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
8666       }
8667       if (wordhalf_idx) {
8668 	break;
8669       }
8670       wordhalf_idx++;
8671 #ifdef __LP64__
8672       cur_include_halfword = cur_include_word >> 32;
8673 #else
8674       cur_include_halfword = cur_include_word >> 16;
8675 #endif
8676     }
8677     if (output_quaterarr == output_quaterarr_last) {
8678       if (word_write_halfshift == word_write_halfshift_end) {
8679 	if (word_write_halfshift_end) {
8680 	  *output_quaterarr_last = cur_output_word;
8681 	}
8682 	return;
8683       }
8684     }
8685   }
8686 }
8687 
8688 /*
8689 void inplace_quaterarr_proper_subset(const uintptr_t* __restrict subset_mask, uint32_t orig_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict main_quaterarr) {
8690   assert(orig_quaterarr_size > subset_size);
8691   // worthwhile to special-case this since we get to entirely skip
8692   // reading/writing these words
8693   if (!(~subset_mask[0])) {
8694     const uintptr_t* subset_mask_initial = subset_mask;
8695     // guaranteed to terminate since orig_quaterarr_size > subset_size.
8696     do {
8697       subset_mask++;
8698     } while (!(~subset_mask[0]));
8699     const uint32_t quaterarr_word_skip_ct = 2 * ((uintptr_t)(subset_mask - subset_mask_initial));
8700     main_quaterarr = &(main_quaterarr[quaterarr_word_skip_ct]);
8701     const uint32_t item_skip_ct = quaterarr_word_skip_ct * BITCT2;
8702     orig_quaterarr_size -= item_skip_ct;
8703     subset_size -= item_skip_ct;
8704   }
8705   uintptr_t cur_output_word = 0;
8706   uintptr_t* main_quaterarr_writer = main_quaterarr;
8707   uintptr_t* main_quaterarr_write_last = &(main_quaterarr[subset_size / BITCT2]);
8708   const uint32_t word_write_halfshift_end = subset_size % BITCT2;
8709   uint32_t word_write_halfshift = 0;
8710   // if <= 2/3-filled, use sparse copy algorithm
8711   if (subset_size * (3 * ONELU) <= orig_quaterarr_size * (2 * ONELU)) {
8712     uint32_t subset_mask_widx = 0;
8713     while (1) {
8714       const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
8715       if (cur_include_word) {
8716 	uint32_t wordhalf_idx = 0;
8717 #ifdef __LP64__
8718 	uint32_t cur_include_halfword = (uint32_t)cur_include_word;
8719 #else
8720 	uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8721 #endif
8722 	while (1) {
8723 	  if (cur_include_halfword) {
8724 	    uintptr_t orig_quaterarr_word = main_quaterarr[subset_mask_widx * 2 + wordhalf_idx];
8725 	    do {
8726 	      uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
8727 	      cur_output_word |= ((orig_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
8728 	      if (++word_write_halfshift == BITCT2) {
8729 		*main_quaterarr_writer++ = cur_output_word;
8730 		word_write_halfshift = 0;
8731 		cur_output_word = 0;
8732 	      }
8733 	      cur_include_halfword &= cur_include_halfword - 1;
8734 	    } while (cur_include_halfword);
8735 	  }
8736 	  if (wordhalf_idx) {
8737 	    break;
8738 	  }
8739 	  wordhalf_idx++;
8740 #ifdef __LP64__
8741 	  cur_include_halfword = cur_include_word >> 32;
8742 #else
8743 	  cur_include_halfword = cur_include_word >> 16;
8744 #endif
8745 	}
8746 	if (main_quaterarr_writer == main_quaterarr_write_last) {
8747 	  if (word_write_halfshift == word_write_halfshift_end) {
8748             if (word_write_halfshift_end) {
8749 	      *main_quaterarr_writer = cur_output_word;
8750 	    }
8751 	    return;
8752 	  }
8753 	}
8754       }
8755       subset_mask_widx++;
8756     }
8757   }
8758   // blocked copy
8759   while (1) {
8760     const uintptr_t cur_include_word = *subset_mask++;
8761     uint32_t wordhalf_idx = 0;
8762 #ifdef __LP64__
8763     uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
8764 #else
8765     uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8766 #endif
8767     while (1) {
8768       uintptr_t orig_quaterarr_word = *main_quaterarr++;
8769       while (cur_include_halfword) {
8770 	uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
8771 	uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
8772 	uintptr_t orig_quaterarr_curblock_unmasked = orig_quaterarr_word >> (rqa_idx_lowbits * 2);
8773 	uint32_t rqa_block_len = CTZLU(halfword_invshifted);
8774 	uint32_t block_len_limit = BITCT2 - word_write_halfshift;
8775 	cur_output_word |= orig_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
8776 	if (rqa_block_len < block_len_limit) {
8777 	  word_write_halfshift += rqa_block_len;
8778 	  cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
8779 	} else {
8780 	  // no need to mask, extra bits vanish off the high end
8781 
8782 	  *main_quaterarr_writer++ = cur_output_word;
8783 	  word_write_halfshift = rqa_block_len - block_len_limit;
8784 	  if (word_write_halfshift) {
8785 	    cur_output_word = (orig_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
8786 	  } else {
8787 	    cur_output_word = 0;
8788 	  }
8789 	}
8790 	cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
8791       }
8792       if (wordhalf_idx) {
8793 	break;
8794       }
8795       wordhalf_idx++;
8796 #ifdef __LP64__
8797       cur_include_halfword = cur_include_word >> 32;
8798 #else
8799       cur_include_halfword = cur_include_word >> 16;
8800 #endif
8801     }
8802     if (main_quaterarr_writer == main_quaterarr_write_last) {
8803       if (word_write_halfshift == word_write_halfshift_end) {
8804 	if (word_write_halfshift_end) {
8805 	  *main_quaterarr_writer = cur_output_word;
8806 	}
8807 	return;
8808       }
8809     }
8810   }
8811 }
8812 */
8813 
load_and_collapse_incl(uint32_t unfiltered_sample_ct,uint32_t sample_ct,const uintptr_t * __restrict sample_include,uintptr_t final_mask,uint32_t do_reverse,FILE * bedfile,uintptr_t * __restrict rawbuf,uintptr_t * __restrict mainbuf)8814 uint32_t load_and_collapse_incl(uint32_t unfiltered_sample_ct, uint32_t sample_ct, const uintptr_t* __restrict sample_include, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict mainbuf) {
8815   assert(unfiltered_sample_ct);
8816   uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8817   if (unfiltered_sample_ct == sample_ct) {
8818     rawbuf = mainbuf;
8819   }
8820   if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
8821     return RET_READ_FAIL;
8822   }
8823   if (unfiltered_sample_ct != sample_ct) {
8824     copy_quaterarr_nonempty_subset(rawbuf, sample_include, unfiltered_sample_ct, sample_ct, mainbuf);
8825   } else {
8826     mainbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
8827   }
8828   if (do_reverse) {
8829     reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
8830   }
8831   return 0;
8832 }
8833 
8834 /*
8835 uint32_t load_and_collapse_incl_inplace(const uintptr_t* __restrict sample_include, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict mainbuf) {
8836   // mainbuf must be large enough to store unfiltered data
8837   uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8838   if (load_raw(unfiltered_sample_ct4, bedfile, mainbuf)) {
8839     return RET_READ_FAIL;
8840   }
8841   if (unfiltered_sample_ct == sample_ct) {
8842     mainbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
8843   } else {
8844     inplace_quaterarr_proper_subset(sample_include, unfiltered_sample_ct, sample_ct, mainbuf);
8845   }
8846   if (do_reverse) {
8847     reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
8848   }
8849   return 0;
8850 }
8851 */
8852 
load_and_split(uint32_t unfiltered_sample_ct,const uintptr_t * __restrict pheno_nm,const uintptr_t * __restrict pheno_c,FILE * bedfile,uintptr_t * __restrict rawbuf,uintptr_t * __restrict casebuf,uintptr_t * __restrict ctrlbuf)8853 uint32_t load_and_split(uint32_t unfiltered_sample_ct, const uintptr_t* __restrict pheno_nm, const uintptr_t* __restrict pheno_c, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict casebuf, uintptr_t* __restrict ctrlbuf) {
8854   // add do_reverse later if needed
8855   uintptr_t* rawbuf_end = &(rawbuf[unfiltered_sample_ct / BITCT2]);
8856   uintptr_t case_word = 0;
8857   uintptr_t ctrl_word = 0;
8858   uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8859   uint32_t case_shift2 = 0;
8860   uint32_t ctrl_shift2 = 0;
8861   uint32_t read_shift_max = BITCT2;
8862   uint32_t sample_uidx = 0;
8863   uint32_t read_shift;
8864   uintptr_t read_word;
8865   uintptr_t ulii;
8866   if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
8867     return RET_READ_FAIL;
8868   }
8869   while (1) {
8870     while (rawbuf < rawbuf_end) {
8871       read_word = *rawbuf++;
8872       for (read_shift = 0; read_shift < read_shift_max; sample_uidx++, read_shift++) {
8873 	if (is_set(pheno_nm, sample_uidx)) {
8874 	  ulii = read_word & 3;
8875 	  if (is_set(pheno_c, sample_uidx)) {
8876 	    case_word |= ulii << case_shift2;
8877 	    case_shift2 += 2;
8878 	    if (case_shift2 == BITCT) {
8879 	      *casebuf++ = case_word;
8880 	      case_word = 0;
8881 	      case_shift2 = 0;
8882 	    }
8883 	  } else {
8884 	    ctrl_word |= ulii << ctrl_shift2;
8885 	    ctrl_shift2 += 2;
8886 	    if (ctrl_shift2 == BITCT) {
8887 	      *ctrlbuf++ = ctrl_word;
8888 	      ctrl_word = 0;
8889 	      ctrl_shift2 = 0;
8890 	    }
8891 	  }
8892 	}
8893 	read_word >>= 2;
8894       }
8895     }
8896     if (sample_uidx == unfiltered_sample_ct) {
8897       if (case_shift2) {
8898 	*casebuf = case_word;
8899       }
8900       if (ctrl_shift2) {
8901 	*ctrlbuf = ctrl_word;
8902       }
8903       return 0;
8904     }
8905     rawbuf_end++;
8906     read_shift_max = unfiltered_sample_ct % BITCT2;
8907   }
8908 }
8909 
init_quaterarr_from_bitarr(const uintptr_t * __restrict bitarr,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict new_quaterarr)8910 void init_quaterarr_from_bitarr(const uintptr_t* __restrict bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict new_quaterarr) {
8911   // allows unfiltered_sample_ct == 0
8912   uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
8913   uintptr_t ulii;
8914   uintptr_t uljj;
8915   uintptr_t ulkk;
8916   uintptr_t ulmm;
8917   uint32_t bit_idx;
8918   while (unfiltered_sample_ctl) {
8919     ulii = ~(*bitarr++);
8920     ulkk = FIVEMASK;
8921     ulmm = FIVEMASK;
8922     if (ulii) {
8923       uljj = ulii >> BITCT2;
8924 #ifdef __LP64__
8925       ulii &= 0xffffffffLLU;
8926 #else
8927       ulii &= 0xffffLU;
8928 #endif
8929       if (ulii) {
8930 	do {
8931 	  bit_idx = CTZLU(ulii);
8932 	  ulkk &= ~(ONELU << (bit_idx * 2));
8933 	  ulii &= ulii - 1;
8934 	} while (ulii);
8935       }
8936       if (uljj) {
8937 	do {
8938 	  bit_idx = CTZLU(uljj);
8939 	  ulmm &= ~(ONELU << (bit_idx * 2));
8940 	  uljj &= uljj - 1;
8941 	} while (uljj);
8942       }
8943     }
8944     *new_quaterarr++ = ulkk;
8945     *new_quaterarr++ = ulmm;
8946     --unfiltered_sample_ctl;
8947   }
8948   ulii = unfiltered_sample_ct & (BITCT - 1);
8949   if (ulii) {
8950     new_quaterarr--;
8951     if (ulii < BITCT2) {
8952       *new_quaterarr-- = 0;
8953     } else {
8954       ulii -= BITCT2;
8955     }
8956     *new_quaterarr &= (ONELU << (ulii * 2)) - ONELU;
8957   }
8958 }
8959 
init_quaterarr_from_inverted_bitarr(const uintptr_t * __restrict inverted_bitarr,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict new_quaterarr)8960 void init_quaterarr_from_inverted_bitarr(const uintptr_t* __restrict inverted_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict new_quaterarr) {
8961   // allows unfiltered_sample_ct == 0
8962   uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
8963   uintptr_t ulii;
8964   uintptr_t uljj;
8965   uintptr_t ulkk;
8966   uintptr_t ulmm;
8967   uint32_t bit_idx;
8968   while (unfiltered_sample_ctl) {
8969     ulii = *inverted_bitarr++;
8970     ulkk = FIVEMASK;
8971     ulmm = FIVEMASK;
8972     if (ulii) {
8973       uljj = ulii >> BITCT2;
8974 #ifdef __LP64__
8975       ulii &= 0xffffffffLLU;
8976 #else
8977       ulii &= 0xffffLU;
8978 #endif
8979       if (ulii) {
8980 	do {
8981 	  bit_idx = CTZLU(ulii);
8982 	  ulkk &= ~(ONELU << (bit_idx * 2));
8983 	  ulii &= ulii - 1;
8984 	} while (ulii);
8985       }
8986       if (uljj) {
8987 	do {
8988 	  bit_idx = CTZLU(uljj);
8989 	  ulmm &= ~(ONELU << (bit_idx * 2));
8990 	  uljj &= uljj - 1;
8991 	} while (uljj);
8992       }
8993     }
8994     *new_quaterarr++ = ulkk;
8995     *new_quaterarr++ = ulmm;
8996     --unfiltered_sample_ctl;
8997   }
8998   ulii = unfiltered_sample_ct & (BITCT - 1);
8999   if (ulii) {
9000     new_quaterarr--;
9001     if (ulii < BITCT2) {
9002       *new_quaterarr-- = 0;
9003     } else {
9004       ulii -= BITCT2;
9005     }
9006     *new_quaterarr &= (ONELU << (ulii * 2)) - ONELU;
9007   }
9008 }
9009 
quatervec_01_init_invert(const uintptr_t * __restrict source_quatervec,uintptr_t entry_ct,uintptr_t * __restrict target_quatervec)9010 void quatervec_01_init_invert(const uintptr_t* __restrict source_quatervec, uintptr_t entry_ct, uintptr_t* __restrict target_quatervec) {
9011   // Initializes a quatervec as the inverse of another.
9012   // Some modifications needed for AVX2.
9013   uint32_t vec_wsize = QUATERCT_TO_ALIGNED_WORDCT(entry_ct);
9014   uint32_t rem = entry_ct & (BITCT - 1);
9015 #ifdef __LP64__
9016   const __m128i m1 = {FIVEMASK, FIVEMASK};
9017   __m128i* tptr = (__m128i*)target_quatervec;
9018   __m128i* sptr = (__m128i*)source_quatervec;
9019   __m128i* tptr_end = (__m128i*)(&(target_quatervec[vec_wsize]));
9020   uintptr_t* second_to_last;
9021   while (tptr < tptr_end) {
9022     *tptr++ = _mm_andnot_si128(*sptr++, m1);
9023   }
9024   if (rem) {
9025     second_to_last = &(((uintptr_t*)tptr_end)[-2]);
9026     if (rem > BITCT2) {
9027       second_to_last[1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
9028     } else {
9029       *second_to_last &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
9030       second_to_last[1] = 0;
9031     }
9032   }
9033 #else
9034   uintptr_t* tptr_end = &(target_quatervec[vec_wsize]);
9035   while (target_quatervec < tptr_end) {
9036     *target_quatervec++ = FIVEMASK & (~(*source_quatervec++));
9037   }
9038   if (rem) {
9039     if (rem > BITCT2) {
9040       target_quatervec[-1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
9041     } else {
9042       target_quatervec[-2] &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
9043       target_quatervec[-1] = 0;
9044     }
9045   }
9046 
9047 #endif
9048 }
9049 
bitvec_andnot_copy(const uintptr_t * __restrict source_vec,const uintptr_t * __restrict exclude_vec,uintptr_t word_ct,uintptr_t * __restrict target_vec)9050 void bitvec_andnot_copy(const uintptr_t* __restrict source_vec, const uintptr_t* __restrict exclude_vec, uintptr_t word_ct, uintptr_t* __restrict target_vec) {
9051   // target_vec := source_vec ANDNOT exclude_vec
9052   // may write an extra word
9053   assert(word_ct);
9054 #ifdef __LP64__
9055   __m128i* tptr = (__m128i*)target_vec;
9056   __m128i* sptr = (__m128i*)source_vec;
9057   __m128i* xptr = (__m128i*)exclude_vec;
9058   __m128i* tptr_end = (__m128i*)(&(target_vec[round_up_pow2(word_ct, VEC_WORDS)]));
9059   do {
9060     *tptr++ = _mm_andnot_si128(*xptr++, *sptr++);
9061   } while (tptr < tptr_end);
9062 #else
9063   uintptr_t* tptr_end = &(target_vec[word_ct]);
9064   do {
9065     *target_vec++ = (*source_vec++) & (~(*exclude_vec++));
9066   } while (target_vec < tptr_end);
9067 #endif
9068 }
9069 
apply_bitarr_mask_to_quaterarr_01(const uintptr_t * __restrict mask_bitarr,uintptr_t unfiltered_sample_ct,uintptr_t * main_quaterarr)9070 void apply_bitarr_mask_to_quaterarr_01(const uintptr_t* __restrict mask_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* main_quaterarr) {
9071   // allows unfiltered_sample_ct == 0
9072   uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9073   uintptr_t ulii;
9074   uintptr_t uljj;
9075   uintptr_t ulkk;
9076   uintptr_t ulmm;
9077   uint32_t bit_idx;
9078   while (unfiltered_sample_ctl) {
9079     ulii = ~(*mask_bitarr++);
9080     ulkk = *main_quaterarr;
9081     ulmm = main_quaterarr[1];
9082     if (ulii) {
9083       uljj = ulii >> BITCT2;
9084 #ifdef __LP64__
9085       ulii &= 0xffffffffLLU;
9086 #else
9087       ulii &= 0xffffLU;
9088 #endif
9089       if (ulii) {
9090 	do {
9091 	  bit_idx = CTZLU(ulii);
9092 	  ulkk &= ~(ONELU << (bit_idx * 2));
9093 	  ulii &= ulii - 1;
9094 	} while (ulii);
9095       }
9096       if (uljj) {
9097 	do {
9098 	  bit_idx = CTZLU(uljj);
9099 	  ulmm &= ~(ONELU << (bit_idx * 2));
9100 	  uljj &= uljj - 1;
9101 	} while (uljj);
9102       }
9103     }
9104     *main_quaterarr++ = ulkk;
9105     *main_quaterarr++ = ulmm;
9106     --unfiltered_sample_ctl;
9107   }
9108 }
9109 
apply_bitarr_excl_to_quaterarr_01(const uintptr_t * __restrict excl_bitarr,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict main_quaterarr)9110 void apply_bitarr_excl_to_quaterarr_01(const uintptr_t* __restrict excl_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict main_quaterarr) {
9111   assert(unfiltered_sample_ct);
9112   uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9113   uintptr_t ulii;
9114   uintptr_t uljj;
9115   uintptr_t ulkk;
9116   uintptr_t ulmm;
9117   uint32_t bit_idx;
9118   do {
9119     ulii = *excl_bitarr++;
9120     ulkk = *main_quaterarr;
9121     ulmm = main_quaterarr[1];
9122     if (ulii) {
9123       uljj = ulii >> BITCT2;
9124 #ifdef __LP64__
9125       ulii &= 0xffffffffLLU;
9126 #else
9127       ulii &= 0xffffLU;
9128 #endif
9129       if (ulii) {
9130 	do {
9131 	  bit_idx = CTZLU(ulii);
9132 	  ulkk &= ~(ONELU << (bit_idx * 2));
9133 	  ulii &= ulii - 1;
9134 	} while (ulii);
9135       }
9136       if (uljj) {
9137 	do {
9138 	  bit_idx = CTZLU(uljj);
9139 	  ulmm &= ~(ONELU << (bit_idx * 2));
9140 	  uljj &= uljj - 1;
9141 	} while (uljj);
9142       }
9143     }
9144     *main_quaterarr++ = ulkk;
9145     *main_quaterarr++ = ulmm;
9146   } while (--unfiltered_sample_ctl);
9147 }
9148 
apply_excl_intersect_to_quaterarr_01(const uintptr_t * __restrict excl_bitarr_1,const uintptr_t * __restrict excl_bitarr_2,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict main_quaterarr)9149 void apply_excl_intersect_to_quaterarr_01(const uintptr_t* __restrict excl_bitarr_1, const uintptr_t* __restrict excl_bitarr_2, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict main_quaterarr) {
9150   assert(unfiltered_sample_ct);
9151   uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9152   uintptr_t ulii;
9153   uintptr_t uljj;
9154   uintptr_t ulkk;
9155   uintptr_t ulmm;
9156   uint32_t bit_idx;
9157   do {
9158     ulii = (*excl_bitarr_1++) & (*excl_bitarr_2++);
9159     ulkk = *main_quaterarr;
9160     ulmm = main_quaterarr[1];
9161     if (ulii) {
9162       uljj = ulii >> BITCT2;
9163 #ifdef __LP64__
9164       ulii &= 0xffffffffLLU;
9165 #else
9166       ulii &= 0xffffLU;
9167 #endif
9168       if (ulii) {
9169 	do {
9170 	  bit_idx = CTZLU(ulii);
9171 	  ulkk &= ~(ONELU << (bit_idx * 2));
9172 	  ulii &= ulii - 1;
9173 	} while (ulii);
9174       }
9175       if (uljj) {
9176 	do {
9177 	  bit_idx = CTZLU(uljj);
9178 	  ulmm &= ~(ONELU << (bit_idx * 2));
9179 	  uljj &= uljj - 1;
9180 	} while (uljj);
9181       }
9182     }
9183     *main_quaterarr++ = ulkk;
9184     *main_quaterarr++ = ulmm;
9185   } while (--unfiltered_sample_ctl);
9186 }
9187 
quatervec_copy_only_01(const uintptr_t * __restrict input_quatervec,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict output_quatervec)9188 void quatervec_copy_only_01(const uintptr_t* __restrict input_quatervec, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict output_quatervec) {
9189   // initializes result_ptr bits 01 iff input_quatervec bits are 01
9190   assert(unfiltered_sample_ct);
9191 #ifdef __LP64__
9192   const __m128i m1 = {FIVEMASK, FIVEMASK};
9193   __m128i* vec2_read = (__m128i*)input_quatervec;
9194   __m128i* read_end = &(vec2_read[QUATERCT_TO_VECCT(unfiltered_sample_ct)]);
9195   __m128i* vec2_write = (__m128i*)output_quatervec;
9196   __m128i loader;
9197   do {
9198     loader = *vec2_read++;
9199     *vec2_write++ = _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader, 1), loader), m1);
9200   } while (vec2_read < read_end);
9201 #else
9202   const uintptr_t* read_end = &(input_quatervec[QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct)]);
9203   uintptr_t loader;
9204   do {
9205     loader = *input_quatervec++;
9206     *output_quatervec++ = loader & (~(loader >> 1)) & FIVEMASK;
9207   } while (input_quatervec < read_end);
9208 #endif
9209 }
9210 
quatervec_01_invert(uintptr_t unfiltered_sample_ct,uintptr_t * main_quatervec)9211 void quatervec_01_invert(uintptr_t unfiltered_sample_ct, uintptr_t* main_quatervec) {
9212   uintptr_t* vec2_last = &(main_quatervec[unfiltered_sample_ct / BITCT2]);
9213   uint32_t remainder = unfiltered_sample_ct & (BITCT2 - 1);
9214 #ifdef __LP64__
9215   const __m128i m1 = {FIVEMASK, FIVEMASK};
9216   __m128i* vec2_128 = (__m128i*)main_quatervec;
9217   __m128i* vec2_last128 = &(vec2_128[unfiltered_sample_ct / BITCT]);
9218   while (vec2_128 < vec2_last128) {
9219     *vec2_128 = _mm_xor_si128(*vec2_128, m1);
9220     vec2_128++;
9221   }
9222   main_quatervec = (uintptr_t*)vec2_128;
9223   if (main_quatervec != vec2_last) {
9224     *main_quatervec = (*main_quatervec) ^ FIVEMASK;
9225     main_quatervec++;
9226   }
9227 #else
9228   while (main_quatervec != vec2_last) {
9229     *main_quatervec = (*main_quatervec) ^ FIVEMASK;
9230     main_quatervec++;
9231   }
9232 #endif
9233   if (remainder) {
9234     *vec2_last = *vec2_last ^ (FIVEMASK >> (2 * (BITCT2 - remainder)));
9235   }
9236 }
9237 
vec_datamask(uintptr_t unfiltered_sample_ct,uint32_t matchval,uintptr_t * data_ptr,uintptr_t * mask_ptr,uintptr_t * result_ptr)9238 void vec_datamask(uintptr_t unfiltered_sample_ct, uint32_t matchval, uintptr_t* data_ptr, uintptr_t* mask_ptr, uintptr_t* result_ptr) {
9239   // vec_ptr assumed to be standard 00/01 bit vector
9240   // sets result_vec bits to 01 iff data_ptr bits are equal to matchval and
9241   // vec_ptr bit is set, 00 otherwise.
9242   // currently assumes matchval is not 1.
9243   assert(unfiltered_sample_ct);
9244 #ifdef __LP64__
9245   __m128i* data_read = (__m128i*)data_ptr;
9246   __m128i* mask_read = (__m128i*)mask_ptr;
9247   __m128i* data_read_end = &(data_read[QUATERCT_TO_VECCT(unfiltered_sample_ct)]);
9248   __m128i* writer = (__m128i*)result_ptr;
9249   __m128i loader;
9250 #else
9251   uintptr_t* data_read_end = &(data_ptr[QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct)]);
9252   uintptr_t loader;
9253 #endif
9254   if (matchval) {
9255     if (matchval == 2) {
9256 #ifdef __LP64__
9257       do {
9258         loader = *data_read++;
9259         *writer++ = _mm_and_si128(_mm_andnot_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
9260       } while (data_read < data_read_end);
9261 #else
9262       do {
9263 	loader = *data_ptr++;
9264         *result_ptr++ = (~loader) & (loader >> 1) & (*mask_ptr++);
9265       } while (data_ptr < data_read_end);
9266 #endif
9267     } else {
9268 #ifdef __LP64__
9269       do {
9270         loader = *data_read++;
9271         *writer++ = _mm_and_si128(_mm_and_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
9272       } while (data_read < data_read_end);
9273 #else
9274       do {
9275         loader = *data_ptr++;
9276         *result_ptr++ = loader & (loader >> 1) & (*mask_ptr++);
9277       } while (data_ptr < data_read_end);
9278 #endif
9279     }
9280   } else {
9281 #ifdef __LP64__
9282     do {
9283       loader = *data_read++;
9284       *writer++ = _mm_andnot_si128(_mm_or_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
9285     } while (data_read < data_read_end);
9286 #else
9287     do {
9288       loader = *data_ptr++;
9289       *result_ptr++ = (~(loader | (loader >> 1))) & (*mask_ptr++);
9290     } while (data_ptr < data_read_end);
9291 #endif
9292   }
9293 }
9294 
9295 /*
9296 void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct) {
9297 #ifdef __LP64__
9298   const __m128i m1 = {FIVEMASK, FIVEMASK};
9299   __m128i* vptr = (__m128i*)lptr;
9300   __m128i* vend = (__m128i*)(&(lptr[word_ct]));
9301   __m128i vii;
9302   __m128i vjj;
9303   do {
9304     // new high bit set iff old low bit was set
9305     // new low bit set iff old bits differed
9306     vii = *vptr;
9307     vjj = _mm_and_si128(vii, m1); // old low bit
9308     vii = _mm_and_si128(_mm_srli_epi64(vii, 1), m1); // old high bit, shifted
9309     *vptr = _mm_or_si128(_mm_slli_epi64(vjj, 1), _mm_xor_si128(vii, vjj));
9310   } while (++vptr != vend);
9311 #else
9312   uintptr_t* lend = &(lptr[word_ct]);
9313   uintptr_t ulii;
9314   uintptr_t uljj;
9315   do {
9316     ulii = *lptr;
9317     uljj = ulii & FIVEMASK;
9318     ulii = (ulii >> 1) & FIVEMASK;
9319     *lptr = ulii ^ (uljj * 3);
9320   } while (++lptr != lend);
9321 #endif
9322 }
9323 */
9324 
9325 // this was "rotate_plink1_to_plink2_...", until I noticed that the plink2
9326 // format should store alt allele counts instead of ref allele counts.
rotate_plink1_to_a2ct_and_copy(uintptr_t * loadbuf,uintptr_t * writebuf,uintptr_t word_ct)9327 void rotate_plink1_to_a2ct_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct) {
9328   // assumes positive word_ct
9329   uintptr_t* loadbuf_end = &(loadbuf[word_ct]);
9330   uintptr_t ulii;
9331   uintptr_t uljj;
9332   do {
9333     ulii = *loadbuf++;
9334     uljj = ulii & FIVEMASK;
9335     ulii = (ulii >> 1) & FIVEMASK;
9336     *writebuf++ = ulii ^ (uljj * 3);
9337   } while (loadbuf < loadbuf_end);
9338 }
9339 
extract_collapsed_missing_bitfield(uintptr_t * lptr,uintptr_t unfiltered_sample_ct,uintptr_t * sample_include_quaterarr,uintptr_t sample_ct,uintptr_t * missing_bitfield)9340 void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_include_quaterarr, uintptr_t sample_ct, uintptr_t* missing_bitfield) {
9341   uint32_t word_ct = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
9342   uintptr_t sample_idx;
9343   uintptr_t cur_word;
9344   uintptr_t cur_mask;
9345   uintptr_t cur_write;
9346   uint32_t woffset;
9347   uint32_t widx;
9348   uint32_t uii;
9349   if (unfiltered_sample_ct == sample_ct) {
9350     cur_write = 0;
9351     woffset = 0;
9352     for (widx = 0; widx < word_ct; widx++) {
9353       cur_word = *lptr++;
9354       cur_word = cur_word & ((~cur_word) >> 1) & (*sample_include_quaterarr++);
9355       while (cur_word) {
9356         uii = CTZLU(cur_word) / 2;
9357         cur_write |= ONELU << (woffset + uii);
9358 	cur_word &= cur_word - 1;
9359       }
9360       if (woffset) {
9361         *missing_bitfield++ = cur_write;
9362         cur_write = 0;
9363         woffset = 0;
9364       } else {
9365 	woffset = BITCT2;
9366       }
9367     }
9368     if (woffset) {
9369       *missing_bitfield++ = cur_write;
9370     }
9371   } else {
9372     fill_ulong_zero(BITCT_TO_WORDCT(sample_ct), missing_bitfield);
9373     sample_idx = 0;
9374     for (widx = 0; sample_idx < sample_ct; widx++, lptr++) {
9375       cur_mask = *sample_include_quaterarr++;
9376       if (cur_mask) {
9377         cur_word = *lptr;
9378         cur_word = cur_word & ((~cur_word) >> 1) & cur_mask;
9379 	if (cur_mask == FIVEMASK) {
9380           if (cur_word) {
9381 	    uii = sample_idx;
9382             do {
9383               set_bit((CTZLU(cur_word) / 2) + uii, missing_bitfield);
9384               cur_word &= cur_word - 1;
9385 	    } while (cur_word);
9386 	  }
9387 	  sample_idx += BITCT2;
9388 	} else {
9389 	  if (cur_word) {
9390 	    do {
9391 	      uii = CTZLU(cur_mask);
9392 	      if ((cur_word >> uii) & 1) {
9393                 set_bit_ul(sample_idx, missing_bitfield);
9394 	      }
9395 	      sample_idx++;
9396 	      cur_mask &= cur_mask - 1;
9397 	    } while (cur_mask);
9398 	  } else {
9399             sample_idx += popcount2_long(cur_mask);
9400 	  }
9401         }
9402       }
9403     }
9404   }
9405 }
9406 
hh_reset(unsigned char * loadbuf,uintptr_t * sample_include_quaterarr,uintptr_t unfiltered_sample_ct)9407 void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include_quaterarr, uintptr_t unfiltered_sample_ct) {
9408   uintptr_t sample_bidx = 0;
9409   unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
9410   unsigned char* iicp;
9411   unsigned char ucc;
9412   unsigned char ucc2;
9413   uintptr_t unfiltered_sample_ctd;
9414   uint32_t* loadbuf_alias32;
9415   uint32_t uii;
9416   uint32_t ujj;
9417 #ifdef __LP64__
9418   uint32_t* sample_include_quaterarr_alias32;
9419   __m128i* loadbuf_alias;
9420   __m128i* iivp;
9421   __m128i vii;
9422   __m128i vjj;
9423   if (!(((uintptr_t)loadbuf) & 15)) {
9424     loadbuf_alias = (__m128i*)loadbuf;
9425     iivp = (__m128i*)sample_include_quaterarr;
9426     unfiltered_sample_ctd = unfiltered_sample_ct / 64;
9427     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9428       vii = *loadbuf_alias;
9429       vjj = _mm_and_si128(_mm_andnot_si128(vii, _mm_srli_epi64(vii, 1)), *iivp++);
9430       *loadbuf_alias++ = _mm_sub_epi64(vii, vjj);
9431     }
9432     loadbuf = (unsigned char*)loadbuf_alias;
9433     iicp = (unsigned char*)iivp;
9434   } else if (!(((uintptr_t)loadbuf) & 3)) {
9435     loadbuf_alias32 = (uint32_t*)loadbuf;
9436     sample_include_quaterarr_alias32 = (uint32_t*)sample_include_quaterarr;
9437     unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
9438     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9439       uii = *loadbuf_alias32;
9440       ujj = ((uii >> 1) & (~uii)) & (*sample_include_quaterarr_alias32++);
9441       *loadbuf_alias32++ = uii - ujj;
9442     }
9443     loadbuf = (unsigned char*)loadbuf_alias32;
9444     iicp = (unsigned char*)sample_include_quaterarr_alias32;
9445   } else {
9446     iicp = (unsigned char*)sample_include_quaterarr;
9447   }
9448 #else
9449   if (!(((uintptr_t)loadbuf) & 3)) {
9450     loadbuf_alias32 = (uint32_t*)loadbuf;
9451     unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
9452     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9453       uii = *loadbuf_alias32;
9454       ujj = ((uii >> 1) & (~uii)) & (*sample_include_quaterarr++);
9455       *loadbuf_alias32++ = uii - ujj;
9456     }
9457     loadbuf = (unsigned char*)loadbuf_alias32;
9458   }
9459   iicp = (unsigned char*)sample_include_quaterarr;
9460 #endif
9461   for (; loadbuf < loadbuf_end;) {
9462     ucc = *loadbuf;
9463     ucc2 = ((ucc >> 1) & (~ucc)) & (*iicp++);
9464     *loadbuf++ = ucc - ucc2;
9465   }
9466 }
9467 
hh_reset_y(unsigned char * loadbuf,uintptr_t * sample_include_quaterarr,uintptr_t * sample_male_include_quaterarr,uintptr_t unfiltered_sample_ct)9468 void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include_quaterarr, uintptr_t* sample_male_include_quaterarr, uintptr_t unfiltered_sample_ct) {
9469   uintptr_t sample_bidx = 0;
9470   unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
9471   unsigned char* iicp;
9472   unsigned char* imicp;
9473   unsigned char ucc;
9474   unsigned char ucc2;
9475   unsigned char ucc3;
9476   uintptr_t unfiltered_sample_ctd;
9477   uint32_t* loadbuf_alias32;
9478   uint32_t uii;
9479   uint32_t ujj;
9480   uint32_t ukk;
9481 #ifdef __LP64__
9482   const __m128i m1 = {FIVEMASK, FIVEMASK};
9483   uint32_t* sample_include_quaterarr_alias32;
9484   uint32_t* sample_male_include_quaterarr_alias32;
9485   __m128i* loadbuf_alias;
9486   __m128i* iivp;
9487   __m128i* imivp;
9488   __m128i vii;
9489   __m128i vjj;
9490   __m128i vkk;
9491   if (!(((uintptr_t)loadbuf) & 15)) {
9492     loadbuf_alias = (__m128i*)loadbuf;
9493     iivp = (__m128i*)sample_include_quaterarr;
9494     imivp = (__m128i*)sample_male_include_quaterarr;
9495     unfiltered_sample_ctd = unfiltered_sample_ct / 64;
9496     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9497       // sample_include_quaterarr & ~sample_male_include_quaterarr: force to 01
9498       // sample_male_include_quaterarr: convert 10 to 01, keep everything else
9499       vii = *imivp++;
9500       vjj = *iivp++;
9501       vkk = _mm_and_si128(*loadbuf_alias, _mm_or_si128(vii, _mm_slli_epi64(vii, 1)));
9502       *loadbuf_alias++ = _mm_or_si128(_mm_andnot_si128(vii, vjj), _mm_sub_epi64(vkk, _mm_and_si128(_mm_andnot_si128(vkk, _mm_srli_epi64(vkk, 1)), m1)));
9503     }
9504     loadbuf = (unsigned char*)loadbuf_alias;
9505     iicp = (unsigned char*)iivp;
9506     imicp = (unsigned char*)imivp;
9507   } else if (!(((uintptr_t)loadbuf) & 3)) {
9508     loadbuf_alias32 = (uint32_t*)loadbuf;
9509     sample_include_quaterarr_alias32 = (uint32_t*)sample_include_quaterarr;
9510     sample_male_include_quaterarr_alias32 = (uint32_t*)sample_male_include_quaterarr;
9511     unfiltered_sample_ctd = unfiltered_sample_ct / 16;
9512     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9513       uii = *sample_male_include_quaterarr_alias32++;
9514       ujj = *sample_include_quaterarr_alias32++;
9515       ukk = (*loadbuf_alias32) & (uii * 3);
9516       *loadbuf_alias32++ = ((~uii) & ujj) | (ukk - ((~ukk) & (ukk >> 1) & 0x55555555));
9517     }
9518     loadbuf = (unsigned char*)loadbuf_alias32;
9519     iicp = (unsigned char*)sample_include_quaterarr_alias32;
9520     imicp = (unsigned char*)sample_male_include_quaterarr_alias32;
9521   } else {
9522     iicp = (unsigned char*)sample_include_quaterarr;
9523     imicp = (unsigned char*)sample_male_include_quaterarr;
9524   }
9525 #else
9526   if (!(((uintptr_t)loadbuf) & 3)) {
9527     loadbuf_alias32 = (uint32_t*)loadbuf;
9528     unfiltered_sample_ctd = unfiltered_sample_ct / 16;
9529     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9530       uii = *sample_male_include_quaterarr++;
9531       ujj = *sample_include_quaterarr++;
9532       ukk = (*loadbuf_alias32) & (uii * 3);
9533       *loadbuf_alias32++ = ((~uii) & ujj) | (ukk - ((~ukk) & (ukk >> 1) & 0x55555555));
9534     }
9535     loadbuf = (unsigned char*)loadbuf_alias32;
9536   }
9537   iicp = (unsigned char*)sample_include_quaterarr;
9538   imicp = (unsigned char*)sample_male_include_quaterarr;
9539 #endif
9540   for (; loadbuf < loadbuf_end;) {
9541     ucc = *imicp++;
9542     ucc2 = *iicp++;
9543     ucc3 = (*loadbuf) & (ucc * 3);
9544     *loadbuf++ = ((~ucc) & ucc2) | (ucc3 - ((~ucc3) & (ucc3 >> 1) & 0x55));
9545   }
9546 }
9547 
alloc_raw_haploid_filters(uint32_t unfiltered_sample_ct,uint32_t hh_exists,uint32_t is_include,uintptr_t * sample_bitarr,uintptr_t * sex_male,uintptr_t ** sample_raw_include_quatervec_ptr,uintptr_t ** sample_raw_male_include_quatervec_ptr)9548 uint32_t alloc_raw_haploid_filters(uint32_t unfiltered_sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t* sample_bitarr, uintptr_t* sex_male, uintptr_t** sample_raw_include_quatervec_ptr, uintptr_t** sample_raw_male_include_quatervec_ptr) {
9549   uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
9550   uintptr_t* sample_raw_male_include_quatervec;
9551   if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
9552     if (bigstack_alloc_ul(unfiltered_sample_ctv2, sample_raw_include_quatervec_ptr)) {
9553       return 1;
9554     }
9555     if (is_include) {
9556       init_quaterarr_from_bitarr(sample_bitarr, unfiltered_sample_ct, *sample_raw_include_quatervec_ptr);
9557     } else {
9558       init_quaterarr_from_inverted_bitarr(sample_bitarr, unfiltered_sample_ct, *sample_raw_include_quatervec_ptr);
9559     }
9560   }
9561   if (hh_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
9562     if (bigstack_alloc_ul(unfiltered_sample_ctv2, sample_raw_male_include_quatervec_ptr)) {
9563       return 1;
9564     }
9565     sample_raw_male_include_quatervec = *sample_raw_male_include_quatervec_ptr;
9566     if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
9567       memcpy(sample_raw_male_include_quatervec, *sample_raw_include_quatervec_ptr, unfiltered_sample_ctv2 * sizeof(intptr_t));
9568     } else {
9569       if (is_include) {
9570 	init_quaterarr_from_bitarr(sample_bitarr, unfiltered_sample_ct, sample_raw_male_include_quatervec);
9571       } else {
9572 	init_quaterarr_from_inverted_bitarr(sample_bitarr, unfiltered_sample_ct, sample_raw_male_include_quatervec);
9573       }
9574     }
9575     apply_bitarr_mask_to_quaterarr_01(sex_male, unfiltered_sample_ct, sample_raw_male_include_quatervec);
9576   }
9577   return 0;
9578 }
9579 
haploid_fix_multiple(uintptr_t * marker_exclude,uintptr_t marker_uidx_start,uintptr_t marker_ct,Chrom_info * chrom_info_ptr,uint32_t hh_exists,uint32_t set_hh_missing,uint32_t set_mixed_mt_missing,uintptr_t * sample_raw_include2,uintptr_t * sample_raw_male_include2,uintptr_t unfiltered_sample_ct,uintptr_t byte_ct_per_marker,unsigned char * loadbuf)9580 void haploid_fix_multiple(uintptr_t* marker_exclude, uintptr_t marker_uidx_start, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, uint32_t hh_exists, uint32_t set_hh_missing, uint32_t set_mixed_mt_missing, uintptr_t* sample_raw_include2, uintptr_t* sample_raw_male_include2, uintptr_t unfiltered_sample_ct, uintptr_t byte_ct_per_marker, unsigned char* loadbuf) {
9581   uintptr_t marker_idx = 0;
9582   uintptr_t marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx_start);
9583   uint32_t chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx);
9584   uint32_t chrom_idx;
9585   uint32_t is_x;
9586   uint32_t is_y;
9587   uint32_t is_mt;
9588   uint32_t is_haploid;
9589   uintptr_t chrom_end;
9590   uintptr_t marker_idx_chrom_end;
9591 
9592   while (marker_idx < marker_ct) {
9593     chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
9594     chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
9595     is_x = (chrom_info_ptr->xymt_codes[X_OFFSET] == (int32_t)chrom_idx);
9596     is_y = (chrom_info_ptr->xymt_codes[Y_OFFSET] == (int32_t)chrom_idx);
9597     is_mt = (chrom_info_ptr->xymt_codes[MT_OFFSET] == (int32_t)chrom_idx);
9598     is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
9599     marker_idx_chrom_end = marker_idx + chrom_end - marker_uidx - popcount_bit_idx(marker_exclude, marker_uidx, chrom_end);
9600     if (marker_idx_chrom_end > marker_ct) {
9601       marker_idx_chrom_end = marker_ct;
9602     }
9603     if (is_haploid && set_hh_missing) {
9604       if (is_x) {
9605 	if (hh_exists & XMHH_EXISTS) {
9606 	  for (; marker_idx < marker_idx_chrom_end; marker_idx++) {
9607 	    hh_reset(&(loadbuf[marker_idx * byte_ct_per_marker]), sample_raw_male_include2, unfiltered_sample_ct);
9608 	  }
9609 	}
9610       } else if (is_y) {
9611 	if (hh_exists & Y_FIX_NEEDED) {
9612 	  for (; marker_idx < marker_idx_chrom_end; marker_idx++) {
9613 	    hh_reset_y(&(loadbuf[marker_idx * byte_ct_per_marker]), sample_raw_include2, sample_raw_male_include2, unfiltered_sample_ct);
9614 	  }
9615 	}
9616       } else if (hh_exists & NXMHH_EXISTS) {
9617 	for (; marker_idx < marker_idx_chrom_end; marker_idx++) {
9618 	  hh_reset(&(loadbuf[marker_idx * byte_ct_per_marker]), sample_raw_include2, unfiltered_sample_ct);
9619 	}
9620       }
9621     } else if (is_mt && set_mixed_mt_missing) {
9622       for (; marker_idx < marker_idx_chrom_end; marker_idx++) {
9623 	hh_reset(&(loadbuf[marker_idx * byte_ct_per_marker]), sample_raw_include2, unfiltered_sample_ct);
9624       }
9625     }
9626     marker_idx = marker_idx_chrom_end;
9627     chrom_fo_idx++;
9628   }
9629 }
9630 
force_missing(unsigned char * loadbuf,uintptr_t * force_missing_include2,uintptr_t unfiltered_sample_ct)9631 void force_missing(unsigned char* loadbuf, uintptr_t* force_missing_include2, uintptr_t unfiltered_sample_ct) {
9632   uintptr_t sample_bidx = 0;
9633   unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
9634   unsigned char* fmicp;
9635   unsigned char ucc;
9636   unsigned char ucc2;
9637   uintptr_t unfiltered_sample_ctd;
9638   uint32_t* loadbuf_alias32;
9639   uint32_t uii;
9640   uint32_t ujj;
9641 #ifdef __LP64__
9642   uint32_t* force_missing_include2_alias32;
9643   __m128i* loadbuf_alias;
9644   __m128i* fmivp;
9645   __m128i vii;
9646   __m128i vjj;
9647   if (!(((uintptr_t)loadbuf) & 15)) {
9648     loadbuf_alias = (__m128i*)loadbuf;
9649     fmivp = (__m128i*)force_missing_include2;
9650     unfiltered_sample_ctd = unfiltered_sample_ct / 64;
9651     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9652       vii = *loadbuf_alias;
9653       vjj = *fmivp++;
9654       vii = _mm_or_si128(vii, vjj);
9655       vjj = _mm_slli_epi64(vjj, 1);
9656       *loadbuf_alias++ = _mm_andnot_si128(vjj, vii);
9657     }
9658     loadbuf = (unsigned char*)loadbuf_alias;
9659     fmicp = (unsigned char*)fmivp;
9660   } else if (!(((uintptr_t)loadbuf) & 3)) {
9661     loadbuf_alias32 = (uint32_t*)loadbuf;
9662     force_missing_include2_alias32 = (uint32_t*)force_missing_include2;
9663     unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
9664     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9665       uii = *loadbuf_alias32;
9666       ujj = *force_missing_include2_alias32++;
9667       uii |= ujj;
9668       ujj <<= 1;
9669       *loadbuf_alias32++ = uii & (~ujj);
9670     }
9671     loadbuf = (unsigned char*)loadbuf_alias32;
9672     fmicp = (unsigned char*)force_missing_include2_alias32;
9673   } else {
9674     fmicp = (unsigned char*)force_missing_include2;
9675   }
9676 #else
9677   if (!(((uintptr_t)loadbuf) & 3)) {
9678     loadbuf_alias32 = (uint32_t*)loadbuf;
9679     unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
9680     for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9681       uii = *loadbuf_alias32;
9682       ujj = *force_missing_include2++;
9683       uii |= ujj;
9684       ujj <<= 1;
9685       *loadbuf_alias32++ = uii & (~ujj);
9686     }
9687     loadbuf = (unsigned char*)loadbuf_alias32;
9688   }
9689   fmicp = (unsigned char*)force_missing_include2;
9690 #endif
9691   for (; loadbuf < loadbuf_end;) {
9692     ucc = *loadbuf;
9693     ucc2 = *fmicp++;
9694     ucc |= ucc2;
9695     ucc2 <<= 1;
9696     *loadbuf++ = ucc & (~ucc2);
9697   }
9698 }
9699 
open_and_size_string_list(char * fname,FILE ** infile_ptr,uintptr_t * list_len_ptr,uintptr_t * max_str_len_ptr)9700 int32_t open_and_size_string_list(char* fname, FILE** infile_ptr, uintptr_t* list_len_ptr, uintptr_t* max_str_len_ptr) {
9701   // assumes file is not open yet, and g_textbuf is safe to clobber
9702   uint32_t max_len = 0;
9703   uintptr_t line_idx = 0;
9704   uintptr_t list_len = 0;
9705   int32_t retval = 0;
9706   char* bufptr;
9707   uint32_t cur_len;
9708   if (fopen_checked(fname, "r", infile_ptr)) {
9709     goto open_and_size_string_list_ret_OPEN_FAIL;
9710   }
9711   g_textbuf[MAXLINELEN - 1] = ' ';
9712   while (fgets(g_textbuf, MAXLINELEN, *infile_ptr)) {
9713     line_idx++;
9714     if (!g_textbuf[MAXLINELEN - 1]) {
9715       LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
9716       goto open_and_size_string_list_ret_INVALID_FORMAT;
9717     }
9718     bufptr = skip_initial_spaces(g_textbuf);
9719     if (is_eoln_kns(*bufptr)) {
9720       continue;
9721     }
9722     // don't complain about more than one entry on a line for now
9723     list_len++;
9724     cur_len = strlen_se(bufptr);
9725     if (cur_len >= max_len) {
9726       max_len = cur_len + 1;
9727     }
9728   }
9729   if (!feof(*infile_ptr)) {
9730     goto open_and_size_string_list_ret_READ_FAIL;
9731   }
9732   *list_len_ptr = list_len;
9733   *max_str_len_ptr = max_len;
9734   while (0) {
9735   open_and_size_string_list_ret_OPEN_FAIL:
9736     retval = RET_OPEN_FAIL;
9737     break;
9738   open_and_size_string_list_ret_READ_FAIL:
9739     retval = RET_READ_FAIL;
9740     break;
9741   open_and_size_string_list_ret_INVALID_FORMAT:
9742     retval = RET_INVALID_FORMAT;
9743     break;
9744   }
9745   return retval;
9746 }
9747 
load_string_list(FILE ** infile_ptr,uintptr_t max_str_len,char * str_list)9748 int32_t load_string_list(FILE** infile_ptr, uintptr_t max_str_len, char* str_list) {
9749   // assumes file is open (probably by open_and_size_string_list), and
9750   // g_textbuf is safe to clobber
9751   int32_t retval = 0;
9752   char* bufptr;
9753   uint32_t cur_len;
9754   rewind(*infile_ptr);
9755   while (fgets(g_textbuf, MAXLINELEN, *infile_ptr)) {
9756     bufptr = skip_initial_spaces(g_textbuf);
9757     if (is_eoln_kns(*bufptr)) {
9758       continue;
9759     }
9760     cur_len = strlen_se(bufptr);
9761     memcpy(str_list, bufptr, cur_len);
9762     str_list[cur_len] = '\0';
9763     str_list = &(str_list[max_str_len]);
9764   }
9765   if (!feof(*infile_ptr)) {
9766     goto load_string_list_ret_READ_FAIL;
9767   }
9768   while (0) {
9769   load_string_list_ret_READ_FAIL:
9770     retval = RET_READ_FAIL;
9771     break;
9772   }
9773   return retval;
9774 }
9775 
open_and_skip_first_lines(FILE ** infile_ptr,char * fname,char * loadbuf,uintptr_t loadbuf_size,uint32_t lines_to_skip)9776 int32_t open_and_skip_first_lines(FILE** infile_ptr, char* fname, char* loadbuf, uintptr_t loadbuf_size, uint32_t lines_to_skip) {
9777   uint32_t line_idx;
9778   loadbuf[loadbuf_size - 1] = ' ';
9779   if (fopen_checked(fname, "r", infile_ptr)) {
9780     return RET_OPEN_FAIL;
9781   }
9782   for (line_idx = 1; line_idx <= lines_to_skip; line_idx++) {
9783     if (!fgets(loadbuf, loadbuf_size, *infile_ptr)) {
9784       if (feof(*infile_ptr)) {
9785 	LOGERRPRINTFWW("Error: Fewer lines than expected in %s.\n", fname);
9786 	return RET_INVALID_FORMAT;
9787       } else {
9788 	return RET_READ_FAIL;
9789       }
9790     }
9791     if (!(loadbuf[loadbuf_size - 1])) {
9792       if ((loadbuf_size == MAXLINELEN) || (loadbuf_size == MAXLINEBUFLEN)) {
9793 	LOGERRPRINTFWW("Error: Line %u of %s is pathologically long.\n", line_idx, fname);
9794 	return RET_INVALID_FORMAT;
9795       } else {
9796         return RET_NOMEM;
9797       }
9798     }
9799   }
9800   return 0;
9801 }
9802 
load_to_first_token(FILE * infile,uintptr_t loadbuf_size,char comment_char,const char * file_descrip,char * loadbuf,char ** bufptr_ptr,uintptr_t * line_idx_ptr)9803 int32_t load_to_first_token(FILE* infile, uintptr_t loadbuf_size, char comment_char, const char* file_descrip, char* loadbuf, char** bufptr_ptr, uintptr_t* line_idx_ptr) {
9804   uintptr_t line_idx = 0;
9805   while (fgets(loadbuf, loadbuf_size, infile)) {
9806     line_idx++;
9807     if (!(loadbuf[loadbuf_size - 1])) {
9808       // PLINK 1.9 has two text line loading modes: "regular" and "long".
9809       // * "Regular" mode limits lines to about MAXLINELEN (about 128k as of
9810       //   this writing) characters.
9811       // * "Long" mode theoretically accepts lines up to about MAXLINEBUFLEN
9812       //   (~2 GB) characters but degrades gracefully if less memory is
9813       //   available (in that case, an out-of-memory instead of an
9814       //   invalid-format error is reported on fgets overflow).  Any long
9815       //   buffer size larger than MAXLINELEN should work properly with
9816       //   plink_common.
9817       if ((loadbuf_size == MAXLINELEN) || (loadbuf_size == MAXLINEBUFLEN)) {
9818 	LOGERRPRINTF("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, file_descrip);
9819 	return RET_INVALID_FORMAT;
9820       } else {
9821 	return RET_NOMEM;
9822       }
9823     }
9824     *bufptr_ptr = skip_initial_spaces(loadbuf);
9825     if (!is_eoln_kns(**bufptr_ptr)) {
9826       if ((**bufptr_ptr) != comment_char) {
9827 	*line_idx_ptr = line_idx;
9828         return 0;
9829       }
9830     }
9831   }
9832   if (!feof(infile)) {
9833     return RET_READ_FAIL;
9834   }
9835   LOGERRPRINTF("Error: Empty %s.\n", file_descrip);
9836   return RET_INVALID_FORMAT;
9837 }
9838 
open_and_load_to_first_token(FILE ** infile_ptr,char * fname,uintptr_t loadbuf_size,char comment_char,const char * file_descrip,char * loadbuf,char ** bufptr_ptr,uintptr_t * line_idx_ptr)9839 int32_t open_and_load_to_first_token(FILE** infile_ptr, char* fname, uintptr_t loadbuf_size, char comment_char, const char* file_descrip, char* loadbuf, char** bufptr_ptr, uintptr_t* line_idx_ptr) {
9840   loadbuf[loadbuf_size - 1] = ' ';
9841   if (fopen_checked(fname, "r", infile_ptr)) {
9842     return RET_OPEN_FAIL;
9843   }
9844   return load_to_first_token(*infile_ptr, loadbuf_size, comment_char, file_descrip, loadbuf, bufptr_ptr, line_idx_ptr);
9845 }
9846 
scan_max_strlen(char * fname,uint32_t colnum,uint32_t colnum2,uint32_t headerskip,char skipchar,uintptr_t * max_str_len_ptr,uintptr_t * max_str2_len_ptr)9847 int32_t scan_max_strlen(char* fname, uint32_t colnum, uint32_t colnum2, uint32_t headerskip, char skipchar, uintptr_t* max_str_len_ptr, uintptr_t* max_str2_len_ptr) {
9848   // colnum and colnum2 are 1-based indices.  If colnum2 is zero, only colnum
9849   // is scanned.
9850   // Includes terminating null in lengths.
9851   FILE* infile = nullptr;
9852   uintptr_t loadbuf_size = bigstack_left();
9853   uintptr_t max_str_len = *max_str_len_ptr;
9854   uintptr_t max_str2_len = 0;
9855   char* loadbuf = (char*)g_bigstack_base;
9856   uint32_t colmin;
9857   uint32_t coldiff;
9858   char* str1_ptr;
9859   char* str2_ptr;
9860   char cc;
9861   uintptr_t cur_str_len;
9862   uintptr_t line_idx;
9863   int32_t retval;
9864   if (loadbuf_size > MAXLINEBUFLEN) {
9865     loadbuf_size = MAXLINEBUFLEN;
9866   } else if (loadbuf_size <= MAXLINELEN) {
9867     goto scan_max_strlen_ret_NOMEM;
9868   }
9869   retval = open_and_skip_first_lines(&infile, fname, loadbuf, loadbuf_size, headerskip);
9870   if (retval) {
9871     goto scan_max_strlen_ret_1;
9872   }
9873   if (colnum < colnum2) {
9874     max_str2_len = *max_str2_len_ptr;
9875     colmin = colnum - 1;
9876     coldiff = colnum2 - colnum;
9877   } else if (colnum2) {
9878     max_str2_len = max_str_len;
9879     max_str_len = *max_str2_len_ptr;
9880     colmin = colnum2 - 1;
9881     coldiff = colnum - colnum2;
9882   } else {
9883     colmin = colnum - 1;
9884     coldiff = 0;
9885     colnum2 = 0xffffffffU;
9886   }
9887   line_idx = headerskip;
9888   while (fgets(loadbuf, loadbuf_size, infile)) {
9889     line_idx++;
9890     if (!(loadbuf[loadbuf_size - 1])) {
9891       if (loadbuf_size == MAXLINEBUFLEN) {
9892         LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
9893 	goto scan_max_strlen_ret_INVALID_FORMAT_2;
9894       } else {
9895         goto scan_max_strlen_ret_NOMEM;
9896       }
9897     }
9898     str1_ptr = skip_initial_spaces(loadbuf);
9899     cc = *str1_ptr;
9900     if (is_eoln_kns(cc) || (cc == skipchar)) {
9901       continue;
9902     }
9903     str1_ptr = next_token_multz(str1_ptr, colmin);
9904     str2_ptr = next_token_multz(str1_ptr, coldiff);
9905     if (no_more_tokens_kns(str2_ptr)) {
9906       // probably want option for letting this slide in the future
9907       LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, fname);
9908       goto scan_max_strlen_ret_INVALID_FORMAT_2;
9909     }
9910     cur_str_len = strlen_se(str1_ptr);
9911     if (cur_str_len >= max_str_len) {
9912       max_str_len = cur_str_len + 1;
9913     }
9914     if (coldiff) {
9915       cur_str_len = strlen_se(str2_ptr);
9916       if (cur_str_len >= max_str2_len) {
9917 	max_str2_len = cur_str_len + 1;
9918       }
9919     }
9920   }
9921   if (!feof(infile)) {
9922     goto scan_max_strlen_ret_READ_FAIL;
9923   }
9924   if (colnum < colnum2) {
9925     *max_str_len_ptr = max_str_len;
9926     if (coldiff) {
9927       *max_str2_len_ptr = max_str2_len;
9928     }
9929   } else {
9930     *max_str_len_ptr = max_str2_len;
9931     *max_str2_len_ptr = max_str_len;
9932   }
9933   while (0) {
9934   scan_max_strlen_ret_NOMEM:
9935     retval = RET_NOMEM;
9936     break;
9937   scan_max_strlen_ret_READ_FAIL:
9938     retval = RET_READ_FAIL;
9939     break;
9940   scan_max_strlen_ret_INVALID_FORMAT_2:
9941     logerrprintb();
9942     retval = RET_INVALID_FORMAT;
9943     break;
9944   }
9945  scan_max_strlen_ret_1:
9946   fclose_cond(infile);
9947   return retval;
9948 }
9949 
scan_max_fam_indiv_strlen(char * fname,uint32_t colnum,uintptr_t * max_sample_id_len_ptr)9950 int32_t scan_max_fam_indiv_strlen(char* fname, uint32_t colnum, uintptr_t* max_sample_id_len_ptr) {
9951   // colnum is a 1-based index with the FID column number; IID column is
9952   // assumed to follow.
9953   // Includes terminating null in lengths.
9954   FILE* infile = nullptr;
9955   uintptr_t loadbuf_size = bigstack_left();
9956   uintptr_t max_sample_id_len = *max_sample_id_len_ptr;
9957   uintptr_t line_idx = 0;
9958   char* loadbuf = (char*)g_bigstack_base;
9959   char* bufptr;
9960   char* bufptr2;
9961   uintptr_t cur_sample_id_len;
9962   int32_t retval;
9963   colnum--;
9964   if (loadbuf_size > MAXLINEBUFLEN) {
9965     loadbuf_size = MAXLINEBUFLEN;
9966   } else if (loadbuf_size <= MAXLINELEN) {
9967     goto scan_max_fam_indiv_strlen_ret_NOMEM;
9968   }
9969   retval = open_and_skip_first_lines(&infile, fname, loadbuf, loadbuf_size, 0);
9970   if (retval) {
9971     goto scan_max_fam_indiv_strlen_ret_1;
9972   }
9973   while (fgets(loadbuf, loadbuf_size, infile)) {
9974     line_idx++;
9975     if (!(loadbuf[loadbuf_size - 1])) {
9976       if (loadbuf_size == MAXLINEBUFLEN) {
9977 	LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
9978 	goto scan_max_fam_indiv_strlen_ret_INVALID_FORMAT_2;
9979       } else {
9980         goto scan_max_fam_indiv_strlen_ret_NOMEM;
9981       }
9982     }
9983     bufptr = skip_initial_spaces(loadbuf);
9984     if (is_eoln_kns(*bufptr)) {
9985       continue;
9986     }
9987     bufptr = next_token_multz(bufptr, colnum);
9988     bufptr2 = next_token(bufptr);
9989     if (no_more_tokens_kns(bufptr2)) {
9990       LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, fname);
9991       goto scan_max_fam_indiv_strlen_ret_INVALID_FORMAT_2;
9992     }
9993     cur_sample_id_len = strlen_se(bufptr) + strlen_se(bufptr2) + 2;
9994     if (cur_sample_id_len > max_sample_id_len) {
9995       max_sample_id_len = cur_sample_id_len;
9996     }
9997   }
9998   if (!feof(infile)) {
9999     goto scan_max_fam_indiv_strlen_ret_READ_FAIL;
10000   }
10001   *max_sample_id_len_ptr = max_sample_id_len;
10002   while (0) {
10003   scan_max_fam_indiv_strlen_ret_NOMEM:
10004     retval = RET_NOMEM;
10005     break;
10006   scan_max_fam_indiv_strlen_ret_READ_FAIL:
10007     retval = RET_READ_FAIL;
10008     break;
10009   scan_max_fam_indiv_strlen_ret_INVALID_FORMAT_2:
10010     logerrprintb();
10011     retval = RET_INVALID_FORMAT;
10012     break;
10013   }
10014  scan_max_fam_indiv_strlen_ret_1:
10015   fclose_cond(infile);
10016   return retval;
10017 }
10018 
10019 /*
10020 void inplace_collapse_uint32(uint32_t* item_arr, uint32_t unfiltered_ct, uintptr_t* exclude_arr, uint32_t filtered_ct) {
10021   if (unfiltered_ct == filtered_ct) {
10022     return;
10023   }
10024   uint32_t item_uidx = next_set_unsafe(exclude_arr, 0);
10025   uint32_t item_idx = item_uidx;
10026   for (; item_idx < filtered_ct; item_idx++, item_uidx++) {
10027     next_unset_unsafe_ck(exclude_arr, &item_uidx);
10028     item_arr[item_idx] = item_arr[item_uidx];
10029   }
10030 }
10031 */
10032 
inplace_collapse_uint32_incl(uint32_t * item_arr,uint32_t unfiltered_ct,uintptr_t * incl_arr,uint32_t filtered_ct)10033 void inplace_collapse_uint32_incl(uint32_t* item_arr, uint32_t unfiltered_ct, uintptr_t* incl_arr, uint32_t filtered_ct) {
10034   if (unfiltered_ct == filtered_ct) {
10035     return;
10036   }
10037   uint32_t item_uidx = next_unset_unsafe(incl_arr, 0);
10038   uint32_t item_idx = item_uidx;
10039   for (; item_idx < filtered_ct; item_idx++, item_uidx++) {
10040     next_set_unsafe_ck(incl_arr, &item_uidx);
10041     item_arr[item_idx] = item_arr[item_uidx];
10042   }
10043 }
10044 
alloc_and_init_collapsed_arr(char * item_arr,uintptr_t item_len,uintptr_t unfiltered_ct,uintptr_t * exclude_arr,uintptr_t filtered_ct,uint32_t read_only)10045 char* alloc_and_init_collapsed_arr(char* item_arr, uintptr_t item_len, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t filtered_ct, uint32_t read_only) {
10046   uint32_t item_uidx = 0;
10047   char* new_arr;
10048   char* wptr;
10049   char* wptr_end;
10050   uintptr_t item_uidx_stop;
10051   uintptr_t delta;
10052   if (read_only && (unfiltered_ct == filtered_ct)) {
10053     return item_arr;
10054   }
10055   if (bigstack_alloc_c(filtered_ct * item_len, &new_arr)) {
10056     return nullptr;
10057   }
10058   wptr = new_arr;
10059   wptr_end = &(new_arr[filtered_ct * item_len]);
10060   while (wptr < wptr_end) {
10061     item_uidx = next_unset_ul_unsafe(exclude_arr, item_uidx);
10062     item_uidx_stop = next_set_ul(exclude_arr, item_uidx, unfiltered_ct);
10063     delta = item_uidx_stop - item_uidx;
10064     memcpy(wptr, &(item_arr[item_uidx * item_len]), delta * item_len);
10065     wptr = &(wptr[delta * item_len]);
10066     item_uidx = item_uidx_stop;
10067   }
10068   return new_arr;
10069 }
10070 
alloc_and_init_collapsed_arr_incl(char * item_arr,uintptr_t item_len,uintptr_t unfiltered_ct,uintptr_t * include_arr,uintptr_t filtered_ct,uint32_t read_only)10071 char* alloc_and_init_collapsed_arr_incl(char* item_arr, uintptr_t item_len, uintptr_t unfiltered_ct, uintptr_t* include_arr, uintptr_t filtered_ct, uint32_t read_only) {
10072   uint32_t item_uidx = 0;
10073   char* new_arr;
10074   char* wptr;
10075   char* wptr_end;
10076   uintptr_t item_uidx_stop;
10077   uintptr_t delta;
10078   if (read_only && (unfiltered_ct == filtered_ct)) {
10079     return item_arr;
10080   }
10081   if (bigstack_alloc_c(filtered_ct * item_len, &new_arr)) {
10082     return nullptr;
10083   }
10084   wptr = new_arr;
10085   wptr_end = &(new_arr[filtered_ct * item_len]);
10086   do {
10087     item_uidx = next_set_ul_unsafe(include_arr, item_uidx);
10088     item_uidx_stop = next_unset_ul(include_arr, item_uidx, unfiltered_ct);
10089     delta = item_uidx_stop - item_uidx;
10090     memcpy(wptr, &(item_arr[item_uidx * item_len]), delta * item_len);
10091     wptr = &(wptr[delta * item_len]);
10092     item_uidx = item_uidx_stop;
10093   } while (wptr < wptr_end);
10094   return new_arr;
10095 }
10096 
inplace_delta_collapse_arr(char * item_arr,uintptr_t item_len,uintptr_t filtered_ct_orig,uintptr_t filtered_ct_new,uintptr_t * exclude_orig,uintptr_t * exclude_new)10097 void inplace_delta_collapse_arr(char* item_arr, uintptr_t item_len, uintptr_t filtered_ct_orig, uintptr_t filtered_ct_new, uintptr_t* exclude_orig, uintptr_t* exclude_new) {
10098   // if this sort of collapse function is ever in an important loop, check
10099   // whether specialized 4-byte and 8-byte versions are much faster
10100   uintptr_t* exclude_orig_start = exclude_orig;
10101   char* write_end = &(item_arr[filtered_ct_new * item_len]);
10102   uintptr_t read_idx = 1;
10103   uint32_t uii = 0;
10104   char* write_ptr;
10105   uintptr_t ulii;
10106   uintptr_t uljj;
10107   uint32_t read_uidx;
10108   uint32_t ujj;
10109   if (filtered_ct_new == filtered_ct_orig) {
10110     return;
10111   }
10112   // find location of first newly excluded item
10113   while (1) {
10114     ulii = *exclude_orig;
10115     uljj = *exclude_new;
10116     if (ulii != uljj) {
10117       break;
10118     }
10119     uii += popcount_long(ulii);
10120     exclude_orig++;
10121     exclude_new++;
10122   }
10123   exclude_new -= ((uintptr_t)(exclude_orig - exclude_orig_start));
10124   read_uidx = BITCT * ((uintptr_t)(exclude_orig - exclude_orig_start));
10125   ujj = CTZLU(ulii ^ uljj);
10126   read_uidx += ujj;
10127   uii += popcount_long(ulii & ((ONELU << ujj) - ONELU));
10128   uii = read_uidx - uii; // now equal to # initial filtered indices skipped
10129   filtered_ct_new -= uii;
10130   item_arr = &(item_arr[uii * item_len]);
10131   write_ptr = item_arr;
10132   read_uidx++;
10133   for (; write_ptr < write_end; read_uidx++, read_idx++) {
10134     next_unset_unsafe_ck(exclude_orig_start, &read_uidx);
10135     if (IS_SET(exclude_new, read_uidx)) {
10136       continue;
10137     }
10138     memcpy(write_ptr, &(item_arr[read_idx * item_len]), item_len);
10139     write_ptr = &(write_ptr[item_len]);
10140   }
10141 }
10142 
inplace_delta_collapse_bitfield(uintptr_t * read_ptr,uint32_t filtered_ct_new,uintptr_t * exclude_orig,uintptr_t * exclude_new)10143 void inplace_delta_collapse_bitfield(uintptr_t* read_ptr, uint32_t filtered_ct_new, uintptr_t* exclude_orig, uintptr_t* exclude_new) {
10144   // only guaranteed to zero out trailing bits up to the nearest 16-byte
10145   // boundary on 64-bit systems
10146   uintptr_t* write_ptr = read_ptr;
10147   uintptr_t readw = *read_ptr++;
10148   uintptr_t writew = 0;
10149   uint32_t item_uidx = 0;
10150   uint32_t item_mwidx = 0;
10151   uint32_t item_idx = 0;
10152   for (; item_idx < filtered_ct_new; item_uidx++) {
10153     next_unset_unsafe_ck(exclude_orig, &item_uidx);
10154     if (!is_set(exclude_new, item_uidx)) {
10155       if ((readw >> item_mwidx) & 1) {
10156 	writew |= ONELU << (item_idx % BITCT);
10157       }
10158       if (!((++item_idx) % BITCT)) {
10159 	*write_ptr++ = writew;
10160 	writew = 0;
10161       }
10162     }
10163     if (++item_mwidx == BITCT) {
10164       item_mwidx = 0;
10165       readw = *read_ptr++;
10166     }
10167   }
10168   if (write_ptr < read_ptr) {
10169     *write_ptr++ = writew;
10170     if (write_ptr < read_ptr) {
10171       *write_ptr = 0;
10172     }
10173   }
10174 }
10175 
copy_bitarr_subset_excl(const uintptr_t * __restrict raw_bitarr,const uintptr_t * __restrict subset_excl,uint32_t raw_bitarr_size,uint32_t subset_size,uintptr_t * __restrict output_bitarr)10176 void copy_bitarr_subset_excl(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_excl, uint32_t raw_bitarr_size, uint32_t subset_size, uintptr_t* __restrict output_bitarr) {
10177   uintptr_t cur_write = 0;
10178   uint32_t item_uidx = 0;
10179   uint32_t write_bit = 0;
10180   uint32_t item_idx = 0;
10181   uint32_t item_uidx_stop;
10182   if (!subset_excl[0]) {
10183     item_uidx = next_set(subset_excl, 0, raw_bitarr_size & (~(BITCT - 1))) & (~(BITCT - 1));
10184     memcpy(output_bitarr, raw_bitarr, item_uidx / 8);
10185     item_idx = item_uidx;
10186     output_bitarr = &(output_bitarr[item_uidx / BITCT]);
10187   }
10188   while (item_idx < subset_size) {
10189     item_uidx = next_unset_unsafe(subset_excl, item_uidx);
10190     item_uidx_stop = next_set(subset_excl, item_uidx, raw_bitarr_size);
10191     item_idx += item_uidx_stop - item_uidx;
10192     do {
10193       cur_write |= ((raw_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << write_bit;
10194       if (++write_bit == BITCT) {
10195 	*output_bitarr++ = cur_write;
10196         cur_write = 0;
10197 	write_bit = 0;
10198       }
10199     } while (++item_uidx < item_uidx_stop);
10200   }
10201   if (write_bit) {
10202     *output_bitarr = cur_write;
10203   }
10204 }
10205 
copy_bitarr_subset(const uintptr_t * __restrict raw_bitarr,const uintptr_t * __restrict subset_mask,uint32_t raw_bitarr_size,uint32_t subset_size,uintptr_t * __restrict output_bitarr)10206 void copy_bitarr_subset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t raw_bitarr_size, uint32_t subset_size, uintptr_t* __restrict output_bitarr) {
10207   // full-blown blocked copy not worth it due to undefined CTZLU(0), >> 64,
10208   // << 64
10209   uintptr_t cur_output_word = 0;
10210   uint32_t item_uidx = 0;
10211   uint32_t word_write_shift = 0;
10212   uint32_t item_idx = 0;
10213   uint32_t item_uidx_stop;
10214   if (!(~subset_mask[0])) {
10215     item_uidx = next_unset(subset_mask, 0, raw_bitarr_size & (~(BITCT - 1))) & (~(BITCT - 1));
10216     memcpy(output_bitarr, raw_bitarr, item_uidx / 8);
10217     item_idx = item_uidx;
10218     output_bitarr = &(output_bitarr[item_uidx / BITCT]);
10219   }
10220   while (item_idx < subset_size) {
10221     item_uidx = next_set_unsafe(subset_mask, item_uidx);
10222 
10223     // can speed this up a bit once we have a guaranteed unset bit at the end
10224     item_uidx_stop = next_unset(subset_mask, item_uidx, raw_bitarr_size);
10225 
10226     item_idx += item_uidx_stop - item_uidx;
10227     do {
10228       cur_output_word |= ((raw_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << word_write_shift;
10229       if (++word_write_shift == BITCT) {
10230 	*output_bitarr++ = cur_output_word;
10231         cur_output_word = 0;
10232 	word_write_shift = 0;
10233       }
10234     } while (++item_uidx < item_uidx_stop);
10235   }
10236   if (word_write_shift) {
10237     *output_bitarr = cur_output_word;
10238   }
10239 }
10240 
uncollapse_copy_flip_include_arr(uintptr_t * collapsed_include_arr,uintptr_t unfiltered_ct,uintptr_t * exclude_arr,uintptr_t * output_exclude_arr)10241 void uncollapse_copy_flip_include_arr(uintptr_t* collapsed_include_arr, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* output_exclude_arr) {
10242   uintptr_t unfiltered_ctl = BITCT_TO_WORDCT(unfiltered_ct);
10243   uintptr_t* output_exclude_true_end = &(output_exclude_arr[unfiltered_ctl]);
10244   uintptr_t* output_exclude_end = &(output_exclude_arr[unfiltered_ct / BITCT]);
10245   uintptr_t cea_read = 0;
10246   uint32_t read_bit = BITCT;
10247   uint32_t write_bit;
10248   uintptr_t cur_write;
10249   uintptr_t cur_read = 0;
10250   if (!exclude_arr[0]) {
10251     // copy-with-possible-offset is substantially slower, so treat initial lack
10252     // of offset as a special case
10253     for (cur_read = 0; cur_read < unfiltered_ctl; cur_read++) {
10254       *output_exclude_arr++ = ~(*collapsed_include_arr++);
10255       if (*(++exclude_arr)) {
10256 	break;
10257       }
10258     }
10259   }
10260   while (output_exclude_arr < output_exclude_end) {
10261     cur_write = *exclude_arr++;
10262     // want efficient handling of all-zeroes and all-ones here
10263     if (cur_write) {
10264       cur_read = ~cur_write;
10265     uncollapse_copy_flip_include_arr_loop:
10266       while (cur_read) {
10267         write_bit = CTZLU(cur_read);
10268         if (read_bit == BITCT) {
10269           cea_read = ~(*collapsed_include_arr++);
10270 	  read_bit = 0;
10271         }
10272         cur_write |= (cea_read & ONELU) << write_bit;
10273         cea_read >>= 1;
10274         read_bit++;
10275         cur_read &= cur_read - ONELU;
10276       }
10277       *output_exclude_arr = cur_write;
10278     } else {
10279       if (read_bit == BITCT) {
10280         *output_exclude_arr = ~(*collapsed_include_arr++);
10281       } else {
10282         cur_write = cea_read;
10283         cea_read = ~(*collapsed_include_arr++);
10284         *output_exclude_arr = cur_write | (cea_read << (BITCT - read_bit));
10285 	cea_read >>= read_bit;
10286       }
10287     }
10288     output_exclude_arr++;
10289   }
10290   if (output_exclude_arr < output_exclude_true_end) {
10291     cur_write = *exclude_arr++;
10292     cur_read = (~cur_write) & ((ONELU << (unfiltered_ct % BITCT)) - ONELU);
10293     goto uncollapse_copy_flip_include_arr_loop;
10294   }
10295 }
10296 
copy_when_nonmissing(uintptr_t * loadbuf,char * source,uintptr_t elem_size,uintptr_t unfiltered_sample_ct,uintptr_t missing_ct,char * dest)10297 void copy_when_nonmissing(uintptr_t* loadbuf, char* source, uintptr_t elem_size, uintptr_t unfiltered_sample_ct, uintptr_t missing_ct, char* dest) {
10298   uintptr_t* loadbuf_end = &(loadbuf[QUATERCT_TO_WORDCT(unfiltered_sample_ct)]);
10299   uintptr_t last_missing_p1 = 0;
10300   uintptr_t sample_idx_offset = 0;
10301   uintptr_t cur_word;
10302   uintptr_t new_missing_idx;
10303   uintptr_t diff;
10304   if (!missing_ct) {
10305     memcpy(dest, source, unfiltered_sample_ct * elem_size);
10306     return;
10307   }
10308   do {
10309     cur_word = *loadbuf++;
10310     cur_word = cur_word & (~(cur_word >> 1)) & FIVEMASK;
10311     while (cur_word) {
10312       new_missing_idx = sample_idx_offset + (CTZLU(cur_word) / 2);
10313       diff = new_missing_idx - last_missing_p1;
10314       if (diff) {
10315 	dest = memcpya(dest, &(source[last_missing_p1 * elem_size]), diff * elem_size);
10316       }
10317       last_missing_p1 = new_missing_idx + 1;
10318       cur_word &= cur_word - 1;
10319     }
10320     sample_idx_offset += BITCT2;
10321   } while (loadbuf < loadbuf_end);
10322   diff = unfiltered_sample_ct - last_missing_p1;
10323   if (diff) {
10324     memcpy(dest, &(source[last_missing_p1 * elem_size]), diff * elem_size);
10325   }
10326 }
10327 
collapse_duplicate_ids(char * sorted_ids,uintptr_t id_ct,uintptr_t max_id_len,uint32_t * id_starts)10328 uint32_t collapse_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, uint32_t* id_starts) {
10329   // Collapses array of sorted IDs to remove duplicates, and writes
10330   // pre-collapse positions to id_starts (so e.g. duplication count of any
10331   // sample ID can be determined via subtraction) if it isn't nullptr.
10332   // Returns id_ct of collapsed array.
10333   uintptr_t read_idx;
10334   uintptr_t write_idx;
10335   if (!id_ct) {
10336     return 0;
10337   }
10338   if (id_starts) {
10339     id_starts[0] = 0;
10340     for (read_idx = 1; read_idx < id_ct; read_idx++) {
10341       if (!strcmp(&(sorted_ids[(read_idx - 1) * max_id_len]), &(sorted_ids[read_idx * max_id_len]))) {
10342 	break;
10343       }
10344       id_starts[read_idx] = read_idx;
10345     }
10346     write_idx = read_idx;
10347     while (++read_idx < id_ct) {
10348       if (strcmp(&(sorted_ids[(write_idx - 1) * max_id_len]), &(sorted_ids[read_idx * max_id_len]))) {
10349 	strcpy(&(sorted_ids[write_idx * max_id_len]), &(sorted_ids[read_idx * max_id_len]));
10350 	id_starts[write_idx++] = read_idx;
10351       }
10352     }
10353   } else {
10354     for (read_idx = 1; read_idx < id_ct; read_idx++) {
10355       if (!strcmp(&(sorted_ids[(read_idx - 1) * max_id_len]), &(sorted_ids[read_idx * max_id_len]))) {
10356 	break;
10357       }
10358     }
10359     write_idx = read_idx;
10360     while (++read_idx < id_ct) {
10361       if (strcmp(&(sorted_ids[(write_idx - 1) * max_id_len]), &(sorted_ids[read_idx * max_id_len]))) {
10362 	strcpy(&(sorted_ids[write_idx * max_id_len]), &(sorted_ids[read_idx * max_id_len]));
10363 	write_idx++;
10364       }
10365     }
10366   }
10367   return write_idx;
10368 }
10369 
range_list_init(Range_list * range_list_ptr)10370 void range_list_init(Range_list* range_list_ptr) {
10371   range_list_ptr->names = nullptr;
10372   range_list_ptr->starts_range = nullptr;
10373   range_list_ptr->name_ct = 0;
10374   range_list_ptr->name_max_len = 0;
10375 }
10376 
free_range_list(Range_list * range_list_ptr)10377 void free_range_list(Range_list* range_list_ptr) {
10378   free_cond(range_list_ptr->names);
10379   free_cond(range_list_ptr->starts_range);
10380 }
10381 
10382 // implementation used in PLINK 1.07 stats.cpp
10383 // probably want to remove this function and use erf() calls in the future
normdist(double zz)10384 double normdist(double zz) {
10385   double sqrt2pi = 2.50662827463;
10386   double t0;
10387   double z1;
10388   double p0;
10389   t0 = 1 / (1 + 0.2316419 * fabs(zz));
10390   z1 = exp(-0.5 * zz * zz) / sqrt2pi;
10391   p0 = z1 * t0 * (0.31938153 + t0 * (-0.356563782 + t0 * (1.781477937 + t0 * (-1.821255978 + 1.330274429 * t0))));
10392   return zz >= 0 ? 1 - p0 : p0;
10393 }
10394 
rand_normal(double * secondval_ptr)10395 double rand_normal(double* secondval_ptr) {
10396   // N(0, 1)
10397   double dxx = sqrt(-2 * log(rand_unif()));
10398   double dyy = 2 * PI * rand_unif();
10399   *secondval_ptr = dxx * cos(dyy);
10400   return dxx * sin(dyy);
10401 }
10402 
init_sfmt64_from_sfmt32(sfmt_t * sfmt32,sfmt_t * sfmt64)10403 void init_sfmt64_from_sfmt32(sfmt_t* sfmt32, sfmt_t* sfmt64) {
10404   // sfmt_genrand_uint64() is not supposed to be called after
10405   // sfmt_genrand_uint32() is called on the same generator.  To work around
10406   // this, we initialize a new sfmt64 generator with this function when
10407   // necessary, and stick to genrand_uint32() calls with the main generator.
10408   uint32_t init_arr[4];
10409   uint32_t uii;
10410   for (uii = 0; uii < 4; uii++) {
10411     init_arr[uii] = sfmt_genrand_uint32(sfmt32);
10412   }
10413   sfmt_init_by_array(sfmt64, init_arr, 4);
10414 }
10415 
generate_perm1_interleaved(uint32_t tot_ct,uint32_t set_ct,uintptr_t perm_idx,uintptr_t perm_ct,uintptr_t * perm_buf)10416 void generate_perm1_interleaved(uint32_t tot_ct, uint32_t set_ct, uintptr_t perm_idx, uintptr_t perm_ct, uintptr_t* perm_buf) {
10417   uintptr_t tot_ctl = BITCT_TO_WORDCT(tot_ct);
10418   uintptr_t tot_rem = tot_ct & (BITCT - 1);
10419   uint32_t tot_quotient = (uint32_t)(0x100000000LLU / tot_ct);
10420   uint32_t upper_bound = tot_ct * tot_quotient - 1;
10421   uintptr_t uljj = perm_ct - perm_idx;
10422   uint32_t totq_preshift;
10423   uint64_t totq_magic;
10424   uint32_t totq_postshift;
10425   uint32_t totq_incr;
10426   uintptr_t* pbptr;
10427   uint32_t num_set;
10428   uint32_t urand;
10429   uintptr_t ulii;
10430   // seeing as how we're gonna divide by the same number a billion times or so,
10431   // it just might be worth optimizing that division...
10432   magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
10433   if (set_ct * 2 < tot_ct) {
10434     for (ulii = 0; ulii < tot_ctl; ulii++) {
10435       fill_ulong_zero(uljj, &(perm_buf[perm_idx + (ulii * perm_ct)]));
10436     }
10437     for (; perm_idx < perm_ct; perm_idx++) {
10438       pbptr = &(perm_buf[perm_idx]);
10439       for (num_set = 0; num_set < set_ct; num_set++) {
10440 	do {
10441 	  do {
10442 	    urand = sfmt_genrand_uint32(&g_sfmt);
10443 	  } while (urand > upper_bound);
10444 	  // this is identical to ulii = urand / tot_quotient
10445 	  ulii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
10446 	  uljj = ulii / BITCT;
10447 	  ulii &= (BITCT - 1);
10448 	} while ((pbptr[uljj * perm_ct] >> ulii) & 1);
10449 	pbptr[uljj * perm_ct] |= (ONELU << ulii);
10450       }
10451     }
10452   } else {
10453     for (ulii = 0; ulii < tot_ctl; ulii++) {
10454       fill_ulong_one(uljj, &(perm_buf[perm_idx + (ulii * perm_ct)]));
10455     }
10456     // "set" has reversed meaning here
10457     set_ct = tot_ct - set_ct;
10458     for (; perm_idx < perm_ct; perm_idx++) {
10459       pbptr = &(perm_buf[perm_idx]);
10460       for (num_set = 0; num_set < set_ct; num_set++) {
10461 	do {
10462 	  do {
10463 	    urand = sfmt_genrand_uint32(&g_sfmt);
10464 	  } while (urand > upper_bound);
10465 	  ulii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
10466 	  uljj = ulii / BITCT;
10467 	  ulii &= (BITCT - 1);
10468 	} while (!((pbptr[uljj * perm_ct] >> ulii) & 1));
10469 	pbptr[uljj * perm_ct] &= ~(ONELU << ulii);
10470       }
10471     }
10472     if (tot_rem) {
10473       uljj = (~ZEROLU) >> (BITCT - tot_rem);
10474       pbptr = &(perm_buf[(tot_ctl - 1) * perm_ct + perm_idx]);
10475       for (ulii = perm_idx; ulii < perm_ct; ulii++) {
10476 	*pbptr &= uljj;
10477 	pbptr++;
10478       }
10479     }
10480   }
10481 }
10482 
cubic_real_roots(double coef_a,double coef_b,double coef_c,double * solutions)10483 uint32_t cubic_real_roots(double coef_a, double coef_b, double coef_c, double* solutions) {
10484   // Analytically finds all real roots of x^3 + ax^2 + bx + c, saving them in
10485   // solutions[] (sorted from smallest to largest), and returning the count.
10486   // Multiple roots are only returned/counted once.
10487   // Additional research into numerical stability may be in order here.
10488   double a2 = coef_a * coef_a;
10489   double qq = (a2 - 3 * coef_b) * (1.0 / 9.0);
10490   double rr = (2 * a2 * coef_a - 9 * coef_a * coef_b + 27 * coef_c) * (1.0 / 54.0);
10491   double r2 = rr * rr;
10492   double q3 = qq * qq * qq;
10493   double adiv3 = coef_a * (1.0 / 3.0);
10494   double sq;
10495   double dxx;
10496   if (r2 < q3) {
10497     // three real roots
10498     sq = sqrt(qq);
10499     dxx = acos(rr / (qq * sq)) * (1.0 / 3.0);
10500     sq *= -2;
10501     solutions[0] = sq * cos(dxx) - adiv3;
10502     solutions[1] = sq * cos(dxx + (2.0 * PI / 3.0)) - adiv3;
10503     solutions[2] = sq * cos(dxx - (2.0 * PI / 3.0)) - adiv3;
10504     // now sort and check for within-epsilon equality
10505     if (solutions[0] > solutions[1]) {
10506       dxx = solutions[0];
10507       solutions[0] = solutions[1];
10508       if (dxx > solutions[2]) {
10509         solutions[1] = solutions[2];
10510 	solutions[2] = dxx;
10511       } else {
10512 	solutions[1] = dxx;
10513       }
10514       if (solutions[0] > solutions[1]) {
10515 	dxx = solutions[0];
10516 	solutions[0] = solutions[1];
10517 	solutions[1] = dxx;
10518       }
10519     } else if (solutions[1] > solutions[2]) {
10520       dxx = solutions[1];
10521       solutions[1] = solutions[2];
10522       solutions[2] = dxx;
10523     }
10524     if (solutions[1] - solutions[0] < EPSILON) {
10525       solutions[1] = solutions[2];
10526       return (solutions[1] - solutions[0] < EPSILON)? 1 : 2;
10527     }
10528     return (solutions[2] - solutions[1] < EPSILON)? 2 : 3;
10529   }
10530   dxx = -pow(fabs(rr) + sqrt(r2 - q3), 1.0 / 3.0);
10531   if (dxx == 0.0) {
10532     solutions[0] = -adiv3;
10533     return 1;
10534   }
10535   if (rr < 0.0) {
10536     dxx = -dxx;
10537   }
10538   sq = qq / dxx;
10539   solutions[0] = dxx + sq - adiv3;
10540   // use of regular epsilon here has actually burned us
10541   if (fabs(dxx - sq) >= (EPSILON * 8)) {
10542     return 1;
10543   }
10544   if (dxx >= 0.0) {
10545     solutions[1] = solutions[0];
10546     solutions[0] = -dxx - adiv3;
10547   } else {
10548     solutions[1] = -dxx - adiv3;
10549   }
10550   return 2;
10551 }
10552 
join_threads(pthread_t * threads,uint32_t ctp1)10553 void join_threads(pthread_t* threads, uint32_t ctp1) {
10554   if (!(--ctp1)) {
10555     return;
10556   }
10557 #ifdef _WIN32
10558   WaitForMultipleObjects(ctp1, threads, 1, INFINITE);
10559   for (uint32_t uii = 0; uii < ctp1; ++uii) {
10560     CloseHandle(threads[uii]);
10561   }
10562 #else
10563   for (uint32_t uii = 0; uii < ctp1; uii++) {
10564     pthread_join(threads[uii], nullptr);
10565   }
10566 #endif
10567 }
10568 
10569 #ifdef _WIN32
spawn_threads(pthread_t * threads,unsigned (__stdcall * start_routine)(void *),uintptr_t ct)10570 int32_t spawn_threads(pthread_t* threads, unsigned (__stdcall *start_routine)(void*), uintptr_t ct)
10571 #else
10572 int32_t spawn_threads(pthread_t* threads, void* (*start_routine)(void*), uintptr_t ct)
10573 #endif
10574 {
10575   uintptr_t ulii;
10576   if (ct == 1) {
10577     return 0;
10578   }
10579   for (ulii = 1; ulii < ct; ulii++) {
10580 #ifdef _WIN32
10581     threads[ulii - 1] = (HANDLE)_beginthreadex(nullptr, 4096, start_routine, (void*)ulii, 0, nullptr);
10582     if (!threads[ulii - 1]) {
10583       join_threads(threads, ulii);
10584       return -1;
10585     }
10586 #else
10587     if (pthread_create(&(threads[ulii - 1]), nullptr, start_routine, (void*)ulii)) {
10588       join_threads(threads, ulii);
10589       return -1;
10590     }
10591 #endif
10592   }
10593   return 0;
10594 }
10595 
10596 // Okay, it's time to bite the bullet and stop creating and destroying threads
10597 // like crazy, at least in the small-block-size GRM calculation; Intel
10598 // MKL-powered GCTA 1.24 blew away our code on the NIH 512-core test machine
10599 // when the maximum number of threads was used.  Mostly because threads were
10600 // actually costing much more in creation/destruction time than they saved;
10601 // much better wall-clock times would have resulted from manually setting
10602 // --threads to a low number.  That's not cool.
10603 //
10604 // New framework:
10605 // * On all operating systems, g_is_last_thread_block indicates whether all
10606 //   threads should terminate upon completion of the current block.  (Initially
10607 //   had this volatile, then realized that the presence of the sync-wait should
10608 //   be enough to force the global variable to be reread.)
10609 // * On Linux and OS X, if we aren't dealing with the final block,
10610 //   spawn_threads2() also reinitializes g_thread_active_ct.
10611 // * On Linux and OS X, spawn_threads2() checks if g_thread_mutex_initialized
10612 //   is set.  If not, it, it is set, g_thread_sync_mutex,
10613 //   g_thread_cur_block_done_condvar and g_thread_start_next_condvar are
10614 //   initialized, then threads are launched.
10615 //   If it has, pthread_cond_broadcast() acts on g_thread_start_next_condvar.
10616 // * On Windows, spawn_threads2() checks if g_thread_mutex_initialized is set.
10617 //   If it has not, it, along with g_thread_start_next_event[] and
10618 //   g_thread_cur_block_done_events[], are initialized, then the threads are
10619 //   launched.  If it has, SetEvent() acts on g_thread_start_next_event[].
10620 //   (It used to act on only one event; then I realized that safely dealing
10621 //   with a manual-reset event could be a pain if the first thread finishes
10622 //   before the last one wakes up...)
10623 // * Thread functions are expected to be of the form
10624 //     THREAD_RET_TYPE function_name(void* arg) {
10625 //       uintptr_t tidx = (uintptr_t)arg;
10626 //       ...
10627 //       while (1) {
10628 //         ... // process current block
10629 //         if ((!tidx) || g_is_last_thread_block) {
10630 //           THREAD_RETURN;
10631 //         }
10632 //         THREAD_BLOCK_FINISH(tidx);
10633 //       }
10634 //     }
10635 // * On Linux and OS X, THREAD_BLOCK_FINISH() acquires a mutex, decrements
10636 //   g_thread_active_ct, calls pthread_cond_signal() on
10637 //   g_thread_cur_block_done_condvar iff g_thread_active_ct is now zero, then
10638 //   unconditionally calls pthread_cond_wait on g_thread_start_next_condvar and
10639 //   the mutex.
10640 // * On Windows, THREAD_BLOCK_FINISH() calls SetEvent() on
10641 //   g_thread_cur_block_done_events[tidx - 1], then waits on
10642 //   g_thread_start_next_event[tidx - 1].
10643 // * If the termination variable is set, join_threads2() waits for all threads
10644 //   to complete, then cleans up all multithreading objects.  Otherwise, on
10645 //   Linux and OS X, it acquires the mutex and calls pthread_cond_wait() on
10646 //   g_thread_cur_block_done_condvar and the mutex; and on Windows, it calls
10647 //   WaitForMultipleObjects() on g_thread_cur_block_done_events[].
10648 //   WaitForMultipleObjects has a 64 object limit, and for now it doesn't seem
10649 //   too important to use a for loop to handle more objects?... well, we can
10650 //   add that if anyone wants it, but for now the Windows thread limit is 65
10651 //   (the main thread isn't part of the wait).
10652 //
10653 // This is only very slightly better than the original approach on my old
10654 // MacBook Pro (since threading overhead was never high to begin with, there
10655 // being only 2 cores...), but the impact should be more noticeable on heavily
10656 // multicore machines.
10657 //
10658 // The next performance improvement to make is double-buffering; tricky to
10659 // estimate how much (if any) "consumption" the main I/O thread should be
10660 // doing, though, so it may want a job queue to go with it.
10661 
10662 uintptr_t g_thread_spawn_ct;
10663 uint32_t g_is_last_thread_block = 0;
10664 #ifdef _WIN32
10665 HANDLE g_thread_start_next_event[MAX_THREADS];
10666 HANDLE g_thread_cur_block_done_events[MAX_THREADS];
10667 #else
10668 static pthread_mutex_t g_thread_sync_mutex;
10669 static pthread_cond_t g_thread_cur_block_done_condvar;
10670 static pthread_cond_t g_thread_start_next_condvar;
10671 uint32_t g_thread_active_ct;
10672 
THREAD_BLOCK_FINISH(uintptr_t tidx)10673 void THREAD_BLOCK_FINISH(uintptr_t tidx) {
10674   uintptr_t initial_spawn_ct = g_thread_spawn_ct;
10675   pthread_mutex_lock(&g_thread_sync_mutex);
10676   if (!(--g_thread_active_ct)) {
10677     pthread_cond_signal(&g_thread_cur_block_done_condvar);
10678   }
10679   while (g_thread_spawn_ct == initial_spawn_ct) {
10680     // spurious wakeup guard
10681     pthread_cond_wait(&g_thread_start_next_condvar, &g_thread_sync_mutex);
10682   }
10683   pthread_mutex_unlock(&g_thread_sync_mutex);
10684 }
10685 #endif
10686 static uint32_t g_thread_mutex_initialized = 0;
10687 
join_threads2(pthread_t * threads,uint32_t ctp1,uint32_t is_last_block)10688 void join_threads2(pthread_t* threads, uint32_t ctp1, uint32_t is_last_block) {
10689   uint32_t uii;
10690   if (!(--ctp1)) {
10691     if (is_last_block) {
10692       // allow another multithreaded function to be called later
10693       g_thread_mutex_initialized = 0;
10694     }
10695     return;
10696   }
10697 #ifdef _WIN32
10698   if (!is_last_block) {
10699     WaitForMultipleObjects(ctp1, g_thread_cur_block_done_events, 1, INFINITE);
10700   } else {
10701     WaitForMultipleObjects(ctp1, threads, 1, INFINITE);
10702     for (uii = 0; uii < ctp1; uii++) {
10703       CloseHandle(threads[uii]);
10704       CloseHandle(g_thread_start_next_event[uii]);
10705       CloseHandle(g_thread_cur_block_done_events[uii]);
10706     }
10707     g_thread_mutex_initialized = 0;
10708   }
10709 #else
10710   if (!is_last_block) {
10711     pthread_mutex_lock(&g_thread_sync_mutex);
10712     while (g_thread_active_ct) {
10713       pthread_cond_wait(&g_thread_cur_block_done_condvar, &g_thread_sync_mutex);
10714     }
10715     // keep mutex until next block loaded
10716   } else {
10717     for (uii = 0; uii < ctp1; uii++) {
10718       pthread_join(threads[uii], nullptr);
10719     }
10720     // slightly inefficient if there are multiple multithreaded commands being
10721     // run, but if different commands require different numbers of threads,
10722     // optimizing this sort of thing away could introduce bugs...
10723     pthread_mutex_destroy(&g_thread_sync_mutex);
10724     pthread_cond_destroy(&g_thread_cur_block_done_condvar);
10725     pthread_cond_destroy(&g_thread_start_next_condvar);
10726     g_thread_mutex_initialized = 0;
10727   }
10728 #endif
10729 }
10730 
10731 #ifdef _WIN32
spawn_threads2(pthread_t * threads,unsigned (__stdcall * start_routine)(void *),uintptr_t ct,uint32_t is_last_block)10732 int32_t spawn_threads2(pthread_t* threads, unsigned (__stdcall *start_routine)(void*), uintptr_t ct, uint32_t is_last_block)
10733 #else
10734 int32_t spawn_threads2(pthread_t* threads, void* (*start_routine)(void*), uintptr_t ct, uint32_t is_last_block)
10735 #endif
10736 {
10737   uintptr_t ulii;
10738   // this needs to go before the ct == 1 check since start_routine() might need
10739   // it
10740   if (g_is_last_thread_block != is_last_block) {
10741     // might save us an unnecessary memory write that confuses the cache
10742     // coherency logic?
10743     g_is_last_thread_block = is_last_block;
10744   }
10745 #ifdef _WIN32
10746   if (!g_thread_mutex_initialized) {
10747     g_thread_spawn_ct = 0;
10748     g_thread_mutex_initialized = 1;
10749     if (ct == 1) {
10750       return 0;
10751     }
10752     for (ulii = 1; ulii < ct; ulii++) {
10753       g_thread_start_next_event[ulii - 1] = CreateEvent(nullptr, FALSE, FALSE, nullptr);
10754       g_thread_cur_block_done_events[ulii - 1] = CreateEvent(nullptr, FALSE, FALSE, nullptr);
10755     }
10756     for (ulii = 1; ulii < ct; ulii++) {
10757       threads[ulii - 1] = (HANDLE)_beginthreadex(nullptr, 4096, start_routine, (void*)ulii, 0, nullptr);
10758       if (!threads[ulii - 1]) {
10759 	if (ulii > 1) {
10760 	  join_threads2(threads, ulii, is_last_block);
10761 	  if (!is_last_block) {
10762 	    for (uintptr_t uljj = 0; uljj < ulii - 1; ++uljj) {
10763 	      CloseHandle(threads[uljj]);
10764 	    }
10765 	  }
10766 	}
10767 	if ((!is_last_block) || (ulii == 1)) {
10768 	  for (uint32_t uii = 0; uii < ct - 1; ++uii) {
10769 	    CloseHandle(g_thread_start_next_event[uii]);
10770 	    CloseHandle(g_thread_cur_block_done_events[uii]);
10771 	  }
10772 	  g_thread_mutex_initialized = 0;
10773 	}
10774 	return -1;
10775       }
10776     }
10777   } else {
10778     g_thread_spawn_ct++;
10779     for (ulii = 1; ulii < ct; ulii++) {
10780       SetEvent(g_thread_start_next_event[ulii - 1]);
10781     }
10782   }
10783 #else
10784   if (!is_last_block) {
10785     g_thread_active_ct = ct - 1;
10786   }
10787   if (!g_thread_mutex_initialized) {
10788     g_thread_spawn_ct = 0; // tidx 0 may need to know modulus
10789     g_thread_mutex_initialized = 1;
10790     if (ct == 1) {
10791       return 0;
10792     }
10793     if (pthread_mutex_init(&g_thread_sync_mutex, nullptr) ||
10794         pthread_cond_init(&g_thread_cur_block_done_condvar, nullptr) ||
10795         pthread_cond_init(&g_thread_start_next_condvar, nullptr)) {
10796       return -1;
10797     }
10798     for (ulii = 1; ulii < ct; ulii++) {
10799       if (pthread_create(&(threads[ulii - 1]), nullptr, start_routine, (void*)ulii)) {
10800 	if (ulii > 1) {
10801 	  join_threads2(threads, ulii, is_last_block);
10802 	  if (!is_last_block) {
10803 	    for (uintptr_t uljj = 0; uljj < ulii - 1; ++uljj) {
10804 	      pthread_cancel(threads[uljj]);
10805 	    }
10806 	  }
10807 	}
10808 	if ((!is_last_block) || (ulii == 1)) {
10809 	  pthread_mutex_destroy(&g_thread_sync_mutex);
10810 	  pthread_cond_destroy(&g_thread_cur_block_done_condvar);
10811 	  pthread_cond_destroy(&g_thread_start_next_condvar);
10812 	  g_thread_mutex_initialized = 0;
10813 	}
10814 	return -1;
10815       }
10816     }
10817   } else {
10818     g_thread_spawn_ct++;
10819     if (ct == 1) {
10820       return 0;
10821     }
10822     // still holding mutex
10823     pthread_mutex_unlock(&g_thread_sync_mutex);
10824     pthread_cond_broadcast(&g_thread_start_next_condvar);
10825   }
10826 #endif
10827   return 0;
10828 }
10829 
10830 sfmt_t** g_sfmtp_arr;
10831 
bigstack_init_sfmtp(uint32_t thread_ct)10832 uint32_t bigstack_init_sfmtp(uint32_t thread_ct) {
10833   uint32_t uibuf[4];
10834   uint32_t tidx;
10835   uint32_t uii;
10836   g_sfmtp_arr = (sfmt_t**)bigstack_alloc(thread_ct * sizeof(intptr_t));
10837   if (!g_sfmtp_arr) {
10838     return 1;
10839   }
10840   g_sfmtp_arr[0] = &g_sfmt;
10841   if (thread_ct > 1) {
10842     for (tidx = 1; tidx < thread_ct; tidx++) {
10843       g_sfmtp_arr[tidx] = (sfmt_t*)bigstack_alloc(sizeof(sfmt_t));
10844       if (!g_sfmtp_arr[tidx]) {
10845 	return 1;
10846       }
10847       for (uii = 0; uii < 4; uii++) {
10848 	uibuf[uii] = sfmt_genrand_uint32(&g_sfmt);
10849       }
10850       sfmt_init_by_array(g_sfmtp_arr[tidx], uibuf, 4);
10851     }
10852   }
10853   return 0;
10854 }
10855