1 // This file is part of PLINK 1.90, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation, either version 3 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink_common.h"
19
20 // #include "pigz.h"
21
22 // no leading \n since this is used in LOGPRINTFWW expressions
23 const char g_errstr_fopen[] = "Error: Failed to open %s.\n";
24
25 const char g_cmdline_format_str[] = "\n " PROG_NAME_STR " <input flag(s)...> [command flag(s)...] [other flag(s)...]\n " PROG_NAME_STR " --help [flag name(s)...]\n\n";
26
27 char g_textbuf[TEXTBUF_SIZE];
28
29 // note that \xxx character constants are interpreted in octal.
30 // technically no need to represent 0-31, but 64 extra bytes of data is
31 // probably cheaper than the code to subtract 32 everywhere.
32 const char g_one_char_strs[] = "\0\0\1\0\2\0\3\0\4\0\5\0\6\0\7\0\10\0\11\0\12\0\13\0\14\0\15\0\16\0\17\0\20\0\21\0\22\0\23\0\24\0\25\0\26\0\27\0\30\0\31\0\32\0\33\0\34\0\35\0\36\0\37\0\40\0\41\0\42\0\43\0\44\0\45\0\46\0\47\0\50\0\51\0\52\0\53\0\54\0\55\0\56\0\57\0\60\0\61\0\62\0\63\0\64\0\65\0\66\0\67\0\70\0\71\0\72\0\73\0\74\0\75\0\76\0\77\0\100\0\101\0\102\0\103\0\104\0\105\0\106\0\107\0\110\0\111\0\112\0\113\0\114\0\115\0\116\0\117\0\120\0\121\0\122\0\123\0\124\0\125\0\126\0\127\0\130\0\131\0\132\0\133\0\134\0\135\0\136\0\137\0\140\0\141\0\142\0\143\0\144\0\145\0\146\0\147\0\150\0\151\0\152\0\153\0\154\0\155\0\156\0\157\0\160\0\161\0\162\0\163\0\164\0\165\0\166\0\167\0\170\0\171\0\172\0\173\0\174\0\175\0\176\0\177\0\200\0\201\0\202\0\203\0\204\0\205\0\206\0\207\0\210\0\211\0\212\0\213\0\214\0\215\0\216\0\217\0\220\0\221\0\222\0\223\0\224\0\225\0\226\0\227\0\230\0\231\0\232\0\233\0\234\0\235\0\236\0\237\0\240\0\241\0\242\0\243\0\244\0\245\0\246\0\247\0\250\0\251\0\252\0\253\0\254\0\255\0\256\0\257\0\260\0\261\0\262\0\263\0\264\0\265\0\266\0\267\0\270\0\271\0\272\0\273\0\274\0\275\0\276\0\277\0\300\0\301\0\302\0\303\0\304\0\305\0\306\0\307\0\310\0\311\0\312\0\313\0\314\0\315\0\316\0\317\0\320\0\321\0\322\0\323\0\324\0\325\0\326\0\327\0\330\0\331\0\332\0\333\0\334\0\335\0\336\0\337\0\340\0\341\0\342\0\343\0\344\0\345\0\346\0\347\0\350\0\351\0\352\0\353\0\354\0\355\0\356\0\357\0\360\0\361\0\362\0\363\0\364\0\365\0\366\0\367\0\370\0\371\0\372\0\373\0\374\0\375\0\376\0\377";
33 const char* g_missing_geno_ptr = &(g_one_char_strs[96]);
34 const char* g_output_missing_geno_ptr = &(g_one_char_strs[96]);
35
36 uintptr_t g_failed_alloc_attempt_size = 0;
37
38 sfmt_t g_sfmt;
39
40 FILE* g_logfile = nullptr;
41
42 char g_logbuf[MAXLINELEN * 2];
43
44 uint32_t g_debug_on = 0;
45 uint32_t g_log_failed = 0;
46 uint32_t g_thread_ct;
47
aligned_malloc(uintptr_t size,uintptr_t ** aligned_pp)48 uint32_t aligned_malloc(uintptr_t size, uintptr_t** aligned_pp) {
49 #if defined __LP64__ && !defined __APPLE__
50 // Avoid random segfaults on 64-bit machines which have 8-byte- instead of
51 // 16-byte-aligned malloc(). (Slightly different code is needed if malloc()
52 // does not even guarantee 8-byte alignment.)
53 uintptr_t* malloc_ptr = (uintptr_t*)malloc(size + VEC_BYTES);
54 if (!malloc_ptr) {
55 g_failed_alloc_attempt_size = size + VEC_BYTES;
56 return 1;
57 }
58 *aligned_pp = (uintptr_t*)((((uintptr_t)malloc_ptr) + VEC_BYTES) & (~(VEC_BYTES_M1 * ONELU)));
59 (*aligned_pp)[-1] = (uintptr_t)malloc_ptr;
60 #else
61 // no SSE2 concerns here
62 *aligned_pp = (uintptr_t*)malloc(size);
63 if (!(*aligned_pp)) {
64 g_failed_alloc_attempt_size = size;
65 return 1;
66 }
67 #endif
68 return 0;
69 }
70
aligned_free(uintptr_t * aligned_pp)71 void aligned_free(uintptr_t* aligned_pp) {
72 #if defined __LP64__ && !defined __APPLE__
73 free((uintptr_t*)(aligned_pp[-1]));
74 #else
75 free(aligned_pp);
76 #endif
77 }
78
push_ll_str(const char * ss,Ll_str ** ll_stack_ptr)79 uint32_t push_ll_str(const char* ss, Ll_str** ll_stack_ptr) {
80 uintptr_t str_bytes = strlen(ss) + 1;
81 Ll_str* new_ll_str = (Ll_str*)malloc(sizeof(Ll_str) + str_bytes);
82 if (!new_ll_str) {
83 g_failed_alloc_attempt_size = sizeof(Ll_str) + str_bytes;
84 return 1;
85 }
86 new_ll_str->next = *ll_stack_ptr;
87 memcpy(new_ll_str->ss, ss, str_bytes);
88 *ll_stack_ptr = new_ll_str;
89 return 0;
90 }
91
logstr(const char * ss)92 void logstr(const char* ss) {
93 if (!g_debug_on) {
94 fputs(ss, g_logfile);
95 if (ferror(g_logfile)) {
96 putc_unlocked('\n', stdout);
97 fflush(stdout);
98 fprintf(stderr, "Warning: Logging failure on:\n%s\nFurther logging will not be attempted in this run.\n", ss);
99 g_log_failed = 1;
100 }
101 } else {
102 if (g_log_failed) {
103 fflush(stdout);
104 fputs(ss, stderr);
105 } else {
106 fputs(ss, g_logfile);
107 if (ferror(g_logfile)) {
108 putc_unlocked('\n', stdout);
109 fflush(stdout);
110 fprintf(stderr, "Error: Debug logging failure. Dumping to stderr:\n%s", ss);
111 g_log_failed = 1;
112 } else {
113 fflush(g_logfile);
114 }
115 }
116 }
117 }
118
logprint(const char * ss)119 void logprint(const char* ss) {
120 logstr(ss);
121 fputs(ss, stdout);
122 }
123
logerrprint(const char * ss)124 void logerrprint(const char* ss) {
125 logstr(ss);
126 fflush(stdout);
127 fputs(ss, stderr);
128 }
129
logprintb()130 void logprintb() {
131 logstr(g_logbuf);
132 fputs(g_logbuf, stdout);
133 }
134
logerrprintb()135 void logerrprintb() {
136 logstr(g_logbuf);
137 fflush(stdout);
138 fputs(g_logbuf, stderr);
139 }
140
wordwrap(uint32_t suffix_len,char * ss)141 void wordwrap(uint32_t suffix_len, char* ss) {
142 // Input: A null-terminated string with no intermediate newlines. If
143 // suffix_len is zero, there should be a terminating \n; otherwise,
144 // the last character should be a space.
145 // Effect: Spaces are replaced with newlines in a manner that plays well with
146 // 80 column terminal windows. (Multi-space blocks are never
147 // collapsed.)
148 char* token_start = ss;
149 char* line_end = &(ss[79]);
150 char* token_end;
151 while (1) {
152 while (*token_start == ' ') {
153 token_start++;
154 }
155 if (token_start > line_end) {
156 do {
157 *line_end = '\n';
158 line_end = &(line_end[80]);
159 } while (token_start > line_end);
160 }
161 token_end = strchr(token_start, ' ');
162 if (!token_end) {
163 if (&(token_start[79]) == line_end) {
164 return;
165 }
166 token_end = strchr(token_start, '\0');
167 if (!suffix_len) {
168 if (token_end <= &(line_end[1])) {
169 // okay if end-of-string is one past the end, because function
170 // assumes last character is \n in suffix_len == 0 case
171 assert(token_end[-1] == '\n');
172 return;
173 }
174 } else {
175 if (&(token_end[suffix_len]) <= line_end) {
176 return;
177 }
178 // because of terminal space assumption, token_start actually points
179 // to the end of the string
180 assert(token_start[-1] == ' ');
181 }
182 token_start[-1] = '\n';
183 return;
184 }
185 if (token_end > line_end) {
186 if (&(token_start[79]) != line_end) {
187 token_start[-1] = '\n';
188 line_end = &(token_start[79]);
189 if (token_end > line_end) {
190 // single really long token, can't do anything beyond putting it on
191 // its own line
192 *token_end = '\n';
193 line_end = &(token_end[80]);
194 }
195 } else {
196 // single really long token, *and* previous token was either
197 // nonexistent or long
198 *token_end = '\n';
199 line_end = &(token_end[80]);
200 }
201 }
202 token_start = &(token_end[1]);
203 }
204 }
205
wordwrapb(uint32_t suffix_len)206 void wordwrapb(uint32_t suffix_len) {
207 wordwrap(suffix_len, g_logbuf);
208 }
209
fopen_checked(const char * fname,const char * mode,FILE ** target_ptr)210 int32_t fopen_checked(const char* fname, const char* mode, FILE** target_ptr) {
211 *target_ptr = fopen(fname, mode);
212 if (!(*target_ptr)) {
213 LOGERRPRINTFWW(g_errstr_fopen, fname);
214 return -1;
215 }
216 return 0;
217 }
218
fwrite_checked(const void * buf,size_t len,FILE * outfile)219 int32_t fwrite_checked(const void* buf, size_t len, FILE* outfile) {
220 while (len > 0x7ffff000) {
221 // OS X can't perform 2GB+ writes
222 // typical disk block size is 4kb, so 0x7ffff000 is the largest sensible
223 // write size
224 fwrite(buf, 1, 0x7ffff000, outfile);
225 buf = &(((unsigned char*)buf)[0x7ffff000]);
226 len -= 0x7ffff000;
227 }
228 fwrite(buf, 1, len, outfile);
229 return ferror(outfile);
230 }
231
gzopen_read_checked(const char * fname,gzFile * gzf_ptr)232 int32_t gzopen_read_checked(const char* fname, gzFile* gzf_ptr) {
233 *gzf_ptr = gzopen(fname, FOPEN_RB);
234 if (!(*gzf_ptr)) {
235 LOGERRPRINTFWW(g_errstr_fopen, fname);
236 return RET_OPEN_FAIL;
237 }
238 if (gzbuffer(*gzf_ptr, 131072)) {
239 return RET_NOMEM;
240 }
241 return 0;
242 }
243
244 // manually managed, very large stack
245 unsigned char* g_bigstack_base;
246 unsigned char* g_bigstack_end;
247
bigstack_alloc(uintptr_t size)248 unsigned char* bigstack_alloc(uintptr_t size) {
249 unsigned char* alloc_ptr;
250 size = round_up_pow2(size, CACHELINE);
251 if (bigstack_left() < size) {
252 g_failed_alloc_attempt_size = size;
253 return nullptr;
254 }
255 alloc_ptr = g_bigstack_base;
256 g_bigstack_base += size;
257 return alloc_ptr;
258 }
259
bigstack_shrink_top(const void * rebase,uintptr_t new_size)260 void bigstack_shrink_top(const void* rebase, uintptr_t new_size) {
261 uintptr_t freed_bytes = ((uintptr_t)(g_bigstack_base - ((unsigned char*)rebase))) - round_up_pow2(new_size, CACHELINE);
262 g_bigstack_base -= freed_bytes;
263 }
264
bigstack_end_alloc_presized(uintptr_t size)265 unsigned char* bigstack_end_alloc_presized(uintptr_t size) {
266 assert(!(size & END_ALLOC_CHUNK_M1));
267 uintptr_t cur_bigstack_left = bigstack_left();
268 if (size > cur_bigstack_left) {
269 g_failed_alloc_attempt_size = size;
270 return nullptr;
271 } else {
272 g_bigstack_end -= size;
273 return g_bigstack_end;
274 }
275 }
276
match_upper(const char * ss,const char * fixed_str)277 uint32_t match_upper(const char* ss, const char* fixed_str) {
278 char cc = *fixed_str++;
279 do {
280 if ((((unsigned char)(*ss++)) & 0xdf) != ((unsigned char)cc)) {
281 return 0;
282 }
283 cc = *fixed_str++;
284 } while (cc);
285 return !(*ss);
286 }
287
match_upper_counted(const char * ss,const char * fixed_str,uint32_t ct)288 uint32_t match_upper_counted(const char* ss, const char* fixed_str, uint32_t ct) {
289 do {
290 if ((((unsigned char)(*ss++)) & 0xdf) != ((unsigned char)(*fixed_str++))) {
291 return 0;
292 }
293 } while (--ct);
294 return 1;
295 }
296
297 #ifdef __LP64__
scan_uint_capped_finish(const char * ss,uint64_t cap,uint32_t * valp)298 static inline uint32_t scan_uint_capped_finish(const char* ss, uint64_t cap, uint32_t* valp) {
299 uint64_t val = *valp;
300 while (1) {
301 // a little bit of unrolling seems to help
302 const uint64_t cur_digit = (uint64_t)((unsigned char)(*ss++)) - 48;
303 if (cur_digit >= 10) {
304 break;
305 }
306 // val = val * 10 + cur_digit;
307 const uint64_t cur_digit2 = (uint64_t)((unsigned char)(*ss++)) - 48;
308 if (cur_digit2 >= 10) {
309 val = val * 10 + cur_digit;
310 if (val > cap) {
311 return 1;
312 }
313 break;
314 }
315 val = val * 100 + cur_digit * 10 + cur_digit2;
316 if (val > cap) {
317 return 1;
318 }
319 }
320 *valp = val;
321 return 0;
322 }
323
scan_posint_capped(const char * ss,uint64_t cap,uint32_t * valp)324 uint32_t scan_posint_capped(const char* ss, uint64_t cap, uint32_t* valp) {
325 // '0' has ascii code 48
326 *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
327 if (*valp >= 10) {
328 // permit leading '+' (ascii 43), but not '++' or '+-'
329 if (*valp != 0xfffffffbU) {
330 return 1;
331 }
332 *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
333 if (*valp >= 10) {
334 return 1;
335 }
336 }
337 while (!(*valp)) {
338 *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
339 if ((*valp) >= 10) {
340 return 1;
341 }
342 }
343 return scan_uint_capped_finish(ss, cap, valp);
344 }
345
scan_uint_capped(const char * ss,uint64_t cap,uint32_t * valp)346 uint32_t scan_uint_capped(const char* ss, uint64_t cap, uint32_t* valp) {
347 // Reads an integer in [0, cap]. Assumes first character is nonspace.
348 uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
349 if (val >= 10) {
350 if (val != 0xfffffffbU) {
351 // '-' has ascii code 45, so unsigned 45 - 48 = 0xfffffffdU
352 if ((val != 0xfffffffdU) || (*ss != '0')) {
353 return 1;
354 }
355 // accept "-0", "-00", etc.
356 while (*(++ss) == '0');
357 *valp = 0;
358 return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
359 }
360 // accept leading '+'
361 val = (uint32_t)((unsigned char)(*ss++)) - 48;
362 if (val >= 10) {
363 return 1;
364 }
365 }
366 *valp = val;
367 return scan_uint_capped_finish(ss, cap, valp);
368 }
369
scan_int_abs_bounded(const char * ss,uint64_t bound,int32_t * valp)370 uint32_t scan_int_abs_bounded(const char* ss, uint64_t bound, int32_t* valp) {
371 // Reads an integer in [-bound, bound]. Assumes first character is nonspace.
372 *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
373 int32_t sign = 1;
374 if (((uint32_t)*valp) >= 10) {
375 if (*valp == -3) {
376 sign = -1;
377 } else if (*valp != -5) {
378 return 1;
379 }
380 *valp = (uint32_t)((unsigned char)(*ss++)) - 48;
381 if (((uint32_t)*valp) >= 10) {
382 return 1;
383 }
384 }
385 if (scan_uint_capped_finish(ss, bound, (uint32_t*)valp)) {
386 return 1;
387 }
388 *valp *= sign;
389 return 0;
390 }
391 #else // not __LP64__
scan_posint_capped32(const char * ss,uint32_t cap_div_10,uint32_t cap_mod_10,uint32_t * valp)392 uint32_t scan_posint_capped32(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
393 // '0' has ascii code 48
394 uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
395 if (val >= 10) {
396 if (val != 0xfffffffbU) {
397 return 1;
398 }
399 val = (uint32_t)((unsigned char)(*ss++)) - 48;
400 if (val >= 10) {
401 return 1;
402 }
403 }
404 while (!val) {
405 val = (uint32_t)((unsigned char)(*ss++)) - 48;
406 if (val >= 10) {
407 return 1;
408 }
409 }
410 while (1) {
411 const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
412 if (cur_digit >= 10) {
413 *valp = val;
414 return 0;
415 }
416 // avoid integer overflow in middle of computation
417 if ((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10))) {
418 return 1;
419 }
420 val = val * 10 + cur_digit;
421 }
422 }
423
scan_uint_capped32(const char * ss,uint32_t cap_div_10,uint32_t cap_mod_10,uint32_t * valp)424 uint32_t scan_uint_capped32(const char* ss, uint32_t cap_div_10, uint32_t cap_mod_10, uint32_t* valp) {
425 // Reads an integer in [0, cap]. Assumes first character is nonspace.
426 uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
427 if (val >= 10) {
428 if (val != 0xfffffffbU) {
429 if ((val != 0xfffffffdU) || (*ss != '0')) {
430 return 1;
431 }
432 while (*(++ss) == '0');
433 *valp = 0;
434 return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
435 }
436 val = (uint32_t)((unsigned char)(*ss++)) - 48;
437 if (val >= 10) {
438 return 1;
439 }
440 }
441 while (1) {
442 const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
443 if (cur_digit >= 10) {
444 *valp = val;
445 return 0;
446 }
447 if ((val >= cap_div_10) && ((val > cap_div_10) || (cur_digit > cap_mod_10))) {
448 return 1;
449 }
450 val = val * 10 + cur_digit;
451 }
452 }
453
scan_int_abs_bounded32(const char * ss,uint32_t bound_div_10,uint32_t bound_mod_10,int32_t * valp)454 uint32_t scan_int_abs_bounded32(const char* ss, uint32_t bound_div_10, uint32_t bound_mod_10, int32_t* valp) {
455 // Reads an integer in [-bound, bound]. Assumes first character is nonspace.
456 uint32_t val = (uint32_t)((unsigned char)(*ss++)) - 48;
457 int32_t sign = 1;
458 if (val >= 10) {
459 if (val == 0xfffffffdU) {
460 sign = -1;
461 } else if (val != 0xfffffffbU) {
462 return 1;
463 }
464 val = (uint32_t)((unsigned char)(*ss++)) - 48;
465 if (val >= 10) {
466 return 1;
467 }
468 }
469 while (1) {
470 const uint32_t cur_digit = (uint32_t)((unsigned char)(*ss++)) - 48;
471 if (cur_digit >= 10) {
472 *valp = sign * ((int32_t)val);
473 return 0;
474 }
475 if ((val >= bound_div_10) && ((val > bound_div_10) || (cur_digit > bound_mod_10))) {
476 return 1;
477 }
478 val = val * 10 + cur_digit;
479 }
480 }
481 #endif
482
scan_posintptr(const char * ss,uintptr_t * valp)483 uint32_t scan_posintptr(const char* ss, uintptr_t* valp) {
484 // Reads an integer in [1, 2^BITCT - 1]. Assumes first character is
485 // nonspace.
486 uintptr_t val = (uintptr_t)((unsigned char)(*ss++)) - 48;
487 if (val >= 10) {
488 #ifdef __LP64__
489 if (val != 0xfffffffffffffffbLLU) {
490 return 1;
491 }
492 #else
493 if (val != 0xfffffffbU) {
494 return 1;
495 }
496 #endif
497 val = (uintptr_t)((unsigned char)(*ss++)) - 48;
498 if (val >= 10) {
499 return 1;
500 }
501 }
502 while (!val) {
503 val = (uintptr_t)((unsigned char)(*ss++)) - 48;
504 if (val >= 10) {
505 return 1;
506 }
507 }
508 // limit is 20 digits, we've already read one
509 #ifdef __LP64__
510 const char* ss_limit = &(ss[20]);
511 #else
512 const char* ss_limit = &(ss[10]);
513 #endif
514 while (1) {
515 const uintptr_t cur_digit = (uintptr_t)((unsigned char)(*ss++)) - 48;
516 if (cur_digit >= 10) {
517 *valp = val;
518 return 0;
519 }
520 const uintptr_t cur_digit2 = (uintptr_t)((unsigned char)(*ss++)) - 48;
521 if (ss == ss_limit) {
522 if ((cur_digit2 < 10) || ((val >= (~ZEROLU) / 10) && ((val > (~ZEROLU) / 10) || (cur_digit > (~ZEROLU) % 10)))) {
523 return 1;
524 }
525 *valp = val * 10 + cur_digit;
526 return 0;
527 }
528 if (cur_digit2 >= 10) {
529 *valp = val * 10 + cur_digit;
530 return 0;
531 }
532 val = val * 100 + cur_digit * 10 + cur_digit2;
533 }
534 }
535
536 /*
537 uint32_t scan_uintptr(char* ss, uintptr_t* valp) {
538 // [0, 2^BITCT - 1].
539 uintptr_t val = (uint32_t)((unsigned char)*ss) - 48;
540 uintptr_t cur_digit;
541 if (val < 10) {
542 while (1) {
543 scan_uintptr_main_loop:
544 cur_digit = (uint32_t)((unsigned char)(*(++ss))) - 48;
545 if (cur_digit >= 10) {
546 *valp = val;
547 return 0;
548 }
549 if ((val >= (~ZEROLU) / 10) && ((val > (~ZEROLU) / 10) || (cur_digit > (~ZEROLU) % 10))) {
550 return 1;
551 }
552 val = val * 10 + cur_digit;
553 }
554 }
555 ss++;
556 if (val != 0xfffffffdU) {
557 if (val == 0xfffffffbU) {
558 val = (uint32_t)((unsigned char)(*ss)) - 48;
559 if (val < 10) {
560 goto scan_uintptr_main_loop;
561 }
562 }
563 return 1;
564 }
565 if (*ss != '0') {
566 return 1;
567 }
568 while (*(++ss) == '0');
569 *valp = 0;
570 return ((uint32_t)((unsigned char)(*ss)) - 48) < 10;
571 }
572 */
573
scan_posint_cappedx(const char * ss,uint64_t cap,uint32_t * valp)574 uint32_t scan_posint_cappedx(const char* ss, uint64_t cap, uint32_t* valp) {
575 double val;
576 if (scan_doublex(ss, &val) || (val < 1.0) || (val > ((double)cap))) {
577 return 1;
578 }
579 *valp = (uint32_t)val;
580 return (val != ((double)(*valp)));
581 }
582
scan_uint_cappedx(const char * ss,uint64_t cap,uint32_t * valp)583 uint32_t scan_uint_cappedx(const char* ss, uint64_t cap, uint32_t* valp) {
584 double val;
585 if (scan_doublex(ss, &val) || (val < 0.0) || (val > ((double)cap))) {
586 return 1;
587 }
588 *valp = (uint32_t)val;
589 return (val != ((double)(*valp)));
590 }
591
scan_int_abs_boundedx(const char * ss,uint64_t bound,int32_t * valp)592 uint32_t scan_int_abs_boundedx(const char* ss, uint64_t bound, int32_t* valp) {
593 const double bound_d = (double)bound;
594 double val;
595 if (scan_doublex(ss, &val) || (val < -bound_d) || (val > bound_d)) {
596 return 1;
597 }
598 *valp = (int32_t)val;
599 return (val != ((double)(*valp)));
600 }
601
scan_posintptrx(const char * ss,uintptr_t * valp)602 uint32_t scan_posintptrx(const char* ss, uintptr_t* valp) {
603 double val;
604 if (scan_doublex(ss, &val) || (val < 1.0) || (val > ((double)(~ZEROLU)))) {
605 return 1;
606 }
607 *valp = (uintptr_t)val;
608 return (val != ((double)(*valp)));
609 }
610
611
scan_two_doubles(char * ss,double * __restrict val1p,double * __restrict val2p)612 uint32_t scan_two_doubles(char* ss, double* __restrict val1p, double* __restrict val2p) {
613 char* ss2;
614 *val1p = strtod(ss, &ss2);
615 if (ss == ss2) {
616 return 1;
617 }
618 ss = skip_initial_spaces(ss2);
619 *val2p = strtod(ss, &ss2);
620 return (ss == ss2)? 1 : 0;
621 }
622
scan_token_ct_len(uintptr_t half_bufsize,FILE * infile,char * buf,uintptr_t * __restrict token_ct_ptr,uintptr_t * __restrict max_token_len_ptr)623 int32_t scan_token_ct_len(uintptr_t half_bufsize, FILE* infile, char* buf, uintptr_t* __restrict token_ct_ptr, uintptr_t* __restrict max_token_len_ptr) {
624 // buf must be of size >= (2 * half_bufsize + 2)
625 // max_token_len includes trailing null
626 uintptr_t full_bufsize = half_bufsize * 2;
627 uintptr_t curtoklen = 0;
628 uintptr_t token_ct = *token_ct_ptr;
629 uintptr_t max_token_len = *max_token_len_ptr;
630 char* midbuf = &(buf[half_bufsize]);
631 char* bufptr;
632 char* bufptr2;
633 char* buf_end;
634 uintptr_t bufsize;
635 while (1) {
636 if (fread_checked(midbuf, half_bufsize, infile, &bufsize)) {
637 return RET_READ_FAIL;
638 }
639 if (!bufsize) {
640 if (curtoklen) {
641 // corner case
642 if (curtoklen >= max_token_len) {
643 max_token_len = curtoklen + 1;
644 }
645 token_ct++;
646 }
647 break;
648 }
649 buf_end = &(midbuf[bufsize]);
650 *buf_end = ' ';
651 buf_end[1] = '0';
652 bufptr = &(buf[half_bufsize - curtoklen]);
653 bufptr2 = midbuf;
654 if (curtoklen) {
655 goto scan_token_ct_len_tok_start;
656 }
657 while (1) {
658 while (*bufptr <= ' ') {
659 bufptr++;
660 }
661 if (bufptr >= buf_end) {
662 curtoklen = 0;
663 break;
664 }
665 bufptr2 = &(bufptr[1]);
666 scan_token_ct_len_tok_start:
667 while (*bufptr2 > ' ') {
668 bufptr2++;
669 }
670 curtoklen = (uintptr_t)(bufptr2 - bufptr);
671 if ((bufptr2 == buf_end) && (buf_end == &(buf[full_bufsize]))) {
672 if (curtoklen >= half_bufsize) {
673 return RET_INVALID_FORMAT;
674 }
675 break;
676 }
677 if (curtoklen >= max_token_len) {
678 if (curtoklen >= half_bufsize) {
679 return RET_INVALID_FORMAT;
680 }
681 max_token_len = curtoklen + 1;
682 }
683 token_ct++;
684 bufptr = &(bufptr2[1]);
685 }
686 }
687 if (!feof(infile)) {
688 return RET_READ_FAIL;
689 }
690 *max_token_len_ptr = max_token_len;
691 *token_ct_ptr = token_ct;
692 return 0;
693 }
694
read_tokens(uintptr_t half_bufsize,uintptr_t token_ct,uintptr_t max_token_len,FILE * infile,char * __restrict buf,char * __restrict token_name_buf)695 int32_t read_tokens(uintptr_t half_bufsize, uintptr_t token_ct, uintptr_t max_token_len, FILE* infile, char* __restrict buf, char* __restrict token_name_buf) {
696 // buf must be of size >= (2 * half_bufsize + 2).
697 // max_token_len includes trailing null
698 uintptr_t full_bufsize = half_bufsize * 2;
699 uintptr_t curtoklen = 0;
700 uintptr_t token_idx = 0;
701 char* midbuf = &(buf[half_bufsize]);
702 char* bufptr = midbuf;
703 char* bufptr2;
704 char* bufptr3;
705 char* buf_end;
706 uintptr_t bufsize;
707 while (1) {
708 if (fread_checked(midbuf, half_bufsize, infile, &bufsize)) {
709 return RET_READ_FAIL;
710 }
711 if (!bufsize) {
712 if (curtoklen) {
713 if (token_idx + 1 == token_ct) {
714 memcpyx(&(token_name_buf[token_idx * max_token_len]), bufptr, curtoklen, '\0');
715 return 0;
716 }
717 }
718 // something very strange has to happen to get here
719 return RET_READ_FAIL;
720 }
721 buf_end = &(midbuf[bufsize]);
722 *buf_end = ' ';
723 buf_end[1] = '0';
724 bufptr2 = midbuf;
725 if (curtoklen) {
726 goto read_tokens_tok_start;
727 }
728 while (1) {
729 while (*bufptr <= ' ') {
730 bufptr++;
731 }
732 if (bufptr >= buf_end) {
733 curtoklen = 0;
734 bufptr = midbuf;
735 break;
736 }
737 bufptr2 = &(bufptr[1]);
738 read_tokens_tok_start:
739 while (*bufptr2 > ' ') {
740 bufptr2++;
741 }
742 curtoklen = (uintptr_t)(bufptr2 - bufptr);
743 if ((bufptr2 == buf_end) && (buf_end == &(buf[full_bufsize]))) {
744 bufptr3 = &(buf[half_bufsize - curtoklen]);
745 memcpy(bufptr3, bufptr, curtoklen);
746 bufptr = bufptr3;
747 break;
748 }
749 memcpyx(&(token_name_buf[token_idx * max_token_len]), bufptr, curtoklen, '\0');
750 if (++token_idx == token_ct) {
751 return 0;
752 }
753 bufptr = &(bufptr2[1]);
754 }
755 }
756 }
757
gzputs_w4(gzFile gz_outfile,const char * ss)758 int32_t gzputs_w4(gzFile gz_outfile, const char* ss) {
759 if (!ss[1]) {
760 if (gzputs(gz_outfile, " ") == -1) {
761 return -1;
762 }
763 return gzputc(gz_outfile, ss[0]);
764 }
765 if (!ss[2]) {
766 if (gzputs(gz_outfile, " ") == -1) {
767 return -1;
768 }
769 } else if (!ss[3]) {
770 if (gzputc(gz_outfile, ' ') == -1) {
771 return -1;
772 }
773 }
774 return gzputs(gz_outfile, ss);
775 }
776
get_next_noncomment(FILE * fptr,char ** lptr_ptr,uintptr_t * line_idx_ptr)777 int32_t get_next_noncomment(FILE* fptr, char** lptr_ptr, uintptr_t* line_idx_ptr) {
778 char* lptr;
779 do {
780 if (!fgets(g_textbuf, MAXLINELEN, fptr)) {
781 return -1;
782 }
783 *line_idx_ptr += 1;
784 lptr = skip_initial_spaces(g_textbuf);
785 } while (is_eoln_or_comment_kns(*lptr));
786 *lptr_ptr = lptr;
787 return 0;
788 }
789
get_next_noncomment_excl(const uintptr_t * __restrict marker_exclude,FILE * fptr,char ** lptr_ptr,uintptr_t * __restrict line_idx_ptr,uintptr_t * __restrict marker_uidx_ptr)790 int32_t get_next_noncomment_excl(const uintptr_t* __restrict marker_exclude, FILE* fptr, char** lptr_ptr, uintptr_t* __restrict line_idx_ptr, uintptr_t* __restrict marker_uidx_ptr) {
791 while (!get_next_noncomment(fptr, lptr_ptr, line_idx_ptr)) {
792 if (!is_set_ul(marker_exclude, *marker_uidx_ptr)) {
793 return 0;
794 }
795 *marker_uidx_ptr += 1;
796 }
797 return -1;
798 }
799
get_top_two_ui(const uint32_t * __restrict uint_arr,uintptr_t uia_size,uintptr_t * __restrict top_idx_ptr,uintptr_t * __restrict second_idx_ptr)800 void get_top_two_ui(const uint32_t* __restrict uint_arr, uintptr_t uia_size, uintptr_t* __restrict top_idx_ptr, uintptr_t* __restrict second_idx_ptr) {
801 assert(uia_size > 1);
802 uintptr_t top_idx = (uint_arr[1] > uint_arr[0])? 1 : 0;
803 uintptr_t second_idx = 1 ^ top_idx;
804 uint32_t top_val = uint_arr[top_idx];
805 uint32_t second_val = uint_arr[second_idx];
806 uintptr_t cur_idx;
807 uintptr_t cur_val;
808 for (cur_idx = 2; cur_idx < uia_size; ++cur_idx) {
809 cur_val = uint_arr[cur_idx];
810 if (cur_val > second_val) {
811 if (cur_val > top_val) {
812 second_val = top_val;
813 second_idx = top_idx;
814 top_val = cur_val;
815 top_idx = cur_idx;
816 } else {
817 second_val = cur_val;
818 second_idx = cur_idx;
819 }
820 }
821 }
822 *top_idx_ptr = top_idx;
823 *second_idx_ptr = second_idx;
824 }
825
intlen(int32_t num)826 uint32_t intlen(int32_t num) {
827 int32_t retval = 1;
828 uint32_t absnum;
829 if (num < 0) {
830 absnum = -num;
831 retval++;
832 } else {
833 absnum = num;
834 }
835 while (absnum > 99) {
836 // division by a constant is faster for unsigned ints
837 absnum /= 100;
838 retval += 2;
839 }
840 if (absnum > 9) {
841 retval++;
842 }
843 return retval;
844 }
845
strcmp_se(const char * s_read,const char * s_const,uint32_t s_const_len)846 int32_t strcmp_se(const char* s_read, const char* s_const, uint32_t s_const_len) {
847 return memcmp(s_read, s_const, s_const_len) || (!is_space_or_eoln(s_read[s_const_len]));
848 }
849
next_token(char * sptr)850 char* next_token(char* sptr) {
851 if (!sptr) {
852 return nullptr;
853 }
854 unsigned char ucc = *sptr;
855 while (ucc > 32) {
856 ucc = *(++sptr);
857 }
858 while ((ucc == ' ') || (ucc == '\t')) {
859 ucc = *(++sptr);
860 }
861 return (ucc > 32)? sptr : nullptr;
862 }
863
next_token_mult(char * sptr,uint32_t ct)864 char* next_token_mult(char* sptr, uint32_t ct) {
865 assert(ct);
866 if (!sptr) {
867 return nullptr;
868 }
869 unsigned char ucc = *sptr;
870 do {
871 while (ucc > 32) {
872 ucc = *(++sptr);
873 }
874 while ((ucc == ' ') || (ucc == '\t')) {
875 ucc = *(++sptr);
876 }
877 if (ucc <= 32) {
878 return nullptr;
879 }
880 } while (--ct);
881 return sptr;
882 }
883
count_tokens(const char * bufptr)884 uint32_t count_tokens(const char* bufptr) {
885 uint32_t token_ct = 0;
886 while ((*bufptr == ' ') || (*bufptr == '\t')) {
887 bufptr++;
888 }
889 while (!is_eoln_kns(*bufptr)) {
890 token_ct++;
891 while (!is_space_or_eoln(*(++bufptr)));
892 while ((*bufptr == ' ') || (*bufptr == '\t')) {
893 bufptr++;
894 }
895 }
896 return token_ct;
897 }
898
count_and_measure_multistr(const char * multistr,uintptr_t * max_slen_ptr)899 uint32_t count_and_measure_multistr(const char* multistr, uintptr_t* max_slen_ptr) {
900 // max_slen includes null terminator
901 // assumes multistr is nonempty
902 uint32_t ct = 0;
903 uintptr_t max_slen = *max_slen_ptr;
904 uintptr_t slen;
905 do {
906 slen = strlen(multistr) + 1;
907 if (slen > max_slen) {
908 max_slen = slen;
909 }
910 multistr = &(multistr[slen]);
911 ct++;
912 } while (*multistr);
913 *max_slen_ptr = max_slen;
914 return ct;
915 }
916
917 // number-to-string encoders
918
919 static const char digit2_table[200] = {
920 '0', '0', '0', '1', '0', '2', '0', '3', '0', '4',
921 '0', '5', '0', '6', '0', '7', '0', '8', '0', '9',
922 '1', '0', '1', '1', '1', '2', '1', '3', '1', '4',
923 '1', '5', '1', '6', '1', '7', '1', '8', '1', '9',
924 '2', '0', '2', '1', '2', '2', '2', '3', '2', '4',
925 '2', '5', '2', '6', '2', '7', '2', '8', '2', '9',
926 '3', '0', '3', '1', '3', '2', '3', '3', '3', '4',
927 '3', '5', '3', '6', '3', '7', '3', '8', '3', '9',
928 '4', '0', '4', '1', '4', '2', '4', '3', '4', '4',
929 '4', '5', '4', '6', '4', '7', '4', '8', '4', '9',
930 '5', '0', '5', '1', '5', '2', '5', '3', '5', '4',
931 '5', '5', '5', '6', '5', '7', '5', '8', '5', '9',
932 '6', '0', '6', '1', '6', '2', '6', '3', '6', '4',
933 '6', '5', '6', '6', '6', '7', '6', '8', '6', '9',
934 '7', '0', '7', '1', '7', '2', '7', '3', '7', '4',
935 '7', '5', '7', '6', '7', '7', '7', '8', '7', '9',
936 '8', '0', '8', '1', '8', '2', '8', '3', '8', '4',
937 '8', '5', '8', '6', '8', '7', '8', '8', '8', '9',
938 '9', '0', '9', '1', '9', '2', '9', '3', '9', '4',
939 '9', '5', '9', '6', '9', '7', '9', '8', '9', '9'};
940
uint32toa(uint32_t uii,char * start)941 char* uint32toa(uint32_t uii, char* start) {
942 // Memory-efficient fast integer writer. (You can do a bit better sometimes
943 // by using a larger lookup table, but on average I doubt that pays off.)
944 // Returns a pointer to the end of the integer (not null-terminated).
945 uint32_t quotient;
946 if (uii < 1000) {
947 if (uii < 10) {
948 *start++ = '0' + uii;
949 return start;
950 }
951 if (uii < 100) {
952 goto uint32toa_2;
953 }
954 quotient = uii / 100;
955 *start++ = '0' + quotient;
956 } else {
957 if (uii < 10000000) {
958 if (uii >= 100000) {
959 if (uii < 1000000) {
960 goto uint32toa_6;
961 }
962 quotient = uii / 1000000;
963 *start++ = '0' + quotient;
964 goto uint32toa_6b;
965 }
966 if (uii < 10000) {
967 goto uint32toa_4;
968 }
969 quotient = uii / 10000;
970 *start++ = '0' + quotient;
971 } else {
972 if (uii >= 100000000) {
973 quotient = uii / 100000000;
974 if (uii >= 1000000000) {
975 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
976 } else {
977 *start++ = '0' + quotient;
978 }
979 uii -= 100000000 * quotient;
980 }
981 quotient = uii / 1000000;
982 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
983 uint32toa_6b:
984 uii -= 1000000 * quotient;
985 uint32toa_6:
986 quotient = uii / 10000;
987 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
988 }
989 uii -= 10000 * quotient;
990 uint32toa_4:
991 // could make a uitoa_z4() call here, but that's slightly slower
992 quotient = uii / 100;
993 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
994 }
995 uii -= 100 * quotient;
996 uint32toa_2:
997 return memcpya(start, &(digit2_table[uii * 2]), 2);
998 }
999
int32toa(int32_t ii,char * start)1000 char* int32toa(int32_t ii, char* start) {
1001 uint32_t uii = ii;
1002 if (ii < 0) {
1003 // -INT_MIN is undefined, but negating the unsigned int equivalent works
1004 *start++ = '-';
1005 uii = -uii;
1006 }
1007 return uint32toa(uii, start);
1008 }
1009
uitoa_z4(uint32_t uii,char * start)1010 char* uitoa_z4(uint32_t uii, char* start) {
1011 uint32_t quotient = uii / 100;
1012 assert(quotient < 100);
1013 uii -= 100 * quotient;
1014 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1015 return memcpya(start, &(digit2_table[uii * 2]), 2);
1016 }
1017
uitoa_z6(uint32_t uii,char * start)1018 char* uitoa_z6(uint32_t uii, char* start) {
1019 uint32_t quotient = uii / 10000;
1020 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1021 return uitoa_z4(uii - 10000 * quotient, start);
1022 }
1023
uitoa_z8(uint32_t uii,char * start)1024 char* uitoa_z8(uint32_t uii, char* start) {
1025 uint32_t quotient = uii / 1000000;
1026 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1027 return uitoa_z6(uii - 1000000 * quotient, start);
1028 }
1029
int64toa(int64_t llii,char * start)1030 char* int64toa(int64_t llii, char* start) {
1031 uint64_t ullii = llii;
1032 uint64_t top_digits;
1033 uint32_t bottom_eight;
1034 uint32_t middle_eight;
1035 if (llii < 0) {
1036 *start++ = '-';
1037 ullii = -ullii;
1038 }
1039 if (ullii <= 0xffffffffLLU) {
1040 return uint32toa((uint32_t)ullii, start);
1041 }
1042 top_digits = ullii / 100000000;
1043 bottom_eight = (uint32_t)(ullii - (top_digits * 100000000));
1044 if (top_digits <= 0xffffffffLLU) {
1045 start = uint32toa((uint32_t)top_digits, start);
1046 return uitoa_z8(bottom_eight, start);
1047 }
1048 ullii = top_digits / 100000000;
1049 middle_eight = (uint32_t)(top_digits - (ullii * 100000000));
1050 start = uint32toa((uint32_t)ullii, start);
1051 start = uitoa_z8(middle_eight, start);
1052 return uitoa_z8(bottom_eight, start);
1053 }
1054
uint32toa_w4(uint32_t uii,char * start)1055 char* uint32toa_w4(uint32_t uii, char* start) {
1056 uint32_t quotient;
1057 if (uii < 1000) {
1058 if (uii < 10) {
1059 // assumes little-endian
1060 *((uint32_t*)start) = 0x30202020 + (uii << 24);
1061 return &(start[4]);
1062 }
1063 if (uii < 100) {
1064 memset(start, 32, 2);
1065 } else {
1066 quotient = uii / 100;
1067 *start++ = ' ';
1068 *start++ = '0' + quotient;
1069 uii -= quotient * 100;
1070 }
1071 return memcpya(start, &(digit2_table[uii * 2]), 2);
1072 } else {
1073 // presumably the field width is 4 for a reason; don't bother optimizing
1074 // this
1075 return uint32toa(uii, start);
1076 }
1077 }
1078
uint32toa_w6(uint32_t uii,char * start)1079 char* uint32toa_w6(uint32_t uii, char* start) {
1080 uint32_t quotient;
1081 if (uii < 1000) {
1082 if (uii < 10) {
1083 start = memseta(start, 32, 5);
1084 *start++ = '0' + uii;
1085 return start;
1086 }
1087 if (uii < 100) {
1088 start = memseta(start, 32, 4);
1089 goto uint32toa_w6_2;
1090 }
1091 quotient = uii / 100;
1092 // the little-endian trick doesn't seem to help here. possibly relevant
1093 // differences from uint32toa_w4() and _w8(): sequential dependence on
1094 // quotient, need to interpret pointer as a char* again
1095 start = memseta(start, 32, 3);
1096 *start++ = '0' + quotient;
1097 } else {
1098 if (uii < 10000000) {
1099 if (uii >= 100000) {
1100 if (uii < 1000000) {
1101 goto uint32toa_w6_6;
1102 }
1103 quotient = uii / 1000000;
1104 *start++ = '0' + quotient;
1105 goto uint32toa_w6_6b;
1106 } else if (uii >= 10000) {
1107 *start++ = ' ';
1108 quotient = uii / 10000;
1109 *start++ = '0' + quotient;
1110 } else {
1111 start = memseta(start, 32, 2);
1112 goto uint32toa_w6_4;
1113 }
1114 } else {
1115 if (uii >= 100000000) {
1116 quotient = uii / 100000000;
1117 if (uii >= 1000000000) {
1118 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1119 } else {
1120 *start++ = '0' + quotient;
1121 }
1122 uii -= 100000000 * quotient;
1123 }
1124 quotient = uii / 1000000;
1125 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1126 uint32toa_w6_6b:
1127 uii -= 1000000 * quotient;
1128 uint32toa_w6_6:
1129 quotient = uii / 10000;
1130 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1131 }
1132 uii -= 10000 * quotient;
1133 uint32toa_w6_4:
1134 quotient = uii / 100;
1135 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1136 }
1137 uii -= 100 * quotient;
1138 uint32toa_w6_2:
1139 return memcpya(start, &(digit2_table[uii * 2]), 2);
1140 }
1141
uint32toa_w7(uint32_t uii,char * start)1142 char* uint32toa_w7(uint32_t uii, char* start) {
1143 uint32_t quotient;
1144 if (uii < 1000) {
1145 if (uii < 10) {
1146 start = memseta(start, 32, 6);
1147 *start++ = '0' + uii;
1148 return start;
1149 }
1150 if (uii < 100) {
1151 start = memseta(start, 32, 5);
1152 goto uint32toa_w7_2;
1153 }
1154 quotient = uii / 100;
1155 start = memseta(start, 32, 4);
1156 *start++ = '0' + quotient;
1157 } else {
1158 if (uii < 10000000) {
1159 if (uii >= 100000) {
1160 if (uii >= 1000000) {
1161 quotient = uii / 1000000;
1162 *start++ = '0' + quotient;
1163 goto uint32toa_w7_6b;
1164 }
1165 *start++ = ' ';
1166 goto uint32toa_w7_6;
1167 } else if (uii >= 10000) {
1168 start = memseta(start, 32, 2);
1169 quotient = uii / 10000;
1170 *start++ = '0' + quotient;
1171 } else {
1172 start = memseta(start, 32, 3);
1173 goto uint32toa_w7_4;
1174 }
1175 } else {
1176 if (uii >= 100000000) {
1177 quotient = uii / 100000000;
1178 if (uii >= 1000000000) {
1179 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1180 } else {
1181 *start++ = '0' + quotient;
1182 }
1183 uii -= 100000000 * quotient;
1184 }
1185 quotient = uii / 1000000;
1186 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1187 uint32toa_w7_6b:
1188 uii -= 1000000 * quotient;
1189 uint32toa_w7_6:
1190 quotient = uii / 10000;
1191 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1192 }
1193 uii -= 10000 * quotient;
1194 uint32toa_w7_4:
1195 quotient = uii / 100;
1196 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1197 }
1198 uii -= 100 * quotient;
1199 uint32toa_w7_2:
1200 return memcpya(start, &(digit2_table[uii * 2]), 2);
1201 }
1202
uint32toa_w8(uint32_t uii,char * start)1203 char* uint32toa_w8(uint32_t uii, char* start) {
1204 uint32_t quotient;
1205 if (uii < 1000) {
1206 if (uii < 10) {
1207 #ifdef __LP64__
1208 *((uintptr_t*)start) = 0x3020202020202020LLU + (((uintptr_t)uii) << 56);
1209 return &(start[8]);
1210 #else
1211 start = memseta(start, 32, 7);
1212 *start++ = '0' + uii;
1213 return start;
1214 #endif
1215 }
1216 if (uii < 100) {
1217 start = memseta(start, 32, 6);
1218 goto uint32toa_w8_2;
1219 }
1220 quotient = uii / 100;
1221 start = memseta(start, 32, 5);
1222 *start++ = '0' + quotient;
1223 } else {
1224 if (uii < 10000000) {
1225 if (uii >= 100000) {
1226 if (uii < 1000000) {
1227 start = memseta(start, 32, 2);
1228 goto uint32toa_w8_6;
1229 }
1230 quotient = uii / 1000000;
1231 *start = ' ';
1232 start[1] = '0' + quotient;
1233 start += 2;
1234 goto uint32toa_w8_6b;
1235 } else if (uii < 10000) {
1236 start = memseta(start, 32, 4);
1237 goto uint32toa_w8_4;
1238 }
1239 memset(start, 32, 3);
1240 quotient = uii / 10000;
1241 start[3] = '0' + quotient;
1242 start += 4;
1243 } else {
1244 if (uii >= 100000000) {
1245 quotient = uii / 100000000;
1246 if (uii >= 1000000000) {
1247 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1248 } else {
1249 *start++ = '0' + quotient;
1250 }
1251 uii -= 100000000 * quotient;
1252 }
1253 quotient = uii / 1000000;
1254 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1255 uint32toa_w8_6b:
1256 uii -= 1000000 * quotient;
1257 uint32toa_w8_6:
1258 quotient = uii / 10000;
1259 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1260 }
1261 uii -= 10000 * quotient;
1262 uint32toa_w8_4:
1263 quotient = uii / 100;
1264 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1265 }
1266 uii -= 100 * quotient;
1267 uint32toa_w8_2:
1268 return memcpya(start, &(digit2_table[uii * 2]), 2);
1269 }
1270
uint32toa_w10(uint32_t uii,char * start)1271 char* uint32toa_w10(uint32_t uii, char* start) {
1272 // if we decide to reduce code size and optimize only one field width, this
1273 // should be it
1274 uint32_t quotient;
1275 if (uii < 1000) {
1276 if (uii < 10) {
1277 start = memseta(start, 32, 9);
1278 *start++ = '0' + uii;
1279 return start;
1280 }
1281 if (uii < 100) {
1282 start = memseta(start, 32, 8);
1283 goto uint32toa_w10_2;
1284 }
1285 quotient = uii / 100;
1286 start = memseta(start, 32, 7);
1287 *start++ = '0' + quotient;
1288 } else {
1289 if (uii < 10000000) {
1290 if (uii >= 100000) {
1291 if (uii < 1000000) {
1292 start = memseta(start, 32, 4);
1293 goto uint32toa_w10_6;
1294 }
1295 quotient = uii / 1000000;
1296 memset(start, 32, 3);
1297 start[3] = '0' + quotient;
1298 start += 4;
1299 goto uint32toa_w10_6b;
1300 } else if (uii < 10000) {
1301 start = memseta(start, 32, 6);
1302 goto uint32toa_w10_4;
1303 }
1304 memset(start, 32, 5);
1305 quotient = uii / 10000;
1306 start[5] = '0' + quotient;
1307 start += 6;
1308 } else {
1309 if (uii >= 100000000) {
1310 quotient = uii / 100000000;
1311 if (uii >= 1000000000) {
1312 memcpy(start, &(digit2_table[quotient * 2]), 2);
1313 } else {
1314 *start = ' ';
1315 start[1] = '0' + quotient;
1316 }
1317 uii -= 100000000 * quotient;
1318 } else {
1319 memset(start, 32, 2);
1320 }
1321 quotient = uii / 1000000;
1322 memcpy(&(start[2]), &(digit2_table[quotient * 2]), 2);
1323 start += 4;
1324 uint32toa_w10_6b:
1325 uii -= 1000000 * quotient;
1326 uint32toa_w10_6:
1327 quotient = uii / 10000;
1328 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1329 }
1330 uii -= 10000 * quotient;
1331 uint32toa_w10_4:
1332 quotient = uii / 100;
1333 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1334 }
1335 uii -= 100 * quotient;
1336 uint32toa_w10_2:
1337 return memcpya(start, &(digit2_table[uii * 2]), 2);
1338 }
1339
uitoa_trunc2(uint32_t uii,char * start)1340 static inline char* uitoa_trunc2(uint32_t uii, char* start) {
1341 // Given 0 < uii < 100, writes uii without *trailing* zeroes. (I.e. this is
1342 // for floating-point encoder use.)
1343 memcpy(start, &(digit2_table[uii * 2]), 2);
1344 if (start[1] != '0') {
1345 return &(start[2]);
1346 }
1347 return &(start[1]);
1348 }
1349
uitoa_trunc3(uint32_t uii,char * start)1350 static inline char* uitoa_trunc3(uint32_t uii, char* start) {
1351 *start++ = '0' + (uii / 100);
1352 uii %= 100;
1353 if (!uii) {
1354 return start;
1355 }
1356 memcpy(start, &(digit2_table[uii * 2]), 2);
1357 if (start[1] != '0') {
1358 return &(start[2]);
1359 }
1360 return &(start[1]);
1361 }
1362
uitoa_trunc4(uint32_t uii,char * start)1363 static inline char* uitoa_trunc4(uint32_t uii, char* start) {
1364 uint32_t quotient = uii / 100;
1365 memcpy(start, &(digit2_table[quotient * 2]), 2);
1366 uii -= 100 * quotient;
1367 if (uii) {
1368 start += 2;
1369 memcpy(start, &(digit2_table[uii * 2]), 2);
1370 }
1371 if (start[1] != '0') {
1372 return &(start[2]);
1373 }
1374 return &(start[1]);
1375 }
1376
uitoa_trunc6(uint32_t uii,char * start)1377 static inline char* uitoa_trunc6(uint32_t uii, char* start) {
1378 uint32_t quotient = uii / 10000;
1379 memcpy(start, &(digit2_table[quotient * 2]), 2);
1380 uii -= 10000 * quotient;
1381 if (uii) {
1382 quotient = uii / 100;
1383 start += 2;
1384 memcpy(start, &(digit2_table[quotient * 2]), 2);
1385 uii -= 100 * quotient;
1386 if (uii) {
1387 start += 2;
1388 memcpy(start, &(digit2_table[uii * 2]), 2);
1389 }
1390 }
1391 if (start[1] != '0') {
1392 return &(start[2]);
1393 }
1394 return &(start[1]);
1395 }
1396
uitoa_trunc8(uint32_t uii,char * start)1397 static inline char* uitoa_trunc8(uint32_t uii, char* start) {
1398 uint32_t quotient = uii / 1000000;
1399 memcpy(start, &(digit2_table[quotient * 2]), 2);
1400 uii -= 1000000 * quotient;
1401 if (uii) {
1402 quotient = uii / 10000;
1403 start += 2;
1404 memcpy(start, &(digit2_table[quotient * 2]), 2);
1405 uii -= 10000 * quotient;
1406 if (uii) {
1407 quotient = uii / 100;
1408 start += 2;
1409 memcpy(start, &(digit2_table[quotient * 2]), 2);
1410 uii -= 100 * quotient;
1411 if (uii) {
1412 start += 2;
1413 memcpy(start, &(digit2_table[uii * 2]), 2);
1414 }
1415 }
1416 }
1417 if (start[1] != '0') {
1418 return &(start[2]);
1419 }
1420 return &(start[1]);
1421 }
1422
qrtoa_1p1(uint32_t quotient,uint32_t remainder,char * start)1423 static inline char* qrtoa_1p1(uint32_t quotient, uint32_t remainder, char* start) {
1424 start[0] = '0' + quotient;
1425 if (!remainder) {
1426 return &(start[1]);
1427 }
1428 start[1] = '.';
1429 start[2] = '0' + remainder;
1430 return &(start[3]);
1431 }
1432
qrtoa_1p2(uint32_t quotient,uint32_t remainder,char * start)1433 static inline char* qrtoa_1p2(uint32_t quotient, uint32_t remainder, char* start) {
1434 *start++ = '0' + quotient;
1435 if (!remainder) {
1436 return start;
1437 }
1438 *start++ = '.';
1439 memcpy(start, &(digit2_table[remainder * 2]), 2);
1440 if (start[1] != '0') {
1441 return &(start[2]);
1442 }
1443 return &(start[1]);
1444 }
1445
qrtoa_1p3(uint32_t quotient,uint32_t remainder,char * start)1446 static inline char* qrtoa_1p3(uint32_t quotient, uint32_t remainder, char* start) {
1447 // quotient = (int32_t)dxx;
1448 // remainder = ((int32_t)(dxx * 1000)) - (quotient * 1000);
1449 *start++ = '0' + quotient;
1450 if (!remainder) {
1451 return start;
1452 }
1453 *start++ = '.';
1454 quotient = remainder / 10;
1455 memcpy(start, &(digit2_table[quotient * 2]), 2);
1456 remainder -= 10 * quotient;
1457 if (remainder) {
1458 start[2] = '0' + remainder;
1459 return &(start[3]);
1460 }
1461 if (start[1] != '0') {
1462 return &(start[2]);
1463 }
1464 return &(start[1]);
1465 }
1466
qrtoa_1p5(uint32_t quotient,uint32_t remainder,char * start)1467 static inline char* qrtoa_1p5(uint32_t quotient, uint32_t remainder, char* start) {
1468 *start++ = '0' + quotient;
1469 if (!remainder) {
1470 return start;
1471 }
1472 *start++ = '.';
1473 quotient = remainder / 1000;
1474 memcpy(start, &(digit2_table[quotient * 2]), 2);
1475 remainder -= 1000 * quotient;
1476 if (remainder) {
1477 quotient = remainder / 10;
1478 start += 2;
1479 memcpy(start, &(digit2_table[quotient * 2]), 2);
1480 remainder -= 10 * quotient;
1481 if (remainder) {
1482 start[2] = '0' + remainder;
1483 return &(start[3]);
1484 }
1485 }
1486 if (start[1] != '0') {
1487 return &(start[2]);
1488 }
1489 return &(start[1]);
1490 }
1491
qrtoa_1p7(uint32_t quotient,uint32_t remainder,char * start)1492 static inline char* qrtoa_1p7(uint32_t quotient, uint32_t remainder, char* start) {
1493 *start++ = '0' + quotient;
1494 if (!remainder) {
1495 return start;
1496 }
1497 *start++ = '.';
1498 quotient = remainder / 100000;
1499 memcpy(start, &(digit2_table[quotient * 2]), 2);
1500 remainder -= 100000 * quotient;
1501 if (remainder) {
1502 quotient = remainder / 1000;
1503 start += 2;
1504 memcpy(start, &(digit2_table[quotient * 2]), 2);
1505 remainder -= 1000 * quotient;
1506 if (remainder) {
1507 quotient = remainder / 10;
1508 start += 2;
1509 memcpy(start, &(digit2_table[quotient * 2]), 2);
1510 remainder -= 10 * quotient;
1511 if (remainder) {
1512 start[2] = '0' + remainder;
1513 return &(start[3]);
1514 }
1515 }
1516 }
1517 if (start[1] != '0') {
1518 return &(start[2]);
1519 }
1520 return &(start[1]);
1521 }
1522
1523 // Okay, time to do banker's rounding when printing doubles. 14 digits of
1524 // precision are used in judging equality to 0.5 (actual precision of doubles
1525 // is 15-17 digits); the intention is to capture all directly loaded or exactly
1526 // computed edge cases (so enough tolerance is needed to survive the internal
1527 // multiplications by powers of 10, etc.), while rounding a negligible number
1528 // of honest-to-god 0.4999999s up and 0.5000001s down.
1529 // To avoid inadvertent printing of an extra digit, there's a deliberate gap
1530 // between the 99.9994999...-type bounds and the largest numbers that would
1531 // actually round down.
1532 static const double banker_round5[] = {0.499995, 0.500005};
1533 static const double banker_round6[] = {0.4999995, 0.5000005};
1534 static const double banker_round7[] = {0.49999995, 0.50000005};
1535 static const double banker_round8[] = {0.499999995, 0.500000005};
1536 static const double banker_round9[] = {0.4999999995, 0.5000000005};
1537 static const double banker_round10[] = {0.49999999995, 0.50000000005};
1538 static const double banker_round11[] = {0.499999999995, 0.500000000005};
1539 static const double banker_round12[] = {0.4999999999995, 0.5000000000005};
1540
double_bround(double dxx,const double * banker_round)1541 static inline uint32_t double_bround(double dxx, const double* banker_round) {
1542 uint32_t result = (int32_t)dxx;
1543 return result + (int32_t)((dxx - ((int32_t)result)) + banker_round[result & 1]);
1544 }
1545
1546 // These are separate functions so the compiler can optimize the integer
1547 // divisions.
double_bround1(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1548 static inline void double_bround1(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1549 dxx *= 10;
1550 uint32_t remainder = (int32_t)dxx;
1551 remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1552 *quotientp = remainder / 10;
1553 *remainderp = remainder - (*quotientp) * 10;
1554 }
1555
double_bround2(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1556 static inline void double_bround2(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1557 dxx *= 100;
1558 uint32_t remainder = (int32_t)dxx;
1559 remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1560 *quotientp = remainder / 100;
1561 *remainderp = remainder - (*quotientp) * 100;
1562 }
1563
double_bround3(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1564 static inline void double_bround3(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1565 dxx *= 1000;
1566 uint32_t remainder = (int32_t)dxx;
1567 remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1568 *quotientp = remainder / 1000;
1569 *remainderp = remainder - (*quotientp) * 1000;
1570 }
1571
double_bround4(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1572 static inline void double_bround4(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1573 dxx *= 10000;
1574 uint32_t remainder = (int32_t)dxx;
1575 remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1576 *quotientp = remainder / 10000;
1577 *remainderp = remainder - (*quotientp) * 10000;
1578 }
1579
double_bround5(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1580 static inline void double_bround5(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1581 dxx *= 100000;
1582 uint32_t remainder = (int32_t)dxx;
1583 remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1584 *quotientp = remainder / 100000;
1585 *remainderp = remainder - (*quotientp) * 100000;
1586 }
1587
double_bround6(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1588 static inline void double_bround6(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1589 dxx *= 1000000;
1590 uint32_t remainder = (int32_t)dxx;
1591 remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1592 *quotientp = remainder / 1000000;
1593 *remainderp = remainder - (*quotientp) * 1000000;
1594 }
1595
double_bround7(double dxx,const double * banker_round,uint32_t * quotientp,uint32_t * remainderp)1596 static inline void double_bround7(double dxx, const double* banker_round, uint32_t* quotientp, uint32_t* remainderp) {
1597 dxx *= 10000000;
1598 uint32_t remainder = (int32_t)dxx;
1599 remainder += (int32_t)((dxx - ((int32_t)remainder)) + banker_round[remainder & 1]);
1600 *quotientp = remainder / 10000000;
1601 *remainderp = remainder - (*quotientp) * 10000000;
1602 }
1603
dtoa_so6(double dxx,char * start)1604 char* dtoa_so6(double dxx, char* start) {
1605 // 6 sig fig number, 0.999995 <= dxx < 999999.5
1606 // 'so' = "significand only"
1607 // Just hardcoding all six cases, in the absence of a better approach...
1608 uint32_t uii;
1609 uint32_t quotient;
1610 uint32_t remainder;
1611 if (dxx < 99.999949999999) {
1612 if (dxx < 9.9999949999999) {
1613 double_bround5(dxx, banker_round8, "ient, &remainder);
1614 return qrtoa_1p5(quotient, remainder, start);
1615 }
1616 double_bround4(dxx, banker_round8, "ient, &remainder);
1617 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1618 if (!remainder) {
1619 return start;
1620 }
1621 *start++ = '.';
1622 quotient = remainder / 100;
1623 memcpy(start, &(digit2_table[quotient * 2]), 2);
1624 remainder -= 100 * quotient;
1625 if (remainder) {
1626 start += 2;
1627 dtoa_so6_pretail:
1628 memcpy(start, &(digit2_table[remainder * 2]), 2);
1629 }
1630 dtoa_so6_tail:
1631 if (start[1] != '0') {
1632 return &(start[2]);
1633 }
1634 return &(start[1]);
1635 } else if (dxx < 9999.9949999999) {
1636 if (dxx < 999.99949999999) {
1637 double_bround3(dxx, banker_round8, &uii, &remainder);
1638 quotient = uii / 100;
1639 *start++ = '0' + quotient;
1640 quotient = uii - 100 * quotient;
1641 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1642 if (!remainder) {
1643 return start;
1644 }
1645 *start++ = '.';
1646 quotient = remainder / 10;
1647 memcpy(start, &(digit2_table[quotient * 2]), 2);
1648 remainder -= quotient * 10;
1649 if (!remainder) {
1650 goto dtoa_so6_tail;
1651 }
1652 start[2] = '0' + remainder;
1653 return &(start[3]);
1654 }
1655 double_bround2(dxx, banker_round8, &uii, &remainder);
1656 quotient = uii / 100;
1657 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1658 quotient = uii - (100 * quotient);
1659 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1660 if (!remainder) {
1661 return start;
1662 }
1663 *start++ = '.';
1664 goto dtoa_so6_pretail;
1665 } else if (dxx < 99999.949999999) {
1666 double_bround1(dxx, banker_round8, &uii, &remainder);
1667 quotient = uii / 10000;
1668 *start = '0' + quotient;
1669 uii -= 10000 * quotient;
1670 quotient = uii / 100;
1671 start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1672 uii = uii - 100 * quotient;
1673 start = memcpya(start, &(digit2_table[uii * 2]), 2);
1674 if (!remainder) {
1675 return start;
1676 }
1677 *start++ = '.';
1678 *start = '0' + remainder;
1679 return &(start[1]);
1680 } else {
1681 return uitoa_z6(double_bround(dxx, banker_round8), start);
1682 }
1683 }
1684
1685 // Briefly had banker's rounding for floats, but then I realized that the only
1686 // float-printing function calls are --make-grm related, they all request 6-7
1687 // digits of precision, and at that point it's impossible to distinguish exact
1688 // 0.5-matches in the remainder. So we just have generic rounding functions
1689 // here, with similar interfaces to the double-rounding functions to minimize
1690 // the need for separate reasoning about this code.
float_round(float fxx)1691 static inline uint32_t float_round(float fxx) {
1692 return (uint32_t)((int32_t)(fxx + 0.5));
1693 }
1694
float_round1(float fxx,uint32_t * quotientp,uint32_t * remainderp)1695 static inline void float_round1(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1696 uint32_t remainder = float_round(fxx * 10);
1697 *quotientp = remainder / 10;
1698 *remainderp = remainder - (*quotientp) * 10;
1699 }
1700
float_round2(float fxx,uint32_t * quotientp,uint32_t * remainderp)1701 static inline void float_round2(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1702 uint32_t remainder = float_round(fxx * 100);
1703 *quotientp = remainder / 100;
1704 *remainderp = remainder - (*quotientp) * 100;
1705 }
1706
float_round3(float fxx,uint32_t * quotientp,uint32_t * remainderp)1707 static inline void float_round3(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1708 uint32_t remainder = float_round(fxx * 1000);
1709 *quotientp = remainder / 1000;
1710 *remainderp = remainder - (*quotientp) * 1000;
1711 }
1712
float_round4(float fxx,uint32_t * quotientp,uint32_t * remainderp)1713 static inline void float_round4(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1714 uint32_t remainder = float_round(fxx * 10000);
1715 *quotientp = remainder / 10000;
1716 *remainderp = remainder - (*quotientp) * 10000;
1717 }
1718
float_round5(float fxx,uint32_t * quotientp,uint32_t * remainderp)1719 static inline void float_round5(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1720 uint32_t remainder = float_round(fxx * 100000);
1721 *quotientp = remainder / 100000;
1722 *remainderp = remainder - (*quotientp) * 100000;
1723 }
1724
float_round6(float fxx,uint32_t * quotientp,uint32_t * remainderp)1725 static inline void float_round6(float fxx, uint32_t* quotientp, uint32_t* remainderp) {
1726 uint32_t remainder = float_round(fxx * 1000000);
1727 *quotientp = remainder / 1000000;
1728 *remainderp = remainder - (*quotientp) * 1000000;
1729 }
1730
ftoa_so6(float fxx,char * start)1731 char* ftoa_so6(float fxx, char* start) {
1732 uint32_t uii;
1733 uint32_t quotient;
1734 uint32_t remainder;
1735 // difference between consecutive floats near 10 can be as large as
1736 // 10 * 2^{-23}, which is just under 1.2e-6. So, to avoid printing an extra
1737 // digit, we have to set this bound to be robust to an addition error of size
1738 // 6e-7.
1739 // (possible todo: just brute-force test this on all <2^32 possible floats
1740 // and look for a better threshold)
1741 if (fxx < 99.999944) {
1742 if (fxx < 9.9999944) {
1743 float_round5(fxx, "ient, &remainder);
1744 return qrtoa_1p5(quotient, remainder, start);
1745 }
1746 float_round4(fxx, "ient, &remainder);
1747 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1748 if (!remainder) {
1749 return start;
1750 }
1751 *start++ = '.';
1752 quotient = remainder / 100;
1753 memcpy(start, &(digit2_table[quotient * 2]), 2);
1754 remainder -= 100 * quotient;
1755 if (remainder) {
1756 start += 2;
1757 ftoa_so6_pretail:
1758 memcpy(start, &(digit2_table[remainder * 2]), 2);
1759 }
1760 ftoa_so6_tail:
1761 if (start[1] != '0') {
1762 return &(start[2]);
1763 }
1764 return &(start[1]);
1765 } else if (fxx < 9999.9944) {
1766 if (fxx < 999.99944) {
1767 float_round3(fxx, &uii, &remainder);
1768 quotient = uii / 100;
1769 *start = '0' + quotient;
1770 quotient = uii - 100 * quotient;
1771 start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1772 if (!remainder) {
1773 return start;
1774 }
1775 *start++ = '.';
1776 quotient = remainder / 10;
1777 memcpy(start, &(digit2_table[quotient * 2]), 2);
1778 remainder -= quotient * 10;
1779 if (!remainder) {
1780 goto ftoa_so6_tail;
1781 }
1782 start[2] = '0' + remainder;
1783 return &(start[3]);
1784 }
1785 float_round2(fxx, &uii, &remainder);
1786 quotient = uii / 100;
1787 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1788 quotient = uii - (100 * quotient);
1789 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1790 if (!remainder) {
1791 return start;
1792 }
1793 *start++ = '.';
1794 goto ftoa_so6_pretail;
1795 } else if (fxx < 99999.944) {
1796 float_round1(fxx, &uii, &remainder);
1797 quotient = uii / 10000;
1798 *start = '0' + quotient;
1799 uii -= 10000 * quotient;
1800 quotient = uii / 100;
1801 start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1802 uii = uii - 100 * quotient;
1803 start = memcpya(start, &(digit2_table[uii * 2]), 2);
1804 if (!remainder) {
1805 return start;
1806 }
1807 *start = '.';
1808 start[1] = '0' + remainder;
1809 return &(start[2]);
1810 } else {
1811 return uitoa_z6(float_round(fxx), start);
1812 }
1813 }
1814
dtoa_so2(double dxx,char * start)1815 char* dtoa_so2(double dxx, char* start) {
1816 // 2 sig fig number, 0.95 <= dxx < 99.5
1817 uint32_t quotient;
1818 uint32_t remainder;
1819 if (dxx < 9.9499999999999) {
1820 double_bround1(dxx, banker_round12, "ient, &remainder);
1821 return qrtoa_1p1(quotient, remainder, start);
1822 }
1823 return memcpya(start, &(digit2_table[(double_bround(dxx, banker_round12)) * 2]), 2);
1824 }
1825
dtoa_so3(double dxx,char * start)1826 char* dtoa_so3(double dxx, char* start) {
1827 // 3 sig fig number, 0.995 <= dxx < 999.5
1828 uint32_t quotient;
1829 uint32_t remainder;
1830 if (dxx < 99.949999999999) {
1831 if (dxx < 9.9949999999999) {
1832 double_bround2(dxx, banker_round11, "ient, &remainder);
1833 return qrtoa_1p2(quotient, remainder, start);
1834 }
1835 double_bround1(dxx, banker_round11, "ient, &remainder);
1836 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1837 if (!remainder) {
1838 return start;
1839 }
1840 *start++ = '.';
1841 } else {
1842 quotient = double_bround(dxx, banker_round11);
1843 start = memcpya(start, &(digit2_table[(quotient / 10) * 2]), 2);
1844 remainder = quotient % 10;
1845 }
1846 *start = '0' + remainder;
1847 return &(start[1]);
1848 }
1849
dtoa_so4(double dxx,char * start)1850 char* dtoa_so4(double dxx, char* start) {
1851 // 4 sig fig number, 0.9995 <= dxx < 9999.5
1852 uint32_t uii;
1853 uint32_t quotient;
1854 uint32_t remainder;
1855 if (dxx < 99.994999999999) {
1856 if (dxx < 9.9994999999999) {
1857 double_bround3(dxx, banker_round10, "ient, &remainder);
1858 return qrtoa_1p3(quotient, remainder, start);
1859 }
1860 double_bround2(dxx, banker_round10, "ient, &remainder);
1861 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1862 if (!remainder) {
1863 return start;
1864 }
1865 *start++ = '.';
1866 memcpy(start, &(digit2_table[remainder * 2]), 2);
1867 if (start[1] != '0') {
1868 return &(start[2]);
1869 }
1870 return &(start[1]);
1871 } else if (dxx < 999.94999999999) {
1872 double_bround1(dxx, banker_round10, &uii, &remainder);
1873 quotient = uii / 100;
1874 *start = '0' + quotient;
1875 quotient = uii - 100 * quotient;
1876 start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1877 if (!remainder) {
1878 return start;
1879 }
1880 *start = '.';
1881 start[1] = '0' + remainder;
1882 return &(start[2]);
1883 } else {
1884 uitoa_z4(double_bround(dxx, banker_round10), start);
1885 return &(start[4]);
1886 }
1887 }
1888
dtoa_so8(double dxx,char * start)1889 char* dtoa_so8(double dxx, char* start) {
1890 // 8 sig fig number, 0.99999995 <= dxx < 99999999.5
1891 uint32_t uii;
1892 uint32_t quotient;
1893 uint32_t remainder;
1894 if (dxx < 99.999999499999) {
1895 if (dxx < 9.9999999499999) {
1896 double_bround7(dxx, banker_round6, "ient, &remainder);
1897 return qrtoa_1p7(quotient, remainder, start);
1898 }
1899 double_bround6(dxx, banker_round6, "ient, &remainder);
1900 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1901 if (!remainder) {
1902 return start;
1903 }
1904 *start++ = '.';
1905 quotient = remainder / 10000;
1906 memcpy(start, &(digit2_table[quotient * 2]), 2);
1907 remainder -= 10000 * quotient;
1908 if (remainder) {
1909 start += 2;
1910 dtoa_so8_pretail4:
1911 quotient = remainder / 100;
1912 memcpy(start, &(digit2_table[quotient * 2]), 2);
1913 remainder -= 100 * quotient;
1914 if (remainder) {
1915 start += 2;
1916 dtoa_so8_pretail2:
1917 memcpy(start, &(digit2_table[remainder * 2]), 2);
1918 }
1919 }
1920 dtoa_so8_tail:
1921 if (start[1] != '0') {
1922 return &(start[2]);
1923 }
1924 return &(start[1]);
1925 } else if (dxx < 9999.9999499999) {
1926 if (dxx < 999.99999499999) {
1927 double_bround5(dxx, banker_round6, &uii, &remainder);
1928 quotient = uii / 100;
1929 *start++ = '0' + quotient;
1930 quotient = uii - 100 * quotient;
1931 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1932 if (!remainder) {
1933 return start;
1934 }
1935 *start++ = '.';
1936 quotient = remainder / 1000;
1937 memcpy(start, &(digit2_table[quotient * 2]), 2);
1938 remainder -= quotient * 1000;
1939 if (!remainder) {
1940 goto dtoa_so8_tail;
1941 }
1942 start += 2;
1943 dtoa_so8_pretail3:
1944 quotient = remainder / 10;
1945 memcpy(start, &(digit2_table[quotient * 2]), 2);
1946 remainder -= quotient * 10;
1947 if (!remainder) {
1948 goto dtoa_so8_tail;
1949 }
1950 start[2] = '0' + remainder;
1951 return &(start[3]);
1952 }
1953 double_bround4(dxx, banker_round6, &uii, &remainder);
1954 quotient = uii / 100;
1955 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1956 quotient = uii - (100 * quotient);
1957 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1958 if (!remainder) {
1959 return start;
1960 }
1961 *start++ = '.';
1962 goto dtoa_so8_pretail4;
1963 } else if (dxx < 999999.99499999) {
1964 if (dxx < 99999.999499999) {
1965 double_bround3(dxx, banker_round6, &uii, &remainder);
1966 quotient = uii / 10000;
1967 *start = '0' + quotient;
1968 uii -= 10000 * quotient;
1969 quotient = uii / 100;
1970 start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1971 uii -= 100 * quotient;
1972 start = memcpya(start, &(digit2_table[uii * 2]), 2);
1973 if (!remainder) {
1974 return start;
1975 }
1976 *start++ = '.';
1977 goto dtoa_so8_pretail3;
1978 }
1979 double_bround2(dxx, banker_round6, &uii, &remainder);
1980 quotient = uii / 10000;
1981 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1982 uii -= 10000 * quotient;
1983 quotient = uii / 100;
1984 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
1985 uii -= 100 * quotient;
1986 start = memcpya(start, &(digit2_table[uii * 2]), 2);
1987 if (!remainder) {
1988 return start;
1989 }
1990 *start++ = '.';
1991 goto dtoa_so8_pretail2;
1992 } else if (dxx < 9999999.9499999) {
1993 double_bround1(dxx, banker_round6, &uii, &remainder);
1994 quotient = uii / 1000000;
1995 *start = '0' + quotient;
1996 uii -= 1000000 * quotient;
1997 quotient = uii / 10000;
1998 start = memcpya(&(start[1]), &(digit2_table[quotient * 2]), 2);
1999 uii -= 10000 * quotient;
2000 quotient = uii / 100;
2001 start = memcpya(start, &(digit2_table[quotient * 2]), 2);
2002 uii -= 100 * quotient;
2003 start = memcpya(start, &(digit2_table[uii * 2]), 2);
2004 if (!remainder) {
2005 return start;
2006 }
2007 *start = '.';
2008 start[1] = '0' + remainder;
2009 return &(start[2]);
2010 } else {
2011 return uitoa_z8(double_bround(dxx, banker_round6), start);
2012 }
2013 }
2014
dtoa_e(double dxx,char * start)2015 char* dtoa_e(double dxx, char* start) {
2016 uint32_t xp10 = 0;
2017 uint32_t quotient;
2018 uint32_t remainder;
2019 char sign;
2020 if (dxx != dxx) {
2021 // do this first to avoid generating exception
2022 return memcpyl3a(start, "nan");
2023 } else if (dxx < 0) {
2024 *start++ = '-';
2025 dxx = -dxx;
2026 }
2027 if (dxx >= 9.9999994999999e-1) {
2028 if (dxx >= 9.9999994999999e7) {
2029 if (dxx >= 9.9999994999999e127) {
2030 if (dxx == INFINITY) {
2031 return memcpyl3a(start, "inf");
2032 } else if (dxx >= 9.9999994999999e255) {
2033 dxx *= 1.0e-256;
2034 xp10 |= 256;
2035 } else {
2036 dxx *= 1.0e-128;
2037 xp10 |= 128;
2038 }
2039 }
2040 if (dxx >= 9.9999994999999e63) {
2041 dxx *= 1.0e-64;
2042 xp10 |= 64;
2043 }
2044 if (dxx >= 9.9999994999999e31) {
2045 dxx *= 1.0e-32;
2046 xp10 |= 32;
2047 }
2048 if (dxx >= 9.9999994999999e15) {
2049 dxx *= 1.0e-16;
2050 xp10 |= 16;
2051 }
2052 if (dxx >= 9.9999994999999e7) {
2053 dxx *= 1.0e-8;
2054 xp10 |= 8;
2055 }
2056 }
2057 if (dxx >= 9.9999994999999e3) {
2058 dxx *= 1.0e-4;
2059 xp10 |= 4;
2060 }
2061 if (dxx >= 9.9999994999999e1) {
2062 dxx *= 1.0e-2;
2063 xp10 |= 2;
2064 }
2065 if (dxx >= 9.9999994999999) {
2066 dxx *= 1.0e-1;
2067 xp10++;
2068 }
2069 sign = '+';
2070 } else {
2071 if (dxx < 9.9999994999999e-8) {
2072 // general case
2073 if (dxx < 9.9999994999999e-128) {
2074 if (dxx == 0.0) {
2075 return memcpya(start, "0.000000e+00", 12);
2076 }
2077 if (dxx < 9.9999994999999e-256) {
2078 dxx *= 1.0e256;
2079 xp10 |= 256;
2080 } else {
2081 dxx *= 1.0e128;
2082 xp10 |= 128;
2083 }
2084 }
2085 if (dxx < 9.9999994999999e-64) {
2086 dxx *= 1.0e64;
2087 xp10 |= 64;
2088 }
2089 if (dxx < 9.9999994999999e-32) {
2090 dxx *= 1.0e32;
2091 xp10 |= 32;
2092 }
2093 if (dxx < 9.9999994999999e-16) {
2094 dxx *= 1.0e16;
2095 xp10 |= 16;
2096 }
2097 if (dxx < 9.9999994999999e-8) {
2098 dxx *= 100000000;
2099 xp10 |= 8;
2100 }
2101 }
2102 if (dxx < 9.999994999999e-4) {
2103 dxx *= 10000;
2104 xp10 |= 4;
2105 }
2106 if (dxx < 9.9999994999999e-2) {
2107 dxx *= 100;
2108 xp10 |= 2;
2109 }
2110 if (dxx < 9.9999994999999e-1) {
2111 dxx *= 10;
2112 xp10++;
2113 }
2114 sign = '-';
2115 }
2116 double_bround6(dxx, banker_round7, "ient, &remainder);
2117 *start++ = '0' + quotient;
2118 *start++ = '.';
2119 start = uitoa_z6(remainder, start);
2120 *start++ = 'e';
2121 *start++ = sign;
2122 if (xp10 >= 100) {
2123 quotient = xp10 / 100;
2124 *start++ = '0' + quotient;
2125 xp10 -= quotient * 100;
2126 }
2127 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2128 }
2129
ftoa_e(float fxx,char * start)2130 char* ftoa_e(float fxx, char* start) {
2131 uint32_t xp10 = 0;
2132 uint32_t quotient;
2133 uint32_t remainder;
2134 char sign;
2135 if (fxx != fxx) {
2136 // do this first to avoid generating exception
2137 return memcpyl3a(start, "nan");
2138 } else if (fxx < 0) {
2139 *start++ = '-';
2140 fxx = -fxx;
2141 }
2142 if (fxx >= 9.9999995e-1) {
2143 if (fxx >= 9.9999995e15) {
2144 if (fxx == INFINITY) {
2145 return memcpyl3a(start, "inf");
2146 } else if (fxx >= 9.9999995e31) {
2147 fxx *= 1.0e-32;
2148 xp10 |= 32;
2149 } else {
2150 fxx *= 1.0e-16;
2151 xp10 |= 16;
2152 }
2153 }
2154 if (fxx >= 9.9999995e7) {
2155 fxx *= 1.0e-8;
2156 xp10 |= 8;
2157 }
2158 if (fxx >= 9.9999995e3) {
2159 fxx *= 1.0e-4;
2160 xp10 |= 4;
2161 }
2162 if (fxx >= 9.9999995e1) {
2163 fxx *= 1.0e-2;
2164 xp10 |= 2;
2165 }
2166 if (fxx >= 9.9999995) {
2167 fxx *= 1.0e-1;
2168 xp10++;
2169 }
2170 sign = '+';
2171 } else {
2172 if (fxx < 9.9999995e-16) {
2173 if (fxx == 0.0) {
2174 return memcpya(start, "0.000000e+00", 12);
2175 } else if (fxx < 9.9999995e-32) {
2176 fxx *= 1.0e32;
2177 xp10 |= 32;
2178 } else {
2179 fxx *= 1.0e16;
2180 xp10 |= 16;
2181 }
2182 }
2183 if (fxx < 9.9999995e-8) {
2184 fxx *= 100000000;
2185 xp10 |= 8;
2186 }
2187 if (fxx < 9.9999995e-4) {
2188 fxx *= 10000;
2189 xp10 |= 4;
2190 }
2191 if (fxx < 9.9999995e-2) {
2192 fxx *= 100;
2193 xp10 |= 2;
2194 }
2195 if (fxx < 9.9999995e-1) {
2196 fxx *= 10;
2197 xp10++;
2198 }
2199 sign = '-';
2200 }
2201 float_round6(fxx, "ient, &remainder);
2202 *start++ = '0' + quotient;
2203 *start++ = '.';
2204 start = uitoa_z6(remainder, start);
2205 *start++ = 'e';
2206 *start++ = sign;
2207 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2208 }
2209
dtoa_f_p2(double dxx,char * start)2210 char* dtoa_f_p2(double dxx, char* start) {
2211 const double* br_ptr;
2212 uint32_t quotient;
2213 uint32_t remainder;
2214 if (dxx != dxx) {
2215 return memcpyl3a(start, "nan");
2216 } else if (dxx < 9.9949999999999) {
2217 if (dxx < 0) {
2218 *start++ = '-';
2219 dxx = -dxx;
2220 if (dxx >= 9.9949999999999) {
2221 goto dtoa_f_p2_10;
2222 }
2223 }
2224 double_bround2(dxx, banker_round11, "ient, &remainder);
2225 *start++ = '0' + quotient;
2226 dtoa_f_p2_dec:
2227 *start++ = '.';
2228 return memcpya(start, &(digit2_table[remainder * 2]), 2);
2229 }
2230 dtoa_f_p2_10:
2231 if (dxx < 9999999.9949999) {
2232 if (dxx < 999.99499999999) {
2233 if (dxx < 99.994999999999) {
2234 br_ptr = banker_round10;
2235 } else {
2236 br_ptr = banker_round9;
2237 }
2238 } else if (dxx < 99999.994999999) {
2239 if (dxx < 9999.9949999999) {
2240 br_ptr = banker_round8;
2241 } else {
2242 br_ptr = banker_round7;
2243 }
2244 } else if (dxx < 999999.99499999) {
2245 br_ptr = banker_round6;
2246 } else {
2247 br_ptr = banker_round5;
2248 }
2249 double_bround2(dxx, br_ptr, "ient, &remainder);
2250 start = uint32toa(quotient, start);
2251 goto dtoa_f_p2_dec;
2252 }
2253 if (dxx == INFINITY) {
2254 return memcpyl3a(start, "inf");
2255 }
2256 // just punt larger numbers to glibc for now, this isn't a bottleneck
2257 start += sprintf(start, "%.2f", dxx);
2258 return start;
2259 }
2260
dtoa_f_p3(double dxx,char * start)2261 char* dtoa_f_p3(double dxx, char* start) {
2262 const double* br_ptr;
2263 uint32_t quotient;
2264 uint32_t remainder;
2265 if (dxx != dxx) {
2266 return memcpyl3a(start, "nan");
2267 } else if (dxx < 9.9994999999999) {
2268 if (dxx < 0) {
2269 *start++ = '-';
2270 dxx = -dxx;
2271 if (dxx >= 9.9994999999999) {
2272 goto dtoa_f_p3_10;
2273 }
2274 }
2275 double_bround3(dxx, banker_round10, "ient, &remainder);
2276 *start++ = '0' + quotient;
2277 dtoa_f_p3_dec:
2278 *start++ = '.';
2279 quotient = remainder / 100;
2280 remainder -= 100 * quotient;
2281 *start++ = '0' + quotient;
2282 return memcpya(start, &(digit2_table[remainder * 2]), 2);
2283 }
2284 dtoa_f_p3_10:
2285 if (dxx < 999999.99949999) {
2286 if (dxx < 999.99949999999) {
2287 if (dxx < 99.999499999999) {
2288 br_ptr = banker_round9;
2289 } else {
2290 br_ptr = banker_round8;
2291 }
2292 } else if (dxx < 99999.999499999) {
2293 if (dxx < 9999.9994999999) {
2294 br_ptr = banker_round7;
2295 } else {
2296 br_ptr = banker_round6;
2297 }
2298 } else {
2299 br_ptr = banker_round5;
2300 }
2301 double_bround3(dxx, br_ptr, "ient, &remainder);
2302 start = uint32toa(quotient, start);
2303 goto dtoa_f_p3_dec;
2304 }
2305 if (dxx == INFINITY) {
2306 return memcpyl3a(start, "inf");
2307 }
2308 start += sprintf(start, "%.3f", dxx);
2309 return start;
2310 }
2311
dtoa_f_w9p6(double dxx,char * start)2312 char* dtoa_f_w9p6(double dxx, char* start) {
2313 uint32_t quotient;
2314 uint32_t remainder;
2315 if (dxx != dxx) {
2316 return memcpya(start, " nan", 9);
2317 } else if (dxx < 9.9999994999999) {
2318 if (dxx < 0) {
2319 *start++ = '-';
2320 dxx = -dxx;
2321 if (dxx >= 9.9999994999999) {
2322 goto dtoa_f_w9p6_10;
2323 }
2324 } else {
2325 *start++ = ' ';
2326 }
2327 double_bround6(dxx, banker_round7, "ient, &remainder);
2328 *start++ = '0' + quotient;
2329 dtoa_f_w9p6_dec:
2330 *start++ = '.';
2331 return uitoa_z6(remainder, start);
2332 }
2333 dtoa_f_w9p6_10:
2334 if (dxx < 999.99999949999) {
2335 double_bround6(dxx, (dxx < 99.999999499999)? banker_round6 : banker_round5, "ient, &remainder);
2336 start = uint32toa(quotient, start);
2337 goto dtoa_f_w9p6_dec;
2338 }
2339 if (dxx == INFINITY) {
2340 return memcpya(start, " inf", 9);
2341 }
2342 start += sprintf(start, "%.6f", dxx);
2343 return start;
2344 }
2345
dtoa_f_w7p4(double dxx,char * start)2346 char* dtoa_f_w7p4(double dxx, char* start) {
2347 const double* br_ptr;
2348 uint32_t quotient;
2349 uint32_t remainder;
2350 if (dxx != dxx) {
2351 return memcpya(start, " nan", 7);
2352 } else if (dxx < 9.9999499999999) {
2353 if (dxx < 0) {
2354 *start++ = '-';
2355 dxx = -dxx;
2356 if (dxx >= 9.9999499999999) {
2357 goto dtoa_f_w7p4_10;
2358 }
2359 } else {
2360 *start++ = ' ';
2361 }
2362 double_bround4(dxx, banker_round9, "ient, &remainder);
2363 *start++ = '0' + quotient;
2364 dtoa_f_w7p4_dec:
2365 *start++ = '.';
2366 quotient = remainder / 100;
2367 remainder -= 100 * quotient;
2368 return memcpya(memcpya(start, &(digit2_table[quotient * 2]), 2), &(digit2_table[remainder * 2]), 2);
2369 }
2370 dtoa_f_w7p4_10:
2371 if (dxx < 99999.999949999) {
2372 if (dxx < 999.99994999999) {
2373 if (dxx < 99.999949999999) {
2374 br_ptr = banker_round8;
2375 } else {
2376 br_ptr = banker_round7;
2377 }
2378 } else if (dxx < 9999.9999499999) {
2379 br_ptr = banker_round6;
2380 } else {
2381 br_ptr = banker_round5;
2382 }
2383 double_bround4(dxx, br_ptr, "ient, &remainder);
2384 start = uint32toa(quotient, start);
2385 goto dtoa_f_w7p4_dec;
2386 }
2387 if (dxx == INFINITY) {
2388 return memcpya(start, " inf", 7);
2389 }
2390 start += sprintf(start, "%.4f", dxx);
2391 return start;
2392 }
2393
dtoa_f_w9p6_spaced(double dxx,char * start)2394 char* dtoa_f_w9p6_spaced(double dxx, char* start) {
2395 // Prettier fixed-width decimal: removes trailing zero(es) if and only if the
2396 // match appears to be exact.
2397 // Does not detect exact matches when abs(dxx) > 2^31 / 10^5.
2398 double dyy = dxx * 100000 + 0.00000005;
2399 start = dtoa_f_w9p6(dxx, start);
2400 if (dyy - ((double)((int32_t)dyy)) >= 0.0000001) {
2401 return start;
2402 }
2403 trailing_zeroes_to_spaces(start);
2404 return start;
2405 }
2406
dtoa_f_w9p6_clipped(double dxx,char * start)2407 char* dtoa_f_w9p6_clipped(double dxx, char* start) {
2408 // same conditions as _spaced()
2409 double dyy = dxx * 100000 + 0.00000005;
2410 start = dtoa_f_w9p6(dxx, start);
2411 if (dyy - ((double)((int32_t)dyy)) >= 0.0000001) {
2412 return start;
2413 }
2414 return clip_trailing_zeroes(start);
2415 }
2416
dtoa_g(double dxx,char * start)2417 char* dtoa_g(double dxx, char* start) {
2418 uint32_t xp10 = 0;
2419 uint32_t quotient;
2420 uint32_t remainder;
2421 if (dxx != dxx) {
2422 return memcpyl3a(start, "nan");
2423 } else if (dxx < 0) {
2424 *start++ = '-';
2425 dxx = -dxx;
2426 }
2427 if (dxx < 9.9999949999999e-5) {
2428 // 6 sig fig exponential notation, small
2429 if (dxx < 9.9999949999999e-16) {
2430 if (dxx < 9.9999949999999e-128) {
2431 if (dxx == 0.0) {
2432 *start = '0';
2433 return &(start[1]);
2434 } else if (dxx < 9.9999949999999e-256) {
2435 dxx *= 1.0e256;
2436 xp10 |= 256;
2437 } else {
2438 dxx *= 1.0e128;
2439 xp10 |= 128;
2440 }
2441 }
2442 if (dxx < 9.9999949999999e-64) {
2443 dxx *= 1.0e64;
2444 xp10 |= 64;
2445 }
2446 if (dxx < 9.9999949999999e-32) {
2447 dxx *= 1.0e32;
2448 xp10 |= 32;
2449 }
2450 if (dxx < 9.9999949999999e-16) {
2451 dxx *= 1.0e16;
2452 xp10 |= 16;
2453 }
2454 }
2455 if (dxx < 9.9999949999999e-8) {
2456 dxx *= 100000000;
2457 xp10 |= 8;
2458 }
2459 if (dxx < 9.9999949999999e-4) {
2460 dxx *= 10000;
2461 xp10 |= 4;
2462 }
2463 if (dxx < 9.9999949999999e-2) {
2464 dxx *= 100;
2465 xp10 |= 2;
2466 }
2467 if (dxx < 9.9999949999999e-1) {
2468 dxx *= 10;
2469 xp10++;
2470 }
2471 double_bround5(dxx, banker_round8, "ient, &remainder);
2472 start = memcpya(qrtoa_1p5(quotient, remainder, start), "e-", 2);
2473 if (xp10 >= 100) {
2474 quotient = xp10 / 100;
2475 *start++ = '0' + quotient;
2476 xp10 -= 100 * quotient;
2477 }
2478 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2479 } else if (dxx >= 999999.49999999) {
2480 // 6 sig fig exponential notation, large
2481 if (dxx >= 9.9999949999999e15) {
2482 if (dxx >= 9.9999949999999e127) {
2483 if (dxx == INFINITY) {
2484 return memcpyl3a(start, "inf");
2485 } else if (dxx >= 9.9999949999999e255) {
2486 dxx *= 1.0e-256;
2487 xp10 |= 256;
2488 } else {
2489 dxx *= 1.0e-128;
2490 xp10 |= 128;
2491 }
2492 }
2493 if (dxx >= 9.9999949999999e63) {
2494 dxx *= 1.0e-64;
2495 xp10 |= 64;
2496 }
2497 if (dxx >= 9.9999949999999e31) {
2498 dxx *= 1.0e-32;
2499 xp10 |= 32;
2500 }
2501 if (dxx >= 9.9999949999999e15) {
2502 dxx *= 1.0e-16;
2503 xp10 |= 16;
2504 }
2505 }
2506 if (dxx >= 9.9999949999999e7) {
2507 dxx *= 1.0e-8;
2508 xp10 |= 8;
2509 }
2510 if (dxx >= 9.9999949999999e3) {
2511 dxx *= 1.0e-4;
2512 xp10 |= 4;
2513 }
2514 if (dxx >= 9.9999949999999e1) {
2515 dxx *= 1.0e-2;
2516 xp10 |= 2;
2517 }
2518 if (dxx >= 9.9999949999999e0) {
2519 dxx *= 1.0e-1;
2520 xp10++;
2521 }
2522 double_bround5(dxx, banker_round8, "ient, &remainder);
2523 start = memcpya(qrtoa_1p5(quotient, remainder, start), "e+", 2);
2524 if (xp10 >= 100) {
2525 quotient = xp10 / 100;
2526 *start++ = '0' + quotient;
2527 xp10 -= 100 * quotient;
2528 }
2529 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2530 } else if (dxx >= 0.99999949999999) {
2531 return dtoa_so6(dxx, start);
2532 } else {
2533 // 6 sig fig decimal, no less than ~0.0001
2534 start = memcpya(start, "0.", 2);
2535 if (dxx < 9.9999949999999e-3) {
2536 dxx *= 100;
2537 start = memcpya(start, "00", 2);
2538 }
2539 if (dxx < 9.9999949999999e-2) {
2540 dxx *= 10;
2541 *start++ = '0';
2542 }
2543 return uitoa_trunc6(double_bround(dxx * 1000000, banker_round8), start);
2544 }
2545 }
2546
ftoa_g(float fxx,char * start)2547 char* ftoa_g(float fxx, char* start) {
2548 uint32_t xp10 = 0;
2549 uint32_t quotient;
2550 uint32_t remainder;
2551 if (fxx != fxx) {
2552 return memcpyl3a(start, "nan");
2553 } else if (fxx < 0) {
2554 *start++ = '-';
2555 fxx = -fxx;
2556 }
2557 if (fxx < 9.9999944e-5) {
2558 if (fxx < 9.9999944e-16) {
2559 if (fxx == 0.0) {
2560 *start = '0';
2561 return &(start[1]);
2562 } else if (fxx < 9.9999944e-32) {
2563 fxx *= 1.0e32;
2564 xp10 |= 32;
2565 } else {
2566 fxx *= 1.0e16;
2567 xp10 |= 16;
2568 }
2569 }
2570 if (fxx < 9.9999944e-8) {
2571 fxx *= 100000000;
2572 xp10 |= 8;
2573 }
2574 if (fxx < 9.9999944e-4) {
2575 fxx *= 10000;
2576 xp10 |= 4;
2577 }
2578 if (fxx < 9.9999944e-2) {
2579 fxx *= 100;
2580 xp10 |= 2;
2581 }
2582 if (fxx < 9.9999944e-1) {
2583 fxx *= 10;
2584 xp10++;
2585 }
2586 float_round5(fxx, "ient, &remainder);
2587 return memcpya(memcpya(qrtoa_1p5(quotient, remainder, start), "e-", 2), &(digit2_table[xp10 * 2]), 2);
2588 } else if (fxx >= 999999.44) {
2589 if (fxx >= 9.9999944e15) {
2590 if (fxx == INFINITY) {
2591 return memcpyl3a(start, "inf");
2592 } else if (fxx >= 9.9999944e31) {
2593 fxx *= 1.0e-32;
2594 xp10 |= 32;
2595 } else {
2596 fxx *= 1.0e-16;
2597 xp10 |= 16;
2598 }
2599 }
2600 if (fxx >= 9.9999944e7) {
2601 fxx *= 1.0e-8;
2602 xp10 |= 8;
2603 }
2604 if (fxx >= 9.9999944e3) {
2605 fxx *= 1.0e-4;
2606 xp10 |= 4;
2607 }
2608 if (fxx >= 9.9999944e1) {
2609 fxx *= 1.0e-2;
2610 xp10 |= 2;
2611 }
2612 if (fxx >= 9.9999944e0) {
2613 fxx *= 1.0e-1;
2614 xp10++;
2615 }
2616 float_round5(fxx, "ient, &remainder);
2617 return memcpya(memcpya(qrtoa_1p5(quotient, remainder, start), "e+", 2), &(digit2_table[xp10 * 2]), 2);
2618 } else if (fxx >= 0.99999944) {
2619 return ftoa_so6(fxx, start);
2620 } else {
2621 // 6 sig fig decimal, no less than ~0.0001
2622 start = memcpya(start, "0.", 2);
2623 if (fxx < 9.9999944e-3) {
2624 fxx *= 100;
2625 start = memcpya(start, "00", 2);
2626 }
2627 if (fxx < 9.9999944e-2) {
2628 fxx *= 10;
2629 *start++ = '0';
2630 }
2631 return uitoa_trunc6(float_round(fxx * 1000000), start);
2632 }
2633 }
2634
dtoa_g_wxp2(double dxx,uint32_t min_width,char * start)2635 char* dtoa_g_wxp2(double dxx, uint32_t min_width, char* start) {
2636 assert(min_width >= 5);
2637 uint32_t xp10 = 0;
2638 char wbuf[16];
2639 char* wpos = wbuf;
2640 uint32_t quotient;
2641 uint32_t remainder;
2642 if (dxx != dxx) {
2643 memcpy(memseta(start, 32, min_width - 4), " nan", 4);
2644 return &(start[min_width]);
2645 } else if (dxx < 0) {
2646 *wpos++ = '-';
2647 dxx = -dxx;
2648 }
2649 if (dxx < 9.9499999999999e-5) {
2650 // 2 sig fig exponential notation, small
2651 if (dxx < 9.9499999999999e-16) {
2652 if (dxx < 9.9499999999999e-128) {
2653 if (dxx == 0.0) {
2654 memset(start, 32, min_width - 1);
2655 start[min_width - 1] = '0';
2656 return &(start[min_width]);
2657 } else if (dxx < 9.9499999999999e-256) {
2658 dxx *= 1.0e256;
2659 xp10 |= 256;
2660 } else {
2661 dxx *= 1.0e128;
2662 xp10 |= 128;
2663 }
2664 }
2665 if (dxx < 9.9499999999999e-64) {
2666 dxx *= 1.0e64;
2667 xp10 |= 64;
2668 }
2669 if (dxx < 9.9499999999999e-32) {
2670 dxx *= 1.0e32;
2671 xp10 |= 32;
2672 }
2673 if (dxx < 9.9499999999999e-16) {
2674 dxx *= 1.0e16;
2675 xp10 |= 16;
2676 }
2677 }
2678 if (dxx < 9.9499999999999e-8) {
2679 dxx *= 100000000;
2680 xp10 |= 8;
2681 }
2682 if (dxx < 9.9499999999999e-4) {
2683 dxx *= 10000;
2684 xp10 |= 4;
2685 }
2686 if (dxx < 9.9499999999999e-2) {
2687 dxx *= 100;
2688 xp10 |= 2;
2689 }
2690 if (dxx < 9.9499999999999e-1) {
2691 dxx *= 10;
2692 xp10++;
2693 }
2694 double_bround1(dxx, banker_round12, "ient, &remainder);
2695 wpos = qrtoa_1p1(quotient, remainder, wpos);
2696 remainder = wpos - wbuf;
2697 if (xp10 >= 100) {
2698 if (remainder < min_width - 5) {
2699 memcpy(memseta(start, 32, min_width - 5 - remainder), wbuf, remainder);
2700 start = &(start[min_width - 5]);
2701 } else {
2702 start = memcpya(start, wbuf, remainder);
2703 }
2704 quotient = xp10 / 100;
2705 start = memcpyax(start, "e-", 2, '0' + quotient);
2706 xp10 -= 100 * quotient;
2707 } else {
2708 if (remainder < min_width - 4) {
2709 memcpy(memseta(start, 32, min_width - 4 - remainder), wbuf, remainder);
2710 start = &(start[min_width - 4]);
2711 } else {
2712 start = memcpya(start, wbuf, remainder);
2713 }
2714 start = memcpya(start, "e-", 2);
2715 }
2716 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2717 } else if (dxx >= 99.499999999999) {
2718 // 2 sig fig exponential notation, large
2719 if (dxx >= 9.9499999999999e15) {
2720 if (dxx >= 9.9499999999999e127) {
2721 if (dxx == INFINITY) {
2722 start = memseta(start, 32, min_width - 4);
2723 if (wpos == wbuf) {
2724 return memcpya(start, " inf", 4);
2725 } else {
2726 return memcpya(start, "-inf", 4);
2727 }
2728 } else if (dxx >= 9.9499999999999e255) {
2729 dxx *= 1.0e-256;
2730 xp10 |= 256;
2731 } else {
2732 dxx *= 1.0e-128;
2733 xp10 |= 128;
2734 }
2735 }
2736 if (dxx >= 9.9499999999999e63) {
2737 dxx *= 1.0e-64;
2738 xp10 |= 64;
2739 }
2740 if (dxx >= 9.9499999999999e31) {
2741 dxx *= 1.0e-32;
2742 xp10 |= 32;
2743 }
2744 if (dxx >= 9.9499999999999e15) {
2745 dxx *= 1.0e-16;
2746 xp10 |= 16;
2747 }
2748 }
2749 if (dxx >= 9.9499999999999e7) {
2750 dxx *= 1.0e-8;
2751 xp10 |= 8;
2752 }
2753 if (dxx >= 9.9499999999999e3) {
2754 dxx *= 1.0e-4;
2755 xp10 |= 4;
2756 }
2757 if (dxx >= 9.9499999999999e1) {
2758 dxx *= 1.0e-2;
2759 xp10 |= 2;
2760 }
2761 if (dxx >= 9.9499999999999e0) {
2762 dxx *= 1.0e-1;
2763 xp10++;
2764 }
2765 double_bround1(dxx, banker_round12, "ient, &remainder);
2766 wpos = qrtoa_1p1(quotient, remainder, wpos);
2767 remainder = wpos - wbuf;
2768 if (xp10 >= 100) {
2769 if (remainder < min_width - 5) {
2770 memcpy(memseta(start, 32, min_width - 5 - remainder), wbuf, remainder);
2771 start = &(start[min_width - 5]);
2772 } else {
2773 start = memcpya(start, wbuf, remainder);
2774 }
2775 quotient = xp10 / 100;
2776 start = memcpyax(start, "e+", 2, '0' + quotient);
2777 xp10 -= 100 * quotient;
2778 } else {
2779 if (remainder < min_width - 4) {
2780 memcpy(memseta(start, 32, min_width - 4 - remainder), wbuf, remainder);
2781 start = &(start[min_width - 4]);
2782 } else {
2783 start = memcpya(start, wbuf, remainder);
2784 }
2785 start = memcpya(start, "e+", 2);
2786 }
2787 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2788 } else {
2789 if (dxx >= 0.99499999999999) {
2790 wpos = dtoa_so2(dxx, wpos);
2791 } else {
2792 // 2 sig fig decimal, no less than ~0.0001
2793 wpos = memcpya(wpos, "0.", 2);
2794 if (dxx < 9.9499999999999e-3) {
2795 dxx *= 100;
2796 wpos = memcpya(wpos, "00", 2);
2797 }
2798 if (dxx < 9.9499999999999e-2) {
2799 dxx *= 10;
2800 *wpos++ = '0';
2801 }
2802 wpos = uitoa_trunc2(double_bround(dxx * 100, banker_round12), wpos);
2803 }
2804 remainder = wpos - wbuf;
2805 if (remainder < min_width) {
2806 memcpy(memseta(start, 32, min_width - remainder), wbuf, remainder);
2807 return &(start[min_width]);
2808 } else {
2809 return memcpya(start, wbuf, remainder);
2810 }
2811 }
2812 }
2813
dtoa_g_wxp3(double dxx,uint32_t min_width,char * start)2814 char* dtoa_g_wxp3(double dxx, uint32_t min_width, char* start) {
2815 assert(min_width >= 5);
2816 uint32_t xp10 = 0;
2817 char wbuf[16];
2818 char* wpos = wbuf;
2819 uint32_t quotient;
2820 uint32_t remainder;
2821 if (dxx != dxx) {
2822 memcpy(memseta(start, 32, min_width - 4), " nan", 4);
2823 return &(start[min_width]);
2824 } else if (dxx < 0) {
2825 *wpos++ = '-';
2826 dxx = -dxx;
2827 }
2828 if (dxx < 9.9949999999999e-5) {
2829 // 3 sig fig exponential notation, small
2830 if (dxx < 9.9949999999999e-16) {
2831 if (dxx < 9.9949999999999e-128) {
2832 if (dxx == 0.0) {
2833 memset(start, 32, min_width - 1);
2834 start[min_width - 1] = '0';
2835 return &(start[min_width]);
2836 } else if (dxx < 9.9949999999999e-256) {
2837 dxx *= 1.0e256;
2838 xp10 |= 256;
2839 } else {
2840 dxx *= 1.0e128;
2841 xp10 |= 128;
2842 }
2843 }
2844 if (dxx < 9.9949999999999e-64) {
2845 dxx *= 1.0e64;
2846 xp10 |= 64;
2847 }
2848 if (dxx < 9.9949999999999e-32) {
2849 dxx *= 1.0e32;
2850 xp10 |= 32;
2851 }
2852 if (dxx < 9.9949999999999e-16) {
2853 dxx *= 1.0e16;
2854 xp10 |= 16;
2855 }
2856 }
2857 if (dxx < 9.9949999999999e-8) {
2858 dxx *= 100000000;
2859 xp10 |= 8;
2860 }
2861 if (dxx < 9.9949999999999e-4) {
2862 dxx *= 10000;
2863 xp10 |= 4;
2864 }
2865 if (dxx < 9.9949999999999e-2) {
2866 dxx *= 100;
2867 xp10 |= 2;
2868 }
2869 if (dxx < 9.9949999999999e-1) {
2870 dxx *= 10;
2871 xp10++;
2872 }
2873 double_bround2(dxx, banker_round11, "ient, &remainder);
2874 wpos = qrtoa_1p2(quotient, remainder, wpos);
2875 remainder = wpos - wbuf;
2876 if (xp10 >= 100) {
2877 if (remainder < min_width - 5) {
2878 memcpy(memseta(start, 32, min_width - 5 - remainder), wbuf, remainder);
2879 start = &(start[min_width - 5]);
2880 } else {
2881 start = memcpya(start, wbuf, remainder);
2882 }
2883 quotient = xp10 / 100;
2884 start = memcpyax(start, "e-", 2, '0' + quotient);
2885 xp10 -= 100 * quotient;
2886 } else {
2887 if (remainder < min_width - 4) {
2888 memcpy(memseta(start, 32, min_width - 4 - remainder), wbuf, remainder);
2889 start = &(start[min_width - 4]);
2890 } else {
2891 start = memcpya(start, wbuf, remainder);
2892 }
2893 start = memcpya(start, "e-", 2);
2894 }
2895 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2896 } else if (dxx >= 999.49999999999) {
2897 // 3 sig fig exponential notation, large
2898 if (dxx >= 9.9949999999999e15) {
2899 if (dxx >= 9.9949999999999e127) {
2900 if (dxx == INFINITY) {
2901 start = memseta(start, 32, min_width - 4);
2902 if (wpos == wbuf) {
2903 return memcpya(start, " inf", 4);
2904 } else {
2905 return memcpya(start, "-inf", 4);
2906 }
2907 } else if (dxx >= 9.9949999999999e255) {
2908 dxx *= 1.0e-256;
2909 xp10 |= 256;
2910 } else {
2911 dxx *= 1.0e-128;
2912 xp10 |= 128;
2913 }
2914 }
2915 if (dxx >= 9.9949999999999e63) {
2916 dxx *= 1.0e-64;
2917 xp10 |= 64;
2918 }
2919 if (dxx >= 9.9949999999999e31) {
2920 dxx *= 1.0e-32;
2921 xp10 |= 32;
2922 }
2923 if (dxx >= 9.9949999999999e15) {
2924 dxx *= 1.0e-16;
2925 xp10 |= 16;
2926 }
2927 }
2928 if (dxx >= 9.9949999999999e7) {
2929 dxx *= 1.0e-8;
2930 xp10 |= 8;
2931 }
2932 if (dxx >= 9.9949999999999e3) {
2933 dxx *= 1.0e-4;
2934 xp10 |= 4;
2935 }
2936 if (dxx >= 9.9949999999999e1) {
2937 dxx *= 1.0e-2;
2938 xp10 |= 2;
2939 }
2940 if (dxx >= 9.9949999999999e0) {
2941 dxx *= 1.0e-1;
2942 xp10++;
2943 }
2944 double_bround2(dxx, banker_round11, "ient, &remainder);
2945 wpos = qrtoa_1p2(quotient, remainder, wpos);
2946 remainder = wpos - wbuf;
2947 if (xp10 >= 100) {
2948 if (remainder < min_width - 5) {
2949 memcpy(memseta(start, 32, min_width - 5 - remainder), wbuf, remainder);
2950 start = &(start[min_width - 5]);
2951 } else {
2952 start = memcpya(start, wbuf, remainder);
2953 }
2954 quotient = xp10 / 100;
2955 start = memcpyax(start, "e+", 2, '0' + quotient);
2956 xp10 -= 100 * quotient;
2957 } else {
2958 if (remainder < min_width - 4) {
2959 memcpy(memseta(start, 32, min_width - 4 - remainder), wbuf, remainder);
2960 start = &(start[min_width - 4]);
2961 } else {
2962 start = memcpya(start, wbuf, remainder);
2963 }
2964 start = memcpya(start, "e+", 2);
2965 }
2966 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
2967 } else {
2968 if (dxx >= 0.99949999999999) {
2969 wpos = dtoa_so3(dxx, wpos);
2970 } else {
2971 // 3 sig fig decimal, no less than ~0.001
2972 wpos = memcpya(wpos, "0.", 2);
2973 if (dxx < 9.9949999999999e-3) {
2974 dxx *= 100;
2975 wpos = memcpya(wpos, "00", 2);
2976 }
2977 if (dxx < 9.9949999999999e-2) {
2978 dxx *= 10;
2979 *wpos++ = '0';
2980 }
2981 wpos = uitoa_trunc3(double_bround(dxx * 1000, banker_round11), wpos);
2982 }
2983 remainder = wpos - wbuf;
2984 if (remainder < min_width) {
2985 memcpy(memseta(start, 32, min_width - remainder), wbuf, remainder);
2986 return &(start[min_width]);
2987 } else {
2988 return memcpya(start, wbuf, remainder);
2989 }
2990 }
2991 }
2992
dtoa_g_wxp4(double dxx,uint32_t min_width,char * start)2993 char* dtoa_g_wxp4(double dxx, uint32_t min_width, char* start) {
2994 uint32_t xp10 = 0;
2995 char wbuf[16];
2996 char* wpos = wbuf;
2997 uint32_t quotient;
2998 uint32_t remainder;
2999 if (dxx != dxx) {
3000 if (min_width > 3) {
3001 start = memseta(start, 32, min_width - 3);
3002 }
3003 return memcpyl3a(start, "nan");
3004 } else if (dxx < 0) {
3005 *wpos++ = '-';
3006 dxx = -dxx;
3007 }
3008 if (dxx < 9.9994999999999e-5) {
3009 // 4 sig fig exponential notation, small
3010 if (dxx < 9.9994999999999e-16) {
3011 if (dxx < 9.9994999999999e-128) {
3012 if (dxx == 0.0) {
3013 memset(start, 32, min_width - 1);
3014 start[min_width - 1] = '0';
3015 return &(start[min_width]);
3016 } else if (dxx < 9.9994999999999e-256) {
3017 dxx *= 1.0e256;
3018 xp10 |= 256;
3019 } else {
3020 dxx *= 1.0e128;
3021 xp10 |= 128;
3022 }
3023 }
3024 if (dxx < 9.9994999999999e-64) {
3025 dxx *= 1.0e64;
3026 xp10 |= 64;
3027 }
3028 if (dxx < 9.9994999999999e-32) {
3029 dxx *= 1.0e32;
3030 xp10 |= 32;
3031 }
3032 if (dxx < 9.9994999999999e-16) {
3033 dxx *= 1.0e16;
3034 xp10 |= 16;
3035 }
3036 }
3037 if (dxx < 9.9994999999999e-8) {
3038 dxx *= 100000000;
3039 xp10 |= 8;
3040 }
3041 if (dxx < 9.9994999999999e-4) {
3042 dxx *= 10000;
3043 xp10 |= 4;
3044 }
3045 if (dxx < 9.9994999999999e-2) {
3046 dxx *= 100;
3047 xp10 |= 2;
3048 }
3049 if (dxx < 9.9994999999999e-1) {
3050 dxx *= 10;
3051 xp10++;
3052 }
3053 double_bround3(dxx, banker_round10, "ient, &remainder);
3054 wpos = qrtoa_1p3(quotient, remainder, wpos);
3055 remainder = wpos - wbuf;
3056 if (xp10 >= 100) {
3057 if (remainder + 5 < min_width) {
3058 memcpy(memseta(start, 32, min_width - (remainder + 5)), wbuf, remainder);
3059 start = &(start[min_width - 5]);
3060 } else {
3061 start = memcpya(start, wbuf, remainder);
3062 }
3063 quotient = xp10 / 100;
3064 start = memcpyax(start, "e-", 2, '0' + quotient);
3065 xp10 -= 100 * quotient;
3066 } else {
3067 if (remainder + 4 < min_width) {
3068 memcpy(memseta(start, 32, min_width - (remainder + 4)), wbuf, remainder);
3069 start = &(start[min_width - 4]);
3070 } else {
3071 start = memcpya(start, wbuf, remainder);
3072 }
3073 start = memcpya(start, "e-", 2);
3074 }
3075 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
3076 } else if (dxx >= 9999.4999999999) {
3077 // 4 sig fig exponential notation, large
3078 if (dxx >= 9.9994999999999e15) {
3079 if (dxx >= 9.9994999999999e127) {
3080 if (dxx == INFINITY) {
3081 if (min_width > 4) {
3082 start = memseta(start, 32, min_width - 4);
3083 }
3084 if (wpos == wbuf) {
3085 return memcpya(start, " inf", 4);
3086 } else {
3087 return memcpya(start, "-inf", 4);
3088 }
3089 } else if (dxx >= 9.9994999999999e255) {
3090 dxx *= 1.0e-256;
3091 xp10 |= 256;
3092 } else {
3093 dxx *= 1.0e-128;
3094 xp10 |= 128;
3095 }
3096 }
3097 if (dxx >= 9.9994999999999e63) {
3098 dxx *= 1.0e-64;
3099 xp10 |= 64;
3100 }
3101 if (dxx >= 9.9994999999999e31) {
3102 dxx *= 1.0e-32;
3103 xp10 |= 32;
3104 }
3105 if (dxx >= 9.9994999999999e15) {
3106 dxx *= 1.0e-16;
3107 xp10 |= 16;
3108 }
3109 }
3110 if (dxx >= 9.9994999999999e7) {
3111 dxx *= 1.0e-8;
3112 xp10 |= 8;
3113 }
3114 if (dxx >= 9.9994999999999e3) {
3115 dxx *= 1.0e-4;
3116 xp10 |= 4;
3117 }
3118 if (dxx >= 9.9994999999999e1) {
3119 dxx *= 1.0e-2;
3120 xp10 |= 2;
3121 }
3122 if (dxx >= 9.9994999999999e0) {
3123 dxx *= 1.0e-1;
3124 xp10++;
3125 }
3126 double_bround3(dxx, banker_round10, "ient, &remainder);
3127 wpos = qrtoa_1p3(quotient, remainder, wpos);
3128 remainder = wpos - wbuf;
3129 if (xp10 >= 100) {
3130 if (remainder + 5 < min_width) {
3131 memcpy(memseta(start, 32, min_width - (remainder + 5)), wbuf, remainder);
3132 start = &(start[min_width - 5]);
3133 } else {
3134 start = memcpya(start, wbuf, remainder);
3135 }
3136 quotient = xp10 / 100;
3137 start = memcpyax(start, "e+", 2, '0' + quotient);
3138 xp10 -= 100 * quotient;
3139 } else {
3140 if (remainder + 4 < min_width) {
3141 memcpy(memseta(start, 32, min_width - (remainder + 4)), wbuf, remainder);
3142 start = &(start[min_width - 4]);
3143 } else {
3144 start = memcpya(start, wbuf, remainder);
3145 }
3146 start = memcpya(start, "e+", 2);
3147 }
3148 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
3149 } else {
3150 if (dxx >= 0.99994999999999) {
3151 wpos = dtoa_so4(dxx, wpos);
3152 } else {
3153 // 4 sig fig decimal, no less than ~0.0001
3154 wpos = memcpya(wpos, "0.", 2);
3155 if (dxx < 9.9994999999999e-3) {
3156 dxx *= 100;
3157 wpos = memcpya(wpos, "00", 2);
3158 }
3159 if (dxx < 9.9994999999999e-2) {
3160 dxx *= 10;
3161 *wpos++ = '0';
3162 }
3163 wpos = uitoa_trunc4(double_bround(dxx * 10000, banker_round10), wpos);
3164 }
3165 remainder = wpos - wbuf;
3166 if (remainder < min_width) {
3167 memcpy(memseta(start, 32, min_width - remainder), wbuf, remainder);
3168 return &(start[min_width]);
3169 } else {
3170 return memcpya(start, wbuf, remainder);
3171 }
3172 }
3173 }
3174
dtoa_g_wxp8(double dxx,uint32_t min_width,char * start)3175 char* dtoa_g_wxp8(double dxx, uint32_t min_width, char* start) {
3176 uint32_t xp10 = 0;
3177 char wbuf[16];
3178 char* wpos = wbuf;
3179 uint32_t quotient;
3180 uint32_t remainder;
3181 if (dxx != dxx) {
3182 if (min_width > 3) {
3183 start = memseta(start, 32, min_width - 3);
3184 }
3185 return memcpyl3a(start, "nan");
3186 } else if (dxx < 0) {
3187 *wpos++ = '-';
3188 dxx = -dxx;
3189 }
3190 if (dxx < 9.9999999499999e-5) {
3191 // 8 sig fig exponential notation, small
3192 if (dxx < 9.9999999499999e-16) {
3193 if (dxx < 9.9999999499999e-128) {
3194 if (dxx == 0.0) {
3195 memset(start, 32, min_width - 1);
3196 start[min_width - 1] = '0';
3197 return &(start[min_width]);
3198 } else if (dxx < 9.9999999499999e-256) {
3199 dxx *= 1.0e256;
3200 xp10 |= 256;
3201 } else {
3202 dxx *= 1.0e128;
3203 xp10 |= 128;
3204 }
3205 }
3206 if (dxx < 9.9999999499999e-64) {
3207 dxx *= 1.0e64;
3208 xp10 |= 64;
3209 }
3210 if (dxx < 9.9999999499999e-32) {
3211 dxx *= 1.0e32;
3212 xp10 |= 32;
3213 }
3214 if (dxx < 9.9999999499999e-16) {
3215 dxx *= 1.0e16;
3216 xp10 |= 16;
3217 }
3218 }
3219 if (dxx < 9.9999999499999e-8) {
3220 dxx *= 100000000;
3221 xp10 |= 8;
3222 }
3223 if (dxx < 9.9999999499999e-4) {
3224 dxx *= 10000;
3225 xp10 |= 4;
3226 }
3227 if (dxx < 9.9999999499999e-2) {
3228 dxx *= 100;
3229 xp10 |= 2;
3230 }
3231 if (dxx < 9.9999999499999e-1) {
3232 dxx *= 10;
3233 xp10++;
3234 }
3235 double_bround7(dxx, banker_round6, "ient, &remainder);
3236 wpos = qrtoa_1p7(quotient, remainder, wpos);
3237 remainder = wpos - wbuf;
3238 if (xp10 >= 100) {
3239 if (remainder + 5 < min_width) {
3240 memcpy(memseta(start, 32, min_width - (remainder + 5)), wbuf, remainder);
3241 start = &(start[min_width - 5]);
3242 } else {
3243 start = memcpya(start, wbuf, remainder);
3244 }
3245 quotient = xp10 / 100;
3246 start = memcpyax(start, "e-", 2, '0' + quotient);
3247 xp10 -= 100 * quotient;
3248 } else {
3249 if (remainder + 4 < min_width) {
3250 memcpy(memseta(start, 32, min_width - (remainder + 4)), wbuf, remainder);
3251 start = &(start[min_width - 4]);
3252 } else {
3253 start = memcpya(start, wbuf, remainder);
3254 }
3255 start = memcpya(start, "e-", 2);
3256 }
3257 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
3258 } else if (dxx >= 99999999.499999) {
3259 // 8 sig fig exponential notation, large
3260 if (dxx >= 9.9999999499999e15) {
3261 if (dxx >= 9.9999999499999e127) {
3262 if (dxx == INFINITY) {
3263 if (min_width > 4) {
3264 start = memseta(start, 32, min_width - 4);
3265 }
3266 if (wpos == wbuf) {
3267 return memcpya(start, " inf", 4);
3268 } else {
3269 return memcpya(start, "-inf", 4);
3270 }
3271 } else if (dxx >= 9.9999999499999e255) {
3272 dxx *= 1.0e-256;
3273 xp10 |= 256;
3274 } else {
3275 dxx *= 1.0e-128;
3276 xp10 |= 128;
3277 }
3278 }
3279 if (dxx >= 9.9999999499999e63) {
3280 dxx *= 1.0e-64;
3281 xp10 |= 64;
3282 }
3283 if (dxx >= 9.9999999499999e31) {
3284 dxx *= 1.0e-32;
3285 xp10 |= 32;
3286 }
3287 if (dxx >= 9.9999999499999e15) {
3288 dxx *= 1.0e-16;
3289 xp10 |= 16;
3290 }
3291 }
3292 if (dxx >= 9.9999999499999e7) {
3293 dxx *= 1.0e-8;
3294 xp10 |= 8;
3295 }
3296 if (dxx >= 9.9999999499999e3) {
3297 dxx *= 1.0e-4;
3298 xp10 |= 4;
3299 }
3300 if (dxx >= 9.9999999499999e1) {
3301 dxx *= 1.0e-2;
3302 xp10 |= 2;
3303 }
3304 if (dxx >= 9.9999999499999e0) {
3305 dxx *= 1.0e-1;
3306 xp10++;
3307 }
3308 double_bround7(dxx, banker_round6, "ient, &remainder);
3309 wpos = qrtoa_1p7(quotient, remainder, wpos);
3310 remainder = wpos - wbuf;
3311 if (xp10 >= 100) {
3312 if (remainder + 5 < min_width) {
3313 memcpy(memseta(start, 32, min_width - (remainder + 5)), wbuf, remainder);
3314 start = &(start[min_width - 5]);
3315 } else {
3316 start = memcpya(start, wbuf, remainder);
3317 }
3318 quotient = xp10 / 100;
3319 start = memcpyax(start, "e+", 2, '0' + quotient);
3320 xp10 -= 100 * quotient;
3321 } else {
3322 if (remainder + 4 < min_width) {
3323 memcpy(memseta(start, 32, min_width - (remainder + 4)), wbuf, remainder);
3324 start = &(start[min_width - 4]);
3325 } else {
3326 start = memcpya(start, wbuf, remainder);
3327 }
3328 start = memcpya(start, "e+", 2);
3329 }
3330 return memcpya(start, &(digit2_table[xp10 * 2]), 2);
3331 } else {
3332 if (dxx >= 0.99999999499999) {
3333 wpos = dtoa_so8(dxx, wpos);
3334 } else {
3335 // 8 sig fig decimal, no less than ~0.0001
3336 wpos = memcpya(wpos, "0.", 2);
3337 if (dxx < 9.9999999499999e-3) {
3338 dxx *= 100;
3339 wpos = memcpya(wpos, "00", 2);
3340 }
3341 if (dxx < 9.9999999499999e-2) {
3342 dxx *= 10;
3343 *wpos++ = '0';
3344 }
3345 wpos = uitoa_trunc8(double_bround(dxx * 100000000, banker_round6), wpos);
3346 }
3347 remainder = wpos - wbuf;
3348 if (remainder < min_width) {
3349 memcpy(memseta(start, 32, min_width - remainder), wbuf, remainder);
3350 return &(start[min_width]);
3351 } else {
3352 return memcpya(start, wbuf, remainder);
3353 }
3354 }
3355 }
3356
chrom_print_human(uint32_t num,char * buf)3357 char* chrom_print_human(uint32_t num, char* buf) {
3358 uint32_t n10;
3359 if (num < 10) {
3360 *buf = '0' + num;
3361 return &(buf[1]);
3362 } else if (num < 23) {
3363 n10 = num / 10;
3364 *buf = '0' + n10;
3365 buf[1] = '0' + (num - 10 * n10);
3366 return &(buf[2]);
3367 } else if (num < 25) {
3368 // X is 24th letter of alphabet, and 23rd chromosome
3369 *buf = 'A' + num;
3370 return &(buf[1]);
3371 } else if (num > 26) {
3372 // --allow-extra-chr 0
3373 *buf = '0';
3374 return &(buf[1]);
3375 } else if (num == 25) {
3376 memcpy(buf, "XY", 2);
3377 return &(buf[2]);
3378 } else {
3379 memcpy(buf, "MT", 2);
3380 return &(buf[2]);
3381 }
3382 }
3383
magic_num(uint32_t divisor,uint64_t * multp,uint32_t * __restrict pre_shiftp,uint32_t * __restrict post_shiftp,uint32_t * __restrict incrp)3384 void magic_num(uint32_t divisor, uint64_t* multp, uint32_t* __restrict pre_shiftp, uint32_t* __restrict post_shiftp, uint32_t* __restrict incrp) {
3385 // Enables fast integer division by a constant not known until runtime. See
3386 // http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html .
3387 // Assumes divisor is not zero, of course.
3388 // May want to populate a struct instead.
3389 uint32_t down_multiplier = 0;
3390 uint32_t down_exponent = 0;
3391 uint32_t has_magic_down = 0;
3392 uint32_t quotient;
3393 uint32_t remainder;
3394 uint32_t ceil_log_2_d;
3395 uint32_t exponent;
3396 uint32_t uii;
3397 if (divisor & (divisor - 1)) {
3398 quotient = 0x80000000U / divisor;
3399 remainder = 0x80000000U - (quotient * divisor);
3400 ceil_log_2_d = 32 - __builtin_clz(divisor);
3401 for (exponent = 0; ; exponent++) {
3402 if (remainder >= divisor - remainder) {
3403 quotient = quotient * 2 + 1;
3404 remainder = remainder * 2 - divisor;
3405 } else {
3406 quotient = quotient * 2;
3407 remainder = remainder * 2;
3408 }
3409 if ((exponent >= ceil_log_2_d) || (divisor - remainder) <= (1U << exponent)) {
3410 break;
3411 }
3412 if ((!has_magic_down) && (remainder <= (1U << exponent))) {
3413 has_magic_down = 1;
3414 down_multiplier = quotient;
3415 down_exponent = exponent;
3416 }
3417 }
3418 if (exponent < ceil_log_2_d) {
3419 *multp = quotient + 1;
3420 *pre_shiftp = 0;
3421 *post_shiftp = 32 + exponent;
3422 *incrp = 0;
3423 return;
3424 } else if (divisor & 1) {
3425 *multp = down_multiplier;
3426 *pre_shiftp = 0;
3427 *post_shiftp = 32 + down_exponent;
3428 *incrp = 1;
3429 return;
3430 } else {
3431 *pre_shiftp = __builtin_ctz(divisor);
3432 magic_num(divisor >> (*pre_shiftp), multp, &uii, post_shiftp, incrp);
3433 return;
3434 }
3435 } else {
3436 // power of 2
3437 *multp = 1;
3438 *pre_shiftp = 0;
3439 *post_shiftp = __builtin_ctz(divisor);
3440 *incrp = 0;
3441 }
3442 }
3443
fill_bits(uintptr_t loc_start,uintptr_t len,uintptr_t * bitarr)3444 void fill_bits(uintptr_t loc_start, uintptr_t len, uintptr_t* bitarr) {
3445 assert(len);
3446 uintptr_t maj_start = loc_start / BITCT;
3447 uintptr_t maj_end = (loc_start + len) / BITCT;
3448 uintptr_t minor;
3449 if (maj_start == maj_end) {
3450 bitarr[maj_start] |= (ONELU << ((loc_start + len) % BITCT)) - (ONELU << (loc_start % BITCT));
3451 } else {
3452 bitarr[maj_start] |= ~((ONELU << (loc_start % BITCT)) - ONELU);
3453 fill_ulong_one(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
3454 minor = (loc_start + len) % BITCT;
3455 if (minor) {
3456 bitarr[maj_end] |= (ONELU << minor) - ONELU;
3457 }
3458 }
3459 }
3460
clear_bits(uintptr_t loc_start,uintptr_t len,uintptr_t * bitarr)3461 void clear_bits(uintptr_t loc_start, uintptr_t len, uintptr_t* bitarr) {
3462 assert(len);
3463 uintptr_t maj_start = loc_start / BITCT;
3464 uintptr_t maj_end = (loc_start + len) / BITCT;
3465 uintptr_t minor;
3466 if (maj_start == maj_end) {
3467 bitarr[maj_start] &= ~((ONELU << ((loc_start + len) % BITCT)) - (ONELU << (loc_start % BITCT)));
3468 } else {
3469 bitarr[maj_start] &= ((ONELU << (loc_start % BITCT)) - ONELU);
3470 fill_ulong_zero(maj_end - maj_start - 1, &(bitarr[maj_start + 1]));
3471 minor = (loc_start + len) % BITCT;
3472 if (minor) {
3473 bitarr[maj_end] &= ~((ONELU << minor) - ONELU);
3474 }
3475 }
3476 }
3477
next_unset_unsafe(const uintptr_t * bitarr,uint32_t loc)3478 uint32_t next_unset_unsafe(const uintptr_t* bitarr, uint32_t loc) {
3479 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3480 uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
3481 if (ulii) {
3482 return loc + CTZLU(ulii);
3483 }
3484 do {
3485 ulii = *(++bitarr_ptr);
3486 } while (ulii == ~ZEROLU);
3487 return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
3488 }
3489
3490 #ifdef __LP64__
next_unset_ul_unsafe(const uintptr_t * bitarr,uintptr_t loc)3491 uintptr_t next_unset_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
3492 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3493 uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
3494 if (ulii) {
3495 return loc + CTZLU(ulii);
3496 }
3497 do {
3498 ulii = *(++bitarr_ptr);
3499 } while (ulii == ~ZEROLU);
3500 return (((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii));
3501 }
3502 #endif
3503
next_unset(const uintptr_t * bitarr,uint32_t loc,uint32_t ceil)3504 uint32_t next_unset(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
3505 // safe version.
3506 assert(ceil >= 1);
3507 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3508 uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
3509 const uintptr_t* bitarr_last;
3510 if (ulii) {
3511 loc += CTZLU(ulii);
3512 return MINV(loc, ceil);
3513 }
3514 bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
3515 do {
3516 if (bitarr_ptr >= bitarr_last) {
3517 return ceil;
3518 }
3519 ulii = *(++bitarr_ptr);
3520 } while (ulii == ~ZEROLU);
3521 loc = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
3522 return MINV(loc, ceil);
3523 }
3524
3525 #ifdef __LP64__
next_unset_ul(const uintptr_t * bitarr,uintptr_t loc,uintptr_t ceil)3526 uintptr_t next_unset_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
3527 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3528 uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % BITCT);
3529 const uintptr_t* bitarr_last;
3530 if (ulii) {
3531 ulii = loc + CTZLU(ulii);
3532 return MINV(ulii, ceil);
3533 }
3534 bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
3535 do {
3536 if (bitarr_ptr >= bitarr_last) {
3537 return ceil;
3538 }
3539 ulii = *(++bitarr_ptr);
3540 } while (ulii == ~ZEROLU);
3541 ulii = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(~ulii);
3542 return MINV(ulii, ceil);
3543 }
3544 #endif
3545
next_set_unsafe(const uintptr_t * bitarr,uint32_t loc)3546 uint32_t next_set_unsafe(const uintptr_t* bitarr, uint32_t loc) {
3547 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3548 uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
3549 if (ulii) {
3550 return loc + CTZLU(ulii);
3551 }
3552 do {
3553 ulii = *(++bitarr_ptr);
3554 } while (!ulii);
3555 return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
3556 }
3557
3558 #ifdef __LP64__
next_set_ul_unsafe(const uintptr_t * bitarr,uintptr_t loc)3559 uintptr_t next_set_ul_unsafe(const uintptr_t* bitarr, uintptr_t loc) {
3560 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3561 uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
3562 if (ulii) {
3563 return loc + CTZLU(ulii);
3564 }
3565 do {
3566 ulii = *(++bitarr_ptr);
3567 } while (!ulii);
3568 return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
3569 }
3570 #endif
3571
next_set(const uintptr_t * bitarr,uint32_t loc,uint32_t ceil)3572 uint32_t next_set(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
3573 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3574 uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
3575 const uintptr_t* bitarr_last;
3576 uint32_t rval;
3577 if (ulii) {
3578 rval = loc + CTZLU(ulii);
3579 return MINV(rval, ceil);
3580 }
3581 bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
3582 do {
3583 if (bitarr_ptr >= bitarr_last) {
3584 return ceil;
3585 }
3586 ulii = *(++bitarr_ptr);
3587 } while (!ulii);
3588 rval = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
3589 return MINV(rval, ceil);
3590 }
3591
3592 #ifdef __LP64__
next_set_ul(const uintptr_t * bitarr,uintptr_t loc,uintptr_t ceil)3593 uintptr_t next_set_ul(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
3594 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3595 uintptr_t ulii = (*bitarr_ptr) >> (loc % BITCT);
3596 const uintptr_t* bitarr_last;
3597 if (ulii) {
3598 ulii = loc + CTZLU(ulii);
3599 return MINV(ulii, ceil);
3600 }
3601 bitarr_last = &(bitarr[(ceil - 1) / BITCT]);
3602 do {
3603 if (bitarr_ptr >= bitarr_last) {
3604 return ceil;
3605 }
3606 ulii = *(++bitarr_ptr);
3607 } while (!ulii);
3608 ulii = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + CTZLU(ulii);
3609 return MINV(ulii, ceil);
3610 }
3611 #endif
3612
last_set_bit(const uintptr_t * bitarr,uint32_t word_ct)3613 int32_t last_set_bit(const uintptr_t* bitarr, uint32_t word_ct) {
3614 const uintptr_t* bitarr_ptr = &(bitarr[word_ct]);
3615 uintptr_t ulii;
3616 do {
3617 ulii = *(--bitarr_ptr);
3618 if (ulii) {
3619 return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
3620 }
3621 } while (bitarr_ptr > bitarr);
3622 return -1;
3623 }
3624
last_clear_bit(const uintptr_t * bitarr,uint32_t ceil)3625 int32_t last_clear_bit(const uintptr_t* bitarr, uint32_t ceil) {
3626 // can return ceil or any lower number
3627 const uintptr_t* bitarr_ptr = &(bitarr[ceil / BITCT]);
3628 uint32_t remainder = ceil % BITCT;
3629 uintptr_t ulii;
3630 if (remainder) {
3631 ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
3632 if (ulii) {
3633 return (ceil | (BITCT - 1)) - CLZLU(ulii);
3634 }
3635 }
3636 while (bitarr_ptr > bitarr) {
3637 ulii = ~(*(--bitarr_ptr));
3638 if (ulii) {
3639 return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
3640 }
3641 }
3642 return -1;
3643 }
3644
prev_unset_unsafe(const uintptr_t * bitarr,uint32_t loc)3645 uint32_t prev_unset_unsafe(const uintptr_t* bitarr, uint32_t loc) {
3646 // unlike the next_{un}set family, this always returns a STRICTLY earlier
3647 // position
3648 const uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3649 uint32_t remainder = loc % BITCT;
3650 uintptr_t ulii;
3651 if (remainder) {
3652 ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
3653 if (ulii) {
3654 return (loc | (BITCT - 1)) - CLZLU(ulii);
3655 }
3656 }
3657 do {
3658 ulii = ~(*(--bitarr_ptr));
3659 } while (!ulii);
3660 return ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
3661 }
3662
3663 /*
3664 uint32_t prev_unset(uintptr_t* bitarr, uint32_t loc, uint32_t floor) {
3665 uintptr_t* bitarr_ptr = &(bitarr[loc / BITCT]);
3666 uint32_t remainder = loc % BITCT;
3667 uintptr_t* bitarr_first;
3668 uintptr_t ulii;
3669 if (remainder) {
3670 ulii = (~(*bitarr_ptr)) & ((ONELU << remainder) - ONELU);
3671 if (ulii) {
3672 loc = (loc | (BITCT - 1)) - CLZLU(ulii);
3673 return MAXV(loc, floor);
3674 }
3675 }
3676 bitarr_first = &(bitarr[floor / BITCT]);
3677 do {
3678 if (bitarr_ptr == bitarr_first) {
3679 return floor;
3680 }
3681 ulii = ~(*(--bitarr_ptr));
3682 } while (!ulii);
3683 loc = ((uintptr_t)(bitarr_ptr - bitarr)) * BITCT + BITCT - 1 - CLZLU(ulii);
3684 return MAXV(loc, floor);
3685 }
3686 */
3687
3688
bigstack_calloc_uc(uintptr_t ct,unsigned char ** ucp_ptr)3689 int32_t bigstack_calloc_uc(uintptr_t ct, unsigned char** ucp_ptr) {
3690 *ucp_ptr = (unsigned char*)bigstack_alloc(ct);
3691 if (!(*ucp_ptr)) {
3692 return 1;
3693 }
3694 memset(*ucp_ptr, 0, ct);
3695 return 0;
3696 }
3697
bigstack_calloc_d(uintptr_t ct,double ** dp_ptr)3698 int32_t bigstack_calloc_d(uintptr_t ct, double** dp_ptr) {
3699 *dp_ptr = (double*)bigstack_alloc(ct * sizeof(double));
3700 if (!(*dp_ptr)) {
3701 return 1;
3702 }
3703 fill_double_zero(ct, *dp_ptr);
3704 return 0;
3705 }
3706
bigstack_calloc_f(uintptr_t ct,float ** fp_ptr)3707 int32_t bigstack_calloc_f(uintptr_t ct, float** fp_ptr) {
3708 *fp_ptr = (float*)bigstack_alloc(ct * sizeof(float));
3709 if (!(*fp_ptr)) {
3710 return 1;
3711 }
3712 fill_float_zero(ct, *fp_ptr);
3713 return 0;
3714 }
3715
bigstack_calloc_ui(uintptr_t ct,uint32_t ** uip_ptr)3716 int32_t bigstack_calloc_ui(uintptr_t ct, uint32_t** uip_ptr) {
3717 *uip_ptr = (uint32_t*)bigstack_alloc(ct * sizeof(int32_t));
3718 if (!(*uip_ptr)) {
3719 return 1;
3720 }
3721 fill_uint_zero(ct, *uip_ptr);
3722 return 0;
3723 }
3724
bigstack_calloc_ul(uintptr_t ct,uintptr_t ** ulp_ptr)3725 int32_t bigstack_calloc_ul(uintptr_t ct, uintptr_t** ulp_ptr) {
3726 *ulp_ptr = (uintptr_t*)bigstack_alloc(ct * sizeof(intptr_t));
3727 if (!(*ulp_ptr)) {
3728 return 1;
3729 }
3730 fill_ulong_zero(ct, *ulp_ptr);
3731 return 0;
3732 }
3733
bigstack_calloc_ull(uintptr_t ct,uint64_t ** ullp_ptr)3734 int32_t bigstack_calloc_ull(uintptr_t ct, uint64_t** ullp_ptr) {
3735 *ullp_ptr = (uint64_t*)bigstack_alloc(ct * sizeof(int64_t));
3736 if (!(*ullp_ptr)) {
3737 return 1;
3738 }
3739 fill_ull_zero(ct, *ullp_ptr);
3740 return 0;
3741 }
3742
bigstack_end_calloc_uc(uintptr_t ct,unsigned char ** ucp_ptr)3743 int32_t bigstack_end_calloc_uc(uintptr_t ct, unsigned char** ucp_ptr) {
3744 *ucp_ptr = (unsigned char*)bigstack_end_alloc(ct);
3745 if (!(*ucp_ptr)) {
3746 return 1;
3747 }
3748 memset(*ucp_ptr, 0, ct);
3749 return 0;
3750 }
3751
bigstack_end_calloc_d(uintptr_t ct,double ** dp_ptr)3752 int32_t bigstack_end_calloc_d(uintptr_t ct, double** dp_ptr) {
3753 *dp_ptr = (double*)bigstack_end_alloc(ct * sizeof(double));
3754 if (!(*dp_ptr)) {
3755 return 1;
3756 }
3757 fill_double_zero(ct, *dp_ptr);
3758 return 0;
3759 }
3760
bigstack_end_calloc_f(uintptr_t ct,float ** fp_ptr)3761 int32_t bigstack_end_calloc_f(uintptr_t ct, float** fp_ptr) {
3762 *fp_ptr = (float*)bigstack_end_alloc(ct * sizeof(float));
3763 if (!(*fp_ptr)) {
3764 return 1;
3765 }
3766 fill_float_zero(ct, *fp_ptr);
3767 return 0;
3768 }
3769
bigstack_end_calloc_ui(uintptr_t ct,uint32_t ** uip_ptr)3770 int32_t bigstack_end_calloc_ui(uintptr_t ct, uint32_t** uip_ptr) {
3771 *uip_ptr = (uint32_t*)bigstack_end_alloc(ct * sizeof(int32_t));
3772 if (!(*uip_ptr)) {
3773 return 1;
3774 }
3775 fill_uint_zero(ct, *uip_ptr);
3776 return 0;
3777 }
3778
bigstack_end_calloc_ul(uintptr_t ct,uintptr_t ** ulp_ptr)3779 int32_t bigstack_end_calloc_ul(uintptr_t ct, uintptr_t** ulp_ptr) {
3780 *ulp_ptr = (uintptr_t*)bigstack_end_alloc(ct * sizeof(intptr_t));
3781 if (!(*ulp_ptr)) {
3782 return 1;
3783 }
3784 fill_ulong_zero(ct, *ulp_ptr);
3785 return 0;
3786 }
3787
bigstack_end_calloc_ull(uintptr_t ct,uint64_t ** ullp_ptr)3788 int32_t bigstack_end_calloc_ull(uintptr_t ct, uint64_t** ullp_ptr) {
3789 *ullp_ptr = (uint64_t*)bigstack_end_alloc(ct * sizeof(int64_t));
3790 if (!(*ullp_ptr)) {
3791 return 1;
3792 }
3793 fill_ull_zero(ct, *ullp_ptr);
3794 return 0;
3795 }
3796
3797
3798 // MurmurHash3, from
3799 // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
3800
rotl32(uint32_t x,int8_t r)3801 static inline uint32_t rotl32(uint32_t x, int8_t r) {
3802 return (x << r) | (x >> (32 - r));
3803 }
3804
getblock32(const uint32_t * p,int i)3805 static inline uint32_t getblock32(const uint32_t* p, int i) {
3806 return p[i];
3807 }
3808
3809 //-----------------------------------------------------------------------------
3810 // Finalization mix - force all bits of a hash block to avalanche
3811
fmix32(uint32_t h)3812 static inline uint32_t fmix32(uint32_t h) {
3813 h ^= h >> 16;
3814 h *= 0x85ebca6b;
3815 h ^= h >> 13;
3816 h *= 0xc2b2ae35;
3817 h ^= h >> 16;
3818
3819 return h;
3820 }
3821
murmurhash3_32(const void * key,uint32_t len)3822 uint32_t murmurhash3_32(const void* key, uint32_t len) {
3823 const uint8_t* data = (const uint8_t*)key;
3824 const int32_t nblocks = len / 4;
3825
3826 uint32_t h1 = 0;
3827 // uint32_t h1 = seed;
3828
3829 const uint32_t c1 = 0xcc9e2d51;
3830 const uint32_t c2 = 0x1b873593;
3831
3832 //----------
3833 // body
3834
3835 const uint32_t* blocks = (const uint32_t*)(data + nblocks*4);
3836
3837 int32_t i;
3838 uint32_t k1;
3839 for(i = -nblocks; i; i++) {
3840 k1 = getblock32(blocks,i);
3841
3842 k1 *= c1;
3843 k1 = rotl32(k1,15);
3844 k1 *= c2;
3845
3846 h1 ^= k1;
3847 h1 = rotl32(h1,13);
3848 h1 = h1*5+0xe6546b64;
3849 }
3850
3851 //----------
3852 // tail
3853
3854 const uint8_t* tail = (const uint8_t*)(data + nblocks*4);
3855
3856 k1 = 0;
3857
3858 switch(len & 3) {
3859 case 3:
3860 k1 ^= tail[2] << 16;
3861 // fall through
3862 case 2:
3863 k1 ^= tail[1] << 8;
3864 // fall through
3865 case 1:
3866 k1 ^= tail[0];
3867 k1 *= c1;
3868 k1 = rotl32(k1,15);
3869 k1 *= c2;
3870 h1 ^= k1;
3871 }
3872
3873 //----------
3874 // finalization
3875
3876 h1 ^= len;
3877
3878 return fmix32(h1);
3879 }
3880
is_composite6(uintptr_t num)3881 uint32_t is_composite6(uintptr_t num) {
3882 // assumes num is congruent to 1 or 5 mod 6.
3883 // can speed this up by ~50% by hardcoding avoidance of multiples of 5/7,
3884 // but this isn't currently a bottleneck so I'll keep this simple
3885 uintptr_t divisor = 5;
3886 while (divisor * divisor <= num) {
3887 if (!(num % divisor)) {
3888 return 1;
3889 }
3890 divisor += 2;
3891 if (!(num % divisor)) {
3892 return 1;
3893 }
3894 divisor += 4;
3895 }
3896 return 0;
3897 }
3898
geqprime(uintptr_t floor)3899 uintptr_t geqprime(uintptr_t floor) {
3900 // assumes floor is odd and greater than 1. Returns 5 if floor = 3,
3901 // otherwise returns the first prime >= floor.
3902 uintptr_t ulii = floor % 3;
3903 if (!ulii) {
3904 floor += 2;
3905 } else if (ulii == 1) {
3906 goto geqprime_1mod6;
3907 }
3908 while (is_composite6(floor)) {
3909 floor += 2;
3910 geqprime_1mod6:
3911 if (!is_composite6(floor)) {
3912 return floor;
3913 }
3914 floor += 4;
3915 }
3916 return floor;
3917 }
3918
populate_id_htable(uintptr_t unfiltered_ct,const uintptr_t * exclude_arr,uintptr_t item_ct,const char * item_ids,uintptr_t max_id_len,uint32_t store_dups,uint32_t id_htable_size,uint32_t * id_htable)3919 int32_t populate_id_htable(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* item_ids, uintptr_t max_id_len, uint32_t store_dups, uint32_t id_htable_size, uint32_t* id_htable) {
3920 // While unique IDs are normally assumed (and enforced) here, --extract and
3921 // --exclude are an exception, since we want to be able to e.g. exclude all
3922 // variants named '.'. Since there could be millions of them, ordinary
3923 // O(n^2) hash table duplicate resolution is unacceptably slow; instead, we
3924 // allocate additional linked lists past the end of id_htable to track all
3925 // unfiltered indexes of duplicate names. (This requires the
3926 // alloc_and_populate_id_htable interface; bigstack_end_alloc doesn't work
3927 // there.)
3928 uintptr_t item_uidx = 0;
3929 uint32_t extra_alloc = 0;
3930 uint32_t prev_llidx = 0;
3931 // needs to be synced with extract_exclude_flag_norange()
3932 uint32_t* extra_alloc_base = (uint32_t*)g_bigstack_base;
3933 uint32_t item_idx = 0;
3934 const char* sptr;
3935 uintptr_t prev_uidx;
3936 uintptr_t cur_bigstack_left;
3937 uint32_t max_extra_alloc;
3938 uint32_t slen;
3939 uint32_t hashval;
3940 uint32_t next_incr;
3941 uint32_t top_diff;
3942 uint32_t hash_result;
3943 uint32_t cur_dup;
3944 fill_uint_one(id_htable_size, id_htable);
3945 if (!store_dups) {
3946 for (; item_idx < item_ct; item_uidx++, item_idx++) {
3947 next_unset_ul_unsafe_ck(exclude_arr, &item_uidx);
3948 sptr = &(item_ids[item_uidx * max_id_len]);
3949 slen = strlen(sptr);
3950 hashval = murmurhash3_32(sptr, slen) % id_htable_size;
3951 next_incr = 1;
3952 while (1) {
3953 hash_result = id_htable[hashval];
3954 if (hash_result == 0xffffffffU) {
3955 id_htable[hashval] = item_uidx;
3956 break;
3957 } else if (!memcmp(sptr, &(item_ids[hash_result * max_id_len]), slen + 1)) {
3958 // could add an allow_dups parameter which controls whether this is
3959 // an error
3960 LOGERRPRINTFWW("Error: Duplicate ID '%s'.\n", sptr);
3961 return RET_INVALID_FORMAT;
3962 }
3963 // defend against overflow
3964 top_diff = id_htable_size - hashval;
3965 if (top_diff > next_incr) {
3966 hashval += next_incr;
3967 } else {
3968 hashval = next_incr - top_diff;
3969 }
3970 next_incr += 2; // quadratic probing
3971 }
3972 }
3973 } else {
3974 cur_bigstack_left = bigstack_left();
3975 #ifdef __LP64__
3976 if (cur_bigstack_left >= 0x400000000LLU) {
3977 max_extra_alloc = 0xfffffffeU;
3978 } else {
3979 max_extra_alloc = cur_bigstack_left / sizeof(int32_t);
3980 }
3981 #else
3982 max_extra_alloc = cur_bigstack_left / sizeof(int32_t);
3983 #endif
3984 for (; item_idx < item_ct; item_uidx++, item_idx++) {
3985 next_unset_ul_unsafe_ck(exclude_arr, &item_uidx);
3986 sptr = &(item_ids[item_uidx * max_id_len]);
3987 slen = strlen(sptr);
3988 hashval = murmurhash3_32(sptr, slen) % id_htable_size;
3989 next_incr = 1;
3990 while (1) {
3991 hash_result = id_htable[hashval];
3992 if (hash_result == 0xffffffffU) {
3993 id_htable[hashval] = item_uidx;
3994 break;
3995 } else {
3996 cur_dup = hash_result >> 31;
3997 if (cur_dup) {
3998 prev_llidx = hash_result << 1;
3999 prev_uidx = extra_alloc_base[prev_llidx];
4000 } else {
4001 prev_uidx = hash_result;
4002 }
4003 if (!memcmp(sptr, &(item_ids[prev_uidx * max_id_len]), slen + 1)) {
4004 if (extra_alloc + 4 > max_extra_alloc) {
4005 return RET_NOMEM;
4006 }
4007 // point to linked list entry instead
4008 if (!cur_dup) {
4009 extra_alloc_base[extra_alloc] = hash_result;
4010 extra_alloc_base[extra_alloc + 1] = 0xffffffffU; // list end
4011 prev_llidx = extra_alloc;
4012 extra_alloc += 2;
4013 }
4014 extra_alloc_base[extra_alloc] = item_uidx;
4015 extra_alloc_base[extra_alloc + 1] = prev_llidx;
4016 id_htable[hashval] = 0x80000000U | (extra_alloc >> 1);
4017 extra_alloc += 2;
4018 break; // bugfix
4019 }
4020 }
4021 top_diff = id_htable_size - hashval;
4022 if (top_diff > next_incr) {
4023 hashval += next_incr;
4024 } else {
4025 hashval = next_incr - top_diff;
4026 }
4027 next_incr += 2;
4028 }
4029 }
4030 if (extra_alloc) {
4031 bigstack_alloc(extra_alloc * sizeof(int32_t));
4032 }
4033 }
4034 return 0;
4035 }
4036
id_htable_find(const char * id_buf,uintptr_t cur_id_len,const uint32_t * id_htable,uint32_t id_htable_size,const char * item_ids,uintptr_t max_id_len)4037 uint32_t id_htable_find(const char* id_buf, uintptr_t cur_id_len, const uint32_t* id_htable, uint32_t id_htable_size, const char* item_ids, uintptr_t max_id_len) {
4038 // assumes no duplicate entries, and nonzero id_htable_size
4039 // returns 0xffffffffU on failure
4040 if (cur_id_len >= max_id_len) {
4041 return 0xffffffffU;
4042 }
4043 uint32_t hashval = murmurhash3_32(id_buf, cur_id_len) % id_htable_size;
4044 uint32_t next_incr = 1;
4045 const char* sptr;
4046 uint32_t hash_result;
4047 uint32_t top_diff;
4048 while (1) {
4049 hash_result = id_htable[hashval];
4050 if (hash_result == 0xffffffffU) {
4051 return 0xffffffffU;
4052 }
4053 sptr = &(item_ids[hash_result * max_id_len]);
4054 if ((!memcmp(id_buf, sptr, cur_id_len)) && (!sptr[cur_id_len])) {
4055 return hash_result;
4056 }
4057 top_diff = id_htable_size - hashval;
4058 if (top_diff > next_incr) {
4059 hashval += next_incr;
4060 } else {
4061 hashval = next_incr - top_diff;
4062 }
4063 next_incr += 2;
4064 }
4065 }
4066
fill_idx_to_uidx(const uintptr_t * exclude_arr,uintptr_t unfiltered_item_ct,uintptr_t item_ct,uint32_t * idx_to_uidx)4067 void fill_idx_to_uidx(const uintptr_t* exclude_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
4068 uint32_t* idx_to_uidx_end = &(idx_to_uidx[item_ct]);
4069 uint32_t item_uidx = 0;
4070 uint32_t item_uidx_stop;
4071 while (idx_to_uidx < idx_to_uidx_end) {
4072 item_uidx = next_unset_unsafe(exclude_arr, item_uidx);
4073 item_uidx_stop = next_set(exclude_arr, item_uidx, unfiltered_item_ct);
4074 do {
4075 *idx_to_uidx++ = item_uidx++;
4076 } while (item_uidx < item_uidx_stop);
4077 }
4078 }
4079
fill_idx_to_uidx_incl(const uintptr_t * include_arr,uintptr_t unfiltered_item_ct,uintptr_t item_ct,uint32_t * idx_to_uidx)4080 void fill_idx_to_uidx_incl(const uintptr_t* include_arr, uintptr_t unfiltered_item_ct, uintptr_t item_ct, uint32_t* idx_to_uidx) {
4081 uint32_t* idx_to_uidx_end = &(idx_to_uidx[item_ct]);
4082 uint32_t item_uidx = 0;
4083 uint32_t item_uidx_stop;
4084 while (idx_to_uidx < idx_to_uidx_end) {
4085 item_uidx = next_set_unsafe(include_arr, item_uidx);
4086 item_uidx_stop = next_unset(include_arr, item_uidx, unfiltered_item_ct);
4087 do {
4088 *idx_to_uidx++ = item_uidx++;
4089 } while (item_uidx < item_uidx_stop);
4090 }
4091 }
4092
fill_uidx_to_idx(const uintptr_t * exclude_arr,uint32_t unfiltered_item_ct,uint32_t item_ct,uint32_t * uidx_to_idx)4093 void fill_uidx_to_idx(const uintptr_t* exclude_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
4094 uint32_t item_uidx = 0;
4095 uint32_t item_idx = 0;
4096 uint32_t* uidx_to_idx_ptr;
4097 uint32_t* uidx_to_idx_stop;
4098 while (item_idx < item_ct) {
4099 item_uidx = next_unset_unsafe(exclude_arr, item_uidx);
4100 uidx_to_idx_ptr = &(uidx_to_idx[item_uidx]);
4101 item_uidx = next_set(exclude_arr, item_uidx, unfiltered_item_ct);
4102 uidx_to_idx_stop = &(uidx_to_idx[item_uidx]);
4103 do {
4104 *uidx_to_idx_ptr++ = item_idx++;
4105 } while (uidx_to_idx_ptr < uidx_to_idx_stop);
4106 }
4107 }
4108
fill_uidx_to_idx_incl(const uintptr_t * include_arr,uint32_t unfiltered_item_ct,uint32_t item_ct,uint32_t * uidx_to_idx)4109 void fill_uidx_to_idx_incl(const uintptr_t* include_arr, uint32_t unfiltered_item_ct, uint32_t item_ct, uint32_t* uidx_to_idx) {
4110 uint32_t item_uidx = 0;
4111 uint32_t item_idx = 0;
4112 uint32_t* uidx_to_idx_ptr;
4113 uint32_t* uidx_to_idx_stop;
4114 while (item_idx < item_ct) {
4115 item_uidx = next_set_unsafe(include_arr, item_uidx);
4116 uidx_to_idx_ptr = &(uidx_to_idx[item_uidx]);
4117 item_uidx = next_unset(include_arr, item_uidx, unfiltered_item_ct);
4118 uidx_to_idx_stop = &(uidx_to_idx[item_uidx]);
4119 do {
4120 *uidx_to_idx_ptr++ = item_idx++;
4121 } while (uidx_to_idx_ptr < uidx_to_idx_stop);
4122 }
4123 }
4124
fill_midx_to_idx(const uintptr_t * exclude_arr_orig,const uintptr_t * exclude_arr,uint32_t item_ct,uint32_t * midx_to_idx)4125 void fill_midx_to_idx(const uintptr_t* exclude_arr_orig, const uintptr_t* exclude_arr, uint32_t item_ct, uint32_t* midx_to_idx) {
4126 // Assumes item_ct is nonzero.
4127
4128 // May want to switch to alternate behavior: when current midx is excluded,
4129 // fill midx_to_idx[] with the next item_idx.
4130 uint32_t item_uidx = next_unset_unsafe(exclude_arr_orig, 0);
4131 uint32_t item_idx = 0;
4132 uint32_t item_midx;
4133 for (item_midx = 0; item_idx < item_ct; item_uidx++, item_midx++) {
4134 next_unset_unsafe_ck(exclude_arr_orig, &item_uidx);
4135 if (!IS_SET(exclude_arr, item_uidx)) {
4136 midx_to_idx[item_midx] = item_idx++;
4137 }
4138 }
4139 }
4140
fill_quatervec_55(uint32_t ct,uintptr_t * quatervec)4141 void fill_quatervec_55(uint32_t ct, uintptr_t* quatervec) {
4142 uint32_t rem = ct & (BITCT - 1);
4143 #ifdef __LP64__
4144 const __m128i m1 = {FIVEMASK, FIVEMASK};
4145 __m128i* vecp = (__m128i*)quatervec;
4146 __m128i* vec_end = (__m128i*)(&(quatervec[2 * (ct / BITCT)]));
4147 uintptr_t* second_to_last;
4148 while (vecp < vec_end) {
4149 *vecp++ = m1;
4150 }
4151 if (rem) {
4152 second_to_last = (uintptr_t*)vecp;
4153 if (rem > BITCT2) {
4154 second_to_last[0] = FIVEMASK;
4155 second_to_last[1] = FIVEMASK >> ((BITCT - rem) * 2);
4156 } else {
4157 second_to_last[0] = FIVEMASK >> ((BITCT2 - rem) * 2);
4158 second_to_last[1] = 0;
4159 }
4160 }
4161 #else
4162 uintptr_t* vec_end = &(quatervec[2 * (ct / BITCT)]);
4163 while (quatervec < vec_end) {
4164 *quatervec++ = FIVEMASK;
4165 }
4166 if (rem) {
4167 if (rem > BITCT2) {
4168 quatervec[0] = FIVEMASK;
4169 quatervec[1] = FIVEMASK >> ((BITCT - rem) * 2);
4170 } else {
4171 quatervec[0] = FIVEMASK >> ((BITCT2 - rem) * 2);
4172 quatervec[1] = 0;
4173 }
4174 }
4175 #endif
4176 }
4177
quaterarr_collapse_init(const uintptr_t * __restrict unfiltered_bitarr,uint32_t unfiltered_ct,const uintptr_t * __restrict filter_bitarr,uint32_t filtered_ct,uintptr_t * __restrict output_quaterarr)4178 void quaterarr_collapse_init(const uintptr_t* __restrict unfiltered_bitarr, uint32_t unfiltered_ct, const uintptr_t* __restrict filter_bitarr, uint32_t filtered_ct, uintptr_t* __restrict output_quaterarr) {
4179 // Used to unpack e.g. unfiltered sex_male to a filtered quaterarr usable as
4180 // a raw input bitmask.
4181 // Assumes output_quaterarr is sized to a multiple of 16 bytes.
4182 uintptr_t cur_write = 0;
4183 uint32_t item_uidx = 0;
4184 uint32_t write_bit = 0;
4185 uint32_t item_idx = 0;
4186 uint32_t item_uidx_stop;
4187 while (item_idx < filtered_ct) {
4188 item_uidx = next_set_unsafe(filter_bitarr, item_uidx);
4189 item_uidx_stop = next_unset(filter_bitarr, item_uidx, unfiltered_ct);
4190 item_idx += item_uidx_stop - item_uidx;
4191 do {
4192 cur_write |= ((unfiltered_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << (write_bit * 2);
4193 if (++write_bit == BITCT2) {
4194 *output_quaterarr++ = cur_write;
4195 cur_write = 0;
4196 write_bit = 0;
4197 }
4198 } while (++item_uidx < item_uidx_stop);
4199 }
4200 if (write_bit) {
4201 *output_quaterarr++ = cur_write;
4202 }
4203 if ((filtered_ct + (BITCT2 - 1)) & BITCT2) {
4204 *output_quaterarr = 0;
4205 }
4206 }
4207
quaterarr_collapse_init_exclude(const uintptr_t * __restrict unfiltered_bitarr,uint32_t unfiltered_ct,const uintptr_t * __restrict filter_exclude_bitarr,uint32_t filtered_ct,uintptr_t * __restrict output_quaterarr)4208 void quaterarr_collapse_init_exclude(const uintptr_t* __restrict unfiltered_bitarr, uint32_t unfiltered_ct, const uintptr_t* __restrict filter_exclude_bitarr, uint32_t filtered_ct, uintptr_t* __restrict output_quaterarr) {
4209 uintptr_t cur_write = 0;
4210 uint32_t item_uidx = 0;
4211 uint32_t write_bit = 0;
4212 uint32_t item_idx = 0;
4213 uint32_t item_uidx_stop;
4214 while (item_idx < filtered_ct) {
4215 item_uidx = next_unset_unsafe(filter_exclude_bitarr, item_uidx);
4216 item_uidx_stop = next_set(filter_exclude_bitarr, item_uidx, unfiltered_ct);
4217 item_idx += item_uidx_stop - item_uidx;
4218 do {
4219 cur_write |= ((unfiltered_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << (write_bit * 2);
4220 if (++write_bit == BITCT2) {
4221 *output_quaterarr++ = cur_write;
4222 cur_write = 0;
4223 write_bit = 0;
4224 }
4225 } while (++item_uidx < item_uidx_stop);
4226 }
4227 if (write_bit) {
4228 *output_quaterarr++ = cur_write;
4229 }
4230 if ((filtered_ct + (BITCT2 - 1)) & BITCT2) {
4231 *output_quaterarr = 0;
4232 }
4233 }
4234
alloc_collapsed_haploid_filters(const uintptr_t * __restrict sample_bitarr,const uintptr_t * __restrict sex_male,uint32_t unfiltered_sample_ct,uint32_t sample_ct,uint32_t hh_exists,uint32_t is_include,uintptr_t ** sample_include_quatervec_ptr,uintptr_t ** sample_male_include_quatervec_ptr)4235 uint32_t alloc_collapsed_haploid_filters(const uintptr_t* __restrict sample_bitarr, const uintptr_t* __restrict sex_male, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t** sample_include_quatervec_ptr, uintptr_t** sample_male_include_quatervec_ptr) {
4236 uintptr_t sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(sample_ct);
4237 if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
4238 // if already allocated, we assume this is fully initialized
4239 if (!(*sample_include_quatervec_ptr)) {
4240 if (bigstack_alloc_ul(sample_ctv2, sample_include_quatervec_ptr)) {
4241 return 1;
4242 }
4243 fill_quatervec_55(sample_ct, *sample_include_quatervec_ptr);
4244 }
4245 }
4246 if (hh_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
4247 // if already allocated, we assume it's been bigstack_end_alloc'd but not
4248 // initialized
4249 if (!(*sample_male_include_quatervec_ptr)) {
4250 if (bigstack_alloc_ul(sample_ctv2, sample_male_include_quatervec_ptr)) {
4251 return 1;
4252 }
4253 }
4254 if (is_include) {
4255 quaterarr_collapse_init(sex_male, unfiltered_sample_ct, sample_bitarr, sample_ct, *sample_male_include_quatervec_ptr);
4256 } else {
4257 quaterarr_collapse_init_exclude(sex_male, unfiltered_sample_ct, sample_bitarr, sample_ct, *sample_male_include_quatervec_ptr);
4258 }
4259 }
4260 return 0;
4261 }
4262
sample_delim_convert(uintptr_t unfiltered_sample_ct,const uintptr_t * sample_exclude,uint32_t sample_ct,uintptr_t max_sample_id_len,char oldc,char newc,char * sample_ids)4263 void sample_delim_convert(uintptr_t unfiltered_sample_ct, const uintptr_t* sample_exclude, uint32_t sample_ct, uintptr_t max_sample_id_len, char oldc, char newc, char* sample_ids) {
4264 // assumes there is exactly one delimiter to convert per name
4265 uintptr_t sample_uidx = 0;
4266 uint32_t sample_idx;
4267 char* nptr;
4268 for (sample_idx = 0; sample_idx < sample_ct; sample_uidx++, sample_idx++) {
4269 next_unset_ul_unsafe_ck(sample_exclude, &sample_uidx);
4270 nptr = (char*)memchr(&(sample_ids[sample_uidx * max_sample_id_len]), (unsigned char)oldc, max_sample_id_len);
4271 *nptr = newc;
4272 }
4273 }
4274
get_set_wrange_align(const uintptr_t * __restrict bitarr,uintptr_t word_ct,uintptr_t * __restrict firstw_ptr,uintptr_t * __restrict wlen_ptr)4275 void get_set_wrange_align(const uintptr_t* __restrict bitarr, uintptr_t word_ct, uintptr_t* __restrict firstw_ptr, uintptr_t* __restrict wlen_ptr) {
4276 const uintptr_t* bitarr_ptr = bitarr;
4277 const uintptr_t* bitarr_end = &(bitarr[word_ct]);
4278 #ifdef __LP64__
4279 const uintptr_t* bitarr_end2 = &(bitarr[word_ct & (~ONELU)]);
4280 while (bitarr_ptr < bitarr_end2) {
4281 if (bitarr_ptr[0] || bitarr_ptr[1]) {
4282 *firstw_ptr = (uintptr_t)(bitarr_ptr - bitarr);
4283 while (!(*(--bitarr_end)));
4284 *wlen_ptr = 1 + (uintptr_t)(bitarr_end - bitarr_ptr);
4285 return;
4286 }
4287 bitarr_ptr = &(bitarr_ptr[2]);
4288 }
4289 if ((bitarr_end2 != bitarr_end) && (*bitarr_end2)) {
4290 *firstw_ptr = word_ct - 1;
4291 *wlen_ptr = 1;
4292 return;
4293 }
4294 #else
4295 while (bitarr_ptr < bitarr_end) {
4296 if (*bitarr_ptr) {
4297 *firstw_ptr = (uintptr_t)(bitarr_ptr - bitarr);
4298 while (!(*(--bitarr_end)));
4299 *wlen_ptr = 1 + (uintptr_t)(bitarr_end - bitarr_ptr);
4300 return;
4301 }
4302 bitarr_ptr++;
4303 }
4304 #endif
4305 *firstw_ptr = 0;
4306 *wlen_ptr = 0;
4307 }
4308
4309 // hashval computation left to caller since this is frequently used with
4310 // chromosome IDs, where the compiler can optimize the integer modulus
4311 // operation since the hash table size is preset
unklen_id_htable_find(const char * cur_id,const char * const * item_ids,const uint32_t * id_htable,uint32_t hashval,uint32_t id_htable_size)4312 uint32_t unklen_id_htable_find(const char* cur_id, const char* const* item_ids, const uint32_t* id_htable, uint32_t hashval, uint32_t id_htable_size) {
4313 // returns 0xffffffffU on failure
4314 uint32_t next_incr = 1;
4315 while (1) {
4316 const uint32_t hash_result = id_htable[hashval];
4317 if (hash_result == 0xffffffffU) {
4318 return 0xffffffffU;
4319 }
4320 const char* htable_entry = item_ids[hash_result];
4321 if (!strcmp(cur_id, htable_entry)) {
4322 return hash_result;
4323 }
4324 const uint32_t top_diff = id_htable_size - hashval;
4325 if (top_diff > next_incr) {
4326 hashval += next_incr;
4327 } else {
4328 hashval = next_incr - top_diff;
4329 }
4330 next_incr += 2;
4331 }
4332 }
4333
nonstd_chrom_name_htable_find(const char * chrom_name,const char * const * nonstd_names,const uint32_t * nonstd_id_htable,uint32_t name_slen)4334 static inline uint32_t nonstd_chrom_name_htable_find(const char* chrom_name, const char* const* nonstd_names, const uint32_t* nonstd_id_htable, uint32_t name_slen) {
4335 const uint32_t hashval = murmurhash3_32(chrom_name, name_slen) % CHROM_NAME_HTABLE_SIZE;
4336 return unklen_id_htable_find(chrom_name, nonstd_names, nonstd_id_htable, hashval, CHROM_NAME_HTABLE_SIZE);
4337 }
4338
4339
4340 // Global since species_str() may be called by functions which don't actually
4341 // care about chrom_info. (chrom_info is really a global variable too, but I
4342 // find it easier to maintain this code when chrom_info dependencies are made
4343 // explicit in the function signatures; in contrast, g_species_singular and
4344 // g_species_plural are just for pretty printing and lend no insight into what
4345 // the functions which reference them are doing.)
4346 const char* g_species_singular = nullptr;
4347 const char* g_species_plural = nullptr;
4348
init_chrom_info(Chrom_info * chrom_info_ptr)4349 int32_t init_chrom_info(Chrom_info* chrom_info_ptr) {
4350 // "constructor". initializes with maximum capacity. doesn't use bigstack.
4351 // chrom_mask, haploid_mask: bits
4352 // chrom_file_order, chrom_idx_to_foidx: int32s
4353 // chrom_fo_vidx_start: int32s, with an extra trailing element
4354 // nonstd_names: intptr_ts
4355 // nonstd_id_htable: CHROM_NAME_HTABLE_SIZE int32s
4356
4357 assert(!(MAX_POSSIBLE_CHROM % VEC_BYTES));
4358 const uintptr_t vecs_required = 2 * BITCT_TO_VECCT(MAX_POSSIBLE_CHROM) + 3 * (MAX_POSSIBLE_CHROM / VEC_INT32) + 1 + (MAX_POSSIBLE_CHROM / VEC_WORDS) + (CHROM_NAME_HTABLE_SIZE + (VEC_INT32 - 1)) / VEC_INT32;
4359
4360 // needed for proper cleanup
4361 chrom_info_ptr->name_ct = 0;
4362 chrom_info_ptr->incl_excl_name_stack = nullptr;
4363 if (aligned_malloc(vecs_required * VEC_BYTES, &(chrom_info_ptr->chrom_mask))) {
4364 return RET_NOMEM;
4365 }
4366 uintptr_t* alloc_iter = &(chrom_info_ptr->chrom_mask[BITCT_TO_VECCT(MAX_POSSIBLE_CHROM) * VEC_WORDS]);
4367 chrom_info_ptr->haploid_mask = alloc_iter;
4368 alloc_iter = &(alloc_iter[BITCT_TO_VECCT(MAX_POSSIBLE_CHROM) * VEC_WORDS]);
4369 chrom_info_ptr->chrom_file_order = (uint32_t*)alloc_iter;
4370 alloc_iter = &(alloc_iter[(MAX_POSSIBLE_CHROM / VEC_INT32) * VEC_WORDS]);
4371 chrom_info_ptr->chrom_fo_vidx_start = (uint32_t*)alloc_iter;
4372 alloc_iter = &(alloc_iter[((MAX_POSSIBLE_CHROM / VEC_INT32) + 1) * VEC_WORDS]);
4373 chrom_info_ptr->chrom_idx_to_foidx = (uint32_t*)alloc_iter;
4374 alloc_iter = &(alloc_iter[(MAX_POSSIBLE_CHROM / VEC_INT32) * VEC_WORDS]);
4375 chrom_info_ptr->nonstd_names = (char**)alloc_iter;
4376 alloc_iter = &(alloc_iter[MAX_POSSIBLE_CHROM]);
4377 chrom_info_ptr->nonstd_id_htable = (uint32_t*)alloc_iter;
4378 // alloc_iter = &(alloc_iter[((CHROM_NAME_HTABLE_SIZE + (VEC_INT32 - 1)) / VEC_INT32) * VEC_WORDS]);
4379 // postpone nonstd_id_htable initialization until first nonstandard ID is
4380 // loaded
4381 // fill_uint_one(CHROM_NAME_HTABLE_SIZE, chrom_info_ptr->nonstd_id_htable);
4382 return 0;
4383 }
4384
4385 // if these are defined within init_species(), they may not persist after
4386 // function exit
4387 static const char species_singular_constants[][7] = {"person", "cow", "dog", "horse", "mouse", "plant", "sheep", "sample"};
4388 static const char species_plural_constants[][8] = {"people", "cattle", "dogs", "horses", "mice", "plants", "sheep", "samples"};
4389
init_species(uint32_t species_code,Chrom_info * chrom_info_ptr)4390 void init_species(uint32_t species_code, Chrom_info* chrom_info_ptr) {
4391 // human: 22, X, Y, XY, MT
4392 // cow: 29, X, Y, MT
4393 // dog: 38, X, Y, XY, MT
4394 // horse: 31, X, Y
4395 // mouse: 19, X, Y
4396 // rice: 12
4397 // sheep: 26, X, Y
4398 const int32_t species_xymt_codes[] = {
4399 23, 24, 25, 26,
4400 30, 31, -2, 33,
4401 39, 40, 41, 42,
4402 32, 33, -2, -2,
4403 20, 21, -2, -2,
4404 -2, -2, -2, -2,
4405 27, 28, -2, -2};
4406 const uint32_t species_autosome_ct[] = {22, 29, 38, 31, 19, 12, 26};
4407 const uint32_t species_max_code[] = {26, 33, 42, 33, 21, 12, 28};
4408 fill_ulong_zero(CHROM_MASK_WORDS, chrom_info_ptr->chrom_mask);
4409 chrom_info_ptr->output_encoding = 0;
4410 chrom_info_ptr->zero_extra_chroms = 0;
4411 chrom_info_ptr->species = species_code;
4412 chrom_info_ptr->is_include_stack = 0;
4413 g_species_singular = species_singular_constants[species_code];
4414 g_species_plural = species_plural_constants[species_code];
4415 if (species_code != SPECIES_UNKNOWN) {
4416 // these are assumed to be already initialized in the SPECIES_UNKNOWN case
4417
4418 // bugfix: haploid_mask was being cleared in --chr-set case
4419 fill_ulong_zero(CHROM_MASK_WORDS, chrom_info_ptr->haploid_mask);
4420 memcpy(chrom_info_ptr->xymt_codes, &(species_xymt_codes[species_code * XYMT_OFFSET_CT]), XYMT_OFFSET_CT * sizeof(int32_t));
4421 chrom_info_ptr->autosome_ct = species_autosome_ct[species_code];
4422 chrom_info_ptr->max_code = species_max_code[species_code];
4423 switch (species_code) {
4424 case SPECIES_HUMAN:
4425 chrom_info_ptr->haploid_mask[0] = 0x1800000;
4426 break;
4427 case SPECIES_COW:
4428 chrom_info_ptr->haploid_mask[0] = 0xc0000000LU;
4429 break;
4430 case SPECIES_DOG:
4431 #ifdef __LP64__
4432 chrom_info_ptr->haploid_mask[0] = 0x18000000000LLU;
4433 #else
4434 chrom_info_ptr->haploid_mask[1] = 0x180;
4435 #endif
4436 break;
4437 case SPECIES_HORSE:
4438 #ifdef __LP64__
4439 chrom_info_ptr->haploid_mask[0] = 0x300000000LLU;
4440 #else
4441 chrom_info_ptr->haploid_mask[1] = 3;
4442 #endif
4443 break;
4444 case SPECIES_MOUSE:
4445 chrom_info_ptr->haploid_mask[0] = 0x300000;
4446 break;
4447 case SPECIES_RICE:
4448 chrom_info_ptr->haploid_mask[0] = 0x1fff;
4449 break;
4450 case SPECIES_SHEEP:
4451 chrom_info_ptr->haploid_mask[0] = 0x18000000;
4452 break;
4453 }
4454 }
4455 fill_uint_one(chrom_info_ptr->max_code + 1, chrom_info_ptr->chrom_idx_to_foidx);
4456 }
4457
init_default_chrom_mask(Chrom_info * chrom_info_ptr)4458 void init_default_chrom_mask(Chrom_info* chrom_info_ptr) {
4459 if (chrom_info_ptr->species != SPECIES_UNKNOWN) {
4460 fill_all_bits(chrom_info_ptr->max_code + 1, chrom_info_ptr->chrom_mask);
4461 } else {
4462 fill_all_bits(chrom_info_ptr->autosome_ct + 1, chrom_info_ptr->chrom_mask);
4463 // --chr-set support
4464 for (uint32_t xymt_idx = 0; xymt_idx < XYMT_OFFSET_CT; ++xymt_idx) {
4465 int32_t cur_code = chrom_info_ptr->xymt_codes[xymt_idx];
4466 if (cur_code != -2) {
4467 set_bit(chrom_info_ptr->xymt_codes[xymt_idx], chrom_info_ptr->chrom_mask);
4468 }
4469 }
4470 }
4471 }
4472
forget_extra_chrom_names(uint32_t reinitialize,Chrom_info * chrom_info_ptr)4473 void forget_extra_chrom_names(uint32_t reinitialize, Chrom_info* chrom_info_ptr) {
4474 const uint32_t name_ct = chrom_info_ptr->name_ct;
4475 // guard against init_species() not being called yet
4476 if (name_ct) {
4477 char** nonstd_names = chrom_info_ptr->nonstd_names;
4478 const uint32_t chrom_idx_last = chrom_info_ptr->max_code + name_ct;
4479 for (uint32_t chrom_idx = chrom_info_ptr->max_code + 1; chrom_idx <= chrom_idx_last; ++chrom_idx) {
4480 free(nonstd_names[chrom_idx]);
4481 nonstd_names[chrom_idx] = nullptr;
4482 }
4483 if (reinitialize) {
4484 fill_uint_one(CHROM_NAME_HTABLE_SIZE, chrom_info_ptr->nonstd_id_htable);
4485 chrom_info_ptr->name_ct = 0;
4486 }
4487 }
4488 }
4489
finalize_chrom_info(Chrom_info * chrom_info_ptr)4490 int32_t finalize_chrom_info(Chrom_info* chrom_info_ptr) {
4491 const uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
4492 const uint32_t name_ct = chrom_info_ptr->name_ct;
4493 const uint32_t chrom_code_end = chrom_info_ptr->max_code + 1 + name_ct;
4494 const uint32_t chrom_code_bitvec_ct = BITCT_TO_VECCT(chrom_code_end);
4495 const uint32_t chrom_ct_int32vec_ct = (chrom_ct + (VEC_INT32 - 1)) / VEC_INT32;
4496 const uint32_t chrom_ct_p1_int32vec_ct = 1 + (chrom_ct / VEC_INT32);
4497 const uint32_t chrom_code_end_int32vec_ct = (chrom_code_end + (VEC_INT32 - 1)) / VEC_INT32;
4498 const uint32_t chrom_code_end_wordvec_ct = (chrom_code_end + (VEC_WORDS - 1)) / VEC_WORDS;
4499 uint32_t final_vecs_required = 2 * chrom_code_bitvec_ct + chrom_ct_int32vec_ct + chrom_ct_p1_int32vec_ct + chrom_code_end_int32vec_ct;
4500 if (name_ct) {
4501 final_vecs_required += chrom_code_end_wordvec_ct + (CHROM_NAME_HTABLE_SIZE + (VEC_INT32 - 1)) / VEC_INT32;
4502 }
4503 uintptr_t* new_alloc;
4504 if (aligned_malloc(final_vecs_required * VEC_BYTES, &new_alloc)) {
4505 return RET_NOMEM;
4506 }
4507 uintptr_t* old_alloc = chrom_info_ptr->chrom_mask;
4508 uintptr_t* new_alloc_iter = new_alloc;
4509
4510 memcpy(new_alloc_iter, chrom_info_ptr->chrom_mask, chrom_code_bitvec_ct * VEC_BYTES);
4511 chrom_info_ptr->chrom_mask = new_alloc_iter;
4512 new_alloc_iter = &(new_alloc_iter[chrom_code_bitvec_ct * VEC_WORDS]);
4513
4514 memcpy(new_alloc_iter, chrom_info_ptr->haploid_mask, chrom_code_bitvec_ct * VEC_BYTES);
4515 chrom_info_ptr->haploid_mask = new_alloc_iter;
4516 new_alloc_iter = &(new_alloc_iter[chrom_code_bitvec_ct * VEC_WORDS]);
4517
4518 memcpy(new_alloc_iter, chrom_info_ptr->chrom_file_order, chrom_ct_int32vec_ct * VEC_BYTES);
4519 chrom_info_ptr->chrom_file_order = (uint32_t*)new_alloc_iter;
4520 new_alloc_iter = &(new_alloc_iter[chrom_ct_int32vec_ct * VEC_WORDS]);
4521
4522 memcpy(new_alloc_iter, chrom_info_ptr->chrom_fo_vidx_start, chrom_ct_p1_int32vec_ct * VEC_BYTES);
4523 chrom_info_ptr->chrom_fo_vidx_start = (uint32_t*)new_alloc_iter;
4524 new_alloc_iter = &(new_alloc_iter[chrom_ct_p1_int32vec_ct * VEC_WORDS]);
4525
4526 memcpy(new_alloc_iter, chrom_info_ptr->chrom_idx_to_foidx, chrom_code_end_int32vec_ct * VEC_BYTES);
4527 chrom_info_ptr->chrom_idx_to_foidx = (uint32_t*)new_alloc_iter;
4528
4529 if (!name_ct) {
4530 chrom_info_ptr->nonstd_names = nullptr;
4531 chrom_info_ptr->nonstd_id_htable = nullptr;
4532 } else {
4533 new_alloc_iter = &(new_alloc_iter[chrom_code_end_int32vec_ct * VEC_WORDS]);
4534
4535 memcpy(new_alloc_iter, chrom_info_ptr->nonstd_names, chrom_code_end_wordvec_ct * VEC_BYTES);
4536 chrom_info_ptr->nonstd_names = (char**)new_alloc_iter;
4537 new_alloc_iter = &(new_alloc_iter[chrom_code_end_wordvec_ct * VEC_WORDS]);
4538
4539 memcpy(new_alloc_iter, chrom_info_ptr->nonstd_id_htable, CHROM_NAME_HTABLE_SIZE * sizeof(int32_t));
4540 chrom_info_ptr->nonstd_id_htable = (uint32_t*)new_alloc_iter;
4541 }
4542 aligned_free(old_alloc);
4543 return 0;
4544 }
4545
cleanup_chrom_info(Chrom_info * chrom_info_ptr)4546 void cleanup_chrom_info(Chrom_info* chrom_info_ptr) {
4547 if (chrom_info_ptr->chrom_mask) {
4548 // bugfix: this must happened before aligned_free() call
4549 forget_extra_chrom_names(0, chrom_info_ptr);
4550
4551 aligned_free(chrom_info_ptr->chrom_mask);
4552 chrom_info_ptr->chrom_mask = nullptr;
4553 }
4554 Ll_str* ll_str_ptr = chrom_info_ptr->incl_excl_name_stack;
4555 while (ll_str_ptr) {
4556 Ll_str* next_ptr = ll_str_ptr->next;
4557 free(ll_str_ptr);
4558 ll_str_ptr = next_ptr;
4559 }
4560 chrom_info_ptr->incl_excl_name_stack = nullptr;
4561 }
4562
chrom_name_std(const Chrom_info * chrom_info_ptr,uint32_t chrom_idx,char * buf)4563 char* chrom_name_std(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, char* buf) {
4564 const uint32_t output_encoding = chrom_info_ptr->output_encoding;
4565 if (output_encoding & (CHR_OUTPUT_PREFIX | CHR_OUTPUT_0M)) {
4566 if (output_encoding == CHR_OUTPUT_0M) {
4567 // force two chars
4568 if (chrom_idx <= chrom_info_ptr->autosome_ct) {
4569 buf = (char*)memcpya(buf, &(digit2_table[chrom_idx * 2]), 2);
4570 } else if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[XY_OFFSET]) {
4571 buf = (char*)memcpya(buf, "XY", 2);
4572 } else {
4573 *buf++ = '0';
4574 if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[X_OFFSET]) {
4575 *buf++ = 'X';
4576 } else {
4577 // assumes only X/Y/XY/MT defined
4578 *buf++ = ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[Y_OFFSET])? 'Y' : 'M';
4579 }
4580 }
4581 return buf;
4582 }
4583 buf = memcpyl3a(buf, "chr");
4584 }
4585 if ((!(output_encoding & (CHR_OUTPUT_M | CHR_OUTPUT_MT))) || (chrom_idx <= chrom_info_ptr->autosome_ct)) {
4586 return uint32toa(chrom_idx, buf);
4587 } else if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[X_OFFSET]) {
4588 *buf++ = 'X';
4589 } else if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[Y_OFFSET]) {
4590 *buf++ = 'Y';
4591 } else if ((int32_t)chrom_idx == chrom_info_ptr->xymt_codes[XY_OFFSET]) {
4592 buf = (char*)memcpya(buf, "XY", 2);
4593 } else {
4594 *buf++ = 'M';
4595 if (output_encoding & CHR_OUTPUT_MT) {
4596 *buf++ = 'T';
4597 }
4598 }
4599 return buf;
4600 }
4601
chrom_name_write(const Chrom_info * chrom_info_ptr,uint32_t chrom_idx,char * buf)4602 char* chrom_name_write(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, char* buf) {
4603 // assumes chrom_idx is valid
4604 if (!chrom_idx) {
4605 *buf++ = '0';
4606 return buf;
4607 } else if (chrom_idx <= chrom_info_ptr->max_code) {
4608 return chrom_name_std(chrom_info_ptr, chrom_idx, buf);
4609 } else if (chrom_info_ptr->zero_extra_chroms) {
4610 *buf++ = '0';
4611 return buf;
4612 } else {
4613 return strcpya(buf, chrom_info_ptr->nonstd_names[chrom_idx]);
4614 }
4615 }
4616
chrom_name_buf5w4write(const Chrom_info * chrom_info_ptr,uint32_t chrom_idx,uint32_t * chrom_name_len_ptr,char * buf5)4617 char* chrom_name_buf5w4write(const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t* chrom_name_len_ptr, char* buf5) {
4618 uint32_t slen;
4619 *chrom_name_len_ptr = 4;
4620 if (!chrom_idx) {
4621 memcpy(buf5, " 0", 4);
4622 } else if (chrom_idx <= chrom_info_ptr->max_code) {
4623 if (chrom_info_ptr->output_encoding & CHR_OUTPUT_PREFIX) {
4624 *chrom_name_len_ptr = (uintptr_t)(chrom_name_std(chrom_info_ptr, chrom_idx, buf5) - buf5);
4625 } else {
4626 width_force(4, buf5, chrom_name_std(chrom_info_ptr, chrom_idx, buf5));
4627 }
4628 } else if (chrom_info_ptr->zero_extra_chroms) {
4629 memcpy(buf5, " 0", 4);
4630 } else {
4631 slen = strlen(chrom_info_ptr->nonstd_names[chrom_idx]);
4632 if (slen < 4) {
4633 fw_strcpyn(4, slen, chrom_info_ptr->nonstd_names[chrom_idx], buf5);
4634 } else {
4635 *chrom_name_len_ptr = slen;
4636 return chrom_info_ptr->nonstd_names[chrom_idx];
4637 }
4638 }
4639 return buf5;
4640 }
4641
get_max_chrom_slen(const Chrom_info * chrom_info_ptr)4642 uint32_t get_max_chrom_slen(const Chrom_info* chrom_info_ptr) {
4643 // does not include trailing null
4644 // can be overestimate
4645 // if more functions start calling this, it should just be built into
4646 // load_bim() instead
4647 if (chrom_info_ptr->zero_extra_chroms) {
4648 return 3 + MAX_CHROM_TEXTNUM_SLEN;
4649 }
4650 const uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
4651 const uint32_t max_code = chrom_info_ptr->max_code;
4652 uint32_t max_chrom_slen = 3 + MAX_CHROM_TEXTNUM_SLEN;
4653 for (uint32_t chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
4654 const uint32_t chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
4655 if (!is_set(chrom_info_ptr->chrom_mask, chrom_idx)) {
4656 continue;
4657 }
4658 if (chrom_idx > max_code) {
4659 const uint32_t name_slen = strlen(chrom_info_ptr->nonstd_names[chrom_idx]);
4660 if (name_slen > max_chrom_slen) {
4661 max_chrom_slen = name_slen;
4662 }
4663 }
4664 }
4665 return max_chrom_slen;
4666 }
4667
haploid_chrom_present(const Chrom_info * chrom_info_ptr)4668 uint32_t haploid_chrom_present(const Chrom_info* chrom_info_ptr) {
4669 const uintptr_t* chrom_mask = chrom_info_ptr->chrom_mask;
4670 const uintptr_t* haploid_mask = chrom_info_ptr->haploid_mask;
4671 for (uint32_t widx = 0; widx < CHROM_MASK_INITIAL_WORDS; widx++) {
4672 if (chrom_mask[widx] & haploid_mask[widx]) {
4673 return 1;
4674 }
4675 }
4676 return 0;
4677 }
4678
single_letter_chrom(uint32_t letter)4679 static inline int32_t single_letter_chrom(uint32_t letter) {
4680 letter &= 0xdf;
4681 if (letter == 'X') {
4682 return CHROM_X;
4683 } else if (letter == 'Y') {
4684 return CHROM_Y;
4685 } else if (letter == 'M') {
4686 return CHROM_MT;
4687 } else {
4688 return -1;
4689 }
4690 }
4691
get_chrom_code_raw(const char * sptr)4692 int32_t get_chrom_code_raw(const char* sptr) {
4693 // any character <= ' ' is considered a terminator
4694 // note that char arithmetic tends to be compiled to int32 operations, so we
4695 // mostly work with ints here
4696 // assumes MAX_CHROM_TEXTNUM_SLEN == 2
4697 uint32_t first_char_code = (unsigned char)sptr[0];
4698 uint32_t second_char_code = (unsigned char)sptr[1];
4699 if ((first_char_code & 0xdf) == 'C') {
4700 if (((second_char_code & 0xdf) == 'H') && ((((unsigned char)sptr[2]) & 0xdf) == 'R')) {
4701 sptr = &(sptr[3]);
4702 first_char_code = (unsigned char)sptr[0];
4703 second_char_code = (unsigned char)sptr[1];
4704 } else {
4705 return -1;
4706 }
4707 }
4708 if (second_char_code > ' ') {
4709 if (sptr[2] > ' ') {
4710 return -1;
4711 }
4712 const uint32_t first_char_toi = first_char_code - '0';
4713 if (first_char_toi < 10) {
4714 const uint32_t second_char_toi = second_char_code - '0';
4715 if (second_char_toi < 10) {
4716 return first_char_toi * 10 + second_char_toi;
4717 } else if (!first_char_toi) {
4718 // accept '0X', '0Y', '0M' emitted by Oxford software
4719 return single_letter_chrom(second_char_code);
4720 }
4721 } else {
4722 first_char_code &= 0xdf;
4723 if (first_char_code == 'X') {
4724 if ((second_char_code == 'Y') || (second_char_code == 'y')) {
4725 return CHROM_XY;
4726 }
4727 } else if (first_char_code == 'M') {
4728 if ((second_char_code == 'T') || (second_char_code == 't')) {
4729 return CHROM_MT;
4730 }
4731 }
4732 }
4733 } else {
4734 const uint32_t first_char_toi = first_char_code - '0';
4735 if (first_char_toi < 10) {
4736 return first_char_toi;
4737 } else {
4738 return single_letter_chrom(first_char_code);
4739 }
4740 }
4741 return -1;
4742 }
4743
get_chrom_code(const char * chrom_name,const Chrom_info * chrom_info_ptr,uint32_t name_slen)4744 int32_t get_chrom_code(const char* chrom_name, const Chrom_info* chrom_info_ptr, uint32_t name_slen) {
4745 // requires chrom_name to be null-terminated
4746 // in practice, name_slen will usually already be known, may as well avoid
4747 // redundant strlen() calls even though this uglifies the interface
4748 // does not perform exhaustive error-checking
4749 // -1 = --allow-extra-chr ok, -2 = total fail
4750 const int32_t chrom_code_raw = get_chrom_code_raw(chrom_name);
4751 if (((const uint32_t)chrom_code_raw) <= chrom_info_ptr->max_code) {
4752 return chrom_code_raw;
4753 }
4754 if (chrom_code_raw != -1) {
4755 if (chrom_code_raw >= MAX_POSSIBLE_CHROM) {
4756 return chrom_info_ptr->xymt_codes[chrom_code_raw - MAX_POSSIBLE_CHROM];
4757 }
4758 return -2;
4759 }
4760 if (!chrom_info_ptr->name_ct) {
4761 return -1;
4762 }
4763 // 0xffffffffU gets casted to -1
4764 return (int32_t)nonstd_chrom_name_htable_find(chrom_name, (const char* const*)chrom_info_ptr->nonstd_names, chrom_info_ptr->nonstd_id_htable, name_slen);
4765 }
4766
get_chrom_code_counted(const Chrom_info * chrom_info_ptr,uint32_t name_slen,char * chrom_name)4767 int32_t get_chrom_code_counted(const Chrom_info* chrom_info_ptr, uint32_t name_slen, char* chrom_name) {
4768 // when the chromosome name isn't null-terminated
4769 char* s_end = &(chrom_name[name_slen]);
4770 const char tmpc = *s_end;
4771 *s_end = '\0';
4772 const int32_t retval = get_chrom_code(chrom_name, chrom_info_ptr, name_slen);
4773 *s_end = tmpc;
4774 return retval;
4775 }
4776
get_variant_chrom_fo_idx(const Chrom_info * chrom_info_ptr,uintptr_t variant_uidx)4777 uint32_t get_variant_chrom_fo_idx(const Chrom_info* chrom_info_ptr, uintptr_t variant_uidx) {
4778 const uint32_t* variant_binsearch = chrom_info_ptr->chrom_fo_vidx_start;
4779 uint32_t chrom_fo_min = 0;
4780 uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
4781 while (chrom_ct - chrom_fo_min > 1) {
4782 const uint32_t chrom_fo_cur = (chrom_ct + chrom_fo_min) / 2;
4783 if (variant_binsearch[chrom_fo_cur] > variant_uidx) {
4784 chrom_ct = chrom_fo_cur;
4785 } else {
4786 chrom_fo_min = chrom_fo_cur;
4787 }
4788 }
4789 return chrom_fo_min;
4790 }
4791
chrom_error(const char * chrom_name,const char * file_descrip,const Chrom_info * chrom_info_ptr,uintptr_t line_idx,int32_t error_code)4792 void chrom_error(const char* chrom_name, const char* file_descrip, const Chrom_info* chrom_info_ptr, uintptr_t line_idx, int32_t error_code) {
4793 // assumes chrom_name is null-terminated
4794 const int32_t raw_code = get_chrom_code_raw(chrom_name);
4795 logprint("\n");
4796 if (line_idx) {
4797 LOGERRPRINTFWW("Error: Invalid chromosome code '%s' on line %" PRIuPTR " of %s.\n", chrom_name, line_idx, file_descrip);
4798 } else {
4799 LOGERRPRINTFWW("Error: Invalid chromosome code '%s' in %s.\n", chrom_name, file_descrip);
4800 }
4801 if ((raw_code > ((int32_t)chrom_info_ptr->max_code)) && ((raw_code <= MAX_CHROM_TEXTNUM + XYMT_OFFSET_CT) || (raw_code >= MAX_POSSIBLE_CHROM))) {
4802 if (chrom_info_ptr->species != SPECIES_UNKNOWN) {
4803 if (chrom_info_ptr->species == SPECIES_HUMAN) {
4804 logerrprint("(This is disallowed for humans. Check if the problem is with your data, or if\nyou forgot to define a different chromosome set with e.g. --chr-set.)\n");
4805 } else {
4806 logerrprint("(This is disallowed by the PLINK 1.07 species flag you used. You can\ntemporarily work around this restriction with --chr-set; contact the developers\nif you want the flag to be permanently redefined.)\n");
4807 }
4808 } else {
4809 logerrprint("(This is disallowed by your --chr-set/--autosome-num parameters. Check if the\nproblem is with your data, or your command line.)\n");
4810 }
4811 } else if (error_code == -1) {
4812 logerrprint("(Use --allow-extra-chr to force it to be accepted.)\n");
4813 }
4814 }
4815
try_to_add_chrom_name(const char * chrom_name,const char * file_descrip,uintptr_t line_idx,uint32_t name_slen,uint32_t allow_extra_chroms,int32_t * chrom_idx_ptr,Chrom_info * chrom_info_ptr)4816 int32_t try_to_add_chrom_name(const char* chrom_name, const char* file_descrip, uintptr_t line_idx, uint32_t name_slen, uint32_t allow_extra_chroms, int32_t* chrom_idx_ptr, Chrom_info* chrom_info_ptr) {
4817 // assumes chrom_name is nonstandard (i.e. not "2", "chr2", "chrX", etc.)
4818 // requires chrom_name to be null-terminated
4819 // assumes chrom_idx currently has the return value of get_chrom_code()
4820 if ((!allow_extra_chroms) || ((*chrom_idx_ptr) == -2)) {
4821 chrom_error(chrom_name, file_descrip, chrom_info_ptr, line_idx, *chrom_idx_ptr);
4822 return RET_MALFORMED_INPUT;
4823 }
4824
4825 // quasi-bugfix: remove redundant hash table check
4826
4827 if (chrom_name[0] == '#') {
4828 // redundant with some of the comment-skipping loaders, but this isn't
4829 // performance-critical
4830 logprint("\n");
4831 logerrprint("Error: Chromosome/contig names may not begin with '#'.\n");
4832 return RET_MALFORMED_INPUT;
4833 }
4834 if (name_slen > MAX_ID_SLEN) {
4835 logprint("\n");
4836 if (line_idx) {
4837 LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s has an excessively long chromosome/contig name. (The " PROG_NAME_CAPS " limit is " MAX_ID_SLEN_STR " characters.)\n", line_idx, file_descrip);
4838 } else {
4839 LOGERRPRINTFWW("Error: Excessively long chromosome/contig name in %s. (The " PROG_NAME_CAPS " limit is " MAX_ID_SLEN_STR " characters.)\n", file_descrip);
4840 }
4841 return RET_MALFORMED_INPUT;
4842 }
4843 const uint32_t max_code_p1 = chrom_info_ptr->max_code + 1;
4844 const uint32_t name_ct = chrom_info_ptr->name_ct;
4845 const uint32_t chrom_code_end = max_code_p1 + name_ct;
4846 if (chrom_code_end == MAX_POSSIBLE_CHROM) {
4847 logprint("\n");
4848 logerrprint("Error: Too many distinct nonstandard chromosome/contig names.\n");
4849 return RET_MALFORMED_INPUT;
4850 }
4851 if (!name_ct) {
4852 // lazy initialization
4853 fill_uint_one(CHROM_NAME_HTABLE_SIZE, chrom_info_ptr->nonstd_id_htable);
4854 }
4855 char** nonstd_names = chrom_info_ptr->nonstd_names;
4856 nonstd_names[chrom_code_end] = (char*)malloc(name_slen + 1);
4857 if (!nonstd_names[chrom_code_end]) {
4858 return RET_NOMEM;
4859 }
4860 Ll_str* name_stack_ptr = chrom_info_ptr->incl_excl_name_stack;
4861 uint32_t in_name_stack = 0;
4862 while (name_stack_ptr) {
4863 // there shouldn't be many of these, so sorting is unimportant
4864 if (!strcmp(chrom_name, name_stack_ptr->ss)) {
4865 in_name_stack = 1;
4866 break;
4867 }
4868 name_stack_ptr = name_stack_ptr->next;
4869 }
4870 if ((in_name_stack && chrom_info_ptr->is_include_stack) || ((!in_name_stack) && (!chrom_info_ptr->is_include_stack))) {
4871 SET_BIT(chrom_code_end, chrom_info_ptr->chrom_mask);
4872 if (chrom_info_ptr->haploid_mask[0] & 1) {
4873 SET_BIT(chrom_code_end, chrom_info_ptr->haploid_mask);
4874 }
4875 }
4876 memcpy(nonstd_names[chrom_code_end], chrom_name, name_slen + 1);
4877 *chrom_idx_ptr = (int32_t)chrom_code_end;
4878 chrom_info_ptr->name_ct = name_ct + 1;
4879 uint32_t* id_htable = chrom_info_ptr->nonstd_id_htable;
4880 uint32_t hashval = murmurhash3_32(chrom_name, name_slen) % CHROM_NAME_HTABLE_SIZE;
4881 uint32_t next_incr = 1;
4882 while (1) {
4883 if (id_htable[hashval] == 0xffffffffU) {
4884 id_htable[hashval] = chrom_code_end;
4885 return 0;
4886 }
4887 // no overflow danger here
4888 hashval += next_incr;
4889 if (hashval >= CHROM_NAME_HTABLE_SIZE) {
4890 hashval -= CHROM_NAME_HTABLE_SIZE;
4891 }
4892 next_incr += 2; // quadratic probing
4893 }
4894 }
4895
allele_set(const char * newval,uint32_t slen,char ** allele_ptr)4896 uint32_t allele_set(const char* newval, uint32_t slen, char** allele_ptr) {
4897 char* newptr;
4898 if (slen == 1) {
4899 newptr = (char*)(&(g_one_char_strs[((unsigned char)*newval) * 2]));
4900 } else {
4901 newptr = (char*)malloc(slen + 1);
4902 if (!newptr) {
4903 return 1;
4904 }
4905 memcpyx(newptr, newval, slen, '\0');
4906 }
4907 *allele_ptr = newptr;
4908 return 0;
4909 }
4910
allele_reset(const char * newval,uint32_t slen,char ** allele_ptr)4911 uint32_t allele_reset(const char* newval, uint32_t slen, char** allele_ptr) {
4912 char* newptr;
4913 if (slen == 1) {
4914 newptr = (char*)(&(g_one_char_strs[((uint8_t)*newval) * 2]));
4915 } else {
4916 newptr = (char*)malloc(slen + 1);
4917 if (!newptr) {
4918 return 1;
4919 }
4920 memcpyx(newptr, newval, slen, '\0');
4921 }
4922 if (allele_ptr[0][1]) {
4923 free(*allele_ptr);
4924 }
4925 *allele_ptr = newptr;
4926 return 0;
4927 }
4928
cleanup_allele_storage(uint32_t max_allele_slen,uintptr_t allele_storage_entry_ct,char ** allele_storage)4929 void cleanup_allele_storage(uint32_t max_allele_slen, uintptr_t allele_storage_entry_ct, char** allele_storage) {
4930 if (allele_storage && (max_allele_slen > 1)) {
4931 const uintptr_t one_char_strs_addr = (uintptr_t)g_one_char_strs;
4932 for (uintptr_t idx = 0; idx < allele_storage_entry_ct; ++idx) {
4933 char* cur_entry = allele_storage[idx];
4934 assert(cur_entry);
4935 // take advantage of unsigned wraparound
4936 if ((((uintptr_t)cur_entry) - one_char_strs_addr) >= 512) {
4937 free(cur_entry);
4938 }
4939 }
4940 }
4941 }
4942
cleanup_allele_storage2(uintptr_t allele_storage_entry_ct,char ** allele_storage)4943 void cleanup_allele_storage2(uintptr_t allele_storage_entry_ct, char** allele_storage) {
4944 if (allele_storage) {
4945 const uintptr_t one_char_strs_addr = (uintptr_t)g_one_char_strs;
4946 for (uintptr_t idx = 0; idx < allele_storage_entry_ct;) {
4947 char* cur_entry = allele_storage[idx];
4948 if (!cur_entry) {
4949 // --merge-equal-pos hacked entry
4950 idx += 2;
4951 continue;
4952 }
4953 // take advantage of unsigned wraparound
4954 if ((((uintptr_t)cur_entry) - one_char_strs_addr) >= 512) {
4955 free(cur_entry);
4956 }
4957 ++idx;
4958 }
4959 }
4960 }
4961
refresh_chrom_info(const Chrom_info * chrom_info_ptr,uintptr_t marker_uidx,uint32_t * __restrict chrom_end_ptr,uint32_t * __restrict chrom_fo_idx_ptr,uint32_t * __restrict is_x_ptr,uint32_t * __restrict is_y_ptr,uint32_t * __restrict is_mt_ptr,uint32_t * __restrict is_haploid_ptr)4962 void refresh_chrom_info(const Chrom_info* chrom_info_ptr, uintptr_t marker_uidx, uint32_t* __restrict chrom_end_ptr, uint32_t* __restrict chrom_fo_idx_ptr, uint32_t* __restrict is_x_ptr, uint32_t* __restrict is_y_ptr, uint32_t* __restrict is_mt_ptr, uint32_t* __restrict is_haploid_ptr) {
4963 // assumes we are at the end of the chromosome denoted by chrom_fo_idx. Ok
4964 // for chrom_fo_idx == 0xffffffffU.
4965 // assumes marker_uidx < unfiltered_marker_ct
4966 *chrom_end_ptr = chrom_info_ptr->chrom_fo_vidx_start[(*chrom_fo_idx_ptr) + 1];
4967 while (marker_uidx >= (*chrom_end_ptr)) {
4968 *chrom_end_ptr = chrom_info_ptr->chrom_fo_vidx_start[(++(*chrom_fo_idx_ptr)) + 1];
4969 }
4970 const int32_t chrom_idx = chrom_info_ptr->chrom_file_order[*chrom_fo_idx_ptr];
4971 *is_x_ptr = (chrom_idx == chrom_info_ptr->xymt_codes[X_OFFSET]);
4972 *is_y_ptr = (chrom_idx == chrom_info_ptr->xymt_codes[Y_OFFSET]);
4973 *is_mt_ptr = (chrom_idx == chrom_info_ptr->xymt_codes[MT_OFFSET]);
4974 *is_haploid_ptr = is_set(chrom_info_ptr->haploid_mask, chrom_idx);
4975 }
4976
single_chrom_start(const Chrom_info * chrom_info_ptr,const uintptr_t * marker_exclude,uint32_t unfiltered_marker_ct)4977 int32_t single_chrom_start(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t unfiltered_marker_ct) {
4978 // Assumes there is at least one marker, and there are no split chromosomes.
4979 // Returns first marker_uidx in chromosome if there is only one, or -1 if
4980 // there's more than one chromosome.
4981 uint32_t first_marker_uidx = next_unset_unsafe(marker_exclude, 0);
4982 uint32_t last_marker_chrom = get_variant_chrom(chrom_info_ptr, last_clear_bit(marker_exclude, unfiltered_marker_ct));
4983 return (get_variant_chrom(chrom_info_ptr, first_marker_uidx) == last_marker_chrom)? first_marker_uidx : -1;
4984 }
4985
get_dmedian(const double * sorted_arr,uintptr_t len)4986 double get_dmedian(const double* sorted_arr, uintptr_t len) {
4987 if (len) {
4988 if (len % 2) {
4989 return sorted_arr[len / 2];
4990 } else {
4991 return (sorted_arr[len / 2] + sorted_arr[(len / 2) - 1]) * 0.5;
4992 }
4993 } else {
4994 return 0.0;
4995 }
4996 }
4997
4998 #ifdef __cplusplus
destructive_get_dmedian(uintptr_t len,double * unsorted_arr)4999 double destructive_get_dmedian(uintptr_t len, double* unsorted_arr) {
5000 if (!len) {
5001 return 0.0;
5002 }
5003 uintptr_t len_d2 = len / 2;
5004 std::nth_element(unsorted_arr, &(unsorted_arr[len_d2]), &(unsorted_arr[len]));
5005 if (!(len % 2)) {
5006 std::nth_element(unsorted_arr, &(unsorted_arr[len_d2 - 1]), &(unsorted_arr[len_d2]));
5007 return (unsorted_arr[len_d2 - 1] + unsorted_arr[len_d2]) * 0.5;
5008 } else {
5009 return unsorted_arr[len_d2];
5010 }
5011 }
5012 #else
destructive_get_dmedian(uintptr_t len,double * unsorted_arr)5013 double destructive_get_dmedian(uintptr_t len, double* unsorted_arr) {
5014 // no, I'm not gonna bother reimplementing introselect just for folks who
5015 // insist on using gcc over g++
5016 qsort(unsorted_arr, len, sizeof(double), double_cmp);
5017 return get_dmedian(unsorted_arr, len);
5018 }
5019 #endif
5020
strcmp_casted(const void * s1,const void * s2)5021 int32_t strcmp_casted(const void* s1, const void* s2) {
5022 return strcmp((char*)s1, (char*)s2);
5023 }
5024
5025 // PLINK 2's natural sort uses the following logic:
5026 // - All alphabetic characters act as if they are capitalized, except for
5027 // tiebreaking purposes (where ASCII is used).
5028 // - Numbers are compared by magnitude, with the exception of...
5029 // - Numbers with leading zero(es). If you're putting extraneous zeroes in
5030 // front of IDs, we assume they're there to force particular items to be sorted
5031 // earlier, rather than just appearing at random. So, unlike many natural sort
5032 // implementations, we sort 00200 < 021 < 20: all numbers with n leading zeroes
5033 // are sorted before all numbers with (n-1) leading zeroes; magnitude only
5034 // applies if the leading zero counts match. This handles e.g. subbasement
5035 // room numbering properly.
5036 //
5037 // This won't always do what you want if your IDs have variable-length decimals
5038 // in them (e.g. it yields 0.99 < 0.101); if you don't want to fall back on
5039 // ASCII sort, enforce a fixed number of digits after the decimal point. Also
5040 // note that ASCII sort is outright better for e.g. numbers represented in
5041 // hexadecimal or base 36. In principle, it's possible to reliably autodetect
5042 // some of these cases (especially hexadecimal numbers beginning with "0x"),
5043 // but that'll never be perfect so we just let the user toggle the sort method.
strcmp_natural_scan_forward(const unsigned char * s1,const unsigned char * s2)5044 int32_t strcmp_natural_scan_forward(const unsigned char* s1, const unsigned char* s2) {
5045 // assumes s1 and s2 currently point to the middle of a mismatching number,
5046 // where s1 < s2.
5047 unsigned char c1;
5048 unsigned char c2;
5049 do {
5050 c1 = *(++s1);
5051 c2 = *(++s2);
5052 if (is_not_digit(c1)) {
5053 return -1;
5054 }
5055 } while (is_digit(c2));
5056 return 1;
5057 }
5058
5059 // We have the following major states:
5060 // 0 (initial): strings perfectly match so far, last char (if any) is
5061 // nonnumeric.
5062 // 1: strings perfectly match so far, last char is numeric.
5063 // 2: strings match except for capitalization, last char is nonnumeric.
5064 // 3: strings match except for capitalization, last char is numeric.
5065 // strcmp_natural_tiebroken() expresses the logic for states 2 and 3, while
5066 // strcmp_natural_uncasted() handles states 0 and 1.
strcmp_natural_tiebroken(const unsigned char * s1,const unsigned char * s2)5067 int32_t strcmp_natural_tiebroken(const unsigned char* s1, const unsigned char* s2) {
5068 // assumes ties should be broken in favor of s2.
5069 unsigned char c1 = *(++s1);
5070 unsigned char c2 = *(++s2);
5071 while (is_not_nzdigit(c1) && is_not_nzdigit(c2)) {
5072 // state 2
5073 strcmp_natural_tiebroken_state_2:
5074 if (c1 != c2) {
5075 if ((c1 >= 'a') && (c1 <= 'z')) {
5076 c1 -= 32;
5077 }
5078 if ((c2 >= 'a') && (c2 <= 'z')) {
5079 c2 -= 32;
5080 }
5081 if (c1 < c2) {
5082 return -1;
5083 } else if (c1 > c2) {
5084 return 1;
5085 }
5086 } else if (!c1) {
5087 return -1;
5088 }
5089 c1 = *(++s1);
5090 c2 = *(++s2);
5091 }
5092 if (is_not_nzdigit(c1) || is_not_nzdigit(c2)) {
5093 return (c1 < c2)? -1 : 1;
5094 }
5095 do {
5096 // state 3
5097 if (c1 != c2) {
5098 if (is_digit(c2)) {
5099 if (c1 < c2) {
5100 return strcmp_natural_scan_forward(s1, s2);
5101 } else {
5102 return -strcmp_natural_scan_forward(s2, s1);
5103 }
5104 }
5105 return 1;
5106 }
5107 c1 = *(++s1);
5108 c2 = *(++s2);
5109 } while (is_digit(c1));
5110 if (is_digit(c2)) {
5111 return -1;
5112 }
5113 // skip the while (is_not_digit...) check
5114 goto strcmp_natural_tiebroken_state_2;
5115 }
5116
strcmp_natural_uncasted(const unsigned char * s1,const unsigned char * s2)5117 static inline int32_t strcmp_natural_uncasted(const unsigned char* s1, const unsigned char* s2) {
5118 unsigned char c1 = *s1;
5119 unsigned char c2 = *s2;
5120 while (is_not_nzdigit(c1) && is_not_nzdigit(c2)) {
5121 // state 0
5122 strcmp_natural_uncasted_state_0:
5123 if (c1 != c2) {
5124 if ((c1 >= 'a') && (c1 <= 'z')) {
5125 if (c2 + 32 == c1) {
5126 return -strcmp_natural_tiebroken(s2, s1);
5127 } else if ((c2 < 'a') || (c2 > 'z')) {
5128 c1 -= 32;
5129 }
5130 } else if ((c2 >= 'a') && (c2 <= 'z')) {
5131 c2 -= 32;
5132 if (c1 == c2) {
5133 return strcmp_natural_tiebroken(s1, s2);
5134 }
5135 }
5136 return (c1 < c2)? -1 : 1;
5137 } else if (!c1) {
5138 return 0;
5139 }
5140 c1 = *(++s1);
5141 c2 = *(++s2);
5142 }
5143 if (is_not_nzdigit(c1) || is_not_nzdigit(c2)) {
5144 return (c1 < c2)? -1 : 1;
5145 }
5146 do {
5147 // state 1
5148 if (c1 != c2) {
5149 if (is_digit(c2)) {
5150 if (c1 < c2) {
5151 return strcmp_natural_scan_forward(s1, s2);
5152 } else {
5153 return -strcmp_natural_scan_forward(s2, s1);
5154 }
5155 }
5156 return 1;
5157 }
5158 c1 = *(++s1);
5159 c2 = *(++s2);
5160 } while (is_digit(c1));
5161 if (is_digit(c2)) {
5162 return -1;
5163 }
5164 goto strcmp_natural_uncasted_state_0;
5165 }
5166
strcmp_natural(const void * s1,const void * s2)5167 int32_t strcmp_natural(const void* s1, const void* s2) {
5168 return strcmp_natural_uncasted((unsigned char*)s1, (unsigned char*)s2);
5169 }
5170
strcmp_deref(const void * s1,const void * s2)5171 int32_t strcmp_deref(const void* s1, const void* s2) {
5172 return strcmp(*(char**)s1, *(char**)s2);
5173 }
5174
strcmp_natural_deref(const void * s1,const void * s2)5175 int32_t strcmp_natural_deref(const void* s1, const void* s2) {
5176 return strcmp_natural_uncasted(*(unsigned char**)s1, *(unsigned char**)s2);
5177 }
5178
get_uidx_from_unsorted(const char * idstr,const uintptr_t * exclude_arr,uint32_t id_ct,const char * unsorted_ids,uintptr_t max_id_len)5179 int32_t get_uidx_from_unsorted(const char* idstr, const uintptr_t* exclude_arr, uint32_t id_ct, const char* unsorted_ids, uintptr_t max_id_len) {
5180 uintptr_t id_uidx = 0;
5181 uintptr_t slen_p1 = strlen(idstr) + 1;
5182 uint32_t id_idx;
5183 if (slen_p1 > max_id_len) {
5184 return -1;
5185 }
5186 for (id_idx = 0; id_idx < id_ct; id_uidx++, id_idx++) {
5187 id_uidx = next_unset_ul_unsafe(exclude_arr, id_uidx);
5188 if (!memcmp(idstr, &(unsorted_ids[id_uidx * max_id_len]), slen_p1)) {
5189 return (int32_t)((uint32_t)id_uidx);
5190 }
5191 }
5192 return -1;
5193 }
5194
scan_for_duplicate_ids(char * sorted_ids,uintptr_t id_ct,uintptr_t max_id_len)5195 char* scan_for_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len) {
5196 uintptr_t id_idx;
5197 id_ct--;
5198 for (id_idx = 0; id_idx < id_ct; id_idx++) {
5199 if (!strcmp(&(sorted_ids[id_idx * max_id_len]), &(sorted_ids[(id_idx + 1) * max_id_len]))) {
5200 return &(sorted_ids[id_idx * max_id_len]);
5201 }
5202 }
5203 return nullptr;
5204 }
5205
scan_for_duplicate_or_overlap_ids(char * sorted_ids,uintptr_t id_ct,uintptr_t max_id_len,const char * sorted_nonoverlap_ids,uintptr_t nonoverlap_id_ct,uintptr_t max_nonoverlap_id_len)5206 char* scan_for_duplicate_or_overlap_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, const char* sorted_nonoverlap_ids, uintptr_t nonoverlap_id_ct, uintptr_t max_nonoverlap_id_len) {
5207 // extended scan_for_duplicate_ids() which also verifies that no entry in
5208 // sorted_ids matches any entry in sorted_nonoverlap_ids.
5209 // nonoverlap_id_ct == 0 and sorted_nonoverlap_ids == nullptr ok. id_ct
5210 // cannot be zero, though.
5211 uintptr_t nonoverlap_id_idx = 0;
5212 uintptr_t id_idx = 0;
5213 char* cur_id_ptr = sorted_ids;
5214 const char* nonoverlap_id_ptr;
5215 char* other_id_ptr;
5216 int32_t ii;
5217 while (1) {
5218 if (nonoverlap_id_idx == nonoverlap_id_ct) {
5219 return scan_for_duplicate_ids(cur_id_ptr, id_ct - id_idx, max_id_len);
5220 }
5221 nonoverlap_id_ptr = &(sorted_nonoverlap_ids[nonoverlap_id_idx * max_nonoverlap_id_len]);
5222 ii = strcmp(cur_id_ptr, nonoverlap_id_ptr);
5223 if (ii < 0) {
5224 if (++id_idx == id_ct) {
5225 return nullptr;
5226 }
5227 other_id_ptr = &(cur_id_ptr[max_id_len]);
5228 if (!strcmp(cur_id_ptr, other_id_ptr)) {
5229 return cur_id_ptr;
5230 }
5231 cur_id_ptr = other_id_ptr;
5232 continue;
5233 } else if (!ii) {
5234 return cur_id_ptr;
5235 }
5236 nonoverlap_id_idx++;
5237 }
5238 }
5239
eval_affection(const char * bufptr,double missing_phenod)5240 int32_t eval_affection(const char* bufptr, double missing_phenod) {
5241 // turns out --1 had the side-effect of *forcing* case/control
5242 // interpretation in 1.07. We replicate that for backward compatibility, and
5243 // no longer call this function in that context.
5244 char* ss;
5245 double dxx;
5246 // this used to be an integer read, but that could do the wrong thing if e.g.
5247 // all phenotypes were -9.xxx...
5248 dxx = strtod(bufptr, &ss);
5249 if ((ss == bufptr) || (dxx == missing_phenod)) {
5250 return 1;
5251 }
5252 return ((bufptr[0] == '0') || (bufptr[0] == '1') || (bufptr[0] == '2')) && is_space_or_eoln(bufptr[1]);
5253 }
5254
triangle_divide(int64_t cur_prod,int32_t modif)5255 uint32_t triangle_divide(int64_t cur_prod, int32_t modif) {
5256 // return smallest integer vv for which (vv * (vv + modif)) is no smaller
5257 // than cur_prod, and neither term in the product is negative. (Note the
5258 // lack of a divide by two; cur_prod should also be double its "true" value
5259 // as a result.)
5260 int64_t vv;
5261 if (cur_prod == 0) {
5262 if (modif < 0) {
5263 return -modif;
5264 } else {
5265 return 0;
5266 }
5267 }
5268 vv = (int64_t)sqrt((double)cur_prod);
5269 while ((vv - 1) * (vv + modif - 1) >= cur_prod) {
5270 vv--;
5271 }
5272 while (vv * (vv + modif) < cur_prod) {
5273 vv++;
5274 }
5275 return vv;
5276 }
5277
parallel_bounds(uint32_t ct,int32_t start,uint32_t parallel_idx,uint32_t parallel_tot,int32_t * __restrict bound_start_ptr,int32_t * __restrict bound_end_ptr)5278 void parallel_bounds(uint32_t ct, int32_t start, uint32_t parallel_idx, uint32_t parallel_tot, int32_t* __restrict bound_start_ptr, int32_t* __restrict bound_end_ptr) {
5279 int32_t modif = 1 - start * 2;
5280 int64_t ct_tot = ((int64_t)ct) * (ct + modif);
5281 *bound_start_ptr = triangle_divide((ct_tot * parallel_idx) / parallel_tot, modif);
5282 *bound_end_ptr = triangle_divide((ct_tot * (parallel_idx + 1)) / parallel_tot, modif);
5283 }
5284
5285 // this might belong in plink_calc instead, not being used anywhere else
5286 // set align to 1 for no alignment
triangle_fill(uint32_t ct,uint32_t pieces,uint32_t parallel_idx,uint32_t parallel_tot,uint32_t start,uint32_t align,uint32_t * target_arr)5287 void triangle_fill(uint32_t ct, uint32_t pieces, uint32_t parallel_idx, uint32_t parallel_tot, uint32_t start, uint32_t align, uint32_t* target_arr) {
5288 int32_t modif = 1 - start * 2;
5289 uint32_t cur_piece = 1;
5290 int64_t ct_tr;
5291 int64_t cur_prod;
5292 int32_t lbound;
5293 int32_t ubound;
5294 uint32_t uii;
5295 uint32_t align_m1;
5296 parallel_bounds(ct, start, parallel_idx, parallel_tot, &lbound, &ubound);
5297 // x(x+1)/2 is divisible by y iff (x % (2y)) is 0 or (2y - 1).
5298 align *= 2;
5299 align_m1 = align - 1;
5300 target_arr[0] = lbound;
5301 target_arr[pieces] = ubound;
5302 cur_prod = ((int64_t)lbound) * (lbound + modif);
5303 ct_tr = (((int64_t)ubound) * (ubound + modif) - cur_prod) / pieces;
5304 while (cur_piece < pieces) {
5305 cur_prod += ct_tr;
5306 lbound = triangle_divide(cur_prod, modif);
5307 uii = (lbound - ((int32_t)start)) & align_m1;
5308 if ((uii) && (uii != align_m1)) {
5309 lbound = start + ((lbound - ((int32_t)start)) | align_m1);
5310 }
5311 // lack of this check caused a nasty bug earlier
5312 if (((uint32_t)lbound) > ct) {
5313 lbound = ct;
5314 }
5315 target_arr[cur_piece++] = lbound;
5316 }
5317 }
5318
relationship_req(uint64_t calculation_type)5319 int32_t relationship_req(uint64_t calculation_type) {
5320 return (calculation_type & (CALC_RELATIONSHIP | CALC_UNRELATED_HERITABILITY | CALC_REL_CUTOFF | CALC_REGRESS_REL | CALC_PCA))? 1 : 0;
5321 }
5322
distance_req(const char * read_dists_fname,uint64_t calculation_type)5323 int32_t distance_req(const char* read_dists_fname, uint64_t calculation_type) {
5324 return ((calculation_type & CALC_DISTANCE) || ((calculation_type & (CALC_PLINK1_DISTANCE_MATRIX | CALC_PLINK1_IBS_MATRIX)) && (!(calculation_type & CALC_GENOME))) || ((!read_dists_fname) && (calculation_type & (CALC_IBS_TEST | CALC_GROUPDIST | CALC_REGRESS_DISTANCE))));
5325 }
5326
double_cmp(const void * aa,const void * bb)5327 int32_t double_cmp(const void* aa, const void* bb) {
5328 double cc = *((const double*)aa) - *((const double*)bb);
5329 if (cc > 0.0) {
5330 return 1;
5331 } else if (cc < 0.0) {
5332 return -1;
5333 } else {
5334 return 0;
5335 }
5336 }
5337
double_cmp_decr(const void * aa,const void * bb)5338 int32_t double_cmp_decr(const void* aa, const void* bb) {
5339 double cc = *((const double*)aa) - *((const double*)bb);
5340 if (cc < 0.0) {
5341 return 1;
5342 } else if (cc > 0.0) {
5343 return -1;
5344 } else {
5345 return 0;
5346 }
5347 }
5348
double_cmp_deref(const void * aa,const void * bb)5349 int32_t double_cmp_deref(const void* aa, const void* bb) {
5350 double cc = **((const double**)aa) - **((const double**)bb);
5351 if (cc > 0.0) {
5352 return 1;
5353 } else if (cc < 0.0) {
5354 return -1;
5355 } else {
5356 return 0;
5357 }
5358 }
5359
char_cmp_deref(const void * aa,const void * bb)5360 int32_t char_cmp_deref(const void* aa, const void* bb) {
5361 return (int32_t)(**((const char**)aa) - **((const char**)bb));
5362 }
5363
intcmp(const void * aa,const void * bb)5364 int32_t intcmp(const void* aa, const void* bb) {
5365 return *((const int32_t*)aa) - *((const int32_t*)bb);
5366 }
5367
uintcmp(const void * aa,const void * bb)5368 int32_t uintcmp(const void* aa, const void* bb) {
5369 if (*((const uint32_t*)aa) < *((const uint32_t*)bb)) {
5370 return -1;
5371 } else {
5372 return (*((const uint32_t*)aa) > *((const uint32_t*)bb));
5373 }
5374 }
5375
intcmp2(const void * aa,const void * bb)5376 int32_t intcmp2(const void* aa, const void* bb) {
5377 if (*((const int32_t*)aa) < *((const int32_t*)bb)) {
5378 return -1;
5379 } else {
5380 return (*((const int32_t*)aa) > *((const int32_t*)bb));
5381 }
5382 }
5383
intcmp3_decr(const void * aa,const void * bb)5384 int32_t intcmp3_decr(const void* aa, const void* bb) {
5385 int32_t ii = *((const int32_t*)bb) - *((const int32_t*)aa);
5386 if (ii) {
5387 return ii;
5388 }
5389 ii = ((const int32_t*)bb)[1] - ((const int32_t*)aa)[1];
5390 if (ii) {
5391 return ii;
5392 }
5393 return ((const int32_t*)bb)[2] - ((const int32_t*)aa)[2];
5394 }
5395
5396 #ifndef __cplusplus
llcmp(const void * aa,const void * bb)5397 int32_t llcmp(const void* aa, const void* bb) {
5398 int64_t diff = *((const int64_t*)aa) - *((const int64_t*)bb);
5399 if (diff > 0) {
5400 return 1;
5401 } else if (diff < 0) {
5402 return -1;
5403 } else {
5404 return 0;
5405 }
5406 }
5407 #endif
5408
5409 // alas, qsort_r not available on some Linux distributions
5410
5411 // Normally use qsort_ext(), but this version is necessary before g_bigstack
5412 // has been allocated.
qsort_ext2(char * main_arr,uintptr_t arr_length,uintptr_t item_length,int (* comparator_deref)(const void *,const void *),char * secondary_arr,uintptr_t secondary_item_len,char * proxy_arr,uintptr_t proxy_len)5413 void qsort_ext2(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, uintptr_t secondary_item_len, char* proxy_arr, uintptr_t proxy_len) {
5414 uintptr_t ulii;
5415 for (ulii = 0; ulii < arr_length; ulii++) {
5416 *(char**)(&(proxy_arr[ulii * proxy_len])) = &(main_arr[ulii * item_length]);
5417 memcpy(&(proxy_arr[ulii * proxy_len + sizeof(void*)]), &(secondary_arr[ulii * secondary_item_len]), secondary_item_len);
5418 }
5419 qsort(proxy_arr, arr_length, proxy_len, comparator_deref);
5420 for (ulii = 0; ulii < arr_length; ulii++) {
5421 memcpy(&(secondary_arr[ulii * secondary_item_len]), &(proxy_arr[ulii * proxy_len + sizeof(void*)]), secondary_item_len);
5422 memcpy(&(proxy_arr[ulii * proxy_len]), *(char**)(&(proxy_arr[ulii * proxy_len])), item_length);
5423 }
5424 for (ulii = 0; ulii < arr_length; ulii++) {
5425 memcpy(&(main_arr[ulii * item_length]), &(proxy_arr[ulii * proxy_len]), item_length);
5426 }
5427 }
5428
5429 // This actually tends to be faster than just sorting an array of indices,
5430 // because of memory locality issues.
qsort_ext(char * main_arr,uintptr_t arr_length,uintptr_t item_length,int (* comparator_deref)(const void *,const void *),char * secondary_arr,intptr_t secondary_item_len)5431 int32_t qsort_ext(char* main_arr, uintptr_t arr_length, uintptr_t item_length, int(* comparator_deref)(const void*, const void*), char* secondary_arr, intptr_t secondary_item_len) {
5432 // main_arr = packed array of equal-length items to sort
5433 // arr_length = number of items
5434 // item_length = byte count of each main_arr item
5435 // comparator_deref = returns positive if *first > *second, 0 if equal,
5436 // negative if *first < *second. Note the extra
5437 // dereference.
5438 // secondary_arr = packed array of fixed-length records associated with the
5439 // main_arr items, to be resorted in the same way. (e.g.
5440 // if one is building an index, this could start as a sorted
5441 // 0..(n-1) sequence of integers; then, post-sort, this would
5442 // be a lookup table for the original position of each
5443 // main_arr item.)
5444 // secondary_item_len = byte count of each secondary_arr item
5445 uintptr_t proxy_len = secondary_item_len + sizeof(void*);
5446 unsigned char* bigstack_mark = g_bigstack_base;
5447 char* proxy_arr;
5448 if (!arr_length) {
5449 return 0;
5450 }
5451 if (proxy_len < item_length) {
5452 proxy_len = item_length;
5453 }
5454 if (bigstack_alloc_c(arr_length * proxy_len, &proxy_arr)) {
5455 return -1;
5456 }
5457 qsort_ext2(main_arr, arr_length, item_length, comparator_deref, secondary_arr, secondary_item_len, proxy_arr, proxy_len);
5458 bigstack_reset(bigstack_mark);
5459 return 0;
5460 }
5461
sort_item_ids_noalloc(uintptr_t unfiltered_ct,const uintptr_t * exclude_arr,uintptr_t item_ct,const char * __restrict item_ids,uintptr_t max_id_len,uint32_t allow_dups,uint32_t collapse_idxs,int (* comparator_deref)(const void *,const void *),char * __restrict sorted_ids,uint32_t * id_map)5462 int32_t sort_item_ids_noalloc(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t item_ct, const char* __restrict item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*), char* __restrict sorted_ids, uint32_t* id_map) {
5463 // Stores a lexicographically sorted list of IDs in sorted_ids and the raw
5464 // positions of the corresponding markers/samples in *id_map_ptr. Does not
5465 // include excluded markers/samples in the list.
5466 // Assumes sorted_ids and id_map have been allocated; use the sort_item_ids()
5467 // wrapper if they haven't been.
5468 // Note that this DOES still perform a "stack" allocation (in the qsort_ext()
5469 // call).
5470 uint32_t uii = 0;
5471 char* dup_id;
5472 char* tptr;
5473 uint32_t ujj;
5474 if (!item_ct) {
5475 return 0;
5476 }
5477 if (!collapse_idxs) {
5478 for (ujj = 0; ujj < item_ct; uii++, ujj++) {
5479 next_unset_unsafe_ck(exclude_arr, &uii);
5480 memcpy(&(sorted_ids[ujj * max_id_len]), &(item_ids[uii * max_id_len]), max_id_len);
5481 id_map[ujj] = uii;
5482 }
5483 } else {
5484 for (ujj = 0; ujj < item_ct; uii++, ujj++) {
5485 next_unset_unsafe_ck(exclude_arr, &uii);
5486 memcpy(&(sorted_ids[ujj * max_id_len]), &(item_ids[uii * max_id_len]), max_id_len);
5487 id_map[ujj] = ujj;
5488 }
5489 }
5490 if (qsort_ext(sorted_ids, item_ct, max_id_len, comparator_deref, (char*)id_map, sizeof(int32_t))) {
5491 return RET_NOMEM;
5492 }
5493 if (!allow_dups) {
5494 dup_id = scan_for_duplicate_ids(sorted_ids, item_ct, max_id_len);
5495 if (dup_id) {
5496 tptr = strchr(dup_id, '\t');
5497 if (tptr) {
5498 *tptr = ' ';
5499 }
5500 LOGERRPRINTFWW("Error: Duplicate ID '%s'.\n", dup_id);
5501 return RET_INVALID_FORMAT;
5502 }
5503 }
5504 return 0;
5505 }
5506
sort_item_ids(uintptr_t unfiltered_ct,const uintptr_t * exclude_arr,uintptr_t exclude_ct,const char * __restrict item_ids,uintptr_t max_id_len,uint32_t allow_dups,uint32_t collapse_idxs,int (* comparator_deref)(const void *,const void *),char ** sorted_ids_ptr,uint32_t ** id_map_ptr)5507 int32_t sort_item_ids(uintptr_t unfiltered_ct, const uintptr_t* exclude_arr, uintptr_t exclude_ct, const char* __restrict item_ids, uintptr_t max_id_len, uint32_t allow_dups, uint32_t collapse_idxs, int(* comparator_deref)(const void*, const void*), char** sorted_ids_ptr, uint32_t** id_map_ptr) {
5508 uintptr_t item_ct = unfiltered_ct - exclude_ct;
5509 // id_map on bottom because --indiv-sort frees *sorted_ids_ptr
5510 if (bigstack_alloc_ui(item_ct, id_map_ptr) ||
5511 bigstack_alloc_c(item_ct * max_id_len, sorted_ids_ptr)) {
5512 return RET_NOMEM;
5513 }
5514 return sort_item_ids_noalloc(unfiltered_ct, exclude_arr, item_ct, item_ids, max_id_len, allow_dups, collapse_idxs, comparator_deref, *sorted_ids_ptr, *id_map_ptr);
5515 }
5516
uint32arr_greater_than(const uint32_t * sorted_uint32_arr,uint32_t arr_length,uint32_t uii)5517 uint32_t uint32arr_greater_than(const uint32_t* sorted_uint32_arr, uint32_t arr_length, uint32_t uii) {
5518 // assumes arr_length is nonzero, and sorted_uint32_arr is in nondecreasing
5519 // order. (useful for searching marker_pos.)
5520 // uii guaranteed to be larger than sorted_uint32_arr[min_idx - 1] if it
5521 // exists, but NOT necessarily sorted_uint32_arr[min_idx].
5522 int32_t min_idx = 0;
5523 // similarly, uii guaranteed to be no greater than
5524 // sorted_uint32_arr[max_idx + 1] if it exists, but not necessarily
5525 // sorted_uint32_arr[max_idx]. Signed integer since it could become -1.
5526 int32_t max_idx = arr_length - 1;
5527 uint32_t mid_idx;
5528 while (min_idx < max_idx) {
5529 mid_idx = (((uint32_t)min_idx) + ((uint32_t)max_idx)) / 2;
5530 if (uii > sorted_uint32_arr[mid_idx]) {
5531 min_idx = mid_idx + 1;
5532 } else {
5533 max_idx = mid_idx - 1;
5534 }
5535 }
5536 if (uii > sorted_uint32_arr[((uint32_t)min_idx)]) {
5537 return (min_idx + 1);
5538 } else {
5539 return min_idx;
5540 }
5541 }
5542
int32arr_greater_than(const int32_t * sorted_int32_arr,uint32_t arr_length,int32_t ii)5543 uint32_t int32arr_greater_than(const int32_t* sorted_int32_arr, uint32_t arr_length, int32_t ii) {
5544 int32_t min_idx = 0;
5545 int32_t max_idx = arr_length - 1;
5546 uint32_t mid_idx;
5547 while (min_idx < max_idx) {
5548 mid_idx = (((uint32_t)min_idx) + ((uint32_t)max_idx)) / 2;
5549 if (ii > sorted_int32_arr[mid_idx]) {
5550 min_idx = mid_idx + 1;
5551 } else {
5552 max_idx = mid_idx - 1;
5553 }
5554 }
5555 if (ii > sorted_int32_arr[((uint32_t)min_idx)]) {
5556 return (min_idx + 1);
5557 } else {
5558 return min_idx;
5559 }
5560 }
5561
uint64arr_greater_than(const uint64_t * sorted_uint64_arr,uintptr_t arr_length,uint64_t ullii)5562 uintptr_t uint64arr_greater_than(const uint64_t* sorted_uint64_arr, uintptr_t arr_length, uint64_t ullii) {
5563 intptr_t min_idx = 0;
5564 intptr_t max_idx = arr_length - 1;
5565 uintptr_t mid_idx;
5566 while (min_idx < max_idx) {
5567 mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
5568 if (ullii > sorted_uint64_arr[mid_idx]) {
5569 min_idx = mid_idx + 1;
5570 } else {
5571 max_idx = mid_idx - 1;
5572 }
5573 }
5574 if (ullii > sorted_uint64_arr[((uintptr_t)min_idx)]) {
5575 return (min_idx + 1);
5576 } else {
5577 return min_idx;
5578 }
5579 }
5580
doublearr_greater_than(const double * sorted_dbl_arr,uintptr_t arr_length,double dxx)5581 uintptr_t doublearr_greater_than(const double* sorted_dbl_arr, uintptr_t arr_length, double dxx) {
5582 // returns number of items in sorted_dbl_arr which dxx is greater than.
5583 // assumes array is nonempty and sorted in nondecreasing order
5584 intptr_t min_idx = 0;
5585 intptr_t max_idx = arr_length - 1;
5586 uintptr_t mid_idx;
5587 while (min_idx < max_idx) {
5588 mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
5589 if (dxx > sorted_dbl_arr[mid_idx]) {
5590 min_idx = mid_idx + 1;
5591 } else {
5592 max_idx = mid_idx - 1;
5593 }
5594 }
5595 if (dxx > sorted_dbl_arr[((uintptr_t)min_idx)]) {
5596 return (min_idx + 1);
5597 } else {
5598 return min_idx;
5599 }
5600 }
5601
nonincr_doublearr_leq_stride(const double * nonincr_dbl_arr,uintptr_t arr_length,uintptr_t stride,double dxx)5602 uintptr_t nonincr_doublearr_leq_stride(const double* nonincr_dbl_arr, uintptr_t arr_length, uintptr_t stride, double dxx) {
5603 // assumes relevant elements of array are sorted in nonincreasing order
5604 // instead, and they are spaced stride units apart
5605 intptr_t min_idx = 0;
5606 intptr_t max_idx = arr_length - 1;
5607 uintptr_t mid_idx;
5608 while (min_idx < max_idx) {
5609 mid_idx = (((uintptr_t)min_idx) + ((uintptr_t)max_idx)) / 2;
5610 if (dxx <= nonincr_dbl_arr[mid_idx * stride]) {
5611 min_idx = mid_idx + 1;
5612 } else {
5613 max_idx = mid_idx - 1;
5614 }
5615 }
5616 if (dxx <= nonincr_dbl_arr[((uintptr_t)min_idx) * stride]) {
5617 return (min_idx + 1);
5618 } else {
5619 return min_idx;
5620 }
5621 }
5622
bsearch_str(const char * id_buf,uintptr_t cur_id_len,const char * lptr,uintptr_t max_id_len,uintptr_t end_idx)5623 int32_t bsearch_str(const char* id_buf, uintptr_t cur_id_len, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
5624 // does not assume null-terminated id_buf, or nonempty array.
5625 // N.B. max_id_len includes null terminator as usual, while cur_id_len does
5626 // NOT.
5627 uintptr_t start_idx = 0;
5628 uintptr_t mid_idx;
5629 int32_t ii;
5630 if (cur_id_len >= max_id_len) {
5631 return -1;
5632 }
5633 while (start_idx < end_idx) {
5634 mid_idx = (start_idx + end_idx) / 2;
5635 ii = memcmp(id_buf, &(lptr[mid_idx * max_id_len]), cur_id_len);
5636 if (ii > 0) {
5637 start_idx = mid_idx + 1;
5638 } else if ((ii < 0) || lptr[mid_idx * max_id_len + cur_id_len]) {
5639 end_idx = mid_idx;
5640 } else {
5641 return ((uint32_t)mid_idx);
5642 }
5643 }
5644 return -1;
5645 }
5646
bsearch_str_natural(const char * id_buf,const char * lptr,uintptr_t max_id_len,uintptr_t end_idx)5647 int32_t bsearch_str_natural(const char* id_buf, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
5648 // unlike bsearch_str(), caller is responsible for slen > max_id_len check
5649 // if appropriate here
5650 uintptr_t start_idx = 0;
5651 uintptr_t mid_idx;
5652 int32_t ii;
5653 while (start_idx < end_idx) {
5654 mid_idx = (start_idx + end_idx) / 2;
5655 ii = strcmp_natural(id_buf, &(lptr[mid_idx * max_id_len]));
5656 if (ii > 0) {
5657 start_idx = mid_idx + 1;
5658 } else if (ii < 0) {
5659 end_idx = mid_idx;
5660 } else {
5661 return ((uint32_t)mid_idx);
5662 }
5663 }
5664 return -1;
5665 }
5666
bsearch_str_lb(const char * id_buf,uintptr_t cur_id_len,const char * lptr,uintptr_t max_id_len,uintptr_t end_idx)5667 uintptr_t bsearch_str_lb(const char* id_buf, uintptr_t cur_id_len, const char* lptr, uintptr_t max_id_len, uintptr_t end_idx) {
5668 // returns number of elements in lptr[] less than id_buf.
5669 uintptr_t start_idx = 0;
5670 uintptr_t mid_idx;
5671 if (cur_id_len > max_id_len) {
5672 cur_id_len = max_id_len;
5673 }
5674 while (start_idx < end_idx) {
5675 mid_idx = (start_idx + end_idx) / 2;
5676 if (memcmp(id_buf, &(lptr[mid_idx * max_id_len]), cur_id_len) > 0) {
5677 start_idx = mid_idx + 1;
5678 } else {
5679 end_idx = mid_idx;
5680 }
5681 }
5682 return start_idx;
5683 }
5684
bsearch_read_fam_indiv(char * __restrict read_ptr,const char * __restrict lptr,uintptr_t max_id_len,uintptr_t filter_line_ct,char ** read_pp_new,int32_t * retval_ptr,char * __restrict id_buf)5685 uint32_t bsearch_read_fam_indiv(char* __restrict read_ptr, const char* __restrict lptr, uintptr_t max_id_len, uintptr_t filter_line_ct, char** read_pp_new, int32_t* retval_ptr, char* __restrict id_buf) {
5686 // id_buf = workspace
5687 // lptr = packed, sorted list of ID strings to search over
5688 // read_ptr is assumed to point to beginning of FID. FID is terminated by
5689 // any space/eoln character, then IID is assumed to follow it (and is also
5690 // terminated by any space/eoln). Nonzero error value is returned if IID
5691 // does not exist.
5692 char* iid_ptr;
5693 uintptr_t slen_fid;
5694 uintptr_t slen_iid;
5695 uintptr_t slen_final;
5696 slen_fid = strlen_se(read_ptr);
5697 iid_ptr = skip_initial_spaces(&(read_ptr[slen_fid]));
5698 if (is_eoln_kns(*iid_ptr)) {
5699 return 1;
5700 }
5701 slen_iid = strlen_se(iid_ptr);
5702 if (read_pp_new) {
5703 *read_pp_new = skip_initial_spaces(&(iid_ptr[slen_iid]));
5704 }
5705 slen_final = slen_fid + slen_iid + 1;
5706 if (slen_final >= max_id_len) {
5707 // avoid buffer overflow
5708 *retval_ptr = -1;
5709 return 0;
5710 }
5711 // error message bugfix: null-terminate this string
5712 memcpyx(memcpyax(id_buf, read_ptr, slen_fid, '\t'), iid_ptr, slen_iid, '\0');
5713 *retval_ptr = bsearch_str(id_buf, slen_final, lptr, max_id_len, filter_line_ct);
5714 return 0;
5715 }
5716
bsearch_fam(const char * __restrict fam_id,const char * __restrict lptr,uintptr_t max_id_len,uint32_t filter_line_ct,uint32_t * __restrict first_idx_ptr,uint32_t * __restrict last_idx_ptr,char * __restrict id_buf)5717 void bsearch_fam(const char* __restrict fam_id, const char* __restrict lptr, uintptr_t max_id_len, uint32_t filter_line_ct, uint32_t* __restrict first_idx_ptr, uint32_t* __restrict last_idx_ptr, char* __restrict id_buf) {
5718 uint32_t slen;
5719 uint32_t fidx;
5720 uint32_t loff;
5721 if (!filter_line_ct) {
5722 goto bsearch_fam_ret_null;
5723 }
5724 slen = strlen_se(fam_id);
5725 if (slen + 3 > max_id_len) {
5726 goto bsearch_fam_ret_null;
5727 }
5728 memcpy(id_buf, fam_id, slen);
5729 id_buf[slen] = '\t';
5730 fidx = bsearch_str_lb(id_buf, slen + 1, lptr, max_id_len, filter_line_ct);
5731 if (fidx == filter_line_ct) {
5732 goto bsearch_fam_ret_null;
5733 }
5734 id_buf[slen] = ' ';
5735 loff = bsearch_str_lb(id_buf, slen + 1, &(lptr[fidx * max_id_len]), max_id_len, filter_line_ct - fidx);
5736 if (!loff) {
5737 goto bsearch_fam_ret_null;
5738 }
5739 *first_idx_ptr = fidx;
5740 *last_idx_ptr = fidx + loff;
5741 return;
5742 bsearch_fam_ret_null:
5743 *first_idx_ptr = 0;
5744 *last_idx_ptr = 0;
5745 }
5746
bitarr_invert(uintptr_t bit_ct,uintptr_t * bitarr)5747 void bitarr_invert(uintptr_t bit_ct, uintptr_t* bitarr) {
5748 uintptr_t* bitarr_stop = &(bitarr[bit_ct / BITCT]);
5749 while (bitarr < bitarr_stop) {
5750 *bitarr = ~(*bitarr);
5751 bitarr++;
5752 }
5753 if (bit_ct % BITCT) {
5754 *bitarr = (~(*bitarr)) & ((ONELU << (bit_ct % BITCT)) - ONELU);
5755 }
5756 }
5757
bitarr_invert_copy(const uintptr_t * input_bitarr,uintptr_t bit_ct,uintptr_t * output_bitarr)5758 void bitarr_invert_copy(const uintptr_t* input_bitarr, uintptr_t bit_ct, uintptr_t* output_bitarr) {
5759 const uintptr_t* input_stop = &(input_bitarr[bit_ct / BITCT]);
5760 while (input_bitarr < input_stop) {
5761 *output_bitarr++ = ~(*input_bitarr++);
5762 }
5763 if (bit_ct % BITCT) {
5764 *output_bitarr = (~(*input_bitarr)) & ((ONELU << (bit_ct % BITCT)) - ONELU);
5765 }
5766 }
5767
bitvec_and(const uintptr_t * __restrict arg_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5768 void bitvec_and(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5769 // main_bitvec := main_bitvec AND arg_bitvec
5770 #ifdef __LP64__
5771 __m128i* vv128 = (__m128i*)main_bitvec;
5772 const __m128i* iv128 = (const __m128i*)arg_bitvec;
5773 __m128i* vv128_end = &(vv128[word_ct / 2]);
5774 while (vv128 < vv128_end) {
5775 *vv128 = _mm_and_si128(*iv128++, *vv128);
5776 vv128++;
5777 }
5778 if (word_ct & 1) {
5779 word_ct--;
5780 main_bitvec[word_ct] &= arg_bitvec[word_ct];
5781 }
5782 #else
5783 uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
5784 while (main_bitvec < bitvec_end) {
5785 *main_bitvec++ &= *arg_bitvec++;
5786 }
5787 #endif
5788 }
5789
bitvec_andnot(const uintptr_t * __restrict exclude_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5790 void bitvec_andnot(const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5791 // main_bitvec := main_bitvec ANDNOT exclude_bitvec
5792 // note that this is the reverse of the _mm_andnot() operand order
5793 #ifdef __LP64__
5794 __m128i* vv128 = (__m128i*)main_bitvec;
5795 const __m128i* ev128 = (const __m128i*)exclude_bitvec;
5796 __m128i* vv128_end = &(vv128[word_ct / 2]);
5797 while (vv128 < vv128_end) {
5798 *vv128 = _mm_andnot_si128(*ev128++, *vv128);
5799 vv128++;
5800 }
5801 if (word_ct & 1) {
5802 word_ct--;
5803 main_bitvec[word_ct] &= ~(exclude_bitvec[word_ct]);
5804 }
5805 #else
5806 uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
5807 while (main_bitvec < bitvec_end) {
5808 *main_bitvec++ &= ~(*exclude_bitvec++);
5809 }
5810 #endif
5811 }
5812
bitvec_andnot_reversed_args(const uintptr_t * __restrict include_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5813 void bitvec_andnot_reversed_args(const uintptr_t* __restrict include_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5814 // main_bitvec := (~main_bitvec) AND include_bitvec
5815 #ifdef __LP64__
5816 __m128i* vv128 = (__m128i*)main_bitvec;
5817 const __m128i* iv128 = (const __m128i*)include_bitvec;
5818 __m128i* vv128_end = &(vv128[word_ct / 2]);
5819 while (vv128 < vv128_end) {
5820 *vv128 = _mm_andnot_si128(*vv128, *iv128++);
5821 vv128++;
5822 }
5823 if (word_ct & 1) {
5824 word_ct--;
5825 main_bitvec[word_ct] = (~main_bitvec[word_ct]) & include_bitvec[word_ct];
5826 }
5827 #else
5828 uintptr_t* bitvec_end = &(main_bitvec[word_ct]);
5829 while (main_bitvec < bitvec_end) {
5830 *main_bitvec = (~(*main_bitvec)) & (*include_bitvec++);
5831 main_bitvec++;
5832 }
5833 #endif
5834 }
5835
bitvec_or(const uintptr_t * __restrict arg_bitvec,uintptr_t word_ct,uintptr_t * main_bitvec)5836 void bitvec_or(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec) {
5837 // main_bitvec := main_bitvec OR arg_bitvec
5838 #ifdef __LP64__
5839 __m128i* vv128 = (__m128i*)main_bitvec;
5840 const __m128i* ov128 = (const __m128i*)arg_bitvec;
5841 __m128i* vv128_end = &(vv128[word_ct / 2]);
5842 while (vv128 < vv128_end) {
5843 *vv128 = _mm_or_si128(*ov128++, *vv128);
5844 vv128++;
5845 }
5846 if (word_ct & 1) {
5847 word_ct--;
5848 main_bitvec[word_ct] |= arg_bitvec[word_ct];
5849 }
5850 #else
5851 uintptr_t* vec_end = &(main_bitvec[word_ct]);
5852 while (main_bitvec < vec_end) {
5853 *main_bitvec++ |= *arg_bitvec++;
5854 }
5855 #endif
5856 }
5857
bitvec_ornot(const uintptr_t * __restrict inverted_or_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5858 void bitvec_ornot(const uintptr_t* __restrict inverted_or_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5859 // main_bitvec := main_bitvec OR (~inverted_or_bitvec)
5860 #ifdef __LP64__
5861 #ifdef __APPLE__
5862 const __m128i all1 = {0xffffffffffffffffLLU, 0xffffffffffffffffLLU};
5863 #else
5864 const __m128i all1 = {-1LL, -1LL};
5865 #endif
5866 __m128i* vv128 = (__m128i*)main_bitvec;
5867 const __m128i* ev128 = (const __m128i*)inverted_or_bitvec;
5868 __m128i* vv128_end = &(vv128[word_ct / 2]);
5869 while (vv128 < vv128_end) {
5870 *vv128 = _mm_or_si128(_mm_xor_si128(*ev128++, all1), *vv128);
5871 vv128++;
5872 }
5873 if (word_ct & 1) {
5874 word_ct--;
5875 main_bitvec[word_ct] |= ~(inverted_or_bitvec[word_ct]);
5876 }
5877 #else
5878 uintptr_t* vec_end = &(main_bitvec[word_ct]);
5879 while (main_bitvec < vec_end) {
5880 *main_bitvec++ |= ~(*inverted_or_bitvec++);
5881 }
5882 #endif
5883 }
5884
bitvec_xor(const uintptr_t * __restrict arg_bitvec,uintptr_t word_ct,uintptr_t * __restrict main_bitvec)5885 void bitvec_xor(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec) {
5886 // main_bitvec := main_bitvec XOR xor_bitvec
5887 #ifdef __LP64__
5888 __m128i* bitv128 = (__m128i*)main_bitvec;
5889 __m128i* xorv128 = (__m128i*)arg_bitvec;
5890 __m128i* bitv128_end = &(bitv128[word_ct / 2]);
5891 while (bitv128 < bitv128_end) {
5892 *bitv128 = _mm_xor_si128(*xorv128++, *bitv128);
5893 bitv128++;
5894 }
5895 if (word_ct & 1) {
5896 word_ct--;
5897 main_bitvec[word_ct] ^= arg_bitvec[word_ct];
5898 }
5899 #else
5900 uintptr_t* main_bitvec_end = &(main_bitvec[word_ct]);
5901 while (main_bitvec < main_bitvec_end) {
5902 *main_bitvec++ ^= *arg_bitvec++;
5903 }
5904 #endif
5905 }
5906
is_monomorphic_a2(const uintptr_t * geno_arr,uint32_t sample_ct)5907 uint32_t is_monomorphic_a2(const uintptr_t* geno_arr, uint32_t sample_ct) {
5908 const uintptr_t* loop_end = &(geno_arr[sample_ct / BITCT2]);
5909 uint32_t sample_rem = sample_ct % BITCT2;
5910 for (; geno_arr < loop_end; geno_arr++) {
5911 if ((~(*geno_arr)) & FIVEMASK) {
5912 return 0;
5913 }
5914 }
5915 return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5916 }
5917
is_monomorphic(const uintptr_t * geno_arr,uint32_t sample_ct)5918 uint32_t is_monomorphic(const uintptr_t* geno_arr, uint32_t sample_ct) {
5919 uint32_t sample_ctd2 = sample_ct / BITCT2;
5920 uint32_t sample_rem = sample_ct % BITCT2;
5921 uintptr_t ulii;
5922 uintptr_t uljj;
5923 while (sample_ctd2) {
5924 ulii = *geno_arr++;
5925 uljj = (ulii >> 1) & FIVEMASK;
5926 ulii = ~ulii;
5927 // now ulii & FIVEMASK = low bit zero, uljj = high bit one
5928 if (uljj) {
5929 if (uljj & ulii) {
5930 // heterozygote observed
5931 return 0;
5932 }
5933 // homozyg A2 observed
5934 while (1) {
5935 // 00 and 10 now both demonstrate marker is polymorphic
5936 if (ulii & FIVEMASK) {
5937 return 0;
5938 }
5939 if (!(--sample_ctd2)) {
5940 return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5941 }
5942 ulii = ~(*geno_arr++);
5943 }
5944 } else if (ulii & FIVEMASK) {
5945 do {
5946 if (!(--sample_ctd2)) {
5947 return (sample_rem && ((*geno_arr) & (AAAAMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5948 }
5949 ulii = *geno_arr++;
5950 } while (!(ulii & AAAAMASK));
5951 return 0;
5952 }
5953 sample_ctd2--;
5954 }
5955 if (sample_rem) {
5956 ulii = *geno_arr;
5957 uljj = (ulii >> 1) & FIVEMASK;
5958 ulii = ~ulii;
5959 if ((uljj & ulii) || (uljj && (ulii & (~uljj) & (FIVEMASK >> (BITCT - sample_rem * 2))))) {
5960 return 0;
5961 }
5962 }
5963 return 1;
5964 }
5965
less_than_two_genotypes(const uintptr_t * geno_arr,uint32_t sample_ct)5966 uint32_t less_than_two_genotypes(const uintptr_t* geno_arr, uint32_t sample_ct) {
5967 uint32_t sample_ctd2 = sample_ct / BITCT2;
5968 uint32_t sample_rem = sample_ct % BITCT2;
5969 uintptr_t ulii;
5970 uintptr_t uljj;
5971 uintptr_t ulkk;
5972 uint32_t distinct_genotype_ct;
5973 while (sample_ctd2) {
5974 ulii = *geno_arr++;
5975 uljj = (ulii >> 1) & FIVEMASK;
5976 ulkk = ~ulii;
5977 if (uljj) {
5978 if (uljj & ulii) {
5979 // homozygote major observed; either 00 or 10 now demonstrate marker
5980 // is polymorphic
5981 while (1) {
5982 if (ulkk & FIVEMASK) {
5983 return 0;
5984 }
5985 if (!(--sample_ctd2)) {
5986 return (sample_rem && ((~(*geno_arr)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5987 }
5988 ulkk = ~(*geno_arr++);
5989 }
5990 } else {
5991 // heterozygote observed; either 00 or 11 now means we have 2+
5992 // genotypes
5993 while (1) {
5994 ulii = ~(*geno_arr++);
5995 if (!(--sample_ctd2)) {
5996 return (sample_rem && (((~ulii) ^ (ulii >> 1)) & (FIVEMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
5997 }
5998 if (((~ulii) ^ (ulii >> 1)) & FIVEMASK) {
5999 return 0;
6000 }
6001 }
6002 }
6003 } else if (ulkk & FIVEMASK) {
6004 // homozygous minor observed; either 10 or 11 now demonstrate marker is
6005 // polymorphic
6006 do {
6007 if (!(--sample_ctd2)) {
6008 return (sample_rem && ((*geno_arr) & (AAAAMASK >> (BITCT - sample_rem * 2))))? 0 : 1;
6009 }
6010 ulii = *geno_arr++;
6011 } while (!(ulii & AAAAMASK));
6012 return 0;
6013 }
6014 sample_ctd2--;
6015 }
6016 if (sample_rem) {
6017 ulii = *geno_arr;
6018 uljj = (ulii >> 1) & FIVEMASK;
6019 ulkk = ~ulii;
6020 // homozygous minor present?
6021 distinct_genotype_ct = (ulkk & (~uljj) & (FIVEMASK >> (BITCT - sample_rem * 2)))? 1 : 0;
6022 // heterozygous present?
6023 distinct_genotype_ct += (uljj & ulkk)? 1 : 0;
6024 // homozygous major present?
6025 distinct_genotype_ct += (uljj & ulii)? 1 : 0;
6026 if (distinct_genotype_ct > 1) {
6027 return 0;
6028 }
6029 }
6030 return 1;
6031 }
6032
6033 /*
6034 uint32_t has_three_genotypes(uintptr_t* lptr, uint32_t sample_ct) {
6035 uintptr_t* lptr_end = &(lptr[sample_ct / BITCT2]);
6036 uint32_t sample_rem = sample_ct % BITCT2;
6037 uintptr_t* cur_lptr;
6038 uintptr_t ulii;
6039 uintptr_t uljj;
6040 cur_lptr = lptr;
6041 while (1) {
6042 ulii = ~(*cur_lptr);
6043 uljj = ulii & (ulii >> 1) & FIVEMASK;
6044 if (cur_lptr == lptr_end) {
6045 if ((!sample_rem) || (!(uljj << (BITCT - sample_rem * 2)))) {
6046 return 0;
6047 }
6048 break;
6049 }
6050 if (uljj) {
6051 // found hom A1
6052 break;
6053 }
6054 cur_lptr++;
6055 }
6056 cur_lptr = lptr;
6057 // zero-padding is benign for het and hom A2 checks
6058 lptr_end = &(lptr[QUATERCT_TO_WORDCT(sample_ct)]);
6059 while (1) {
6060 ulii = *cur_lptr;
6061 uljj = (ulii >> 1) & FIVEMASK;
6062 if ((~ulii) & uljj) {
6063 break;
6064 }
6065 if (++cur_lptr == lptr_end) {
6066 return 0;
6067 }
6068 }
6069 cur_lptr = lptr;
6070 do {
6071 ulii = *cur_lptr;
6072 uljj = (ulii >> 1) & FIVEMASK;
6073 if (ulii & uljj) {
6074 return 1;
6075 }
6076 } while (++cur_lptr < lptr_end);
6077 return 0;
6078 }
6079 */
6080
6081 #ifdef __LP64__
6082 // Basic SSE2 implementation of Lauradoux/Walisch popcount.
popcount_vecs(const __m128i * vptr,uintptr_t ct)6083 static inline uintptr_t popcount_vecs(const __m128i* vptr, uintptr_t ct) {
6084 // popcounts vptr[0..(ct-1)]. Assumes ct is a multiple of 3 (0 ok).
6085 const __m128i m1 = {FIVEMASK, FIVEMASK};
6086 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6087 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6088 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6089 uintptr_t tot = 0;
6090 const __m128i* vend;
6091 __m128i count1;
6092 __m128i count2;
6093 __m128i half1;
6094 __m128i half2;
6095 __univec acc;
6096
6097 while (ct >= 30) {
6098 ct -= 30;
6099 vend = &(vptr[30]);
6100 popcount_vecs_main_loop:
6101 acc.vi = _mm_setzero_si128();
6102 do {
6103 count1 = *vptr++;
6104 count2 = *vptr++;
6105 half1 = *vptr++;
6106 half2 = _mm_and_si128(_mm_srli_epi64(half1, 1), m1);
6107 half1 = _mm_and_si128(half1, m1);
6108 // Two bits can represent values from 0-3, so make each pair in count1
6109 // count2 store a partial bitcount covering themselves AND another bit
6110 // from elsewhere.
6111 count1 = _mm_sub_epi64(count1, _mm_and_si128(_mm_srli_epi64(count1, 1), m1));
6112 count2 = _mm_sub_epi64(count2, _mm_and_si128(_mm_srli_epi64(count2, 1), m1));
6113 count1 = _mm_add_epi64(count1, half1);
6114 count2 = _mm_add_epi64(count2, half2);
6115 // Four bits represent 0-15, so we can safely add four 0-3 partial
6116 // bitcounts together.
6117 count1 = _mm_add_epi64(_mm_and_si128(count1, m2), _mm_and_si128(_mm_srli_epi64(count1, 2), m2));
6118 count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(count2, m2), _mm_and_si128(_mm_srli_epi64(count2, 2), m2)));
6119 // Accumulator stores sixteen 0-255 counts in parallel.
6120 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
6121 } while (vptr < vend);
6122 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
6123 tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
6124 }
6125 if (ct) {
6126 vend = &(vptr[ct]);
6127 ct = 0;
6128 goto popcount_vecs_main_loop;
6129 }
6130 return tot;
6131 }
6132
popcount2_vecs(const __m128i * vptr,uintptr_t ct)6133 static inline uintptr_t popcount2_vecs(const __m128i* vptr, uintptr_t ct) {
6134 // assumes ct is a multiple of 6.
6135 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6136 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6137 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6138 uintptr_t tot = 0;
6139 const __m128i* vend;
6140 __m128i loader1;
6141 __m128i loader2;
6142 __m128i count1;
6143 __m128i count2;
6144 __univec acc;
6145
6146 while (ct >= 30) {
6147 ct -= 30;
6148 vend = &(vptr[30]);
6149 popcount2_vecs_main_loop:
6150 acc.vi = _mm_setzero_si128();
6151 do {
6152 loader1 = *vptr++;
6153 loader2 = *vptr++;
6154 count1 = _mm_add_epi64(_mm_and_si128(loader1, m2), _mm_and_si128(_mm_srli_epi64(loader1, 2), m2));
6155 count2 = _mm_add_epi64(_mm_and_si128(loader2, m2), _mm_and_si128(_mm_srli_epi64(loader2, 2), m2));
6156
6157 loader1 = *vptr++;
6158 loader2 = *vptr++;
6159 count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(loader1, m2), _mm_and_si128(_mm_srli_epi64(loader1, 2), m2)));
6160 count2 = _mm_add_epi64(count2, _mm_add_epi64(_mm_and_si128(loader2, m2), _mm_and_si128(_mm_srli_epi64(loader2, 2), m2)));
6161
6162 loader1 = *vptr++;
6163 loader2 = *vptr++;
6164 count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(loader1, m2), _mm_and_si128(_mm_srli_epi64(loader1, 2), m2)));
6165 count2 = _mm_add_epi64(count2, _mm_add_epi64(_mm_and_si128(loader2, m2), _mm_and_si128(_mm_srli_epi64(loader2, 2), m2)));
6166
6167 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
6168 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count2, m4), _mm_and_si128(_mm_srli_epi64(count2, 4), m4)));
6169 } while (vptr < vend);
6170 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
6171 tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
6172 }
6173 if (ct) {
6174 vend = &(vptr[ct]);
6175 ct = 0;
6176 goto popcount2_vecs_main_loop;
6177 }
6178 return tot;
6179 }
6180
popcount_vecs_exclude(const __m128i * __restrict vptr,const __m128i * __restrict exclude_ptr,uintptr_t ct)6181 static inline uintptr_t popcount_vecs_exclude(const __m128i* __restrict vptr, const __m128i* __restrict exclude_ptr, uintptr_t ct) {
6182 // popcounts vptr ANDNOT exclude_ptr[0..(ct-1)]. ct is a multiple of 3.
6183 const __m128i m1 = {FIVEMASK, FIVEMASK};
6184 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6185 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6186 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6187 uintptr_t tot = 0;
6188 const __m128i* vend;
6189 __m128i count1, count2, half1, half2;
6190 __univec acc;
6191
6192 while (ct >= 30) {
6193 ct -= 30;
6194 vend = &(vptr[30]);
6195 popcount_vecs_exclude_main_loop:
6196 acc.vi = _mm_setzero_si128();
6197 do {
6198 // nots the FIRST value
6199 count1 = _mm_andnot_si128(*exclude_ptr++, *vptr++);
6200 count2 = _mm_andnot_si128(*exclude_ptr++, *vptr++);
6201 half1 = _mm_andnot_si128(*exclude_ptr++, *vptr++);
6202 half2 = _mm_and_si128(_mm_srli_epi64(half1, 1), m1);
6203 half1 = _mm_and_si128(half1, m1);
6204 count1 = _mm_sub_epi64(count1, _mm_and_si128(_mm_srli_epi64(count1, 1), m1));
6205 count2 = _mm_sub_epi64(count2, _mm_and_si128(_mm_srli_epi64(count2, 1), m1));
6206 count1 = _mm_add_epi64(count1, half1);
6207 count2 = _mm_add_epi64(count2, half2);
6208 count1 = _mm_add_epi64(_mm_and_si128(count1, m2), _mm_and_si128(_mm_srli_epi64(count1, 2), m2));
6209 count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(count2, m2), _mm_and_si128(_mm_srli_epi64(count2, 2), m2)));
6210 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
6211 } while (vptr < vend);
6212 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
6213 tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
6214 }
6215 if (ct) {
6216 vend = &(vptr[ct]);
6217 ct = 0;
6218 goto popcount_vecs_exclude_main_loop;
6219 }
6220 return tot;
6221 }
6222
popcount_vecs_intersect(const __m128i * __restrict vptr1,const __m128i * __restrict vptr2,uintptr_t ct)6223 static inline uintptr_t popcount_vecs_intersect(const __m128i* __restrict vptr1, const __m128i* __restrict vptr2, uintptr_t ct) {
6224 // popcounts vptr1 AND vptr2[0..(ct-1)]. ct is a multiple of 3.
6225 const __m128i m1 = {FIVEMASK, FIVEMASK};
6226 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6227 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6228 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6229 uintptr_t tot = 0;
6230 const __m128i* vend1;
6231 __m128i count1, count2, half1, half2;
6232 __univec acc;
6233
6234 while (ct >= 30) {
6235 ct -= 30;
6236 vend1 = &(vptr1[30]);
6237 popcount_vecs_intersect_main_loop:
6238 acc.vi = _mm_setzero_si128();
6239 do {
6240 count1 = _mm_and_si128(*vptr2++, *vptr1++);
6241 count2 = _mm_and_si128(*vptr2++, *vptr1++);
6242 half1 = _mm_and_si128(*vptr2++, *vptr1++);
6243 half2 = _mm_and_si128(_mm_srli_epi64(half1, 1), m1);
6244 half1 = _mm_and_si128(half1, m1);
6245 count1 = _mm_sub_epi64(count1, _mm_and_si128(_mm_srli_epi64(count1, 1), m1));
6246 count2 = _mm_sub_epi64(count2, _mm_and_si128(_mm_srli_epi64(count2, 1), m1));
6247 count1 = _mm_add_epi64(count1, half1);
6248 count2 = _mm_add_epi64(count2, half2);
6249 count1 = _mm_add_epi64(_mm_and_si128(count1, m2), _mm_and_si128(_mm_srli_epi64(count1, 2), m2));
6250 count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(count2, m2), _mm_and_si128(_mm_srli_epi64(count2, 2), m2)));
6251 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
6252 } while (vptr1 < vend1);
6253 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
6254 tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
6255 }
6256 if (ct) {
6257 vend1 = &(vptr1[ct]);
6258 ct = 0;
6259 goto popcount_vecs_intersect_main_loop;
6260 }
6261 return tot;
6262 }
6263 #endif
6264
popcount_longs(const uintptr_t * lptr,uintptr_t word_ct)6265 uintptr_t popcount_longs(const uintptr_t* lptr, uintptr_t word_ct) {
6266 // Efficiently popcounts lptr[0..(word_ct - 1)]. In the 64-bit case, lptr[]
6267 // must be 16-byte aligned.
6268 // The popcount_longs_nzbase() wrapper takes care of starting from a later
6269 // index.
6270 uintptr_t tot = 0;
6271 const uintptr_t* lptr_end = &(lptr[word_ct]);
6272 #ifdef __LP64__
6273 uintptr_t six_ct;
6274 const __m128i* vptr;
6275 vptr = (const __m128i*)lptr;
6276 six_ct = word_ct / 6;
6277 tot += popcount_vecs(vptr, six_ct * 3);
6278 lptr = &(lptr[six_ct * 6]);
6279 #else
6280 // The humble 16-bit lookup table actually beats
6281 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
6282 // on my development machine by a hair.
6283 // However, if we take the hint from Lauradoux/Walisch and postpone the
6284 // multiply and right shift, this is no longer true. Ah well.
6285 const uintptr_t* lptr_six_end;
6286 uintptr_t tmp_stor;
6287 uintptr_t loader;
6288 uintptr_t ulii;
6289 uintptr_t uljj;
6290 lptr_six_end = &(lptr[word_ct - (word_ct % 6)]);
6291 while (lptr < lptr_six_end) {
6292 loader = *lptr++;
6293 ulii = loader - ((loader >> 1) & FIVEMASK);
6294 loader = *lptr++;
6295 uljj = loader - ((loader >> 1) & FIVEMASK);
6296 loader = *lptr++;
6297 ulii += (loader >> 1) & FIVEMASK;
6298 uljj += loader & FIVEMASK;
6299 ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6300 ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6301 tmp_stor = (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6302
6303 loader = *lptr++;
6304 ulii = loader - ((loader >> 1) & FIVEMASK);
6305 loader = *lptr++;
6306 uljj = loader - ((loader >> 1) & FIVEMASK);
6307 loader = *lptr++;
6308 ulii += (loader >> 1) & FIVEMASK;
6309 uljj += loader & FIVEMASK;
6310 ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6311 ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6312 tmp_stor += (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6313
6314 // Each 8-bit slot stores a number in 0..48. Multiplying by 0x01010101 is
6315 // equivalent to the left-shifts and adds we need to sum those four 8-bit
6316 // numbers in the high-order slot.
6317 tot += (tmp_stor * 0x01010101) >> 24;
6318 }
6319 #endif
6320 while (lptr < lptr_end) {
6321 tot += popcount_long(*lptr++);
6322 }
6323 return tot;
6324 }
6325
popcount2_longs(const uintptr_t * lptr,uintptr_t word_ct)6326 uintptr_t popcount2_longs(const uintptr_t* lptr, uintptr_t word_ct) {
6327 // treats lptr[] as an array of two-bit instead of one-bit numbers
6328 uintptr_t tot = 0;
6329 const uintptr_t* lptr_end = &(lptr[word_ct]);
6330 #ifdef __LP64__
6331 uintptr_t twelve_ct;
6332 const __m128i* vptr;
6333 vptr = (const __m128i*)lptr;
6334 twelve_ct = word_ct / 12;
6335 tot += popcount2_vecs(vptr, twelve_ct * 6);
6336 lptr = &(lptr[twelve_ct * 12]);
6337 #else
6338 const uintptr_t* lptr_six_end;
6339 uintptr_t loader1;
6340 uintptr_t loader2;
6341 uintptr_t ulii;
6342 uintptr_t uljj;
6343 lptr_six_end = &(lptr[word_ct - (word_ct % 6)]);
6344 while (lptr < lptr_six_end) {
6345 loader1 = *lptr++;
6346 loader2 = *lptr++;
6347 ulii = (loader1 & 0x33333333) + ((loader1 >> 2) & 0x33333333);
6348 uljj = (loader2 & 0x33333333) + ((loader2 >> 2) & 0x33333333);
6349 loader1 = *lptr++;
6350 loader2 = *lptr++;
6351 ulii += (loader1 & 0x33333333) + ((loader1 >> 2) & 0x33333333);
6352 uljj += (loader2 & 0x33333333) + ((loader2 >> 2) & 0x33333333);
6353 loader1 = *lptr++;
6354 loader2 = *lptr++;
6355 ulii += (loader1 & 0x33333333) + ((loader1 >> 2) & 0x33333333);
6356 uljj += (loader2 & 0x33333333) + ((loader2 >> 2) & 0x33333333);
6357 ulii = (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6358 ulii += (uljj & 0x0f0f0f0f) + ((uljj >> 4) & 0x0f0f0f0f);
6359
6360 // Each 8-bit slot stores a number in 0..48. Multiplying by 0x01010101 is
6361 // equivalent to the left-shifts and adds we need to sum those four 8-bit
6362 // numbers in the high-order slot.
6363 tot += (ulii * 0x01010101) >> 24;
6364 }
6365 #endif
6366 while (lptr < lptr_end) {
6367 tot += popcount2_long(*lptr++);
6368 }
6369 return tot;
6370 }
6371
popcount_bit_idx(const uintptr_t * lptr,uintptr_t start_idx,uintptr_t end_idx)6372 uintptr_t popcount_bit_idx(const uintptr_t* lptr, uintptr_t start_idx, uintptr_t end_idx) {
6373 uintptr_t start_idxl = start_idx / BITCT;
6374 uintptr_t start_idxlr = start_idx & (BITCT - 1);
6375 uintptr_t end_idxl = end_idx / BITCT;
6376 uintptr_t end_idxlr = end_idx & (BITCT - 1);
6377 uintptr_t ct = 0;
6378 if (start_idxl == end_idxl) {
6379 return popcount_long(lptr[start_idxl] & ((ONELU << end_idxlr) - (ONELU << start_idxlr)));
6380 }
6381 if (start_idxlr) {
6382 ct = popcount_long(lptr[start_idxl++] >> start_idxlr);
6383 }
6384 if (end_idxl > start_idxl) {
6385 ct += popcount_longs_nzbase(lptr, start_idxl, end_idxl);
6386 }
6387 if (end_idxlr) {
6388 ct += popcount_long(lptr[end_idxl] & ((ONELU << end_idxlr) - ONELU));
6389 }
6390 return ct;
6391 }
6392
chrom_window_max(const uint32_t * marker_pos,const uintptr_t * marker_exclude,const Chrom_info * chrom_info_ptr,uint32_t chrom_idx,uint32_t ct_max,uint32_t bp_max,uint32_t cur_window_max)6393 uint32_t chrom_window_max(const uint32_t* marker_pos, const uintptr_t* marker_exclude, const Chrom_info* chrom_info_ptr, uint32_t chrom_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max) {
6394 // okay, it's absurd to keep rewriting this from scratch, especially given
6395 // that makes it likely that some reimplementations suck (--indep{-pairwise}
6396 // version was O(n^2) instead of O(n); sure, it didn't really matter because
6397 // the main calculation was more expensive, but still, ugh).
6398
6399 if (cur_window_max >= ct_max) {
6400 return ct_max;
6401 }
6402 // assumes chrom_idx exists
6403 uint32_t chrom_fo_idx = chrom_info_ptr->chrom_idx_to_foidx[chrom_idx];
6404 uint32_t chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
6405 uint32_t marker_uidx = next_unset(marker_exclude, chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], chrom_end);
6406 uint32_t marker_ct = chrom_end - marker_uidx - popcount_bit_idx(marker_exclude, marker_uidx, chrom_end);
6407 if (marker_ct <= cur_window_max) {
6408 return cur_window_max;
6409 }
6410 uint32_t window_idx_first = 0;
6411 uint32_t window_uidx_first = marker_uidx;
6412 uint32_t window_pos_first = marker_pos[marker_uidx];
6413 uint32_t marker_idx;
6414 uint32_t marker_pos_thresh;
6415 for (marker_idx = 0; marker_idx < marker_ct; marker_uidx++, marker_idx++) {
6416 next_unset_unsafe_ck(marker_exclude, &marker_uidx);
6417 marker_pos_thresh = marker_pos[marker_uidx];
6418 if (marker_pos_thresh < bp_max) {
6419 marker_pos_thresh = 0;
6420 } else {
6421 marker_pos_thresh -= bp_max;
6422 }
6423 if (marker_pos_thresh > window_pos_first) {
6424 do {
6425 window_uidx_first++;
6426 next_unset_unsafe_ck(marker_exclude, &window_uidx_first);
6427 window_pos_first = marker_pos[window_uidx_first];
6428 window_idx_first++;
6429 } while (marker_pos_thresh > window_pos_first);
6430 } else if (marker_idx - window_idx_first == cur_window_max) {
6431 if (++cur_window_max == ct_max) {
6432 return cur_window_max;
6433 }
6434 }
6435 }
6436 return cur_window_max;
6437 }
6438
window_back(const uint32_t * __restrict marker_pos,const double * __restrict marker_cms,const uintptr_t * marker_exclude,uint32_t marker_uidx_min,uint32_t marker_uidx_start,uint32_t count_max,uint32_t bp_max,double cm_max,uint32_t * __restrict window_trail_ct_ptr)6439 uint32_t window_back(const uint32_t* __restrict marker_pos, const double* __restrict marker_cms, const uintptr_t* marker_exclude, uint32_t marker_uidx_min, uint32_t marker_uidx_start, uint32_t count_max, uint32_t bp_max, double cm_max, uint32_t* __restrict window_trail_ct_ptr) {
6440 // Finds the earliest location which is within count_max sites, bp_max bps,
6441 // and (if marker_cms != nullptr) cm_max centimorgans.
6442 // count_max must be positive.
6443 if (marker_uidx_min == marker_uidx_start) {
6444 // special-case this since it happens frequently
6445 *window_trail_ct_ptr = 0;
6446 return marker_uidx_min;
6447 }
6448 double min_cm = marker_cms? (marker_cms[marker_uidx_start] - cm_max) : 0.0;
6449 uint32_t min_pos = 0;
6450 uint32_t marker_uwidx_cur = marker_uidx_start / BITCT;
6451 uint32_t uii = marker_uidx_start % BITCT;
6452 uint32_t marker_uidx_last = marker_uidx_start;
6453 uint32_t remaining_count = count_max;
6454 const uintptr_t* marker_exclude_cur = &(marker_exclude[marker_uwidx_cur]);
6455 uintptr_t cur_word;
6456 uint32_t ujj;
6457 uint32_t ukk;
6458 marker_uwidx_cur *= BITCT;
6459 if (bp_max <= marker_pos[marker_uidx_start]) {
6460 min_pos = marker_pos[marker_uidx_start] - bp_max;
6461 }
6462 if (!uii) {
6463 goto window_back_zstart;
6464 }
6465 cur_word = (~(*marker_exclude_cur)) & ((ONELU << uii) - ONELU);
6466 while (1) {
6467 if (marker_uwidx_cur <= marker_uidx_min) {
6468 cur_word &= ~((ONELU << (marker_uidx_min % BITCT)) - ONELU);
6469 marker_uwidx_cur = marker_uidx_min;
6470 uii = popcount_long(cur_word);
6471 if (uii >= remaining_count) {
6472 goto window_back_count;
6473 } else if ((marker_pos[marker_uwidx_cur] < min_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] < min_cm))) {
6474 goto window_back_find_first_pos;
6475 } else {
6476 goto window_back_min;
6477 }
6478 }
6479 uii = popcount_long(cur_word);
6480 if (uii >= remaining_count) {
6481 window_back_count:
6482 uii -= remaining_count; // now a count of number of bits to advance
6483 while (uii) {
6484 cur_word &= cur_word - 1;
6485 uii--;
6486 }
6487 // bugfix (7 May 2017): forgot to round marker_uwidx_cur down to word
6488 // boundary, before adding CTZLU(cur_word) offset
6489 marker_uwidx_cur = (marker_uwidx_cur & (~(BITCT - ONELU))) + CTZLU(cur_word);
6490 if ((marker_pos[marker_uwidx_cur] < min_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] < min_cm))) {
6491 goto window_back_find_first_pos;
6492 }
6493 *window_trail_ct_ptr = count_max;
6494 return marker_uwidx_cur;
6495 }
6496 if ((marker_pos[marker_uwidx_cur] < min_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] < min_cm))) {
6497 window_back_find_first_pos:
6498 ujj = uint32arr_greater_than(&(marker_pos[marker_uwidx_cur]), marker_uidx_last - marker_uwidx_cur, min_pos);
6499 if (marker_cms) {
6500 ukk = doublearr_greater_than(&(marker_cms[marker_uwidx_cur]), marker_uidx_last - marker_uwidx_cur, min_cm);
6501 if (ujj < ukk) {
6502 ujj = ukk;
6503 }
6504 }
6505 marker_uwidx_cur += ujj;
6506 if (marker_uwidx_cur > marker_uidx_min) {
6507 next_unset_unsafe_ck(marker_exclude, &marker_uwidx_cur);
6508 }
6509 window_back_min:
6510 *window_trail_ct_ptr = marker_uidx_start - marker_uwidx_cur - popcount_bit_idx(marker_exclude, marker_uwidx_cur, marker_uidx_start);
6511 return marker_uwidx_cur;
6512 }
6513 remaining_count -= uii;
6514 marker_uidx_last = marker_uwidx_cur;
6515 window_back_zstart:
6516 cur_word = ~(*(--marker_exclude_cur));
6517 marker_uwidx_cur -= BITCT;
6518 }
6519 }
6520
window_forward(const uint32_t * __restrict marker_pos,const double * __restrict marker_cms,const uintptr_t * marker_exclude,uint32_t marker_uidx_start,uint32_t marker_uidx_last,uint32_t count_max,uint32_t bp_max,double cm_max,uint32_t * __restrict window_lead_ct_ptr)6521 uint32_t window_forward(const uint32_t* __restrict marker_pos, const double* __restrict marker_cms, const uintptr_t* marker_exclude, uint32_t marker_uidx_start, uint32_t marker_uidx_last, uint32_t count_max, uint32_t bp_max, double cm_max, uint32_t* __restrict window_lead_ct_ptr) {
6522 // window_lead_ct_ptr currently cannot be nullptr
6523 if (marker_uidx_start == marker_uidx_last) {
6524 *window_lead_ct_ptr = 0;
6525 return marker_uidx_start;
6526 }
6527 double max_cm = marker_cms? (cm_max + marker_cms[marker_uidx_start]) : 0.0;
6528 uint32_t marker_uwidx_prev = marker_uidx_start;
6529 uint32_t max_pos = bp_max + marker_pos[marker_uidx_start];
6530 uint32_t marker_uwidx_cur = (marker_uidx_start + 1) / BITCT;
6531 uint32_t uii = (marker_uidx_start + 1) % BITCT;
6532 uint32_t remaining_count = count_max;
6533 const uintptr_t* marker_exclude_cur = &(marker_exclude[marker_uwidx_cur]);
6534 uintptr_t cur_word;
6535 uint32_t ujj;
6536 uint32_t ukk;
6537 marker_uwidx_cur *= BITCT;
6538 cur_word = ~((*marker_exclude_cur) | ((ONELU << uii) - ONELU));
6539 while (1) {
6540 uii = popcount_long(cur_word);
6541 if (uii >= remaining_count) {
6542 while (--remaining_count) {
6543 cur_word &= cur_word - 1;
6544 }
6545 marker_uwidx_cur += CTZLU(cur_word);
6546 if (marker_uwidx_cur <= marker_uidx_last) {
6547 if ((marker_pos[marker_uwidx_cur] > max_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] > max_cm))) {
6548 break;
6549 }
6550 *window_lead_ct_ptr = count_max;
6551 return marker_uwidx_cur;
6552 }
6553 if ((marker_pos[marker_uidx_last] <= max_pos) && ((!marker_cms) || (marker_cms[marker_uidx_last] <= max_cm))) {
6554 marker_uwidx_prev = marker_uidx_last;
6555 goto window_forward_return;
6556 }
6557 marker_uwidx_cur = marker_uidx_last;
6558 break;
6559 }
6560 marker_uwidx_cur += BITCT;
6561 if (marker_uwidx_cur >= marker_uidx_last) {
6562 if ((marker_pos[marker_uidx_last] <= max_pos) && ((!marker_cms) || (marker_cms[marker_uidx_last] <= max_cm))) {
6563 marker_uwidx_prev = marker_uidx_last;
6564 goto window_forward_return;
6565 }
6566 marker_uwidx_cur = marker_uidx_last;
6567 break;
6568 } else if ((marker_pos[marker_uwidx_cur] > max_pos) || (marker_cms && (marker_cms[marker_uwidx_cur] > max_cm))) {
6569 break;
6570 }
6571 marker_uwidx_prev = marker_uwidx_cur;
6572 remaining_count -= uii;
6573 cur_word = ~(*(++marker_exclude_cur));
6574 }
6575 ujj = uint32arr_greater_than(&(marker_pos[marker_uwidx_prev]), marker_uwidx_cur - marker_uwidx_prev, max_pos + 1);
6576 if (marker_cms) {
6577 ukk = doublearr_greater_than(&(marker_cms[marker_uwidx_prev]), marker_uwidx_cur - marker_uwidx_prev, max_cm * (1 + SMALL_EPSILON));
6578 if (ujj > ukk) {
6579 ujj = ukk;
6580 }
6581 }
6582 marker_uwidx_prev += ujj;
6583 prev_unset_unsafe_ck(marker_exclude, &marker_uwidx_prev);
6584 window_forward_return:
6585 *window_lead_ct_ptr = marker_uwidx_prev - marker_uidx_start - popcount_bit_idx(marker_exclude, marker_uidx_start, marker_uwidx_prev);
6586 return marker_uwidx_prev;
6587 }
6588
jump_forward_unset_unsafe(const uintptr_t * bitvec,uintptr_t cur_pos,uintptr_t forward_ct)6589 uintptr_t jump_forward_unset_unsafe(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct) {
6590 // advances forward_ct unset bits; forward_ct must be positive. (stays put
6591 // if forward_ct == 1 and current bit is unset. may want to tweak this
6592 // interface, easy to introduce off-by-one bugs...)
6593 // In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end
6594 // of the trailing 16-byte block can be safely read from.
6595 uintptr_t widx = cur_pos / BITCT;
6596 uintptr_t ulii = cur_pos % BITCT;
6597 const uintptr_t* bptr = &(bitvec[widx]);
6598 uintptr_t uljj;
6599 uintptr_t ulkk;
6600 #ifdef __LP64__
6601 const __m128i* vptr;
6602 #endif
6603 if (ulii) {
6604 uljj = (~(*bptr)) >> ulii;
6605 ulkk = popcount_long(uljj);
6606 if (ulkk >= forward_ct) {
6607 jump_forward_unset_unsafe_finish:
6608 ulkk = CTZLU(uljj);
6609 while (--forward_ct) {
6610 uljj &= uljj - 1;
6611 ulkk = CTZLU(uljj);
6612 }
6613 return widx * BITCT + ulii + ulkk;
6614 }
6615 forward_ct -= ulkk;
6616 widx++;
6617 bptr++;
6618 }
6619 ulii = 0;
6620 #ifdef __LP64__
6621 if (widx & 1) {
6622 uljj = ~(*bptr);
6623 ulkk = popcount_long(uljj);
6624 if (ulkk >= forward_ct) {
6625 goto jump_forward_unset_unsafe_finish;
6626 }
6627 forward_ct -= ulkk;
6628 bptr++;
6629 }
6630 vptr = (const __m128i*)bptr;
6631 while (forward_ct > BITCT * 6) {
6632 uljj = ((forward_ct - 1) / (BITCT * 6)) * 3;
6633 ulkk = popcount_vecs(vptr, uljj);
6634 vptr = &(vptr[uljj]);
6635 forward_ct -= uljj * BITCT * 2 - ulkk;
6636 }
6637 bptr = (const uintptr_t*)vptr;
6638 while (forward_ct > BITCT) {
6639 forward_ct -= popcount_long(~(*bptr++));
6640 }
6641 #else
6642 while (forward_ct > BITCT) {
6643 uljj = (forward_ct - 1) / BITCT;
6644 ulkk = popcount_longs(bptr, uljj);
6645 bptr = &(bptr[uljj]);
6646 forward_ct -= uljj * BITCT - ulkk;
6647 }
6648 #endif
6649 while (1) {
6650 uljj = ~(*bptr);
6651 ulkk = popcount_long(uljj);
6652 if (ulkk >= forward_ct) {
6653 widx = (uintptr_t)(bptr - bitvec);
6654 goto jump_forward_unset_unsafe_finish;
6655 }
6656 forward_ct -= ulkk;
6657 bptr++;
6658 }
6659 }
6660
popcount_longs_exclude(const uintptr_t * __restrict lptr,const uintptr_t * __restrict exclude_arr,uintptr_t end_idx)6661 uintptr_t popcount_longs_exclude(const uintptr_t* __restrict lptr, const uintptr_t* __restrict exclude_arr, uintptr_t end_idx) {
6662 // popcounts lptr ANDNOT exclude_arr[0..(end_idx-1)].
6663 // N.B. on 64-bit systems, assumes lptr and exclude_arr are 16-byte aligned.
6664 uintptr_t tot = 0;
6665 const uintptr_t* lptr_end = &(lptr[end_idx]);
6666 #ifdef __LP64__
6667 uintptr_t six_ct = end_idx / 6;
6668 tot += popcount_vecs_exclude((const __m128i*)lptr, (const __m128i*)exclude_arr, six_ct * 3);
6669 lptr = &(lptr[six_ct * 6]);
6670 exclude_arr = &(exclude_arr[six_ct * 6]);
6671 #else
6672 const uintptr_t* lptr_six_end;
6673 uintptr_t tmp_stor;
6674 uintptr_t loader;
6675 uintptr_t ulii;
6676 uintptr_t uljj;
6677 lptr_six_end = &(lptr[end_idx - (end_idx % 6)]);
6678 while (lptr < lptr_six_end) {
6679 loader = (*lptr++) & (~(*exclude_arr++));
6680 ulii = loader - ((loader >> 1) & FIVEMASK);
6681 loader = (*lptr++) & (~(*exclude_arr++));
6682 uljj = loader - ((loader >> 1) & FIVEMASK);
6683 loader = (*lptr++) & (~(*exclude_arr++));
6684 ulii += (loader >> 1) & FIVEMASK;
6685 uljj += loader & FIVEMASK;
6686 ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6687 ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6688 tmp_stor = (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6689
6690 loader = (*lptr++) & (~(*exclude_arr++));
6691 ulii = loader - ((loader >> 1) & FIVEMASK);
6692 loader = (*lptr++) & (~(*exclude_arr++));
6693 uljj = loader - ((loader >> 1) & FIVEMASK);
6694 loader = (*lptr++) & (~(*exclude_arr++));
6695 ulii += (loader >> 1) & FIVEMASK;
6696 uljj += loader & FIVEMASK;
6697 ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6698 ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6699 tmp_stor += (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6700
6701 // Each 8-bit slot stores a number in 0..48. Multiplying by 0x01010101 is
6702 // equivalent to the left-shifts and adds we need to sum those four 8-bit
6703 // numbers in the high-order slot.
6704 tot += (tmp_stor * 0x01010101) >> 24;
6705 }
6706 #endif
6707 while (lptr < lptr_end) {
6708 tot += popcount_long((*lptr++) & (~(*exclude_arr++)));
6709 }
6710 return tot;
6711 }
6712
popcount_longs_intersect(const uintptr_t * __restrict lptr1,const uintptr_t * __restrict lptr2,uintptr_t word_ct)6713 uintptr_t popcount_longs_intersect(const uintptr_t* __restrict lptr1, const uintptr_t* __restrict lptr2, uintptr_t word_ct) {
6714 uintptr_t tot = 0;
6715 const uintptr_t* lptr1_end = &(lptr1[word_ct]);
6716 #ifdef __LP64__
6717 uintptr_t six_ct = word_ct / 6;
6718 tot += popcount_vecs_intersect((const __m128i*)lptr1, (const __m128i*)lptr2, six_ct * 3);
6719 lptr1 = &(lptr1[six_ct * 6]);
6720 lptr2 = &(lptr2[six_ct * 6]);
6721 #else
6722 const uintptr_t* lptr1_six_end;
6723 uintptr_t tmp_stor;
6724 uintptr_t loader;
6725 uintptr_t ulii;
6726 uintptr_t uljj;
6727 lptr1_six_end = &(lptr1[word_ct - (word_ct % 6)]);
6728 while (lptr1 < lptr1_six_end) {
6729 loader = (*lptr1++) & (*lptr2++);
6730 ulii = loader - ((loader >> 1) & FIVEMASK);
6731 loader = (*lptr1++) & (*lptr2++);
6732 uljj = loader - ((loader >> 1) & FIVEMASK);
6733 loader = (*lptr1++) & (*lptr2++);
6734 ulii += (loader >> 1) & FIVEMASK;
6735 uljj += loader & FIVEMASK;
6736 ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6737 ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6738 tmp_stor = (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6739
6740 loader = (*lptr1++) & (*lptr2++);
6741 ulii = loader - ((loader >> 1) & FIVEMASK);
6742 loader = (*lptr1++) & (*lptr2++);
6743 uljj = loader - ((loader >> 1) & FIVEMASK);
6744 loader = (*lptr1++) & (*lptr2++);
6745 ulii += (loader >> 1) & FIVEMASK;
6746 uljj += loader & FIVEMASK;
6747 ulii = (ulii & 0x33333333) + ((ulii >> 2) & 0x33333333);
6748 ulii += (uljj & 0x33333333) + ((uljj >> 2) & 0x33333333);
6749 tmp_stor += (ulii & 0x0f0f0f0f) + ((ulii >> 4) & 0x0f0f0f0f);
6750
6751 // Each 8-bit slot stores a number in 0..48. Multiplying by 0x01010101 is
6752 // equivalent to the left-shifts and adds we need to sum those four 8-bit
6753 // numbers in the high-order slot.
6754 tot += (tmp_stor * 0x01010101) >> 24;
6755 }
6756 #endif
6757 while (lptr1 < lptr1_end) {
6758 tot += popcount_long((*lptr1++) & (*lptr2++));
6759 }
6760 return tot;
6761 }
6762
vertical_bitct_subtract(const uintptr_t * bitarr,uint32_t item_ct,uint32_t * sum_arr)6763 void vertical_bitct_subtract(const uintptr_t* bitarr, uint32_t item_ct, uint32_t* sum_arr) {
6764 // assumes trailing bits are zeroed out
6765 uintptr_t cur_word;
6766 uint32_t idx_offset;
6767 uint32_t last_set_bit;
6768 for (idx_offset = 0; idx_offset < item_ct; idx_offset += BITCT) {
6769 cur_word = *bitarr++;
6770 while (cur_word) {
6771 last_set_bit = CTZLU(cur_word);
6772 sum_arr[idx_offset + last_set_bit] -= 1;
6773 cur_word &= cur_word - ONELU;
6774 }
6775 }
6776 }
6777
6778 #ifdef __LP64__
count_2freq_dbl_960b(const VECITYPE * geno_vvec,const VECITYPE * geno_vvec_end,const VECITYPE * __restrict mask1vp,const VECITYPE * __restrict mask2vp,uint32_t * __restrict ct1abp,uint32_t * __restrict ct1cp,uint32_t * __restrict ct2abp,uint32_t * __restrict ct2cp)6779 void count_2freq_dbl_960b(const VECITYPE* geno_vvec, const VECITYPE* geno_vvec_end, const VECITYPE* __restrict mask1vp, const VECITYPE* __restrict mask2vp, uint32_t* __restrict ct1abp, uint32_t* __restrict ct1cp, uint32_t* __restrict ct2abp, uint32_t* __restrict ct2cp) {
6780 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6781 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6782 __m128i loader;
6783 __m128i loader2;
6784 __m128i loader3;
6785 __m128i to_ct1_ab;
6786 __m128i to_ct_abtmp;
6787 __m128i to_ct1_c;
6788 __m128i to_ct2_ab;
6789 __m128i to_ct2_c;
6790 __univec acc1_ab;
6791 __univec acc1_c;
6792 __univec acc2_ab;
6793 __univec acc2_c;
6794
6795 acc1_ab.vi = _mm_setzero_si128();
6796 acc1_c.vi = _mm_setzero_si128();
6797 acc2_ab.vi = _mm_setzero_si128();
6798 acc2_c.vi = _mm_setzero_si128();
6799 do {
6800 loader = *geno_vvec++;
6801 loader2 = *mask1vp++;
6802 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6803 loader2 = _mm_and_si128(loader2, loader);
6804 to_ct1_ab = _mm_add_epi64(loader3, loader2);
6805 to_ct1_c = _mm_andnot_si128(loader3, loader2);
6806 loader2 = *mask2vp++;
6807 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6808 loader2 = _mm_and_si128(loader2, loader);
6809 to_ct2_ab = _mm_add_epi64(loader3, loader2);
6810 to_ct2_c = _mm_andnot_si128(loader3, loader2);
6811 to_ct1_ab = _mm_add_epi64(_mm_and_si128(to_ct1_ab, m2), _mm_and_si128(_mm_srli_epi64(to_ct1_ab, 2), m2));
6812 to_ct2_ab = _mm_add_epi64(_mm_and_si128(to_ct2_ab, m2), _mm_and_si128(_mm_srli_epi64(to_ct2_ab, 2), m2));
6813
6814 loader = *geno_vvec++;
6815 loader2 = *mask1vp++;
6816 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6817 loader2 = _mm_and_si128(loader2, loader);
6818 to_ct_abtmp = _mm_add_epi64(loader3, loader2);
6819 to_ct1_c = _mm_add_epi64(to_ct1_c, _mm_andnot_si128(loader3, loader2));
6820 to_ct1_ab = _mm_add_epi64(to_ct1_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
6821 loader2 = *mask2vp++;
6822 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6823 loader2 = _mm_and_si128(loader2, loader);
6824 to_ct_abtmp = _mm_add_epi64(loader3, loader2);
6825 to_ct2_c = _mm_add_epi64(to_ct2_c, _mm_andnot_si128(loader3, loader2));
6826 to_ct2_ab = _mm_add_epi64(to_ct2_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
6827
6828 loader = *geno_vvec++;
6829 loader2 = *mask1vp++;
6830 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6831 loader2 = _mm_and_si128(loader2, loader);
6832 to_ct_abtmp = _mm_add_epi64(loader3, loader2);
6833 to_ct1_c = _mm_add_epi64(to_ct1_c, _mm_andnot_si128(loader3, loader2));
6834 to_ct1_ab = _mm_add_epi64(to_ct1_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
6835 loader2 = *mask2vp++;
6836 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6837 loader2 = _mm_and_si128(loader2, loader);
6838 to_ct_abtmp = _mm_add_epi64(loader3, loader2);
6839 to_ct2_c = _mm_add_epi64(to_ct2_c, _mm_andnot_si128(loader3, loader2));
6840 to_ct2_ab = _mm_add_epi64(to_ct2_ab, _mm_add_epi64(_mm_and_si128(to_ct_abtmp, m2), _mm_and_si128(_mm_srli_epi64(to_ct_abtmp, 2), m2)));
6841
6842 to_ct1_c = _mm_add_epi64(_mm_and_si128(to_ct1_c, m2), _mm_and_si128(_mm_srli_epi64(to_ct1_c, 2), m2));
6843 to_ct2_c = _mm_add_epi64(_mm_and_si128(to_ct2_c, m2), _mm_and_si128(_mm_srli_epi64(to_ct2_c, 2), m2));
6844
6845 acc1_ab.vi = _mm_add_epi64(acc1_ab.vi, _mm_add_epi64(_mm_and_si128(to_ct1_ab, m4), _mm_and_si128(_mm_srli_epi64(to_ct1_ab, 4), m4)));
6846 acc1_c.vi = _mm_add_epi64(acc1_c.vi, _mm_add_epi64(_mm_and_si128(to_ct1_c, m4), _mm_and_si128(_mm_srli_epi64(to_ct1_c, 4), m4)));
6847 acc2_ab.vi = _mm_add_epi64(acc2_ab.vi, _mm_add_epi64(_mm_and_si128(to_ct2_ab, m4), _mm_and_si128(_mm_srli_epi64(to_ct2_ab, 4), m4)));
6848 acc2_c.vi = _mm_add_epi64(acc2_c.vi, _mm_add_epi64(_mm_and_si128(to_ct2_c, m4), _mm_and_si128(_mm_srli_epi64(to_ct2_c, 4), m4)));
6849 } while (geno_vvec < geno_vvec_end);
6850 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6851 acc1_ab.vi = _mm_add_epi64(_mm_and_si128(acc1_ab.vi, m8), _mm_and_si128(_mm_srli_epi64(acc1_ab.vi, 8), m8));
6852 acc1_c.vi = _mm_and_si128(_mm_add_epi64(acc1_c.vi, _mm_srli_epi64(acc1_c.vi, 8)), m8);
6853 acc2_ab.vi = _mm_add_epi64(_mm_and_si128(acc2_ab.vi, m8), _mm_and_si128(_mm_srli_epi64(acc2_ab.vi, 8), m8));
6854 acc2_c.vi = _mm_and_si128(_mm_add_epi64(acc2_c.vi, _mm_srli_epi64(acc2_c.vi, 8)), m8);
6855 *ct1abp += ((acc1_ab.u8[0] + acc1_ab.u8[1]) * 0x1000100010001LLU) >> 48;
6856 *ct1cp += ((acc1_c.u8[0] + acc1_c.u8[1]) * 0x1000100010001LLU) >> 48;
6857 *ct2abp += ((acc2_ab.u8[0] + acc2_ab.u8[1]) * 0x1000100010001LLU) >> 48;
6858 *ct2cp += ((acc2_c.u8[0] + acc2_c.u8[1]) * 0x1000100010001LLU) >> 48;
6859 }
6860
count_3freq_1920b(const VECITYPE * geno_vvec,const VECITYPE * geno_vvec_end,const VECITYPE * __restrict maskvp,uint32_t * __restrict even_ctp,uint32_t * __restrict odd_ctp,uint32_t * __restrict homset_ctp)6861 void count_3freq_1920b(const VECITYPE* geno_vvec, const VECITYPE* geno_vvec_end, const VECITYPE* __restrict maskvp, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict homset_ctp) {
6862 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
6863 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
6864 __m128i loader;
6865 __m128i loader2;
6866 __m128i loader3;
6867 __m128i even1;
6868 __m128i odd1;
6869 __m128i homset1;
6870 __m128i even2;
6871 __m128i odd2;
6872 __m128i homset2;
6873 __univec acc_even;
6874 __univec acc_odd;
6875 __univec acc_homset;
6876
6877 acc_even.vi = _mm_setzero_si128();
6878 acc_odd.vi = _mm_setzero_si128();
6879 acc_homset.vi = _mm_setzero_si128();
6880 do {
6881 loader = *geno_vvec++;
6882 loader2 = *maskvp++;
6883 odd1 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6884 even1 = _mm_and_si128(loader2, loader);
6885 homset1 = _mm_and_si128(odd1, loader);
6886 loader = *geno_vvec++;
6887 loader2 = *maskvp++;
6888 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6889 even1 = _mm_add_epi64(even1, _mm_and_si128(loader2, loader));
6890 odd1 = _mm_add_epi64(odd1, loader3);
6891 homset1 = _mm_add_epi64(homset1, _mm_and_si128(loader3, loader));
6892 loader = *geno_vvec++;
6893 loader2 = *maskvp++;
6894 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6895 even1 = _mm_add_epi64(even1, _mm_and_si128(loader2, loader));
6896 odd1 = _mm_add_epi64(odd1, loader3);
6897 homset1 = _mm_add_epi64(homset1, _mm_and_si128(loader3, loader));
6898
6899 even1 = _mm_add_epi64(_mm_and_si128(even1, m2), _mm_and_si128(_mm_srli_epi64(even1, 2), m2));
6900 odd1 = _mm_add_epi64(_mm_and_si128(odd1, m2), _mm_and_si128(_mm_srli_epi64(odd1, 2), m2));
6901 homset1 = _mm_add_epi64(_mm_and_si128(homset1, m2), _mm_and_si128(_mm_srli_epi64(homset1, 2), m2));
6902
6903 loader = *geno_vvec++;
6904 loader2 = *maskvp++;
6905 odd2 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6906 even2 = _mm_and_si128(loader2, loader);
6907 homset2 = _mm_and_si128(odd2, loader);
6908 loader = *geno_vvec++;
6909 loader2 = *maskvp++;
6910 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6911 even2 = _mm_add_epi64(even2, _mm_and_si128(loader2, loader));
6912 odd2 = _mm_add_epi64(odd2, loader3);
6913 homset2 = _mm_add_epi64(homset2, _mm_and_si128(loader3, loader));
6914 loader = *geno_vvec++;
6915 loader2 = *maskvp++;
6916 loader3 = _mm_and_si128(loader2, _mm_srli_epi64(loader, 1));
6917 even2 = _mm_add_epi64(even2, _mm_and_si128(loader2, loader));
6918 odd2 = _mm_add_epi64(odd2, loader3);
6919 homset2 = _mm_add_epi64(homset2, _mm_and_si128(loader3, loader));
6920
6921 even1 = _mm_add_epi64(even1, _mm_add_epi64(_mm_and_si128(even2, m2), _mm_and_si128(_mm_srli_epi64(even2, 2), m2)));
6922 odd1 = _mm_add_epi64(odd1, _mm_add_epi64(_mm_and_si128(odd2, m2), _mm_and_si128(_mm_srli_epi64(odd2, 2), m2)));
6923 homset1 = _mm_add_epi64(homset1, _mm_add_epi64(_mm_and_si128(homset2, m2), _mm_and_si128(_mm_srli_epi64(homset2, 2), m2)));
6924
6925 acc_even.vi = _mm_add_epi64(acc_even.vi, _mm_add_epi64(_mm_and_si128(even1, m4), _mm_and_si128(_mm_srli_epi64(even1, 4), m4)));
6926 acc_odd.vi = _mm_add_epi64(acc_odd.vi, _mm_add_epi64(_mm_and_si128(odd1, m4), _mm_and_si128(_mm_srli_epi64(odd1, 4), m4)));
6927 acc_homset.vi = _mm_add_epi64(acc_homset.vi, _mm_add_epi64(_mm_and_si128(homset1, m4), _mm_and_si128(_mm_srli_epi64(homset1, 4), m4)));
6928 } while (geno_vvec < geno_vvec_end);
6929 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
6930 acc_even.vi = _mm_add_epi64(_mm_and_si128(acc_even.vi, m8), _mm_and_si128(_mm_srli_epi64(acc_even.vi, 8), m8));
6931 acc_odd.vi = _mm_add_epi64(_mm_and_si128(acc_odd.vi, m8), _mm_and_si128(_mm_srli_epi64(acc_odd.vi, 8), m8));
6932 acc_homset.vi = _mm_add_epi64(_mm_and_si128(acc_homset.vi, m8), _mm_and_si128(_mm_srli_epi64(acc_homset.vi, 8), m8));
6933 *even_ctp += ((acc_even.u8[0] + acc_even.u8[1]) * 0x1000100010001LLU) >> 48;
6934 *odd_ctp += ((acc_odd.u8[0] + acc_odd.u8[1]) * 0x1000100010001LLU) >> 48;
6935 *homset_ctp += ((acc_homset.u8[0] + acc_homset.u8[1]) * 0x1000100010001LLU) >> 48;
6936 }
6937 #else
count_2freq_dbl_24b(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict mask1p,const uintptr_t * __restrict mask2p,uint32_t * __restrict ct1abp,uint32_t * __restrict ct1cp,uint32_t * __restrict ct2abp,uint32_t * __restrict ct2cp)6938 void count_2freq_dbl_24b(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict mask1p, const uintptr_t* __restrict mask2p, uint32_t* __restrict ct1abp, uint32_t* __restrict ct1cp, uint32_t* __restrict ct2abp, uint32_t* __restrict ct2cp) {
6939 uintptr_t loader = *geno_vec++;
6940 uintptr_t loader2 = *mask1p++;
6941 uintptr_t loader3 = (loader >> 1) & loader2;
6942 uintptr_t to_ct1_ab;
6943 uintptr_t to_ct1_c;
6944 uintptr_t to_ct2_ab;
6945 uintptr_t to_ct2_c;
6946 uintptr_t to_ct_abtmp;
6947 uintptr_t partial1_ab;
6948 uintptr_t partial1_c;
6949 uintptr_t partial2_ab;
6950 uintptr_t partial2_c;
6951 loader2 &= loader;
6952 to_ct1_ab = loader2 + loader3;
6953 to_ct1_c = loader2 & (~loader3);
6954 loader2 = *mask2p++;
6955 loader3 = (loader >> 1) & loader2;
6956 loader2 &= loader;
6957 to_ct2_ab = loader2 + loader3;
6958 to_ct2_c = loader2 & (~loader3);
6959
6960 to_ct1_ab = (to_ct1_ab & 0x33333333) + ((to_ct1_ab >> 2) & 0x33333333);
6961 to_ct2_ab = (to_ct2_ab & 0x33333333) + ((to_ct2_ab >> 2) & 0x33333333);
6962
6963 loader = *geno_vec++;
6964 loader2 = *mask1p++;
6965 loader3 = (loader >> 1) & loader2;
6966 loader2 &= loader;
6967 to_ct_abtmp = loader2 + loader3;
6968 to_ct1_c += loader2 & (~loader3);
6969 to_ct1_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
6970 loader2 = *mask2p++;
6971 loader3 = (loader >> 1) & loader2;
6972 loader2 &= loader;
6973 to_ct_abtmp = loader2 + loader3;
6974 to_ct2_c += loader2 & (~loader3);
6975 to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
6976
6977 loader = *geno_vec++;
6978 loader2 = *mask1p++;
6979 loader3 = (loader >> 1) & loader2;
6980 loader2 &= loader;
6981 to_ct_abtmp = loader2 + loader3;
6982 to_ct1_c += loader2 & (~loader3);
6983 to_ct1_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
6984 loader2 = *mask2p++;
6985 loader3 = (loader >> 1) & loader2;
6986 loader2 &= loader;
6987 to_ct_abtmp = loader2 + loader3;
6988 to_ct2_c += loader2 & (~loader3);
6989 to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
6990
6991 partial1_ab = (to_ct1_ab & 0x0f0f0f0f) + ((to_ct1_ab >> 4) & 0x0f0f0f0f);
6992 partial1_c = (to_ct1_c & 0x33333333) + ((to_ct1_c >> 2) & 0x33333333);
6993 partial2_ab = (to_ct2_ab & 0x0f0f0f0f) + ((to_ct2_ab >> 4) & 0x0f0f0f0f);
6994 partial2_c = (to_ct2_c & 0x33333333) + ((to_ct2_c >> 2) & 0x33333333);
6995
6996 loader = *geno_vec++;
6997 loader2 = *mask1p++;
6998 loader3 = (loader >> 1) & loader2;
6999 loader2 &= loader;
7000 to_ct1_ab = loader2 + loader3;
7001 to_ct1_c = loader2 & (~loader3);
7002 loader2 = *mask2p++;
7003 loader3 = (loader >> 1) & loader2;
7004 loader2 &= loader;
7005 to_ct2_ab = loader2 + loader3;
7006 to_ct2_c = loader2 & (~loader3);
7007
7008 to_ct1_ab = (to_ct1_ab & 0x33333333) + ((to_ct1_ab >> 2) & 0x33333333);
7009 to_ct2_ab = (to_ct2_ab & 0x33333333) + ((to_ct2_ab >> 2) & 0x33333333);
7010
7011 loader = *geno_vec++;
7012 loader2 = *mask1p++;
7013 loader3 = (loader >> 1) & loader2;
7014 loader2 &= loader;
7015 to_ct_abtmp = loader2 + loader3;
7016 to_ct1_c += loader2 & (~loader3);
7017 to_ct1_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
7018 loader2 = *mask2p++;
7019 loader3 = (loader >> 1) & loader2;
7020 loader2 &= loader;
7021 to_ct_abtmp = loader2 + loader3;
7022 to_ct2_c += loader2 & (~loader3);
7023 to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
7024
7025 loader = *geno_vec++;
7026 loader2 = *mask1p++;
7027 loader3 = (loader >> 1) & loader2;
7028 loader2 &= loader;
7029 to_ct_abtmp = loader2 + loader3;
7030 to_ct1_c += loader2 & (~loader3);
7031 to_ct1_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
7032 loader2 = *mask2p++;
7033 loader3 = (loader >> 1) & loader2;
7034 loader2 &= loader;
7035 to_ct_abtmp = loader2 + loader3;
7036 to_ct2_c += loader2 & (~loader3);
7037 to_ct2_ab += (to_ct_abtmp & 0x33333333) + ((to_ct_abtmp >> 2) & 0x33333333);
7038
7039 partial1_ab += (to_ct1_ab & 0x0f0f0f0f) + ((to_ct1_ab >> 4) & 0x0f0f0f0f);
7040 partial1_c += (to_ct1_c & 0x33333333) + ((to_ct1_c >> 2) & 0x33333333);
7041 partial2_ab += (to_ct2_ab & 0x0f0f0f0f) + ((to_ct2_ab >> 4) & 0x0f0f0f0f);
7042 partial2_c += (to_ct2_c & 0x33333333) + ((to_ct2_c >> 2) & 0x33333333);
7043
7044 partial1_c = (partial1_c & 0x0f0f0f0f) + ((partial1_c >> 4) & 0x0f0f0f0f);
7045 partial2_c = (partial2_c & 0x0f0f0f0f) + ((partial2_c >> 4) & 0x0f0f0f0f);
7046
7047 *ct1abp += (partial1_ab * 0x01010101) >> 24;
7048 *ct1cp += (partial1_c * 0x01010101) >> 24;
7049 *ct2abp += (partial2_ab * 0x01010101) >> 24;
7050 *ct2cp += (partial2_c * 0x01010101) >> 24;
7051 }
7052
count_3freq_48b(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict maskp,uint32_t * __restrict ctap,uint32_t * __restrict ctbp,uint32_t * __restrict ctcp)7053 void count_3freq_48b(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict maskp, uint32_t* __restrict ctap, uint32_t* __restrict ctbp, uint32_t* __restrict ctcp) {
7054 uintptr_t loader = *geno_vec++;
7055 uintptr_t loader2 = *maskp++;
7056 uint32_t to_ct_a1 = loader & loader2;
7057 uint32_t to_ct_b1 = (loader >> 1) & loader2;
7058 uint32_t to_ct_c1 = loader & to_ct_b1;
7059 uintptr_t loader3;
7060 uint32_t to_ct_a2;
7061 uint32_t to_ct_b2;
7062 uint32_t to_ct_c2;
7063 uintptr_t partial_a;
7064 uintptr_t partial_b;
7065 uintptr_t partial_c;
7066 loader = *geno_vec++;
7067 loader2 = *maskp++;
7068 loader3 = (loader >> 1) & loader2;
7069 to_ct_a1 += loader & loader2;
7070 to_ct_b1 += loader3;
7071 to_ct_c1 += loader & loader3;
7072 loader = *geno_vec++;
7073 loader2 = *maskp++;
7074 loader3 = (loader >> 1) & loader2;
7075 to_ct_a1 += loader & loader2;
7076 to_ct_b1 += loader3;
7077 to_ct_c1 += loader & loader3;
7078
7079 loader = *geno_vec++;
7080 loader2 = *maskp++;
7081 to_ct_a2 = loader & loader2;
7082 to_ct_b2 = (loader >> 1) & loader2;
7083 to_ct_c2 = loader & to_ct_b2;
7084 loader = *geno_vec++;
7085 loader2 = *maskp++;
7086 loader3 = (loader >> 1) & loader2;
7087 to_ct_a2 += loader & loader2;
7088 to_ct_b2 += loader3;
7089 to_ct_c2 += loader & loader3;
7090 loader = *geno_vec++;
7091 loader2 = *maskp++;
7092 loader3 = (loader >> 1) & loader2;
7093 to_ct_a2 += loader & loader2;
7094 to_ct_b2 += loader3;
7095 to_ct_c2 += loader & loader3;
7096
7097 to_ct_a1 = (to_ct_a1 & 0x33333333) + ((to_ct_a1 >> 2) & 0x33333333);
7098 to_ct_a1 += (to_ct_a2 & 0x33333333) + ((to_ct_a2 >> 2) & 0x33333333);
7099 partial_a = (to_ct_a1 & 0x0f0f0f0f) + ((to_ct_a1 >> 4) & 0x0f0f0f0f);
7100 to_ct_b1 = (to_ct_b1 & 0x33333333) + ((to_ct_b1 >> 2) & 0x33333333);
7101 to_ct_b1 += (to_ct_b2 & 0x33333333) + ((to_ct_b2 >> 2) & 0x33333333);
7102 partial_b = (to_ct_b1 & 0x0f0f0f0f) + ((to_ct_b1 >> 4) & 0x0f0f0f0f);
7103 to_ct_c1 = (to_ct_c1 & 0x33333333) + ((to_ct_c1 >> 2) & 0x33333333);
7104 to_ct_c1 += (to_ct_c2 & 0x33333333) + ((to_ct_c2 >> 2) & 0x33333333);
7105 partial_c = (to_ct_c1 & 0x0f0f0f0f) + ((to_ct_c1 >> 4) & 0x0f0f0f0f);
7106
7107 loader = *geno_vec++;
7108 loader2 = *maskp++;
7109 to_ct_a1 = loader & loader2;
7110 to_ct_b1 = (loader >> 1) & loader2;
7111 to_ct_c1 = loader & to_ct_b1;
7112 loader = *geno_vec++;
7113 loader2 = *maskp++;
7114 loader3 = (loader >> 1) & loader2;
7115 to_ct_a1 += loader & loader2;
7116 to_ct_b1 += loader3;
7117 to_ct_c1 += loader & loader3;
7118 loader = *geno_vec++;
7119 loader2 = *maskp++;
7120 loader3 = (loader >> 1) & loader2;
7121 to_ct_a1 += loader & loader2;
7122 to_ct_b1 += loader3;
7123 to_ct_c1 += loader & loader3;
7124
7125 loader = *geno_vec++;
7126 loader2 = *maskp++;
7127 to_ct_a2 = loader & loader2;
7128 to_ct_b2 = (loader >> 1) & loader2;
7129 to_ct_c2 = loader & to_ct_b2;
7130 loader = *geno_vec++;
7131 loader2 = *maskp++;
7132 loader3 = (loader >> 1) & loader2;
7133 to_ct_a2 += loader & loader2;
7134 to_ct_b2 += loader3;
7135 to_ct_c2 += loader & loader3;
7136 loader = *geno_vec;
7137 loader2 = *maskp;
7138 loader3 = (loader >> 1) & loader2;
7139 to_ct_a2 += loader & loader2;
7140 to_ct_b2 += loader3;
7141 to_ct_c2 += loader & loader3;
7142
7143 to_ct_a1 = (to_ct_a1 & 0x33333333) + ((to_ct_a1 >> 2) & 0x33333333);
7144 to_ct_a1 += (to_ct_a2 & 0x33333333) + ((to_ct_a2 >> 2) & 0x33333333);
7145 partial_a += (to_ct_a1 & 0x0f0f0f0f) + ((to_ct_a1 >> 4) & 0x0f0f0f0f);
7146 to_ct_b1 = (to_ct_b1 & 0x33333333) + ((to_ct_b1 >> 2) & 0x33333333);
7147 to_ct_b1 += (to_ct_b2 & 0x33333333) + ((to_ct_b2 >> 2) & 0x33333333);
7148 partial_b += (to_ct_b1 & 0x0f0f0f0f) + ((to_ct_b1 >> 4) & 0x0f0f0f0f);
7149 to_ct_c1 = (to_ct_c1 & 0x33333333) + ((to_ct_c1 >> 2) & 0x33333333);
7150 to_ct_c1 += (to_ct_c2 & 0x33333333) + ((to_ct_c2 >> 2) & 0x33333333);
7151 partial_c += (to_ct_c1 & 0x0f0f0f0f) + ((to_ct_c1 >> 4) & 0x0f0f0f0f);
7152
7153 *ctap += (partial_a * 0x01010101) >> 24;
7154 *ctbp += (partial_b * 0x01010101) >> 24;
7155 *ctcp += (partial_c * 0x01010101) >> 24;
7156 }
7157 #endif
7158
7159 #ifdef __LP64__
count_set_freq_60v(const __m128i * vptr,const __m128i * vend,const __m128i * __restrict include_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7160 void count_set_freq_60v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7161 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7162 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7163 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7164 __m128i loader;
7165 __m128i loader2;
7166 __m128i loader3;
7167 __m128i odds;
7168 __m128i evens;
7169 __m128i missings;
7170 __univec acc;
7171 __univec accm;
7172 acc.vi = _mm_setzero_si128();
7173 accm.vi = _mm_setzero_si128();
7174 do {
7175 loader = *vptr++;
7176 loader2 = _mm_srli_epi64(loader, 1);
7177 loader3 = *include_vec++;
7178 odds = _mm_and_si128(loader2, loader3);
7179 evens = _mm_and_si128(odds, loader);
7180 missings = _mm_and_si128(loader, _mm_andnot_si128(loader2, loader3));
7181
7182 loader = *vptr++;
7183 loader2 = _mm_srli_epi64(loader, 1);
7184 loader3 = *include_vec++;
7185 odds = _mm_add_epi64(odds, _mm_and_si128(loader2, loader3));
7186 loader3 = _mm_and_si128(loader, loader3);
7187 evens = _mm_add_epi64(evens, _mm_and_si128(loader2, loader3));
7188 missings = _mm_add_epi64(missings, _mm_andnot_si128(loader2, loader3));
7189
7190 loader = *vptr++;
7191 loader2 = _mm_srli_epi64(loader, 1);
7192 loader3 = *include_vec++;
7193 odds = _mm_add_epi64(odds, _mm_and_si128(loader2, loader3));
7194 loader3 = _mm_and_si128(loader, loader3);
7195 evens = _mm_add_epi64(evens, _mm_and_si128(loader2, loader3));
7196 missings = _mm_add_epi64(missings, _mm_andnot_si128(loader2, loader3));
7197
7198 odds = _mm_add_epi64(_mm_and_si128(odds, m2), _mm_and_si128(_mm_srli_epi64(odds, 2), m2));
7199 missings = _mm_add_epi64(_mm_and_si128(missings, m2), _mm_and_si128(_mm_srli_epi64(missings, 2), m2));
7200 odds = _mm_add_epi64(odds, _mm_add_epi64(_mm_and_si128(evens, m2), _mm_and_si128(_mm_srli_epi64(evens, 2), m2)));
7201
7202 // each 4-bit value here <= 6, so safe to add before m4 mask
7203 accm.vi = _mm_add_epi64(accm.vi, _mm_and_si128(_mm_add_epi64(missings, _mm_srli_epi64(missings, 4)), m4));
7204
7205 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(odds, m4), _mm_and_si128(_mm_srli_epi64(odds, 4), m4)));
7206 } while (vptr < vend);
7207 // and each 8-bit value here <= 120
7208 accm.vi = _mm_and_si128(_mm_add_epi64(accm.vi, _mm_srli_epi64(accm.vi, 8)), m8);
7209
7210 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7211 *set_ctp += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7212 *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
7213 }
7214
count_set_freq_hap_120v(const __m128i * vptr,const __m128i * vend,const __m128i * __restrict include_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7215 void count_set_freq_hap_120v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7216 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7217 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7218 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7219 __univec acc;
7220 __univec accm;
7221 __m128i loader;
7222 __m128i loader2;
7223 __m128i loader3;
7224 __m128i partial;
7225 __m128i partialm;
7226 __m128i partial2;
7227 __m128i partial2m;
7228 acc.vi = _mm_setzero_si128();
7229 accm.vi = _mm_setzero_si128();
7230 do {
7231 loader = *vptr++;
7232 loader2 = _mm_srli_epi64(loader, 1);
7233 loader3 = *include_vec++;
7234 partial = _mm_and_si128(loader3, _mm_and_si128(loader, loader2));
7235 partialm = _mm_and_si128(loader3, _mm_xor_si128(loader, loader2));
7236 loader = *vptr++;
7237 loader2 = _mm_srli_epi64(loader, 1);
7238 loader3 = *include_vec++;
7239 partial = _mm_add_epi64(partial, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7240 partialm = _mm_add_epi64(partialm, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7241 loader = *vptr++;
7242 loader2 = _mm_srli_epi64(loader, 1);
7243 loader3 = *include_vec++;
7244 partial = _mm_add_epi64(partial, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7245 partialm = _mm_add_epi64(partialm, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7246 partial2 = _mm_add_epi64(_mm_and_si128(partial, m2), _mm_and_si128(_mm_srli_epi64(partial, 2), m2));
7247 partial2m = _mm_add_epi64(_mm_and_si128(partialm, m2), _mm_and_si128(_mm_srli_epi64(partialm, 2), m2));
7248
7249 loader = *vptr++;
7250 loader2 = _mm_srli_epi64(loader, 1);
7251 loader3 = *include_vec++;
7252 partial = _mm_and_si128(loader3, _mm_and_si128(loader, loader2));
7253 partialm = _mm_and_si128(loader3, _mm_xor_si128(loader, loader2));
7254 loader = *vptr++;
7255 loader2 = _mm_srli_epi64(loader, 1);
7256 loader3 = *include_vec++;
7257 partial = _mm_add_epi64(partial, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7258 partialm = _mm_add_epi64(partialm, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7259 loader = *vptr++;
7260 loader2 = _mm_srli_epi64(loader, 1);
7261 loader3 = *include_vec++;
7262 partial = _mm_add_epi64(partial, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7263 partialm = _mm_add_epi64(partialm, _mm_and_si128(loader3, _mm_and_si128(loader, loader2)));
7264 partial2 = _mm_add_epi64(partial2, _mm_add_epi64(_mm_and_si128(partial, m2), _mm_and_si128(_mm_srli_epi64(partial, 2), m2)));
7265 partial2m = _mm_add_epi64(partial2m, _mm_add_epi64(_mm_and_si128(partialm, m2), _mm_and_si128(_mm_srli_epi64(partialm, 2), m2)));
7266 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(partial2, m4), _mm_and_si128(_mm_srli_epi64(partial2, 4), m4)));
7267 accm.vi = _mm_add_epi64(accm.vi, _mm_add_epi64(_mm_and_si128(partial2m, m4), _mm_and_si128(_mm_srli_epi64(partial2m, 4), m4)));
7268 } while (vptr < vend);
7269 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7270 accm.vi = _mm_add_epi64(_mm_and_si128(accm.vi, m8), _mm_and_si128(_mm_srli_epi64(accm.vi, 8), m8));
7271 *set_ctp += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7272 *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
7273 }
7274
count_set_freq_x_60v(const __m128i * vptr,const __m128i * vend,const __m128i * __restrict include_vec,const __m128i * __restrict male_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7275 void count_set_freq_x_60v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, const __m128i* __restrict male_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7276 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7277 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7278 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7279 __m128i loader;
7280 __m128i loader2;
7281 __m128i loader3;
7282 __m128i loader4;
7283 __m128i set_odds;
7284 __m128i set_evens;
7285 __m128i missings_nm;
7286 __m128i missings_m;
7287 __m128i males;
7288 __univec acc;
7289 __univec accm;
7290 acc.vi = _mm_setzero_si128();
7291 accm.vi = _mm_setzero_si128();
7292 do {
7293 loader = *vptr++;
7294 loader2 = _mm_srli_epi64(loader, 1);
7295 loader3 = *include_vec++;
7296 loader4 = _mm_andnot_si128(*male_vec, loader3);
7297 set_evens = _mm_and_si128(loader, loader4); // subtract missings_nm later
7298 set_odds = _mm_and_si128(loader2, loader4);
7299 missings_nm = _mm_andnot_si128(loader2, set_evens);
7300 males = _mm_and_si128(loader3, *male_vec++);
7301 set_evens = _mm_or_si128(set_evens, _mm_and_si128(_mm_and_si128(loader, loader2), males));
7302 missings_m = _mm_and_si128(_mm_xor_si128(loader, loader2), males);
7303
7304 loader = *vptr++;
7305 loader2 = _mm_srli_epi64(loader, 1);
7306 loader3 = *include_vec++;
7307 loader4 = _mm_andnot_si128(*male_vec, loader3);
7308 set_odds = _mm_add_epi64(set_odds, _mm_and_si128(loader2, loader4));
7309 loader4 = _mm_and_si128(loader, loader4);
7310 set_evens = _mm_add_epi64(set_evens, loader4);
7311 missings_nm = _mm_add_epi64(missings_nm, _mm_andnot_si128(loader2, loader4));
7312 loader4 = _mm_and_si128(loader3, *male_vec++);
7313 set_evens = _mm_add_epi64(set_evens, _mm_and_si128(_mm_and_si128(loader, loader2), loader4));
7314 missings_m = _mm_add_epi64(missings_m, _mm_and_si128(_mm_xor_si128(loader, loader2), loader4));
7315 males = _mm_add_epi64(males, loader4);
7316
7317 loader = *vptr++;
7318 loader2 = _mm_srli_epi64(loader, 1);
7319 loader3 = *include_vec++;
7320 loader4 = _mm_andnot_si128(*male_vec, loader3);
7321 set_odds = _mm_add_epi64(set_odds, _mm_and_si128(loader2, loader4));
7322 loader4 = _mm_and_si128(loader, loader4);
7323 set_evens = _mm_add_epi64(set_evens, loader4);
7324 missings_nm = _mm_add_epi64(missings_nm, _mm_andnot_si128(loader2, loader4));
7325 loader4 = _mm_and_si128(loader3, *male_vec++);
7326 set_evens = _mm_add_epi64(set_evens, _mm_and_si128(_mm_and_si128(loader, loader2), loader4));
7327 missings_m = _mm_add_epi64(missings_m, _mm_and_si128(_mm_xor_si128(loader, loader2), loader4));
7328 males = _mm_add_epi64(males, loader4);
7329
7330 set_evens = _mm_sub_epi64(set_evens, missings_nm);
7331 missings_nm = _mm_slli_epi64(_mm_add_epi64(_mm_and_si128(missings_nm, m2), _mm_and_si128(_mm_srli_epi64(missings_nm, 2), m2)), 1);
7332 set_odds = _mm_add_epi64(_mm_and_si128(set_odds, m2), _mm_and_si128(_mm_srli_epi64(set_odds, 2), m2));
7333 missings_nm = _mm_add_epi64(missings_nm, _mm_add_epi64(_mm_and_si128(missings_m, m2), _mm_and_si128(_mm_srli_epi64(missings_m, 2), m2)));
7334 set_odds = _mm_add_epi64(set_odds, _mm_add_epi64(_mm_and_si128(set_evens, m2), _mm_and_si128(_mm_srli_epi64(set_evens, 2), m2)));
7335 missings_nm = _mm_add_epi64(missings_nm, _mm_add_epi64(_mm_and_si128(males, m2), _mm_and_si128(_mm_srli_epi64(males, 2), m2)));
7336 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(set_odds, m4), _mm_and_si128(_mm_srli_epi64(set_odds, 4), m4)));
7337 accm.vi = _mm_add_epi64(accm.vi, _mm_add_epi64(_mm_and_si128(missings_nm, m4), _mm_and_si128(_mm_srli_epi64(missings_nm, 4), m4)));
7338 } while (vptr < vend);
7339 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7340 accm.vi = _mm_add_epi64(_mm_and_si128(accm.vi, m8), _mm_and_si128(_mm_srli_epi64(accm.vi, 8), m8));
7341 *set_ctp += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7342 *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
7343 }
7344
count_set_freq_y_120v(const __m128i * vptr,const __m128i * vend,const __m128i * __restrict include_vec,const __m128i * __restrict nonmale_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7345 void count_set_freq_y_120v(const __m128i* vptr, const __m128i* vend, const __m128i* __restrict include_vec, const __m128i* __restrict nonmale_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7346 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7347 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7348 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7349 __m128i loader;
7350 __m128i loader2;
7351 __m128i loader3;
7352 __m128i loader4;
7353 __m128i sets1;
7354 __m128i missings1;
7355 __m128i sets2;
7356 __m128i missings2;
7357 __univec acc;
7358 __univec accm;
7359 acc.vi = _mm_setzero_si128();
7360 accm.vi = _mm_setzero_si128();
7361 do {
7362 loader = *vptr++;
7363 loader3 = *include_vec++;
7364 loader2 = _mm_srli_epi64(loader, 1);
7365 loader4 = *nonmale_vec++;
7366 sets1 = _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2));
7367 missings1 = _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2)));
7368
7369 loader = *vptr++;
7370 loader3 = *include_vec++;
7371 loader2 = _mm_srli_epi64(loader, 1);
7372 loader4 = *nonmale_vec++;
7373 sets1 = _mm_add_epi64(sets1, _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2)));
7374 missings1 = _mm_add_epi64(missings1, _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2))));
7375
7376 loader = *vptr++;
7377 loader3 = *include_vec++;
7378 loader2 = _mm_srli_epi64(loader, 1);
7379 loader4 = *nonmale_vec++;
7380 sets1 = _mm_add_epi64(sets1, _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2)));
7381 missings1 = _mm_add_epi64(missings1, _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2))));
7382 sets1 = _mm_add_epi64(_mm_and_si128(sets1, m2), _mm_and_si128(_mm_srli_epi64(sets1, 2), m2));
7383 missings1 = _mm_add_epi64(_mm_and_si128(missings1, m2), _mm_and_si128(_mm_srli_epi64(missings1, 2), m2));
7384
7385 loader = *vptr++;
7386 loader3 = *include_vec++;
7387 loader2 = _mm_srli_epi64(loader, 1);
7388 loader4 = *nonmale_vec++;
7389 sets2 = _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2));
7390 missings2 = _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2)));
7391
7392 loader = *vptr++;
7393 loader3 = *include_vec++;
7394 loader2 = _mm_srli_epi64(loader, 1);
7395 loader4 = *nonmale_vec++;
7396 sets2 = _mm_add_epi64(sets2, _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2)));
7397 missings2 = _mm_add_epi64(missings2, _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2))));
7398
7399 loader = *vptr++;
7400 loader3 = *include_vec++;
7401 loader2 = _mm_srli_epi64(loader, 1);
7402 loader4 = *nonmale_vec++;
7403 sets2 = _mm_add_epi64(sets2, _mm_and_si128(_mm_andnot_si128(loader4, loader3), _mm_and_si128(loader, loader2)));
7404 missings2 = _mm_add_epi64(missings2, _mm_and_si128(loader3, _mm_or_si128(loader4, _mm_xor_si128(loader, loader2))));
7405 sets1 = _mm_add_epi64(sets1, _mm_add_epi64(_mm_and_si128(sets2, m2), _mm_and_si128(_mm_srli_epi64(sets2, 2), m2)));
7406 missings1 = _mm_add_epi64(missings1, _mm_add_epi64(_mm_and_si128(missings2, m2), _mm_and_si128(_mm_srli_epi64(missings2, 2), m2)));
7407 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(sets1, m4), _mm_and_si128(_mm_srli_epi64(sets1, 4), m4)));
7408 accm.vi = _mm_add_epi64(accm.vi, _mm_add_epi64(_mm_and_si128(missings1, m4), _mm_and_si128(_mm_srli_epi64(missings1, 4), m4)));
7409 } while (vptr < vend);
7410 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7411 accm.vi = _mm_add_epi64(_mm_and_si128(accm.vi, m8), _mm_and_si128(_mm_srli_epi64(accm.vi, 8), m8));
7412 *set_ctp += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7413 *missing_ctp += ((accm.u8[0] + accm.u8[1]) * 0x1000100010001LLU) >> 48;
7414 }
7415
count_01_vecs(const __m128i * vptr,uintptr_t vct)7416 uintptr_t count_01_vecs(const __m128i* vptr, uintptr_t vct) {
7417 // counts number of aligned 01s (i.e. PLINK missing genotypes) in
7418 // [vptr, vend). Assumes number of words in interval is a multiple of 12.
7419 const __m128i m1 = {FIVEMASK, FIVEMASK};
7420 const __m128i m2 = {0x3333333333333333LLU, 0x3333333333333333LLU};
7421 const __m128i m4 = {0x0f0f0f0f0f0f0f0fLLU, 0x0f0f0f0f0f0f0f0fLLU};
7422 const __m128i m8 = {0x00ff00ff00ff00ffLLU, 0x00ff00ff00ff00ffLLU};
7423 uintptr_t tot = 0;
7424 const __m128i* vend;
7425 __m128i loader1;
7426 __m128i loader2;
7427 __m128i count1;
7428 __m128i count2;
7429 __univec acc;
7430
7431 while (vct >= 60) {
7432 vct -= 60;
7433 vend = &(vptr[60]);
7434 count_01_vecs_main_loop:
7435 acc.vi = _mm_setzero_si128();
7436 do {
7437 loader1 = *vptr++;
7438 loader2 = *vptr++;
7439 count1 = _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader1, 1), loader1), m1);
7440 count2 = _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2), m1);
7441 loader1 = *vptr++;
7442 loader2 = *vptr++;
7443 count1 = _mm_add_epi64(count1, _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader1, 1), loader1), m1));
7444 count2 = _mm_add_epi64(count2, _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2), m1));
7445 loader1 = *vptr++;
7446 loader2 = *vptr++;
7447 count1 = _mm_add_epi64(count1, _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader1, 1), loader1), m1));
7448 count2 = _mm_add_epi64(count2, _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader2, 1), loader2), m1));
7449 count1 = _mm_add_epi64(_mm_and_si128(count1, m2), _mm_and_si128(_mm_srli_epi64(count1, 2), m2));
7450 count1 = _mm_add_epi64(count1, _mm_add_epi64(_mm_and_si128(count2, m2), _mm_and_si128(_mm_srli_epi64(count2, 2), m2)));
7451 acc.vi = _mm_add_epi64(acc.vi, _mm_add_epi64(_mm_and_si128(count1, m4), _mm_and_si128(_mm_srli_epi64(count1, 4), m4)));
7452 } while (vptr < vend);
7453 acc.vi = _mm_add_epi64(_mm_and_si128(acc.vi, m8), _mm_and_si128(_mm_srli_epi64(acc.vi, 8), m8));
7454 tot += ((acc.u8[0] + acc.u8[1]) * 0x1000100010001LLU) >> 48;
7455 }
7456 if (vct) {
7457 vend = &(vptr[vct]);
7458 vct = 0;
7459 goto count_01_vecs_main_loop;
7460 }
7461 return tot;
7462 }
7463
7464 #else
count_set_freq_6(const uintptr_t * __restrict lptr,const uintptr_t * __restrict include_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7465 void count_set_freq_6(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7466 uintptr_t loader = *lptr++;
7467 uintptr_t loader2 = loader >> 1;
7468 uintptr_t loader3 = *include_vec++;
7469 uintptr_t odds = loader2 & loader3;
7470 uintptr_t evens = odds & loader;
7471 uintptr_t missings = (~loader2) & loader3 & loader;
7472 uintptr_t acc;
7473 uintptr_t accm;
7474
7475 loader = *lptr++;
7476 loader2 = loader >> 1;
7477 loader3 = *include_vec++;
7478 odds += loader2 & loader3;
7479 loader3 &= loader;
7480 evens += loader2 & loader3;
7481 missings += (~loader2) & loader3;
7482
7483 loader = *lptr++;
7484 loader2 = loader >> 1;
7485 loader3 = *include_vec++;
7486 odds += loader2 & loader3;
7487 loader3 &= loader;
7488 evens += loader2 & loader3;
7489 missings += (~loader2) & loader3;
7490
7491 odds = (odds & 0x33333333) + ((odds >> 2) & 0x33333333);
7492 odds += (evens & 0x33333333) + ((evens >> 2) & 0x33333333);
7493 accm = (missings & 0x33333333) + ((missings >> 2) & 0x33333333);
7494 acc = (odds & 0x0f0f0f0f) + ((odds >> 4) & 0x0f0f0f0f);
7495
7496 loader = *lptr++;
7497 loader2 = loader >> 1;
7498 loader3 = *include_vec++;
7499 odds = loader2 & loader3;
7500 evens = odds & loader;
7501 missings = (~loader2) & loader3 & loader;
7502
7503 loader = *lptr++;
7504 loader2 = loader >> 1;
7505 loader3 = *include_vec++;
7506 odds += loader2 & loader3;
7507 loader3 &= loader;
7508 evens += loader2 & loader3;
7509 missings += (~loader2) & loader3;
7510
7511 loader = *lptr++;
7512 loader2 = loader >> 1;
7513 loader3 = *include_vec++;
7514 odds += loader2 & loader3;
7515 loader3 &= loader;
7516 evens += loader2 & loader3;
7517 missings += (~loader2) & loader3;
7518
7519 odds = (odds & 0x33333333) + ((odds >> 2) & 0x33333333);
7520 accm += (missings & 0x33333333) + ((missings >> 2) & 0x33333333);
7521 odds += (evens & 0x33333333) + ((evens >> 2) & 0x33333333);
7522 accm = (accm & 0x0f0f0f0f) + ((accm >> 4) & 0x0f0f0f0f);
7523 acc += (odds & 0x0f0f0f0f) + ((odds >> 4) & 0x0f0f0f0f);
7524 *set_ctp += (acc * 0x01010101) >> 24;
7525 *missing_ctp += (accm * 0x01010101) >> 24;
7526 }
7527
count_set_freq_hap_12(const uintptr_t * __restrict lptr,const uintptr_t * __restrict include_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7528 void count_set_freq_hap_12(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7529 uintptr_t loader = *lptr++;
7530 uintptr_t loader2 = loader >> 1;
7531 uintptr_t loader3 = *include_vec++;
7532 uintptr_t partial = loader & loader2 & loader3;
7533 uintptr_t partialm = (loader ^ loader2) & loader3;
7534 uintptr_t partial2;
7535 uintptr_t partial2m;
7536 uintptr_t acc;
7537 uintptr_t accm;
7538 loader = *lptr++;
7539 loader2 = loader >> 1;
7540 loader3 = *include_vec++;
7541 partial += loader & loader2 & loader3;
7542 partialm += (loader ^ loader2) & loader3;
7543
7544 loader = *lptr++;
7545 loader2 = loader >> 1;
7546 loader3 = *include_vec++;
7547 partial += loader & loader2 & loader3;
7548 partialm += (loader ^ loader2) & loader3;
7549 partial2 = (partial & 0x33333333) + ((partial >> 2) & 0x33333333);
7550 partial2m = (partialm & 0x33333333) + ((partialm >> 2) & 0x33333333);
7551
7552 loader = *lptr++;
7553 loader2 = loader >> 1;
7554 loader3 = *include_vec++;
7555 partial = loader & loader2 & loader3;
7556 partialm = (loader ^ loader2) & loader3;
7557
7558 loader = *lptr++;
7559 loader2 = loader >> 1;
7560 loader3 = *include_vec++;
7561 partial += loader & loader2 & loader3;
7562 partialm += (loader ^ loader2) & loader3;
7563
7564 loader = *lptr++;
7565 loader2 = loader >> 1;
7566 loader3 = *include_vec++;
7567 partial += loader & loader2 & loader3;
7568 partialm += (loader ^ loader2) & loader3;
7569 partial2 += (partial & 0x33333333) + ((partial >> 2) & 0x33333333);
7570 partial2m += (partialm & 0x33333333) + ((partialm >> 2) & 0x33333333);
7571 acc = (partial2 & 0x0f0f0f0f) + ((partial2 >> 4) & 0x0f0f0f0f);
7572 accm = (partial2m & 0x0f0f0f0f) + ((partial2m >> 4) & 0x0f0f0f0f);
7573
7574 loader = *lptr++;
7575 loader2 = loader >> 1;
7576 loader3 = *include_vec++;
7577 partial = loader & loader2 & loader3;
7578 partialm = (loader ^ loader2) & loader3;
7579
7580 loader = *lptr++;
7581 loader2 = loader >> 1;
7582 loader3 = *include_vec++;
7583 partial += loader & loader2 & loader3;
7584 partialm += (loader ^ loader2) & loader3;
7585
7586 loader = *lptr++;
7587 loader2 = loader >> 1;
7588 loader3 = *include_vec++;
7589 partial += loader & loader2 & loader3;
7590 partialm += (loader ^ loader2) & loader3;
7591 partial2 = (partial & 0x33333333) + ((partial >> 2) & 0x33333333);
7592 partial2m = (partialm & 0x33333333) + ((partialm >> 2) & 0x33333333);
7593
7594 loader = *lptr++;
7595 loader2 = loader >> 1;
7596 loader3 = *include_vec++;
7597 partial = loader & loader2 & loader3;
7598 partialm = (loader ^ loader2) & loader3;
7599
7600 loader = *lptr++;
7601 loader2 = loader >> 1;
7602 loader3 = *include_vec++;
7603 partial += loader & loader2 & loader3;
7604 partialm += (loader ^ loader2) & loader3;
7605
7606 loader = *lptr++;
7607 loader2 = loader >> 1;
7608 loader3 = *include_vec++;
7609 partial += loader & loader2 & loader3;
7610 partialm += (loader ^ loader2) & loader3;
7611 partial2 += (partial & 0x33333333) + ((partial >> 2) & 0x33333333);
7612 partial2m += (partialm & 0x33333333) + ((partialm >> 2) & 0x33333333);
7613 acc += (partial2 & 0x0f0f0f0f) + ((partial2 >> 4) & 0x0f0f0f0f);
7614 accm += (partial2m & 0x0f0f0f0f) + ((partial2m >> 4) & 0x0f0f0f0f);
7615 *set_ctp += (acc * 0x01010101) >> 24;
7616 *missing_ctp += (accm * 0x01010101) >> 24;
7617 }
7618
count_set_freq_x_6(const uintptr_t * __restrict lptr,const uintptr_t * __restrict include_vec,const uintptr_t * __restrict male_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7619 void count_set_freq_x_6(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, const uintptr_t* __restrict male_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7620 uintptr_t loader = *lptr++;
7621 uintptr_t loader2 = loader >> 1;
7622 uintptr_t loader3 = *include_vec++;
7623 uintptr_t loader4 = loader3 & (~(*male_vec));
7624 uintptr_t set_odds = loader2 & loader4;
7625 uintptr_t set_evens = loader & loader4;
7626 uintptr_t missings_nm = set_evens & (~loader2);
7627 uintptr_t missings_m;
7628 uintptr_t males;
7629 uintptr_t acc;
7630 uintptr_t accm;
7631 males = loader3 & (*male_vec++);
7632 set_evens |= loader & loader2 & males;
7633 missings_m = (loader ^ loader2) & males;
7634
7635 loader = *lptr++;
7636 loader2 = loader >> 1;
7637 loader3 = *include_vec++;
7638 loader4 = loader3 & (~(*male_vec));
7639 set_odds += loader2 & loader4;
7640 loader4 &= loader;
7641 set_evens += loader4;
7642 missings_nm += loader4 & (~loader2);
7643 loader4 = loader3 & (*male_vec++);
7644 set_evens += loader & loader2 & loader4;
7645 missings_m += (loader ^ loader2) & loader4;
7646 males += loader4;
7647
7648 loader = *lptr++;
7649 loader2 = loader >> 1;
7650 loader3 = *include_vec++;
7651 loader4 = loader3 & (~(*male_vec));
7652 set_odds += loader2 & loader4;
7653 loader4 &= loader;
7654 set_evens += loader4;
7655 missings_nm += loader4 & (~loader2);
7656 loader4 = loader3 & (*male_vec++);
7657 set_evens += loader & loader2 & loader4;
7658 missings_m += (loader ^ loader2) & loader4;
7659 males += loader4;
7660
7661 set_evens -= missings_nm;
7662 set_odds = (set_odds & 0x33333333) + ((set_odds >> 2) & 0x33333333);
7663 set_odds += (set_evens & 0x33333333) + ((set_evens >> 2) & 0x33333333);
7664 missings_nm = ((missings_nm & 0x33333333) + ((missings_nm >> 2) & 0x33333333)) * 2;
7665 missings_nm += (missings_m & 0x33333333) + ((missings_m >> 2) & 0x33333333);
7666 missings_nm += (males & 0x33333333) + ((males >> 2) & 0x33333333);
7667 acc = (set_odds & 0x0f0f0f0f) + ((set_odds >> 4) & 0x0f0f0f0f);
7668 accm = (missings_nm & 0x0f0f0f0f) + ((missings_nm >> 4) & 0x0f0f0f0f);
7669
7670 loader = *lptr++;
7671 loader2 = loader >> 1;
7672 loader3 = *include_vec++;
7673 loader4 = loader3 & (~(*male_vec));
7674 set_odds = loader2 & loader4;
7675 set_evens = loader & loader4;
7676 missings_nm = set_evens & (~loader2);
7677 males = loader3 & (*male_vec++);
7678 set_evens |= loader & loader2 & males;
7679 missings_m = (loader ^ loader2) & males;
7680
7681 loader = *lptr++;
7682 loader2 = loader >> 1;
7683 loader3 = *include_vec++;
7684 loader4 = loader3 & (~(*male_vec));
7685 set_odds += loader2 & loader4;
7686 loader4 &= loader;
7687 set_evens += loader4;
7688 missings_nm += loader4 & (~loader2);
7689 loader4 = loader3 & (*male_vec++);
7690 set_evens += loader & loader2 & loader4;
7691 missings_m += (loader ^ loader2) & loader4;
7692 males += loader4;
7693
7694 loader = *lptr++;
7695 loader2 = loader >> 1;
7696 loader3 = *include_vec++;
7697 loader4 = loader3 & (~(*male_vec));
7698 set_odds += loader2 & loader4;
7699 loader4 &= loader;
7700 set_evens += loader4;
7701 missings_nm += loader4 & (~loader2);
7702 loader4 = loader3 & (*male_vec++);
7703 set_evens += loader & loader2 & loader4;
7704 missings_m += (loader ^ loader2) & loader4;
7705 males += loader4;
7706
7707 set_evens -= missings_nm;
7708 set_odds = (set_odds & 0x33333333) + ((set_odds >> 2) & 0x33333333);
7709 set_odds += (set_evens & 0x33333333) + ((set_evens >> 2) & 0x33333333);
7710 missings_nm = ((missings_nm & 0x33333333) + ((missings_nm >> 2) & 0x33333333)) * 2;
7711 missings_nm += (missings_m & 0x33333333) + ((missings_m >> 2) & 0x33333333);
7712 missings_nm += (males & 0x33333333) + ((males >> 2) & 0x33333333);
7713 acc += (set_odds & 0x0f0f0f0f) + ((set_odds >> 4) & 0x0f0f0f0f);
7714 accm += (missings_nm & 0x0f0f0f0f) + ((missings_nm >> 4) & 0x0f0f0f0f);
7715 *set_ctp += (acc * 0x01010101) >> 24;
7716 *missing_ctp += (accm * 0x01010101) >> 24;
7717 }
7718
count_set_freq_y_12(const uintptr_t * __restrict lptr,const uintptr_t * __restrict include_vec,const uintptr_t * __restrict nonmale_vec,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7719 void count_set_freq_y_12(const uintptr_t* __restrict lptr, const uintptr_t* __restrict include_vec, const uintptr_t* __restrict nonmale_vec, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7720 uintptr_t loader = *lptr++;
7721 uintptr_t loader2 = loader >> 1;
7722 uintptr_t loader3 = *include_vec++;
7723 uintptr_t loader4 = *nonmale_vec++;
7724 uintptr_t sets1 = loader3 & loader & loader2 & (~loader4);
7725 uintptr_t missings1 = loader3 & (loader4 | (loader ^ loader2));
7726 uintptr_t sets2;
7727 uintptr_t missings2;
7728 uintptr_t acc;
7729 uintptr_t accm;
7730
7731 loader = *lptr++;
7732 loader2 = loader >> 1;
7733 loader3 = *include_vec++;
7734 loader4 = *nonmale_vec++;
7735 sets1 += loader3 & loader & loader2 & (~loader4);
7736 missings1 += loader3 & (loader4 | (loader ^ loader2));
7737
7738 loader = *lptr++;
7739 loader2 = loader >> 1;
7740 loader3 = *include_vec++;
7741 loader4 = *nonmale_vec++;
7742 sets1 += loader3 & loader & loader2 & (~loader4);
7743 missings1 += loader3 & (loader4 | (loader ^ loader2));
7744 sets1 = (sets1 & 0x33333333) + ((sets1 >> 2) & 0x33333333);
7745 missings1 = (missings1 & 0x33333333) + ((missings1 >> 2) & 0x33333333);
7746
7747 loader = *lptr++;
7748 loader2 = loader >> 1;
7749 loader3 = *include_vec++;
7750 loader4 = *nonmale_vec++;
7751 sets2 = loader3 & loader & loader2 & (~loader4);
7752 missings2 = loader3 & (loader4 | (loader ^ loader2));
7753
7754 loader = *lptr++;
7755 loader2 = loader >> 1;
7756 loader3 = *include_vec++;
7757 loader4 = *nonmale_vec++;
7758 sets2 += loader3 & loader & loader2 & (~loader4);
7759 missings2 += loader3 & (loader4 | (loader ^ loader2));
7760
7761 loader = *lptr++;
7762 loader2 = loader >> 1;
7763 loader3 = *include_vec++;
7764 loader4 = *nonmale_vec++;
7765 sets2 += loader3 & loader & loader2 & (~loader4);
7766 missings2 += loader3 & (loader4 | (loader ^ loader2));
7767 sets1 += (sets2 & 0x33333333) + ((sets2 >> 2) & 0x33333333);
7768 missings1 += (missings2 & 0x33333333) + ((missings2 >> 2) & 0x33333333);
7769 acc = (sets1 & 0x0f0f0f0f) + ((sets1 >> 4) & 0x0f0f0f0f);
7770 accm = (missings1 & 0x0f0f0f0f) + ((missings1 >> 4) & 0x0f0f0f0f);
7771
7772 loader = *lptr++;
7773 loader2 = loader >> 1;
7774 loader3 = *include_vec++;
7775 loader4 = *nonmale_vec++;
7776 sets1 = loader3 & loader & loader2 & (~loader4);
7777 missings1 = loader3 & (loader4 | (loader ^ loader2));
7778
7779 loader = *lptr++;
7780 loader2 = loader >> 1;
7781 loader3 = *include_vec++;
7782 loader4 = *nonmale_vec++;
7783 sets1 += loader3 & loader & loader2 & (~loader4);
7784 missings1 += loader3 & (loader4 | (loader ^ loader2));
7785
7786 loader = *lptr++;
7787 loader2 = loader >> 1;
7788 loader3 = *include_vec++;
7789 loader4 = *nonmale_vec++;
7790 sets1 += loader3 & loader & loader2 & (~loader4);
7791 missings1 += loader3 & (loader4 | (loader ^ loader2));
7792 sets1 = (sets1 & 0x33333333) + ((sets1 >> 2) & 0x33333333);
7793 missings1 = (missings1 & 0x33333333) + ((missings1 >> 2) & 0x33333333);
7794
7795 loader = *lptr++;
7796 loader2 = loader >> 1;
7797 loader3 = *include_vec++;
7798 loader4 = *nonmale_vec++;
7799 sets2 = loader3 & loader & loader2 & (~loader4);
7800 missings2 = loader3 & (loader4 | (loader ^ loader2));
7801
7802 loader = *lptr++;
7803 loader2 = loader >> 1;
7804 loader3 = *include_vec++;
7805 loader4 = *nonmale_vec++;
7806 sets2 += loader3 & loader & loader2 & (~loader4);
7807 missings2 += loader3 & (loader4 | (loader ^ loader2));
7808
7809 loader = *lptr++;
7810 loader2 = loader >> 1;
7811 loader3 = *include_vec++;
7812 loader4 = *nonmale_vec++;
7813 sets2 += loader3 & loader & loader2 & (~loader4);
7814 missings2 += loader3 & (loader4 | (loader ^ loader2));
7815 sets1 += (sets2 & 0x33333333) + ((sets2 >> 2) & 0x33333333);
7816 missings1 += (missings2 & 0x33333333) + ((missings2 >> 2) & 0x33333333);
7817 acc += (sets1 & 0x0f0f0f0f) + ((sets1 >> 4) & 0x0f0f0f0f);
7818 accm += (missings1 & 0x0f0f0f0f) + ((missings1 >> 4) & 0x0f0f0f0f);
7819 *set_ctp += (acc * 0x01010101) >> 24;
7820 *missing_ctp += (accm * 0x01010101) >> 24;
7821 }
7822
count_01_12(const uintptr_t * lptr)7823 uintptr_t count_01_12(const uintptr_t* lptr) {
7824 uintptr_t loader1 = *lptr++;
7825 uintptr_t loader2 = *lptr++;
7826 uintptr_t count1 = loader1 & (~(loader1 >> 1)) & FIVEMASK;
7827 uintptr_t count2 = loader2 & (~(loader2 >> 1)) & FIVEMASK;
7828 uintptr_t partial1;
7829 uintptr_t partial2;
7830 loader1 = *lptr++;
7831 loader2 = *lptr++;
7832 count1 += loader1 & (~(loader1 >> 1)) & FIVEMASK;
7833 count2 += loader2 & (~(loader2 >> 1)) & FIVEMASK;
7834 loader1 = *lptr++;
7835 loader2 = *lptr++;
7836 count1 += loader1 & (~(loader1 >> 1)) & FIVEMASK;
7837 count2 += loader2 & (~(loader2 >> 1)) & FIVEMASK;
7838 partial1 = (count1 & 0x33333333) + ((count1 >> 2) & 0x33333333);
7839 partial2 = (count2 & 0x33333333) + ((count2 >> 2) & 0x33333333);
7840
7841 loader1 = *lptr++;
7842 loader2 = *lptr++;
7843 count1 = loader1 & (~(loader1 >> 1)) & FIVEMASK;
7844 count2 = loader2 & (~(loader2 >> 1)) & FIVEMASK;
7845 loader1 = *lptr++;
7846 loader2 = *lptr++;
7847 count1 += loader1 & (~(loader1 >> 1)) & FIVEMASK;
7848 count2 += loader2 & (~(loader2 >> 1)) & FIVEMASK;
7849 loader1 = *lptr++;
7850 loader2 = *lptr++;
7851 count1 += loader1 & (~(loader1 >> 1)) & FIVEMASK;
7852 count2 += loader2 & (~(loader2 >> 1)) & FIVEMASK;
7853 partial1 += (count1 & 0x33333333) + ((count1 >> 2) & 0x33333333);
7854 partial2 += (count2 & 0x33333333) + ((count2 >> 2) & 0x33333333);
7855
7856 partial1 = (partial1 & 0x0f0f0f0f) + ((partial1 >> 4) & 0x0f0f0f0f);
7857 partial1 += (partial2 & 0x0f0f0f0f) + ((partial2 >> 4) & 0x0f0f0f0f);
7858 return (partial1 * 0x01010101) >> 24;
7859 }
7860 #endif
7861
genovec_set_freq(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict include_quatervec,uintptr_t sample_ctl2,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7862 void genovec_set_freq(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7863 // Assuming include_quatervec describes e.g. cases, and an autosomal marker,
7864 // this counts the number of case set alleles loaded in geno_vec[], as well
7865 // as the number of cases with missing genotype info.
7866 // See single_marker_freqs_and_hwe() for discussion.
7867 // missing count: popcount2(genotype & (~(genotype >> 1)) & 0x5555...)
7868 // set allele count: popcount(genotype) - missing count
7869 const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
7870 uintptr_t loader;
7871 uintptr_t loader2;
7872 uintptr_t missing_incr;
7873 uint32_t acc = 0;
7874 uint32_t accm = 0;
7875 #ifdef __LP64__
7876 uintptr_t cur_decr = 60;
7877 const uintptr_t* geno_vec_6x_end;
7878 sample_ctl2 -= sample_ctl2 % 6;
7879 while (sample_ctl2 >= 60) {
7880 genovec_set_freq_loop:
7881 geno_vec_6x_end = &(geno_vec[cur_decr]);
7882 count_set_freq_60v((const __m128i*)geno_vec, (const __m128i*)geno_vec_6x_end, (const __m128i*)include_quatervec, &acc, &accm);
7883 geno_vec = geno_vec_6x_end;
7884 include_quatervec = &(include_quatervec[cur_decr]);
7885 sample_ctl2 -= cur_decr;
7886 }
7887 if (sample_ctl2) {
7888 cur_decr = sample_ctl2;
7889 goto genovec_set_freq_loop;
7890 }
7891 #else
7892 const uintptr_t* geno_vec_six_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 6)]);
7893 while (geno_vec < geno_vec_six_end) {
7894 count_set_freq_6(geno_vec, include_quatervec, &acc, &accm);
7895 geno_vec = &(geno_vec[6]);
7896 include_quatervec = &(include_quatervec[6]);
7897 }
7898 #endif
7899 while (geno_vec < geno_vec_end) {
7900 loader = *geno_vec++;
7901 loader2 = *include_quatervec++;
7902 missing_incr = popcount2_long(loader & (~(loader >> 1)) & loader2);
7903 accm += missing_incr;
7904 acc += popcount_long(loader & (loader2 * 3)) - missing_incr;
7905 }
7906 *set_ctp = acc;
7907 *missing_ctp = accm;
7908 }
7909
genovec_set_freq_x(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict include_quatervec,const uintptr_t * __restrict male_quatervec,uintptr_t sample_ctl2,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7910 void genovec_set_freq_x(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, const uintptr_t* __restrict male_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7911 // diploid counting for nonmales, haploid counting for males
7912 // missing_ct := male_obs + male_missing + 2 * female_missing
7913 const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
7914 uintptr_t loader;
7915 uintptr_t loader2;
7916 uintptr_t loader3;
7917 uintptr_t loader4;
7918 uintptr_t missing_incr;
7919 uint32_t acc = 0;
7920 uint32_t accm = 0;
7921 #ifdef __LP64__
7922 uintptr_t cur_decr = 60;
7923 const uintptr_t* geno_vec_6x_end;
7924 sample_ctl2 -= sample_ctl2 % 6;
7925 while (sample_ctl2 >= 60) {
7926 genovec_set_freq_x_loop:
7927 geno_vec_6x_end = &(geno_vec[cur_decr]);
7928 count_set_freq_x_60v((const __m128i*)geno_vec, (const __m128i*)geno_vec_6x_end, (const __m128i*)include_quatervec, (const __m128i*)male_quatervec, &acc, &accm);
7929 geno_vec = geno_vec_6x_end;
7930 include_quatervec = &(include_quatervec[cur_decr]);
7931 male_quatervec = &(male_quatervec[cur_decr]);
7932 sample_ctl2 -= cur_decr;
7933 }
7934 if (sample_ctl2) {
7935 cur_decr = sample_ctl2;
7936 goto genovec_set_freq_x_loop;
7937 }
7938 #else
7939 const uintptr_t* geno_vec_six_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 6)]);
7940 while (geno_vec < geno_vec_six_end) {
7941 count_set_freq_x_6(geno_vec, include_quatervec, male_quatervec, &acc, &accm);
7942 geno_vec = &(geno_vec[6]);
7943 include_quatervec = &(include_quatervec[6]);
7944 male_quatervec = &(male_quatervec[6]);
7945 }
7946 #endif
7947 while (geno_vec < geno_vec_end) {
7948 loader = *geno_vec++;
7949 loader2 = loader >> 1;
7950 loader3 = *include_quatervec++;
7951 loader4 = loader3 & (~(*male_quatervec));
7952 missing_incr = popcount2_long(loader & (~loader2) & loader4);
7953 accm += 2 * missing_incr;
7954 acc += popcount_long(loader & (loader4 * 3)) - missing_incr;
7955
7956 loader4 = loader3 & (*male_quatervec++);
7957 acc += popcount2_long(loader & loader2 & loader4);
7958 accm += popcount_long(((loader ^ loader2) & loader4) | (loader4 << 1));
7959 }
7960 *set_ctp = acc;
7961 *missing_ctp = accm;
7962 }
7963
genovec_set_freq_y(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict include_quatervec,const uintptr_t * __restrict nonmale_quatervec,uintptr_t sample_ctl2,uint32_t * __restrict set_ctp,uint32_t * __restrict missing_ctp)7964 void genovec_set_freq_y(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, const uintptr_t* __restrict nonmale_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict set_ctp, uint32_t* __restrict missing_ctp) {
7965 // all nonmales contribute to missing_ct here
7966 const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
7967 uintptr_t loader;
7968 uintptr_t loader2;
7969 uintptr_t loader3;
7970 uintptr_t loader4;
7971 uint32_t acc = 0;
7972 uint32_t accm = 0;
7973 #ifdef __LP64__
7974 uintptr_t cur_decr = 120;
7975 const uintptr_t* geno_vec_12x_end;
7976 sample_ctl2 -= sample_ctl2 % 12;
7977 while (sample_ctl2 >= 120) {
7978 genovec_set_freq_y_loop:
7979 geno_vec_12x_end = &(geno_vec[cur_decr]);
7980 count_set_freq_y_120v((__m128i*)geno_vec, (__m128i*)geno_vec_12x_end, (__m128i*)include_quatervec, (__m128i*)nonmale_quatervec, &acc, &accm);
7981 geno_vec = geno_vec_12x_end;
7982 include_quatervec = &(include_quatervec[cur_decr]);
7983 nonmale_quatervec = &(nonmale_quatervec[cur_decr]);
7984 sample_ctl2 -= cur_decr;
7985 }
7986 if (sample_ctl2) {
7987 cur_decr = sample_ctl2;
7988 goto genovec_set_freq_y_loop;
7989 }
7990 #else
7991 const uintptr_t* geno_vec_twelve_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 12)]);
7992 while (geno_vec < geno_vec_twelve_end) {
7993 count_set_freq_y_12(geno_vec, include_quatervec, nonmale_quatervec, &acc, &accm);
7994 geno_vec = &(geno_vec[12]);
7995 include_quatervec = &(include_quatervec[12]);
7996 nonmale_quatervec = &(nonmale_quatervec[12]);
7997 }
7998 #endif
7999 while (geno_vec < geno_vec_end) {
8000 loader = *geno_vec++;
8001 loader2 = loader >> 1;
8002 loader3 = *include_quatervec++;
8003 loader4 = *nonmale_quatervec++;
8004 acc += popcount2_long(loader & loader2 & loader3 & (~loader4));
8005 accm += popcount2_long(loader3 & ((loader ^ loader2) | loader4));
8006 }
8007 *set_ctp = acc;
8008 *missing_ctp = accm;
8009 }
8010
genovec_3freq(const uintptr_t * __restrict geno_vec,const uintptr_t * __restrict include_quatervec,uintptr_t sample_ctl2,uint32_t * __restrict missing_ctp,uint32_t * __restrict het_ctp,uint32_t * __restrict homset_ctp)8011 void genovec_3freq(const uintptr_t* __restrict geno_vec, const uintptr_t* __restrict include_quatervec, uintptr_t sample_ctl2, uint32_t* __restrict missing_ctp, uint32_t* __restrict het_ctp, uint32_t* __restrict homset_ctp) {
8012 // generic routine for getting all counts.
8013 const uintptr_t* geno_vec_end = &(geno_vec[sample_ctl2]);
8014 uintptr_t loader;
8015 uintptr_t loader2;
8016 uintptr_t loader3;
8017 uint32_t acc_even = 0;
8018 uint32_t acc_odd = 0;
8019 uint32_t acc_and = 0;
8020 #ifdef __LP64__
8021 uintptr_t cur_decr = 120;
8022 const uintptr_t* geno_vec_12x_end;
8023 sample_ctl2 -= sample_ctl2 % 12;
8024 while (sample_ctl2 >= 120) {
8025 genovec_3freq_loop:
8026 geno_vec_12x_end = &(geno_vec[cur_decr]);
8027 count_3freq_1920b((const __m128i*)geno_vec, (const __m128i*)geno_vec_12x_end, (const __m128i*)include_quatervec, &acc_even, &acc_odd, &acc_and);
8028 geno_vec = geno_vec_12x_end;
8029 include_quatervec = &(include_quatervec[cur_decr]);
8030 sample_ctl2 -= cur_decr;
8031 }
8032 if (sample_ctl2) {
8033 cur_decr = sample_ctl2;
8034 goto genovec_3freq_loop;
8035 }
8036 #else
8037 const uintptr_t* geno_vec_twelve_end = &(geno_vec[sample_ctl2 - (sample_ctl2 % 12)]);
8038 while (geno_vec < geno_vec_twelve_end) {
8039 count_3freq_48b(geno_vec, include_quatervec, &acc_even, &acc_odd, &acc_and);
8040 geno_vec = &(geno_vec[12]);
8041 include_quatervec = &(include_quatervec[12]);
8042 }
8043 #endif
8044 while (geno_vec < geno_vec_end) {
8045 loader = *geno_vec++;
8046 loader2 = *include_quatervec++;
8047 loader3 = loader2 & (loader >> 1);
8048 acc_even += popcount2_long(loader & loader2);
8049 acc_odd += popcount2_long(loader3);
8050 acc_and += popcount2_long(loader & loader3);
8051 }
8052 *missing_ctp = acc_even - acc_and;
8053 *het_ctp = acc_odd - acc_and;
8054 *homset_ctp = acc_and;
8055 }
8056
count_01(const uintptr_t * quatervec,uintptr_t word_ct)8057 uintptr_t count_01(const uintptr_t* quatervec, uintptr_t word_ct) {
8058 // really just for getting a missing count
8059 // unlike popcount01_longs, this does not assume quatervec[] has no 11s
8060 const uintptr_t* quatervec_end = &(quatervec[word_ct]);
8061 uintptr_t loader;
8062 #ifdef __LP64__
8063 uintptr_t acc;
8064 word_ct -= word_ct % 12;
8065 acc = count_01_vecs((__m128i*)quatervec, word_ct / 2);
8066 quatervec = &(quatervec[word_ct]);
8067 #else
8068 const uintptr_t* quatervec_twelve_end = &(quatervec[word_ct - (word_ct % 12)]);
8069 uintptr_t acc = 0;
8070 while (quatervec < quatervec_twelve_end) {
8071 acc += count_01_12(quatervec);
8072 quatervec = &(quatervec[12]);
8073 }
8074 #endif
8075 while (quatervec < quatervec_end) {
8076 loader = *quatervec++;
8077 acc += popcount2_long(loader & (~(loader >> 1)) & FIVEMASK);
8078 }
8079 return acc;
8080 }
8081
fill_all_bits(uintptr_t ct,uintptr_t * bitarr)8082 void fill_all_bits(uintptr_t ct, uintptr_t* bitarr) {
8083 // leaves bits beyond the end unset
8084 // ok for ct == 0
8085 uintptr_t quotient = ct / BITCT;
8086 uintptr_t remainder = ct % BITCT;
8087 fill_ulong_one(quotient, bitarr);
8088 if (remainder) {
8089 bitarr[quotient] = (ONELU << remainder) - ONELU;
8090 }
8091 }
8092
numeric_range_list_to_bitarr(const Range_list * range_list_ptr,uint32_t item_ct,uint32_t offset,uint32_t ignore_overflow,uintptr_t * bitarr)8093 uint32_t numeric_range_list_to_bitarr(const Range_list* range_list_ptr, uint32_t item_ct, uint32_t offset, uint32_t ignore_overflow, uintptr_t* bitarr) {
8094 // bitarr assumed to be initialized
8095 const char* names = range_list_ptr->names;
8096 const unsigned char* starts_range = range_list_ptr->starts_range;
8097 uint32_t name_ct = range_list_ptr->name_ct;
8098 uint32_t name_max_len = range_list_ptr->name_max_len;
8099 uint32_t idx_max = item_ct + offset;
8100 uint32_t name_idx;
8101 uint32_t idx1;
8102 uint32_t idx2;
8103 for (name_idx = 0; name_idx < name_ct; name_idx++) {
8104 if (scan_uint_capped(&(names[name_idx * name_max_len]), idx_max, &idx1)) {
8105 if (ignore_overflow) {
8106 continue;
8107 }
8108 return 1;
8109 }
8110 if (starts_range[name_idx]) {
8111 name_idx++;
8112 if (scan_uint_capped(&(names[name_idx * name_max_len]), idx_max, &idx2)) {
8113 if (!ignore_overflow) {
8114 return 1;
8115 }
8116 idx2 = idx_max - 1;
8117 }
8118 fill_bits(idx1 - offset, (idx2 - idx1) + 1, bitarr);
8119 } else {
8120 set_bit(idx1 - offset, bitarr);
8121 }
8122 }
8123 return 0;
8124 }
8125
string_range_list_to_bitarr(char * header_line,uint32_t item_ct,uint32_t fixed_len,const Range_list * range_list_ptr,const char * __restrict sorted_ids,const uint32_t * __restrict id_map,const char * __restrict range_list_flag,const char * __restrict file_descrip,uintptr_t * bitarr,int32_t * __restrict seen_idxs)8126 int32_t string_range_list_to_bitarr(char* header_line, uint32_t item_ct, uint32_t fixed_len, const Range_list* range_list_ptr, const char* __restrict sorted_ids, const uint32_t* __restrict id_map, const char* __restrict range_list_flag, const char* __restrict file_descrip, uintptr_t* bitarr, int32_t* __restrict seen_idxs) {
8127 // bitarr assumed to be initialized
8128 // if fixed_len is zero, header_line is assumed to be a list of
8129 // space-delimited unequal-length names
8130 uintptr_t max_id_len = range_list_ptr->name_max_len;
8131 uintptr_t name_ct = range_list_ptr->name_ct;
8132 uint32_t item_idx = 0;
8133 int32_t retval = 0;
8134 char* bufptr;
8135 uint32_t cmdline_pos;
8136 int32_t ii;
8137 while (1) {
8138 bufptr = token_endnn(header_line);
8139 ii = bsearch_str(header_line, (uintptr_t)(bufptr - header_line), sorted_ids, max_id_len, name_ct);
8140 if (ii != -1) {
8141 cmdline_pos = id_map[(uint32_t)ii];
8142 if (seen_idxs[cmdline_pos] != -1) {
8143 sprintf(g_logbuf, "Error: Duplicate --%s token in %s.\n", range_list_flag, file_descrip);
8144 goto string_range_list_to_bitarr_ret_INVALID_FORMAT_2;
8145 }
8146 seen_idxs[cmdline_pos] = item_idx;
8147 if (cmdline_pos && range_list_ptr->starts_range[cmdline_pos - 1]) {
8148 if (seen_idxs[cmdline_pos - 1] == -1) {
8149 LOGPREPRINTFWW("Error: Second element of --%s range appears before first element in %s.\n", range_list_flag, file_descrip);
8150 goto string_range_list_to_bitarr_ret_INVALID_CMDLINE_2;
8151 }
8152 fill_bits(seen_idxs[cmdline_pos - 1], (item_idx - seen_idxs[cmdline_pos - 1]) + 1, bitarr);
8153 } else if (!(range_list_ptr->starts_range[cmdline_pos])) {
8154 SET_BIT(item_idx, bitarr);
8155 }
8156 }
8157 if (++item_idx == item_ct) {
8158 break;
8159 }
8160 if (fixed_len) {
8161 header_line = &(header_line[fixed_len]);
8162 } else {
8163 header_line = skip_initial_spaces(&(bufptr[1]));
8164 }
8165 }
8166 for (cmdline_pos = 0; cmdline_pos < name_ct; cmdline_pos++) {
8167 if (seen_idxs[cmdline_pos] == -1) {
8168 goto string_range_list_to_bitarr_ret_INVALID_CMDLINE_3;
8169 }
8170 }
8171 while (0) {
8172 string_range_list_to_bitarr_ret_INVALID_CMDLINE_3:
8173 sprintf(g_logbuf, "Error: Missing --%s token in %s.\n", range_list_flag, file_descrip);
8174 string_range_list_to_bitarr_ret_INVALID_CMDLINE_2:
8175 logerrprintb();
8176 retval = RET_INVALID_CMDLINE;
8177 break;
8178 string_range_list_to_bitarr_ret_INVALID_FORMAT_2:
8179 logerrprintb();
8180 retval = RET_INVALID_FORMAT;
8181 break;
8182 }
8183 return retval;
8184 }
8185
string_range_list_to_bitarr_alloc(char * header_line,uint32_t item_ct,uint32_t fixed_len,const Range_list * range_list_ptr,const char * __restrict range_list_flag,const char * __restrict file_descrip,uintptr_t ** bitarr_ptr)8186 int32_t string_range_list_to_bitarr_alloc(char* header_line, uint32_t item_ct, uint32_t fixed_len, const Range_list* range_list_ptr, const char* __restrict range_list_flag, const char* __restrict file_descrip, uintptr_t** bitarr_ptr) {
8187 // wrapper for string_range_list_to_bitarr which allocates the bitfield and
8188 // temporary buffers on the heap
8189 uintptr_t item_ctl = BITCT_TO_WORDCT(item_ct);
8190 uintptr_t name_ct = range_list_ptr->name_ct;
8191 int32_t retval = 0;
8192 int32_t* seen_idxs;
8193 char* sorted_ids;
8194 uint32_t* id_map;
8195 if (bigstack_calloc_ul(item_ctl, bitarr_ptr) ||
8196 bigstack_alloc_i(name_ct, &seen_idxs)) {
8197 return RET_NOMEM;
8198 }
8199 // kludge to use sort_item_ids()
8200 fill_ulong_zero(BITCT_TO_WORDCT(name_ct), (uintptr_t*)seen_idxs);
8201 if (sort_item_ids(name_ct, (uintptr_t*)seen_idxs, 0, range_list_ptr->names, range_list_ptr->name_max_len, 0, 0, strcmp_deref, &sorted_ids, &id_map)) {
8202 return RET_NOMEM;
8203 }
8204 fill_int_one(name_ct, seen_idxs);
8205 retval = string_range_list_to_bitarr(header_line, item_ct, fixed_len, range_list_ptr, sorted_ids, id_map, range_list_flag, file_descrip, *bitarr_ptr, seen_idxs);
8206 bigstack_reset(seen_idxs);
8207 return retval;
8208 }
8209
string_range_list_to_bitarr2(const char * __restrict sorted_ids,const uint32_t * id_map,uintptr_t item_ct,uintptr_t max_id_len,const Range_list * __restrict range_list_ptr,const char * __restrict range_list_flag,uintptr_t * bitfield_excl)8210 int32_t string_range_list_to_bitarr2(const char* __restrict sorted_ids, const uint32_t* id_map, uintptr_t item_ct, uintptr_t max_id_len, const Range_list* __restrict range_list_ptr, const char* __restrict range_list_flag, uintptr_t* bitfield_excl) {
8211 // sorted_ids/id_map is for e.g. marker IDs instead of command line
8212 // parameters. bitfield_excl is assumed to be initialized (since its length
8213 // is not known by this function).
8214 char* names = range_list_ptr->names;
8215 const unsigned char* starts_range = range_list_ptr->starts_range;
8216 uintptr_t name_max_len = range_list_ptr->name_max_len;
8217 uint32_t name_ct = range_list_ptr->name_ct;
8218 int32_t retval = 0;
8219 uint32_t param_idx;
8220 char* bufptr;
8221 uint32_t item_uidx;
8222 uint32_t item_uidx2;
8223 int32_t ii;
8224 for (param_idx = 0; param_idx < name_ct; param_idx++) {
8225 bufptr = &(names[param_idx * name_max_len]);
8226 ii = bsearch_str_nl(bufptr, sorted_ids, max_id_len, item_ct);
8227 if (ii == -1) {
8228 goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3;
8229 }
8230 item_uidx = id_map[(uint32_t)ii];
8231 if (starts_range[param_idx]) {
8232 param_idx++;
8233 bufptr = &(names[param_idx * name_max_len]);
8234 ii = bsearch_str_nl(bufptr, sorted_ids, max_id_len, item_ct);
8235 if (ii == -1) {
8236 goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3;
8237 }
8238 item_uidx2 = id_map[(uint32_t)ii];
8239 if (item_uidx2 < item_uidx) {
8240 sprintf(g_logbuf, "Error: Second element of --%s range appears before first.\n", range_list_flag);
8241 goto string_range_list_to_bitarr2_ret_INVALID_CMDLINE_2;
8242 }
8243 clear_bits(item_uidx, item_uidx2 - item_uidx + 1, bitfield_excl);
8244 } else {
8245 clear_bit(item_uidx, bitfield_excl);
8246 }
8247 }
8248 while (0) {
8249 string_range_list_to_bitarr2_ret_INVALID_CMDLINE_3:
8250 sprintf(g_logbuf, "Error: --%s ID not found.\n", range_list_flag);
8251 string_range_list_to_bitarr2_ret_INVALID_CMDLINE_2:
8252 logerrprintb();
8253 retval = RET_INVALID_CMDLINE;
8254 break;
8255 }
8256 return retval;
8257 }
8258
count_non_autosomal_markers(const Chrom_info * chrom_info_ptr,const uintptr_t * marker_exclude,uint32_t count_x,uint32_t count_mt)8259 uint32_t count_non_autosomal_markers(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t count_x, uint32_t count_mt) {
8260 // for backward compatibility, unplaced markers are considered to be
8261 // autosomal here
8262 const int32_t x_code = chrom_info_ptr->xymt_codes[X_OFFSET];
8263 const int32_t y_code = chrom_info_ptr->xymt_codes[Y_OFFSET];
8264 const int32_t mt_code = chrom_info_ptr->xymt_codes[MT_OFFSET];
8265 uint32_t ct = 0;
8266 if (count_x && (x_code != -2)) {
8267 ct += count_chrom_markers(chrom_info_ptr, marker_exclude, x_code);
8268 }
8269 if (y_code != -2) {
8270 ct += count_chrom_markers(chrom_info_ptr, marker_exclude, y_code);
8271 }
8272 if (count_mt && (mt_code != -2)) {
8273 ct += count_chrom_markers(chrom_info_ptr, marker_exclude, mt_code);
8274 }
8275 return ct;
8276 }
8277
conditional_allocate_non_autosomal_markers(const Chrom_info * chrom_info_ptr,uintptr_t unfiltered_marker_ct,const uintptr_t * marker_exclude_orig,uint32_t marker_ct,uint32_t count_x,uint32_t count_mt,const char * calc_descrip,uintptr_t ** marker_exclude_ptr,uint32_t * newly_excluded_ct_ptr)8278 int32_t conditional_allocate_non_autosomal_markers(const Chrom_info* chrom_info_ptr, uintptr_t unfiltered_marker_ct, const uintptr_t* marker_exclude_orig, uint32_t marker_ct, uint32_t count_x, uint32_t count_mt, const char* calc_descrip, uintptr_t** marker_exclude_ptr, uint32_t* newly_excluded_ct_ptr) {
8279 // if all markers are autosomal (or pseudoautosomal) diploid, nothing
8280 // happens. otherwise, this creates a marker_exclude copy with
8281 // non-{autosomal diploid} markers excluded for the caller to use.
8282 const uintptr_t unfiltered_marker_ctl = BITCT_TO_WORDCT(unfiltered_marker_ct);
8283 const int32_t* xymt_codes = chrom_info_ptr->xymt_codes;
8284 uint32_t xymt_cts[XYMT_OFFSET_CT];
8285 fill_uint_zero(XYMT_OFFSET_CT, xymt_cts);
8286 if (is_set(chrom_info_ptr->haploid_mask, 0)) {
8287 *newly_excluded_ct_ptr = marker_ct;
8288 } else {
8289 if (count_x && (xymt_codes[X_OFFSET] != -2)) {
8290 xymt_cts[X_OFFSET] = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, xymt_codes[X_OFFSET]);
8291 }
8292 if (xymt_codes[Y_OFFSET] != -2) {
8293 xymt_cts[Y_OFFSET] = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, xymt_codes[Y_OFFSET]);
8294 }
8295 if (count_mt && (xymt_codes[MT_OFFSET] != -2)) {
8296 xymt_cts[MT_OFFSET] = count_chrom_markers(chrom_info_ptr, marker_exclude_orig, xymt_codes[MT_OFFSET]);
8297 }
8298 *newly_excluded_ct_ptr = xymt_cts[X_OFFSET] + xymt_cts[Y_OFFSET] + xymt_cts[MT_OFFSET];
8299 }
8300 if (*newly_excluded_ct_ptr) {
8301 LOGPRINTF("Excluding %u variant%s on non-autosomes from %s.\n", *newly_excluded_ct_ptr, (*newly_excluded_ct_ptr == 1)? "" : "s", calc_descrip);
8302 }
8303 if (*newly_excluded_ct_ptr == marker_ct) {
8304 logerrprint("Error: No variants remaining.\n");
8305 return RET_INVALID_CMDLINE;
8306 }
8307 if (!(*newly_excluded_ct_ptr)) {
8308 return 0;
8309 }
8310 if (bigstack_alloc_ul(unfiltered_marker_ctl, marker_exclude_ptr)) {
8311 return RET_NOMEM;
8312 }
8313 memcpy(*marker_exclude_ptr, marker_exclude_orig, unfiltered_marker_ctl * sizeof(intptr_t));
8314 for (uint32_t xymt_idx = 0; xymt_idx < XYMT_OFFSET_CT; ++xymt_idx) {
8315 if (xymt_cts[xymt_idx]) {
8316 const uint32_t chrom_fo_idx = chrom_info_ptr->chrom_idx_to_foidx[xymt_codes[xymt_idx]];
8317 fill_bits(chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1] - chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx], *marker_exclude_ptr);
8318 }
8319 }
8320 return 0;
8321 }
8322
get_max_chrom_size(const Chrom_info * chrom_info_ptr,const uintptr_t * marker_exclude,uint32_t * last_chrom_fo_idx_ptr)8323 uint32_t get_max_chrom_size(const Chrom_info* chrom_info_ptr, const uintptr_t* marker_exclude, uint32_t* last_chrom_fo_idx_ptr) {
8324 const uint32_t chrom_ct = chrom_info_ptr->chrom_ct;
8325 uint32_t max_chrom_size = 0;
8326 uint32_t last_chrom_fo_idx = 0;
8327 for (uint32_t chrom_fo_idx = 0; chrom_fo_idx < chrom_ct; chrom_fo_idx++) {
8328 const uint32_t cur_chrom_size = count_chrom_markers(chrom_info_ptr, marker_exclude, chrom_info_ptr->chrom_file_order[chrom_fo_idx]);
8329 if (cur_chrom_size) {
8330 last_chrom_fo_idx = chrom_fo_idx;
8331 if (cur_chrom_size > max_chrom_size) {
8332 max_chrom_size = cur_chrom_size;
8333 }
8334 }
8335 }
8336 if (last_chrom_fo_idx_ptr) {
8337 *last_chrom_fo_idx_ptr = last_chrom_fo_idx;
8338 }
8339 return max_chrom_size;
8340 }
8341
count_genders(const uintptr_t * __restrict sex_nm,const uintptr_t * __restrict sex_male,const uintptr_t * __restrict sample_exclude,uintptr_t unfiltered_sample_ct,uint32_t * __restrict male_ct_ptr,uint32_t * __restrict female_ct_ptr,uint32_t * __restrict unk_ct_ptr)8342 void count_genders(const uintptr_t* __restrict sex_nm, const uintptr_t* __restrict sex_male, const uintptr_t* __restrict sample_exclude, uintptr_t unfiltered_sample_ct, uint32_t* __restrict male_ct_ptr, uint32_t* __restrict female_ct_ptr, uint32_t* __restrict unk_ct_ptr) {
8343 // unfiltered_sample_ct can be zero
8344 uint32_t male_ct = 0;
8345 uint32_t female_ct = 0;
8346 uint32_t unk_ct = 0;
8347 uint32_t unfiltered_sample_ctld = unfiltered_sample_ct / BITCT;
8348 uint32_t unfiltered_sample_ct_rem = unfiltered_sample_ct & (BITCT - 1);
8349 uintptr_t ulii;
8350 uintptr_t uljj;
8351 uintptr_t sample_bidx;
8352 for (sample_bidx = 0; sample_bidx < unfiltered_sample_ctld; sample_bidx++) {
8353 ulii = ~(*sample_exclude++);
8354 count_genders_last_loop:
8355 uljj = *sex_nm++;
8356 unk_ct += popcount_long(ulii & (~uljj));
8357 ulii &= uljj;
8358 uljj = *sex_male++;
8359 male_ct += popcount_long(ulii & uljj);
8360 female_ct += popcount_long(ulii & (~uljj));
8361 }
8362 if (unfiltered_sample_ct_rem) {
8363 ulii = (~(*sample_exclude)) & ((ONELU << unfiltered_sample_ct_rem) - ONELU);
8364 unfiltered_sample_ct_rem = 0;
8365 goto count_genders_last_loop;
8366 }
8367 *male_ct_ptr = male_ct;
8368 *female_ct_ptr = female_ct;
8369 *unk_ct_ptr = unk_ct;
8370 }
8371
reverse_loadbuf(uintptr_t unfiltered_sample_ct,unsigned char * loadbuf)8372 void reverse_loadbuf(uintptr_t unfiltered_sample_ct, unsigned char* loadbuf) {
8373 // unfiltered_sample_ct can be zero
8374 uintptr_t sample_bidx = 0;
8375 unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
8376 unsigned char ucc;
8377 unsigned char ucc2;
8378 uintptr_t unfiltered_sample_ctd;
8379 uint32_t* loadbuf_alias32;
8380 uint32_t uii;
8381 uint32_t ujj;
8382 #ifdef __LP64__
8383 const __m128i m1 = {FIVEMASK, FIVEMASK};
8384 __m128i* loadbuf_alias;
8385 __m128i vii;
8386 __m128i vjj;
8387 // todo: use this vector loop even when loadbuf is unaligned, so stuff like
8388 // recode_load_to() is faster
8389 if (!(((uintptr_t)loadbuf) & 15)) {
8390 loadbuf_alias = (__m128i*)loadbuf;
8391 unfiltered_sample_ctd = unfiltered_sample_ct / 64;
8392 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
8393 vii = *loadbuf_alias;
8394 // we want to exchange 00 and 11, and leave 01/10 untouched. So make
8395 // vjj := 11 iff vii is 00/11, and vjj := 00 otherwise; then xor.
8396 vjj = _mm_andnot_si128(_mm_xor_si128(vii, _mm_srli_epi64(vii, 1)), m1);
8397 vjj = _mm_or_si128(vjj, _mm_slli_epi64(vjj, 1));
8398 *loadbuf_alias++ = _mm_xor_si128(vii, vjj);
8399 }
8400 loadbuf = (unsigned char*)loadbuf_alias;
8401 } else if (!(((uintptr_t)loadbuf) & 3)) {
8402 loadbuf_alias32 = (uint32_t*)loadbuf;
8403 unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
8404 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
8405 uii = *loadbuf_alias32;
8406 ujj = 0x55555555 & (~(uii ^ (uii >> 1)));
8407 ujj *= 3;
8408 *loadbuf_alias32++ = uii ^ ujj;
8409 }
8410 loadbuf = (unsigned char*)loadbuf_alias32;
8411 }
8412 #else
8413 if (!(((uintptr_t)loadbuf) & 3)) {
8414 loadbuf_alias32 = (uint32_t*)loadbuf;
8415 unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
8416 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
8417 uii = *loadbuf_alias32;
8418 ujj = 0x55555555 & (~(uii ^ (uii >> 1)));
8419 ujj *= 3;
8420 *loadbuf_alias32++ = uii ^ ujj;
8421 }
8422 loadbuf = (unsigned char*)loadbuf_alias32;
8423 }
8424 #endif
8425 for (; loadbuf < loadbuf_end;) {
8426 ucc = *loadbuf;
8427 ucc2 = 0x55 & (~(ucc ^ (ucc >> 1)));
8428 ucc2 *= 3;
8429 *loadbuf++ = ucc ^ ucc2;
8430 }
8431 uii = unfiltered_sample_ct & 3;
8432 if (uii) {
8433 loadbuf[-1] &= (0xff >> (8 - 2 * uii));
8434 }
8435 }
8436
8437 // deprecated, try to just use copy_quaterarr_nonempty_subset()
copy_quaterarr_nonempty_subset_excl(const uintptr_t * __restrict raw_quaterarr,const uintptr_t * __restrict subset_excl,uint32_t raw_quaterarr_size,uint32_t subset_size,uintptr_t * __restrict output_quaterarr)8438 void copy_quaterarr_nonempty_subset_excl(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_excl, uint32_t raw_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict output_quaterarr) {
8439 assert(subset_size);
8440 assert(raw_quaterarr_size >= subset_size);
8441 uintptr_t cur_output_word = 0;
8442 uintptr_t* output_quaterarr_last = &(output_quaterarr[subset_size / BITCT2]);
8443 const uint32_t word_write_halfshift_end = subset_size % BITCT2;
8444 uint32_t word_write_halfshift = 0;
8445 // if < 2/3-filled, use sparse copy algorithm
8446 if (subset_size * (3 * ONELU) < raw_quaterarr_size * (2 * ONELU)) {
8447 const uint32_t subset_excl_widx_last = raw_quaterarr_size / BITCT;
8448 uint32_t subset_excl_widx = 0;
8449 while (1) {
8450 uintptr_t cur_include_word = ~subset_excl[subset_excl_widx];
8451
8452 // this, kiddies, is why exclude masks were a mistake.
8453 if (subset_excl_widx == subset_excl_widx_last) {
8454 cur_include_word &= (ONELU << (raw_quaterarr_size % BITCT)) - ONELU;
8455 }
8456
8457 if (cur_include_word) {
8458 uint32_t wordhalf_idx = 0;
8459 #ifdef __LP64__
8460 uint32_t cur_include_halfword = (uint32_t)cur_include_word;
8461 #else
8462 uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8463 #endif
8464 while (1) {
8465 if (cur_include_halfword) {
8466 uintptr_t raw_quaterarr_word = raw_quaterarr[subset_excl_widx * 2 + wordhalf_idx];
8467 do {
8468 uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
8469 cur_output_word |= ((raw_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
8470 if (++word_write_halfshift == BITCT2) {
8471 *output_quaterarr++ = cur_output_word;
8472 word_write_halfshift = 0;
8473 cur_output_word = 0;
8474 }
8475 cur_include_halfword &= cur_include_halfword - 1;
8476 } while (cur_include_halfword);
8477 }
8478 if (wordhalf_idx) {
8479 break;
8480 }
8481 wordhalf_idx++;
8482 #ifdef __LP64__
8483 cur_include_halfword = cur_include_word >> 32;
8484 #else
8485 cur_include_halfword = cur_include_word >> 16;
8486 #endif
8487 }
8488 if (output_quaterarr == output_quaterarr_last) {
8489 if (word_write_halfshift == word_write_halfshift_end) {
8490 if (word_write_halfshift_end) {
8491 *output_quaterarr_last = cur_output_word;
8492 }
8493 return;
8494 }
8495 }
8496 }
8497 subset_excl_widx++;
8498 }
8499 }
8500 // blocked copy
8501 const uintptr_t* subset_excl_last = &(subset_excl[raw_quaterarr_size / BITCT]);
8502 while (1) {
8503 uintptr_t cur_include_word = ~(*subset_excl);
8504 if (subset_excl == subset_excl_last) {
8505 cur_include_word &= (ONELU << (raw_quaterarr_size % BITCT)) - ONELU;
8506 }
8507 subset_excl++;
8508 uint32_t wordhalf_idx = 0;
8509 #ifdef __LP64__
8510 uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
8511 #else
8512 uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8513 #endif
8514 while (1) {
8515 uintptr_t raw_quaterarr_word = *raw_quaterarr++;
8516 while (cur_include_halfword) {
8517 uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
8518 uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
8519 uintptr_t raw_quaterarr_curblock_unmasked = raw_quaterarr_word >> (rqa_idx_lowbits * 2);
8520 uint32_t rqa_block_len = CTZLU(halfword_invshifted);
8521 uint32_t block_len_limit = BITCT2 - word_write_halfshift;
8522 cur_output_word |= raw_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
8523 if (rqa_block_len < block_len_limit) {
8524 word_write_halfshift += rqa_block_len;
8525 cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
8526 } else {
8527 // no need to mask, extra bits vanish off the high end
8528 *output_quaterarr++ = cur_output_word;
8529 word_write_halfshift = rqa_block_len - block_len_limit;
8530 cur_output_word = (raw_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
8531 }
8532 cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
8533 }
8534 if (wordhalf_idx) {
8535 break;
8536 }
8537 wordhalf_idx++;
8538 #ifdef __LP64__
8539 cur_include_halfword = cur_include_word >> 32;
8540 #else
8541 cur_include_halfword = cur_include_word >> 16;
8542 #endif
8543 }
8544 if (output_quaterarr == output_quaterarr_last) {
8545 if (word_write_halfshift == word_write_halfshift_end) {
8546 if (word_write_halfshift_end) {
8547 *output_quaterarr_last = cur_output_word;
8548 }
8549 return;
8550 }
8551 }
8552 }
8553 }
8554
load_and_collapse(uint32_t unfiltered_sample_ct,uint32_t sample_ct,const uintptr_t * __restrict sample_exclude,uintptr_t final_mask,uint32_t do_reverse,FILE * bedfile,uintptr_t * __restrict rawbuf,uintptr_t * __restrict mainbuf)8555 uint32_t load_and_collapse(uint32_t unfiltered_sample_ct, uint32_t sample_ct, const uintptr_t* __restrict sample_exclude, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict mainbuf) {
8556 assert(unfiltered_sample_ct);
8557 uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8558 if (unfiltered_sample_ct == sample_ct) {
8559 rawbuf = mainbuf;
8560 }
8561 if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
8562 return RET_READ_FAIL;
8563 }
8564 if (unfiltered_sample_ct != sample_ct) {
8565 copy_quaterarr_nonempty_subset_excl(rawbuf, sample_exclude, unfiltered_sample_ct, sample_ct, mainbuf);
8566 } else {
8567 rawbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
8568 }
8569 if (do_reverse) {
8570 reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
8571 }
8572 return 0;
8573 }
8574
copy_quaterarr_nonempty_subset(const uintptr_t * __restrict raw_quaterarr,const uintptr_t * __restrict subset_mask,uint32_t raw_quaterarr_size,uint32_t subset_size,uintptr_t * __restrict output_quaterarr)8575 void copy_quaterarr_nonempty_subset(const uintptr_t* __restrict raw_quaterarr, const uintptr_t* __restrict subset_mask, uint32_t raw_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict output_quaterarr) {
8576 // in plink 2.0, we probably want (0-based) bit raw_quaterarr_size of
8577 // subset_mask to be always allocated and unset. This removes a few special
8578 // cases re: iterating past the end of arrays.
8579 assert(subset_size);
8580 assert(raw_quaterarr_size >= subset_size);
8581 uintptr_t cur_output_word = 0;
8582 uintptr_t* output_quaterarr_last = &(output_quaterarr[subset_size / BITCT2]);
8583 const uint32_t word_write_halfshift_end = subset_size % BITCT2;
8584 uint32_t word_write_halfshift = 0;
8585 // if < 2/3-filled, use sparse copy algorithm
8586 if (subset_size * (3 * ONELU) < raw_quaterarr_size * (2 * ONELU)) {
8587 uint32_t subset_mask_widx = 0;
8588 while (1) {
8589 const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
8590 if (cur_include_word) {
8591 uint32_t wordhalf_idx = 0;
8592 #ifdef __LP64__
8593 uint32_t cur_include_halfword = (uint32_t)cur_include_word;
8594 #else
8595 uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8596 #endif
8597 while (1) {
8598 if (cur_include_halfword) {
8599 uintptr_t raw_quaterarr_word = raw_quaterarr[subset_mask_widx * 2 + wordhalf_idx];
8600 do {
8601 uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
8602 cur_output_word |= ((raw_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
8603 if (++word_write_halfshift == BITCT2) {
8604 *output_quaterarr++ = cur_output_word;
8605 word_write_halfshift = 0;
8606 cur_output_word = 0;
8607 }
8608 cur_include_halfword &= cur_include_halfword - 1;
8609 } while (cur_include_halfword);
8610 }
8611 if (wordhalf_idx) {
8612 break;
8613 }
8614 wordhalf_idx++;
8615 #ifdef __LP64__
8616 cur_include_halfword = cur_include_word >> 32;
8617 #else
8618 cur_include_halfword = cur_include_word >> 16;
8619 #endif
8620 }
8621 if (output_quaterarr == output_quaterarr_last) {
8622 if (word_write_halfshift == word_write_halfshift_end) {
8623 if (word_write_halfshift_end) {
8624 *output_quaterarr_last = cur_output_word;
8625 }
8626 return;
8627 }
8628 }
8629 }
8630 subset_mask_widx++;
8631 }
8632 }
8633 // blocked copy
8634 while (1) {
8635 const uintptr_t cur_include_word = *subset_mask++;
8636 uint32_t wordhalf_idx = 0;
8637 #ifdef __LP64__
8638 uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
8639 #else
8640 uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8641 #endif
8642 while (1) {
8643 uintptr_t raw_quaterarr_word = *raw_quaterarr++;
8644 while (cur_include_halfword) {
8645 uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
8646 uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
8647 uintptr_t raw_quaterarr_curblock_unmasked = raw_quaterarr_word >> (rqa_idx_lowbits * 2);
8648 uint32_t rqa_block_len = CTZLU(halfword_invshifted);
8649 uint32_t block_len_limit = BITCT2 - word_write_halfshift;
8650 cur_output_word |= raw_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
8651 if (rqa_block_len < block_len_limit) {
8652 word_write_halfshift += rqa_block_len;
8653 cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
8654 } else {
8655 // no need to mask, extra bits vanish off the high end
8656 *output_quaterarr++ = cur_output_word;
8657 word_write_halfshift = rqa_block_len - block_len_limit;
8658 if (word_write_halfshift) {
8659 cur_output_word = (raw_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
8660 } else {
8661 // avoid potential right-shift-64
8662 cur_output_word = 0;
8663 }
8664 }
8665 cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
8666 }
8667 if (wordhalf_idx) {
8668 break;
8669 }
8670 wordhalf_idx++;
8671 #ifdef __LP64__
8672 cur_include_halfword = cur_include_word >> 32;
8673 #else
8674 cur_include_halfword = cur_include_word >> 16;
8675 #endif
8676 }
8677 if (output_quaterarr == output_quaterarr_last) {
8678 if (word_write_halfshift == word_write_halfshift_end) {
8679 if (word_write_halfshift_end) {
8680 *output_quaterarr_last = cur_output_word;
8681 }
8682 return;
8683 }
8684 }
8685 }
8686 }
8687
8688 /*
8689 void inplace_quaterarr_proper_subset(const uintptr_t* __restrict subset_mask, uint32_t orig_quaterarr_size, uint32_t subset_size, uintptr_t* __restrict main_quaterarr) {
8690 assert(orig_quaterarr_size > subset_size);
8691 // worthwhile to special-case this since we get to entirely skip
8692 // reading/writing these words
8693 if (!(~subset_mask[0])) {
8694 const uintptr_t* subset_mask_initial = subset_mask;
8695 // guaranteed to terminate since orig_quaterarr_size > subset_size.
8696 do {
8697 subset_mask++;
8698 } while (!(~subset_mask[0]));
8699 const uint32_t quaterarr_word_skip_ct = 2 * ((uintptr_t)(subset_mask - subset_mask_initial));
8700 main_quaterarr = &(main_quaterarr[quaterarr_word_skip_ct]);
8701 const uint32_t item_skip_ct = quaterarr_word_skip_ct * BITCT2;
8702 orig_quaterarr_size -= item_skip_ct;
8703 subset_size -= item_skip_ct;
8704 }
8705 uintptr_t cur_output_word = 0;
8706 uintptr_t* main_quaterarr_writer = main_quaterarr;
8707 uintptr_t* main_quaterarr_write_last = &(main_quaterarr[subset_size / BITCT2]);
8708 const uint32_t word_write_halfshift_end = subset_size % BITCT2;
8709 uint32_t word_write_halfshift = 0;
8710 // if <= 2/3-filled, use sparse copy algorithm
8711 if (subset_size * (3 * ONELU) <= orig_quaterarr_size * (2 * ONELU)) {
8712 uint32_t subset_mask_widx = 0;
8713 while (1) {
8714 const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
8715 if (cur_include_word) {
8716 uint32_t wordhalf_idx = 0;
8717 #ifdef __LP64__
8718 uint32_t cur_include_halfword = (uint32_t)cur_include_word;
8719 #else
8720 uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8721 #endif
8722 while (1) {
8723 if (cur_include_halfword) {
8724 uintptr_t orig_quaterarr_word = main_quaterarr[subset_mask_widx * 2 + wordhalf_idx];
8725 do {
8726 uint32_t rqa_idx_lowbits = __builtin_ctz(cur_include_halfword);
8727 cur_output_word |= ((orig_quaterarr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
8728 if (++word_write_halfshift == BITCT2) {
8729 *main_quaterarr_writer++ = cur_output_word;
8730 word_write_halfshift = 0;
8731 cur_output_word = 0;
8732 }
8733 cur_include_halfword &= cur_include_halfword - 1;
8734 } while (cur_include_halfword);
8735 }
8736 if (wordhalf_idx) {
8737 break;
8738 }
8739 wordhalf_idx++;
8740 #ifdef __LP64__
8741 cur_include_halfword = cur_include_word >> 32;
8742 #else
8743 cur_include_halfword = cur_include_word >> 16;
8744 #endif
8745 }
8746 if (main_quaterarr_writer == main_quaterarr_write_last) {
8747 if (word_write_halfshift == word_write_halfshift_end) {
8748 if (word_write_halfshift_end) {
8749 *main_quaterarr_writer = cur_output_word;
8750 }
8751 return;
8752 }
8753 }
8754 }
8755 subset_mask_widx++;
8756 }
8757 }
8758 // blocked copy
8759 while (1) {
8760 const uintptr_t cur_include_word = *subset_mask++;
8761 uint32_t wordhalf_idx = 0;
8762 #ifdef __LP64__
8763 uintptr_t cur_include_halfword = (uint32_t)cur_include_word;
8764 #else
8765 uint32_t cur_include_halfword = (uint16_t)cur_include_word;
8766 #endif
8767 while (1) {
8768 uintptr_t orig_quaterarr_word = *main_quaterarr++;
8769 while (cur_include_halfword) {
8770 uint32_t rqa_idx_lowbits = CTZLU(cur_include_halfword);
8771 uintptr_t halfword_invshifted = (~cur_include_halfword) >> rqa_idx_lowbits;
8772 uintptr_t orig_quaterarr_curblock_unmasked = orig_quaterarr_word >> (rqa_idx_lowbits * 2);
8773 uint32_t rqa_block_len = CTZLU(halfword_invshifted);
8774 uint32_t block_len_limit = BITCT2 - word_write_halfshift;
8775 cur_output_word |= orig_quaterarr_curblock_unmasked << (2 * word_write_halfshift);
8776 if (rqa_block_len < block_len_limit) {
8777 word_write_halfshift += rqa_block_len;
8778 cur_output_word &= (ONELU << (word_write_halfshift * 2)) - ONELU;
8779 } else {
8780 // no need to mask, extra bits vanish off the high end
8781
8782 *main_quaterarr_writer++ = cur_output_word;
8783 word_write_halfshift = rqa_block_len - block_len_limit;
8784 if (word_write_halfshift) {
8785 cur_output_word = (orig_quaterarr_curblock_unmasked >> (2 * block_len_limit)) & ((ONELU << (2 * word_write_halfshift)) - ONELU);
8786 } else {
8787 cur_output_word = 0;
8788 }
8789 }
8790 cur_include_halfword &= (~(ONELU << (rqa_block_len + rqa_idx_lowbits))) + ONELU;
8791 }
8792 if (wordhalf_idx) {
8793 break;
8794 }
8795 wordhalf_idx++;
8796 #ifdef __LP64__
8797 cur_include_halfword = cur_include_word >> 32;
8798 #else
8799 cur_include_halfword = cur_include_word >> 16;
8800 #endif
8801 }
8802 if (main_quaterarr_writer == main_quaterarr_write_last) {
8803 if (word_write_halfshift == word_write_halfshift_end) {
8804 if (word_write_halfshift_end) {
8805 *main_quaterarr_writer = cur_output_word;
8806 }
8807 return;
8808 }
8809 }
8810 }
8811 }
8812 */
8813
load_and_collapse_incl(uint32_t unfiltered_sample_ct,uint32_t sample_ct,const uintptr_t * __restrict sample_include,uintptr_t final_mask,uint32_t do_reverse,FILE * bedfile,uintptr_t * __restrict rawbuf,uintptr_t * __restrict mainbuf)8814 uint32_t load_and_collapse_incl(uint32_t unfiltered_sample_ct, uint32_t sample_ct, const uintptr_t* __restrict sample_include, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict mainbuf) {
8815 assert(unfiltered_sample_ct);
8816 uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8817 if (unfiltered_sample_ct == sample_ct) {
8818 rawbuf = mainbuf;
8819 }
8820 if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
8821 return RET_READ_FAIL;
8822 }
8823 if (unfiltered_sample_ct != sample_ct) {
8824 copy_quaterarr_nonempty_subset(rawbuf, sample_include, unfiltered_sample_ct, sample_ct, mainbuf);
8825 } else {
8826 mainbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
8827 }
8828 if (do_reverse) {
8829 reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
8830 }
8831 return 0;
8832 }
8833
8834 /*
8835 uint32_t load_and_collapse_incl_inplace(const uintptr_t* __restrict sample_include, uint32_t unfiltered_sample_ct, uint32_t sample_ct, uintptr_t final_mask, uint32_t do_reverse, FILE* bedfile, uintptr_t* __restrict mainbuf) {
8836 // mainbuf must be large enough to store unfiltered data
8837 uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8838 if (load_raw(unfiltered_sample_ct4, bedfile, mainbuf)) {
8839 return RET_READ_FAIL;
8840 }
8841 if (unfiltered_sample_ct == sample_ct) {
8842 mainbuf[(unfiltered_sample_ct - 1) / BITCT2] &= final_mask;
8843 } else {
8844 inplace_quaterarr_proper_subset(sample_include, unfiltered_sample_ct, sample_ct, mainbuf);
8845 }
8846 if (do_reverse) {
8847 reverse_loadbuf(sample_ct, (unsigned char*)mainbuf);
8848 }
8849 return 0;
8850 }
8851 */
8852
load_and_split(uint32_t unfiltered_sample_ct,const uintptr_t * __restrict pheno_nm,const uintptr_t * __restrict pheno_c,FILE * bedfile,uintptr_t * __restrict rawbuf,uintptr_t * __restrict casebuf,uintptr_t * __restrict ctrlbuf)8853 uint32_t load_and_split(uint32_t unfiltered_sample_ct, const uintptr_t* __restrict pheno_nm, const uintptr_t* __restrict pheno_c, FILE* bedfile, uintptr_t* __restrict rawbuf, uintptr_t* __restrict casebuf, uintptr_t* __restrict ctrlbuf) {
8854 // add do_reverse later if needed
8855 uintptr_t* rawbuf_end = &(rawbuf[unfiltered_sample_ct / BITCT2]);
8856 uintptr_t case_word = 0;
8857 uintptr_t ctrl_word = 0;
8858 uint32_t unfiltered_sample_ct4 = (unfiltered_sample_ct + 3) / 4;
8859 uint32_t case_shift2 = 0;
8860 uint32_t ctrl_shift2 = 0;
8861 uint32_t read_shift_max = BITCT2;
8862 uint32_t sample_uidx = 0;
8863 uint32_t read_shift;
8864 uintptr_t read_word;
8865 uintptr_t ulii;
8866 if (load_raw(unfiltered_sample_ct4, bedfile, rawbuf)) {
8867 return RET_READ_FAIL;
8868 }
8869 while (1) {
8870 while (rawbuf < rawbuf_end) {
8871 read_word = *rawbuf++;
8872 for (read_shift = 0; read_shift < read_shift_max; sample_uidx++, read_shift++) {
8873 if (is_set(pheno_nm, sample_uidx)) {
8874 ulii = read_word & 3;
8875 if (is_set(pheno_c, sample_uidx)) {
8876 case_word |= ulii << case_shift2;
8877 case_shift2 += 2;
8878 if (case_shift2 == BITCT) {
8879 *casebuf++ = case_word;
8880 case_word = 0;
8881 case_shift2 = 0;
8882 }
8883 } else {
8884 ctrl_word |= ulii << ctrl_shift2;
8885 ctrl_shift2 += 2;
8886 if (ctrl_shift2 == BITCT) {
8887 *ctrlbuf++ = ctrl_word;
8888 ctrl_word = 0;
8889 ctrl_shift2 = 0;
8890 }
8891 }
8892 }
8893 read_word >>= 2;
8894 }
8895 }
8896 if (sample_uidx == unfiltered_sample_ct) {
8897 if (case_shift2) {
8898 *casebuf = case_word;
8899 }
8900 if (ctrl_shift2) {
8901 *ctrlbuf = ctrl_word;
8902 }
8903 return 0;
8904 }
8905 rawbuf_end++;
8906 read_shift_max = unfiltered_sample_ct % BITCT2;
8907 }
8908 }
8909
init_quaterarr_from_bitarr(const uintptr_t * __restrict bitarr,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict new_quaterarr)8910 void init_quaterarr_from_bitarr(const uintptr_t* __restrict bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict new_quaterarr) {
8911 // allows unfiltered_sample_ct == 0
8912 uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
8913 uintptr_t ulii;
8914 uintptr_t uljj;
8915 uintptr_t ulkk;
8916 uintptr_t ulmm;
8917 uint32_t bit_idx;
8918 while (unfiltered_sample_ctl) {
8919 ulii = ~(*bitarr++);
8920 ulkk = FIVEMASK;
8921 ulmm = FIVEMASK;
8922 if (ulii) {
8923 uljj = ulii >> BITCT2;
8924 #ifdef __LP64__
8925 ulii &= 0xffffffffLLU;
8926 #else
8927 ulii &= 0xffffLU;
8928 #endif
8929 if (ulii) {
8930 do {
8931 bit_idx = CTZLU(ulii);
8932 ulkk &= ~(ONELU << (bit_idx * 2));
8933 ulii &= ulii - 1;
8934 } while (ulii);
8935 }
8936 if (uljj) {
8937 do {
8938 bit_idx = CTZLU(uljj);
8939 ulmm &= ~(ONELU << (bit_idx * 2));
8940 uljj &= uljj - 1;
8941 } while (uljj);
8942 }
8943 }
8944 *new_quaterarr++ = ulkk;
8945 *new_quaterarr++ = ulmm;
8946 --unfiltered_sample_ctl;
8947 }
8948 ulii = unfiltered_sample_ct & (BITCT - 1);
8949 if (ulii) {
8950 new_quaterarr--;
8951 if (ulii < BITCT2) {
8952 *new_quaterarr-- = 0;
8953 } else {
8954 ulii -= BITCT2;
8955 }
8956 *new_quaterarr &= (ONELU << (ulii * 2)) - ONELU;
8957 }
8958 }
8959
init_quaterarr_from_inverted_bitarr(const uintptr_t * __restrict inverted_bitarr,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict new_quaterarr)8960 void init_quaterarr_from_inverted_bitarr(const uintptr_t* __restrict inverted_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict new_quaterarr) {
8961 // allows unfiltered_sample_ct == 0
8962 uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
8963 uintptr_t ulii;
8964 uintptr_t uljj;
8965 uintptr_t ulkk;
8966 uintptr_t ulmm;
8967 uint32_t bit_idx;
8968 while (unfiltered_sample_ctl) {
8969 ulii = *inverted_bitarr++;
8970 ulkk = FIVEMASK;
8971 ulmm = FIVEMASK;
8972 if (ulii) {
8973 uljj = ulii >> BITCT2;
8974 #ifdef __LP64__
8975 ulii &= 0xffffffffLLU;
8976 #else
8977 ulii &= 0xffffLU;
8978 #endif
8979 if (ulii) {
8980 do {
8981 bit_idx = CTZLU(ulii);
8982 ulkk &= ~(ONELU << (bit_idx * 2));
8983 ulii &= ulii - 1;
8984 } while (ulii);
8985 }
8986 if (uljj) {
8987 do {
8988 bit_idx = CTZLU(uljj);
8989 ulmm &= ~(ONELU << (bit_idx * 2));
8990 uljj &= uljj - 1;
8991 } while (uljj);
8992 }
8993 }
8994 *new_quaterarr++ = ulkk;
8995 *new_quaterarr++ = ulmm;
8996 --unfiltered_sample_ctl;
8997 }
8998 ulii = unfiltered_sample_ct & (BITCT - 1);
8999 if (ulii) {
9000 new_quaterarr--;
9001 if (ulii < BITCT2) {
9002 *new_quaterarr-- = 0;
9003 } else {
9004 ulii -= BITCT2;
9005 }
9006 *new_quaterarr &= (ONELU << (ulii * 2)) - ONELU;
9007 }
9008 }
9009
quatervec_01_init_invert(const uintptr_t * __restrict source_quatervec,uintptr_t entry_ct,uintptr_t * __restrict target_quatervec)9010 void quatervec_01_init_invert(const uintptr_t* __restrict source_quatervec, uintptr_t entry_ct, uintptr_t* __restrict target_quatervec) {
9011 // Initializes a quatervec as the inverse of another.
9012 // Some modifications needed for AVX2.
9013 uint32_t vec_wsize = QUATERCT_TO_ALIGNED_WORDCT(entry_ct);
9014 uint32_t rem = entry_ct & (BITCT - 1);
9015 #ifdef __LP64__
9016 const __m128i m1 = {FIVEMASK, FIVEMASK};
9017 __m128i* tptr = (__m128i*)target_quatervec;
9018 __m128i* sptr = (__m128i*)source_quatervec;
9019 __m128i* tptr_end = (__m128i*)(&(target_quatervec[vec_wsize]));
9020 uintptr_t* second_to_last;
9021 while (tptr < tptr_end) {
9022 *tptr++ = _mm_andnot_si128(*sptr++, m1);
9023 }
9024 if (rem) {
9025 second_to_last = &(((uintptr_t*)tptr_end)[-2]);
9026 if (rem > BITCT2) {
9027 second_to_last[1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
9028 } else {
9029 *second_to_last &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
9030 second_to_last[1] = 0;
9031 }
9032 }
9033 #else
9034 uintptr_t* tptr_end = &(target_quatervec[vec_wsize]);
9035 while (target_quatervec < tptr_end) {
9036 *target_quatervec++ = FIVEMASK & (~(*source_quatervec++));
9037 }
9038 if (rem) {
9039 if (rem > BITCT2) {
9040 target_quatervec[-1] &= (~ZEROLU) >> ((BITCT - rem) * 2);
9041 } else {
9042 target_quatervec[-2] &= (~ZEROLU) >> ((BITCT2 - rem) * 2);
9043 target_quatervec[-1] = 0;
9044 }
9045 }
9046
9047 #endif
9048 }
9049
bitvec_andnot_copy(const uintptr_t * __restrict source_vec,const uintptr_t * __restrict exclude_vec,uintptr_t word_ct,uintptr_t * __restrict target_vec)9050 void bitvec_andnot_copy(const uintptr_t* __restrict source_vec, const uintptr_t* __restrict exclude_vec, uintptr_t word_ct, uintptr_t* __restrict target_vec) {
9051 // target_vec := source_vec ANDNOT exclude_vec
9052 // may write an extra word
9053 assert(word_ct);
9054 #ifdef __LP64__
9055 __m128i* tptr = (__m128i*)target_vec;
9056 __m128i* sptr = (__m128i*)source_vec;
9057 __m128i* xptr = (__m128i*)exclude_vec;
9058 __m128i* tptr_end = (__m128i*)(&(target_vec[round_up_pow2(word_ct, VEC_WORDS)]));
9059 do {
9060 *tptr++ = _mm_andnot_si128(*xptr++, *sptr++);
9061 } while (tptr < tptr_end);
9062 #else
9063 uintptr_t* tptr_end = &(target_vec[word_ct]);
9064 do {
9065 *target_vec++ = (*source_vec++) & (~(*exclude_vec++));
9066 } while (target_vec < tptr_end);
9067 #endif
9068 }
9069
apply_bitarr_mask_to_quaterarr_01(const uintptr_t * __restrict mask_bitarr,uintptr_t unfiltered_sample_ct,uintptr_t * main_quaterarr)9070 void apply_bitarr_mask_to_quaterarr_01(const uintptr_t* __restrict mask_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* main_quaterarr) {
9071 // allows unfiltered_sample_ct == 0
9072 uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9073 uintptr_t ulii;
9074 uintptr_t uljj;
9075 uintptr_t ulkk;
9076 uintptr_t ulmm;
9077 uint32_t bit_idx;
9078 while (unfiltered_sample_ctl) {
9079 ulii = ~(*mask_bitarr++);
9080 ulkk = *main_quaterarr;
9081 ulmm = main_quaterarr[1];
9082 if (ulii) {
9083 uljj = ulii >> BITCT2;
9084 #ifdef __LP64__
9085 ulii &= 0xffffffffLLU;
9086 #else
9087 ulii &= 0xffffLU;
9088 #endif
9089 if (ulii) {
9090 do {
9091 bit_idx = CTZLU(ulii);
9092 ulkk &= ~(ONELU << (bit_idx * 2));
9093 ulii &= ulii - 1;
9094 } while (ulii);
9095 }
9096 if (uljj) {
9097 do {
9098 bit_idx = CTZLU(uljj);
9099 ulmm &= ~(ONELU << (bit_idx * 2));
9100 uljj &= uljj - 1;
9101 } while (uljj);
9102 }
9103 }
9104 *main_quaterarr++ = ulkk;
9105 *main_quaterarr++ = ulmm;
9106 --unfiltered_sample_ctl;
9107 }
9108 }
9109
apply_bitarr_excl_to_quaterarr_01(const uintptr_t * __restrict excl_bitarr,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict main_quaterarr)9110 void apply_bitarr_excl_to_quaterarr_01(const uintptr_t* __restrict excl_bitarr, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict main_quaterarr) {
9111 assert(unfiltered_sample_ct);
9112 uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9113 uintptr_t ulii;
9114 uintptr_t uljj;
9115 uintptr_t ulkk;
9116 uintptr_t ulmm;
9117 uint32_t bit_idx;
9118 do {
9119 ulii = *excl_bitarr++;
9120 ulkk = *main_quaterarr;
9121 ulmm = main_quaterarr[1];
9122 if (ulii) {
9123 uljj = ulii >> BITCT2;
9124 #ifdef __LP64__
9125 ulii &= 0xffffffffLLU;
9126 #else
9127 ulii &= 0xffffLU;
9128 #endif
9129 if (ulii) {
9130 do {
9131 bit_idx = CTZLU(ulii);
9132 ulkk &= ~(ONELU << (bit_idx * 2));
9133 ulii &= ulii - 1;
9134 } while (ulii);
9135 }
9136 if (uljj) {
9137 do {
9138 bit_idx = CTZLU(uljj);
9139 ulmm &= ~(ONELU << (bit_idx * 2));
9140 uljj &= uljj - 1;
9141 } while (uljj);
9142 }
9143 }
9144 *main_quaterarr++ = ulkk;
9145 *main_quaterarr++ = ulmm;
9146 } while (--unfiltered_sample_ctl);
9147 }
9148
apply_excl_intersect_to_quaterarr_01(const uintptr_t * __restrict excl_bitarr_1,const uintptr_t * __restrict excl_bitarr_2,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict main_quaterarr)9149 void apply_excl_intersect_to_quaterarr_01(const uintptr_t* __restrict excl_bitarr_1, const uintptr_t* __restrict excl_bitarr_2, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict main_quaterarr) {
9150 assert(unfiltered_sample_ct);
9151 uint32_t unfiltered_sample_ctl = BITCT_TO_WORDCT(unfiltered_sample_ct);
9152 uintptr_t ulii;
9153 uintptr_t uljj;
9154 uintptr_t ulkk;
9155 uintptr_t ulmm;
9156 uint32_t bit_idx;
9157 do {
9158 ulii = (*excl_bitarr_1++) & (*excl_bitarr_2++);
9159 ulkk = *main_quaterarr;
9160 ulmm = main_quaterarr[1];
9161 if (ulii) {
9162 uljj = ulii >> BITCT2;
9163 #ifdef __LP64__
9164 ulii &= 0xffffffffLLU;
9165 #else
9166 ulii &= 0xffffLU;
9167 #endif
9168 if (ulii) {
9169 do {
9170 bit_idx = CTZLU(ulii);
9171 ulkk &= ~(ONELU << (bit_idx * 2));
9172 ulii &= ulii - 1;
9173 } while (ulii);
9174 }
9175 if (uljj) {
9176 do {
9177 bit_idx = CTZLU(uljj);
9178 ulmm &= ~(ONELU << (bit_idx * 2));
9179 uljj &= uljj - 1;
9180 } while (uljj);
9181 }
9182 }
9183 *main_quaterarr++ = ulkk;
9184 *main_quaterarr++ = ulmm;
9185 } while (--unfiltered_sample_ctl);
9186 }
9187
quatervec_copy_only_01(const uintptr_t * __restrict input_quatervec,uintptr_t unfiltered_sample_ct,uintptr_t * __restrict output_quatervec)9188 void quatervec_copy_only_01(const uintptr_t* __restrict input_quatervec, uintptr_t unfiltered_sample_ct, uintptr_t* __restrict output_quatervec) {
9189 // initializes result_ptr bits 01 iff input_quatervec bits are 01
9190 assert(unfiltered_sample_ct);
9191 #ifdef __LP64__
9192 const __m128i m1 = {FIVEMASK, FIVEMASK};
9193 __m128i* vec2_read = (__m128i*)input_quatervec;
9194 __m128i* read_end = &(vec2_read[QUATERCT_TO_VECCT(unfiltered_sample_ct)]);
9195 __m128i* vec2_write = (__m128i*)output_quatervec;
9196 __m128i loader;
9197 do {
9198 loader = *vec2_read++;
9199 *vec2_write++ = _mm_and_si128(_mm_andnot_si128(_mm_srli_epi64(loader, 1), loader), m1);
9200 } while (vec2_read < read_end);
9201 #else
9202 const uintptr_t* read_end = &(input_quatervec[QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct)]);
9203 uintptr_t loader;
9204 do {
9205 loader = *input_quatervec++;
9206 *output_quatervec++ = loader & (~(loader >> 1)) & FIVEMASK;
9207 } while (input_quatervec < read_end);
9208 #endif
9209 }
9210
quatervec_01_invert(uintptr_t unfiltered_sample_ct,uintptr_t * main_quatervec)9211 void quatervec_01_invert(uintptr_t unfiltered_sample_ct, uintptr_t* main_quatervec) {
9212 uintptr_t* vec2_last = &(main_quatervec[unfiltered_sample_ct / BITCT2]);
9213 uint32_t remainder = unfiltered_sample_ct & (BITCT2 - 1);
9214 #ifdef __LP64__
9215 const __m128i m1 = {FIVEMASK, FIVEMASK};
9216 __m128i* vec2_128 = (__m128i*)main_quatervec;
9217 __m128i* vec2_last128 = &(vec2_128[unfiltered_sample_ct / BITCT]);
9218 while (vec2_128 < vec2_last128) {
9219 *vec2_128 = _mm_xor_si128(*vec2_128, m1);
9220 vec2_128++;
9221 }
9222 main_quatervec = (uintptr_t*)vec2_128;
9223 if (main_quatervec != vec2_last) {
9224 *main_quatervec = (*main_quatervec) ^ FIVEMASK;
9225 main_quatervec++;
9226 }
9227 #else
9228 while (main_quatervec != vec2_last) {
9229 *main_quatervec = (*main_quatervec) ^ FIVEMASK;
9230 main_quatervec++;
9231 }
9232 #endif
9233 if (remainder) {
9234 *vec2_last = *vec2_last ^ (FIVEMASK >> (2 * (BITCT2 - remainder)));
9235 }
9236 }
9237
vec_datamask(uintptr_t unfiltered_sample_ct,uint32_t matchval,uintptr_t * data_ptr,uintptr_t * mask_ptr,uintptr_t * result_ptr)9238 void vec_datamask(uintptr_t unfiltered_sample_ct, uint32_t matchval, uintptr_t* data_ptr, uintptr_t* mask_ptr, uintptr_t* result_ptr) {
9239 // vec_ptr assumed to be standard 00/01 bit vector
9240 // sets result_vec bits to 01 iff data_ptr bits are equal to matchval and
9241 // vec_ptr bit is set, 00 otherwise.
9242 // currently assumes matchval is not 1.
9243 assert(unfiltered_sample_ct);
9244 #ifdef __LP64__
9245 __m128i* data_read = (__m128i*)data_ptr;
9246 __m128i* mask_read = (__m128i*)mask_ptr;
9247 __m128i* data_read_end = &(data_read[QUATERCT_TO_VECCT(unfiltered_sample_ct)]);
9248 __m128i* writer = (__m128i*)result_ptr;
9249 __m128i loader;
9250 #else
9251 uintptr_t* data_read_end = &(data_ptr[QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct)]);
9252 uintptr_t loader;
9253 #endif
9254 if (matchval) {
9255 if (matchval == 2) {
9256 #ifdef __LP64__
9257 do {
9258 loader = *data_read++;
9259 *writer++ = _mm_and_si128(_mm_andnot_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
9260 } while (data_read < data_read_end);
9261 #else
9262 do {
9263 loader = *data_ptr++;
9264 *result_ptr++ = (~loader) & (loader >> 1) & (*mask_ptr++);
9265 } while (data_ptr < data_read_end);
9266 #endif
9267 } else {
9268 #ifdef __LP64__
9269 do {
9270 loader = *data_read++;
9271 *writer++ = _mm_and_si128(_mm_and_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
9272 } while (data_read < data_read_end);
9273 #else
9274 do {
9275 loader = *data_ptr++;
9276 *result_ptr++ = loader & (loader >> 1) & (*mask_ptr++);
9277 } while (data_ptr < data_read_end);
9278 #endif
9279 }
9280 } else {
9281 #ifdef __LP64__
9282 do {
9283 loader = *data_read++;
9284 *writer++ = _mm_andnot_si128(_mm_or_si128(loader, _mm_srli_epi64(loader, 1)), *mask_read++);
9285 } while (data_read < data_read_end);
9286 #else
9287 do {
9288 loader = *data_ptr++;
9289 *result_ptr++ = (~(loader | (loader >> 1))) & (*mask_ptr++);
9290 } while (data_ptr < data_read_end);
9291 #endif
9292 }
9293 }
9294
9295 /*
9296 void vec_rotate_plink1_to_plink2(uintptr_t* lptr, uint32_t word_ct) {
9297 #ifdef __LP64__
9298 const __m128i m1 = {FIVEMASK, FIVEMASK};
9299 __m128i* vptr = (__m128i*)lptr;
9300 __m128i* vend = (__m128i*)(&(lptr[word_ct]));
9301 __m128i vii;
9302 __m128i vjj;
9303 do {
9304 // new high bit set iff old low bit was set
9305 // new low bit set iff old bits differed
9306 vii = *vptr;
9307 vjj = _mm_and_si128(vii, m1); // old low bit
9308 vii = _mm_and_si128(_mm_srli_epi64(vii, 1), m1); // old high bit, shifted
9309 *vptr = _mm_or_si128(_mm_slli_epi64(vjj, 1), _mm_xor_si128(vii, vjj));
9310 } while (++vptr != vend);
9311 #else
9312 uintptr_t* lend = &(lptr[word_ct]);
9313 uintptr_t ulii;
9314 uintptr_t uljj;
9315 do {
9316 ulii = *lptr;
9317 uljj = ulii & FIVEMASK;
9318 ulii = (ulii >> 1) & FIVEMASK;
9319 *lptr = ulii ^ (uljj * 3);
9320 } while (++lptr != lend);
9321 #endif
9322 }
9323 */
9324
9325 // this was "rotate_plink1_to_plink2_...", until I noticed that the plink2
9326 // format should store alt allele counts instead of ref allele counts.
rotate_plink1_to_a2ct_and_copy(uintptr_t * loadbuf,uintptr_t * writebuf,uintptr_t word_ct)9327 void rotate_plink1_to_a2ct_and_copy(uintptr_t* loadbuf, uintptr_t* writebuf, uintptr_t word_ct) {
9328 // assumes positive word_ct
9329 uintptr_t* loadbuf_end = &(loadbuf[word_ct]);
9330 uintptr_t ulii;
9331 uintptr_t uljj;
9332 do {
9333 ulii = *loadbuf++;
9334 uljj = ulii & FIVEMASK;
9335 ulii = (ulii >> 1) & FIVEMASK;
9336 *writebuf++ = ulii ^ (uljj * 3);
9337 } while (loadbuf < loadbuf_end);
9338 }
9339
extract_collapsed_missing_bitfield(uintptr_t * lptr,uintptr_t unfiltered_sample_ct,uintptr_t * sample_include_quaterarr,uintptr_t sample_ct,uintptr_t * missing_bitfield)9340 void extract_collapsed_missing_bitfield(uintptr_t* lptr, uintptr_t unfiltered_sample_ct, uintptr_t* sample_include_quaterarr, uintptr_t sample_ct, uintptr_t* missing_bitfield) {
9341 uint32_t word_ct = QUATERCT_TO_WORDCT(unfiltered_sample_ct);
9342 uintptr_t sample_idx;
9343 uintptr_t cur_word;
9344 uintptr_t cur_mask;
9345 uintptr_t cur_write;
9346 uint32_t woffset;
9347 uint32_t widx;
9348 uint32_t uii;
9349 if (unfiltered_sample_ct == sample_ct) {
9350 cur_write = 0;
9351 woffset = 0;
9352 for (widx = 0; widx < word_ct; widx++) {
9353 cur_word = *lptr++;
9354 cur_word = cur_word & ((~cur_word) >> 1) & (*sample_include_quaterarr++);
9355 while (cur_word) {
9356 uii = CTZLU(cur_word) / 2;
9357 cur_write |= ONELU << (woffset + uii);
9358 cur_word &= cur_word - 1;
9359 }
9360 if (woffset) {
9361 *missing_bitfield++ = cur_write;
9362 cur_write = 0;
9363 woffset = 0;
9364 } else {
9365 woffset = BITCT2;
9366 }
9367 }
9368 if (woffset) {
9369 *missing_bitfield++ = cur_write;
9370 }
9371 } else {
9372 fill_ulong_zero(BITCT_TO_WORDCT(sample_ct), missing_bitfield);
9373 sample_idx = 0;
9374 for (widx = 0; sample_idx < sample_ct; widx++, lptr++) {
9375 cur_mask = *sample_include_quaterarr++;
9376 if (cur_mask) {
9377 cur_word = *lptr;
9378 cur_word = cur_word & ((~cur_word) >> 1) & cur_mask;
9379 if (cur_mask == FIVEMASK) {
9380 if (cur_word) {
9381 uii = sample_idx;
9382 do {
9383 set_bit((CTZLU(cur_word) / 2) + uii, missing_bitfield);
9384 cur_word &= cur_word - 1;
9385 } while (cur_word);
9386 }
9387 sample_idx += BITCT2;
9388 } else {
9389 if (cur_word) {
9390 do {
9391 uii = CTZLU(cur_mask);
9392 if ((cur_word >> uii) & 1) {
9393 set_bit_ul(sample_idx, missing_bitfield);
9394 }
9395 sample_idx++;
9396 cur_mask &= cur_mask - 1;
9397 } while (cur_mask);
9398 } else {
9399 sample_idx += popcount2_long(cur_mask);
9400 }
9401 }
9402 }
9403 }
9404 }
9405 }
9406
hh_reset(unsigned char * loadbuf,uintptr_t * sample_include_quaterarr,uintptr_t unfiltered_sample_ct)9407 void hh_reset(unsigned char* loadbuf, uintptr_t* sample_include_quaterarr, uintptr_t unfiltered_sample_ct) {
9408 uintptr_t sample_bidx = 0;
9409 unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
9410 unsigned char* iicp;
9411 unsigned char ucc;
9412 unsigned char ucc2;
9413 uintptr_t unfiltered_sample_ctd;
9414 uint32_t* loadbuf_alias32;
9415 uint32_t uii;
9416 uint32_t ujj;
9417 #ifdef __LP64__
9418 uint32_t* sample_include_quaterarr_alias32;
9419 __m128i* loadbuf_alias;
9420 __m128i* iivp;
9421 __m128i vii;
9422 __m128i vjj;
9423 if (!(((uintptr_t)loadbuf) & 15)) {
9424 loadbuf_alias = (__m128i*)loadbuf;
9425 iivp = (__m128i*)sample_include_quaterarr;
9426 unfiltered_sample_ctd = unfiltered_sample_ct / 64;
9427 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9428 vii = *loadbuf_alias;
9429 vjj = _mm_and_si128(_mm_andnot_si128(vii, _mm_srli_epi64(vii, 1)), *iivp++);
9430 *loadbuf_alias++ = _mm_sub_epi64(vii, vjj);
9431 }
9432 loadbuf = (unsigned char*)loadbuf_alias;
9433 iicp = (unsigned char*)iivp;
9434 } else if (!(((uintptr_t)loadbuf) & 3)) {
9435 loadbuf_alias32 = (uint32_t*)loadbuf;
9436 sample_include_quaterarr_alias32 = (uint32_t*)sample_include_quaterarr;
9437 unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
9438 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9439 uii = *loadbuf_alias32;
9440 ujj = ((uii >> 1) & (~uii)) & (*sample_include_quaterarr_alias32++);
9441 *loadbuf_alias32++ = uii - ujj;
9442 }
9443 loadbuf = (unsigned char*)loadbuf_alias32;
9444 iicp = (unsigned char*)sample_include_quaterarr_alias32;
9445 } else {
9446 iicp = (unsigned char*)sample_include_quaterarr;
9447 }
9448 #else
9449 if (!(((uintptr_t)loadbuf) & 3)) {
9450 loadbuf_alias32 = (uint32_t*)loadbuf;
9451 unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
9452 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9453 uii = *loadbuf_alias32;
9454 ujj = ((uii >> 1) & (~uii)) & (*sample_include_quaterarr++);
9455 *loadbuf_alias32++ = uii - ujj;
9456 }
9457 loadbuf = (unsigned char*)loadbuf_alias32;
9458 }
9459 iicp = (unsigned char*)sample_include_quaterarr;
9460 #endif
9461 for (; loadbuf < loadbuf_end;) {
9462 ucc = *loadbuf;
9463 ucc2 = ((ucc >> 1) & (~ucc)) & (*iicp++);
9464 *loadbuf++ = ucc - ucc2;
9465 }
9466 }
9467
hh_reset_y(unsigned char * loadbuf,uintptr_t * sample_include_quaterarr,uintptr_t * sample_male_include_quaterarr,uintptr_t unfiltered_sample_ct)9468 void hh_reset_y(unsigned char* loadbuf, uintptr_t* sample_include_quaterarr, uintptr_t* sample_male_include_quaterarr, uintptr_t unfiltered_sample_ct) {
9469 uintptr_t sample_bidx = 0;
9470 unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
9471 unsigned char* iicp;
9472 unsigned char* imicp;
9473 unsigned char ucc;
9474 unsigned char ucc2;
9475 unsigned char ucc3;
9476 uintptr_t unfiltered_sample_ctd;
9477 uint32_t* loadbuf_alias32;
9478 uint32_t uii;
9479 uint32_t ujj;
9480 uint32_t ukk;
9481 #ifdef __LP64__
9482 const __m128i m1 = {FIVEMASK, FIVEMASK};
9483 uint32_t* sample_include_quaterarr_alias32;
9484 uint32_t* sample_male_include_quaterarr_alias32;
9485 __m128i* loadbuf_alias;
9486 __m128i* iivp;
9487 __m128i* imivp;
9488 __m128i vii;
9489 __m128i vjj;
9490 __m128i vkk;
9491 if (!(((uintptr_t)loadbuf) & 15)) {
9492 loadbuf_alias = (__m128i*)loadbuf;
9493 iivp = (__m128i*)sample_include_quaterarr;
9494 imivp = (__m128i*)sample_male_include_quaterarr;
9495 unfiltered_sample_ctd = unfiltered_sample_ct / 64;
9496 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9497 // sample_include_quaterarr & ~sample_male_include_quaterarr: force to 01
9498 // sample_male_include_quaterarr: convert 10 to 01, keep everything else
9499 vii = *imivp++;
9500 vjj = *iivp++;
9501 vkk = _mm_and_si128(*loadbuf_alias, _mm_or_si128(vii, _mm_slli_epi64(vii, 1)));
9502 *loadbuf_alias++ = _mm_or_si128(_mm_andnot_si128(vii, vjj), _mm_sub_epi64(vkk, _mm_and_si128(_mm_andnot_si128(vkk, _mm_srli_epi64(vkk, 1)), m1)));
9503 }
9504 loadbuf = (unsigned char*)loadbuf_alias;
9505 iicp = (unsigned char*)iivp;
9506 imicp = (unsigned char*)imivp;
9507 } else if (!(((uintptr_t)loadbuf) & 3)) {
9508 loadbuf_alias32 = (uint32_t*)loadbuf;
9509 sample_include_quaterarr_alias32 = (uint32_t*)sample_include_quaterarr;
9510 sample_male_include_quaterarr_alias32 = (uint32_t*)sample_male_include_quaterarr;
9511 unfiltered_sample_ctd = unfiltered_sample_ct / 16;
9512 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9513 uii = *sample_male_include_quaterarr_alias32++;
9514 ujj = *sample_include_quaterarr_alias32++;
9515 ukk = (*loadbuf_alias32) & (uii * 3);
9516 *loadbuf_alias32++ = ((~uii) & ujj) | (ukk - ((~ukk) & (ukk >> 1) & 0x55555555));
9517 }
9518 loadbuf = (unsigned char*)loadbuf_alias32;
9519 iicp = (unsigned char*)sample_include_quaterarr_alias32;
9520 imicp = (unsigned char*)sample_male_include_quaterarr_alias32;
9521 } else {
9522 iicp = (unsigned char*)sample_include_quaterarr;
9523 imicp = (unsigned char*)sample_male_include_quaterarr;
9524 }
9525 #else
9526 if (!(((uintptr_t)loadbuf) & 3)) {
9527 loadbuf_alias32 = (uint32_t*)loadbuf;
9528 unfiltered_sample_ctd = unfiltered_sample_ct / 16;
9529 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9530 uii = *sample_male_include_quaterarr++;
9531 ujj = *sample_include_quaterarr++;
9532 ukk = (*loadbuf_alias32) & (uii * 3);
9533 *loadbuf_alias32++ = ((~uii) & ujj) | (ukk - ((~ukk) & (ukk >> 1) & 0x55555555));
9534 }
9535 loadbuf = (unsigned char*)loadbuf_alias32;
9536 }
9537 iicp = (unsigned char*)sample_include_quaterarr;
9538 imicp = (unsigned char*)sample_male_include_quaterarr;
9539 #endif
9540 for (; loadbuf < loadbuf_end;) {
9541 ucc = *imicp++;
9542 ucc2 = *iicp++;
9543 ucc3 = (*loadbuf) & (ucc * 3);
9544 *loadbuf++ = ((~ucc) & ucc2) | (ucc3 - ((~ucc3) & (ucc3 >> 1) & 0x55));
9545 }
9546 }
9547
alloc_raw_haploid_filters(uint32_t unfiltered_sample_ct,uint32_t hh_exists,uint32_t is_include,uintptr_t * sample_bitarr,uintptr_t * sex_male,uintptr_t ** sample_raw_include_quatervec_ptr,uintptr_t ** sample_raw_male_include_quatervec_ptr)9548 uint32_t alloc_raw_haploid_filters(uint32_t unfiltered_sample_ct, uint32_t hh_exists, uint32_t is_include, uintptr_t* sample_bitarr, uintptr_t* sex_male, uintptr_t** sample_raw_include_quatervec_ptr, uintptr_t** sample_raw_male_include_quatervec_ptr) {
9549 uintptr_t unfiltered_sample_ctv2 = QUATERCT_TO_ALIGNED_WORDCT(unfiltered_sample_ct);
9550 uintptr_t* sample_raw_male_include_quatervec;
9551 if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
9552 if (bigstack_alloc_ul(unfiltered_sample_ctv2, sample_raw_include_quatervec_ptr)) {
9553 return 1;
9554 }
9555 if (is_include) {
9556 init_quaterarr_from_bitarr(sample_bitarr, unfiltered_sample_ct, *sample_raw_include_quatervec_ptr);
9557 } else {
9558 init_quaterarr_from_inverted_bitarr(sample_bitarr, unfiltered_sample_ct, *sample_raw_include_quatervec_ptr);
9559 }
9560 }
9561 if (hh_exists & (XMHH_EXISTS | Y_FIX_NEEDED)) {
9562 if (bigstack_alloc_ul(unfiltered_sample_ctv2, sample_raw_male_include_quatervec_ptr)) {
9563 return 1;
9564 }
9565 sample_raw_male_include_quatervec = *sample_raw_male_include_quatervec_ptr;
9566 if (hh_exists & (Y_FIX_NEEDED | NXMHH_EXISTS)) {
9567 memcpy(sample_raw_male_include_quatervec, *sample_raw_include_quatervec_ptr, unfiltered_sample_ctv2 * sizeof(intptr_t));
9568 } else {
9569 if (is_include) {
9570 init_quaterarr_from_bitarr(sample_bitarr, unfiltered_sample_ct, sample_raw_male_include_quatervec);
9571 } else {
9572 init_quaterarr_from_inverted_bitarr(sample_bitarr, unfiltered_sample_ct, sample_raw_male_include_quatervec);
9573 }
9574 }
9575 apply_bitarr_mask_to_quaterarr_01(sex_male, unfiltered_sample_ct, sample_raw_male_include_quatervec);
9576 }
9577 return 0;
9578 }
9579
haploid_fix_multiple(uintptr_t * marker_exclude,uintptr_t marker_uidx_start,uintptr_t marker_ct,Chrom_info * chrom_info_ptr,uint32_t hh_exists,uint32_t set_hh_missing,uint32_t set_mixed_mt_missing,uintptr_t * sample_raw_include2,uintptr_t * sample_raw_male_include2,uintptr_t unfiltered_sample_ct,uintptr_t byte_ct_per_marker,unsigned char * loadbuf)9580 void haploid_fix_multiple(uintptr_t* marker_exclude, uintptr_t marker_uidx_start, uintptr_t marker_ct, Chrom_info* chrom_info_ptr, uint32_t hh_exists, uint32_t set_hh_missing, uint32_t set_mixed_mt_missing, uintptr_t* sample_raw_include2, uintptr_t* sample_raw_male_include2, uintptr_t unfiltered_sample_ct, uintptr_t byte_ct_per_marker, unsigned char* loadbuf) {
9581 uintptr_t marker_idx = 0;
9582 uintptr_t marker_uidx = next_unset_ul_unsafe(marker_exclude, marker_uidx_start);
9583 uint32_t chrom_fo_idx = get_variant_chrom_fo_idx(chrom_info_ptr, marker_uidx);
9584 uint32_t chrom_idx;
9585 uint32_t is_x;
9586 uint32_t is_y;
9587 uint32_t is_mt;
9588 uint32_t is_haploid;
9589 uintptr_t chrom_end;
9590 uintptr_t marker_idx_chrom_end;
9591
9592 while (marker_idx < marker_ct) {
9593 chrom_idx = chrom_info_ptr->chrom_file_order[chrom_fo_idx];
9594 chrom_end = chrom_info_ptr->chrom_fo_vidx_start[chrom_fo_idx + 1];
9595 is_x = (chrom_info_ptr->xymt_codes[X_OFFSET] == (int32_t)chrom_idx);
9596 is_y = (chrom_info_ptr->xymt_codes[Y_OFFSET] == (int32_t)chrom_idx);
9597 is_mt = (chrom_info_ptr->xymt_codes[MT_OFFSET] == (int32_t)chrom_idx);
9598 is_haploid = IS_SET(chrom_info_ptr->haploid_mask, chrom_idx);
9599 marker_idx_chrom_end = marker_idx + chrom_end - marker_uidx - popcount_bit_idx(marker_exclude, marker_uidx, chrom_end);
9600 if (marker_idx_chrom_end > marker_ct) {
9601 marker_idx_chrom_end = marker_ct;
9602 }
9603 if (is_haploid && set_hh_missing) {
9604 if (is_x) {
9605 if (hh_exists & XMHH_EXISTS) {
9606 for (; marker_idx < marker_idx_chrom_end; marker_idx++) {
9607 hh_reset(&(loadbuf[marker_idx * byte_ct_per_marker]), sample_raw_male_include2, unfiltered_sample_ct);
9608 }
9609 }
9610 } else if (is_y) {
9611 if (hh_exists & Y_FIX_NEEDED) {
9612 for (; marker_idx < marker_idx_chrom_end; marker_idx++) {
9613 hh_reset_y(&(loadbuf[marker_idx * byte_ct_per_marker]), sample_raw_include2, sample_raw_male_include2, unfiltered_sample_ct);
9614 }
9615 }
9616 } else if (hh_exists & NXMHH_EXISTS) {
9617 for (; marker_idx < marker_idx_chrom_end; marker_idx++) {
9618 hh_reset(&(loadbuf[marker_idx * byte_ct_per_marker]), sample_raw_include2, unfiltered_sample_ct);
9619 }
9620 }
9621 } else if (is_mt && set_mixed_mt_missing) {
9622 for (; marker_idx < marker_idx_chrom_end; marker_idx++) {
9623 hh_reset(&(loadbuf[marker_idx * byte_ct_per_marker]), sample_raw_include2, unfiltered_sample_ct);
9624 }
9625 }
9626 marker_idx = marker_idx_chrom_end;
9627 chrom_fo_idx++;
9628 }
9629 }
9630
force_missing(unsigned char * loadbuf,uintptr_t * force_missing_include2,uintptr_t unfiltered_sample_ct)9631 void force_missing(unsigned char* loadbuf, uintptr_t* force_missing_include2, uintptr_t unfiltered_sample_ct) {
9632 uintptr_t sample_bidx = 0;
9633 unsigned char* loadbuf_end = &(loadbuf[(unfiltered_sample_ct + 3) / 4]);
9634 unsigned char* fmicp;
9635 unsigned char ucc;
9636 unsigned char ucc2;
9637 uintptr_t unfiltered_sample_ctd;
9638 uint32_t* loadbuf_alias32;
9639 uint32_t uii;
9640 uint32_t ujj;
9641 #ifdef __LP64__
9642 uint32_t* force_missing_include2_alias32;
9643 __m128i* loadbuf_alias;
9644 __m128i* fmivp;
9645 __m128i vii;
9646 __m128i vjj;
9647 if (!(((uintptr_t)loadbuf) & 15)) {
9648 loadbuf_alias = (__m128i*)loadbuf;
9649 fmivp = (__m128i*)force_missing_include2;
9650 unfiltered_sample_ctd = unfiltered_sample_ct / 64;
9651 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9652 vii = *loadbuf_alias;
9653 vjj = *fmivp++;
9654 vii = _mm_or_si128(vii, vjj);
9655 vjj = _mm_slli_epi64(vjj, 1);
9656 *loadbuf_alias++ = _mm_andnot_si128(vjj, vii);
9657 }
9658 loadbuf = (unsigned char*)loadbuf_alias;
9659 fmicp = (unsigned char*)fmivp;
9660 } else if (!(((uintptr_t)loadbuf) & 3)) {
9661 loadbuf_alias32 = (uint32_t*)loadbuf;
9662 force_missing_include2_alias32 = (uint32_t*)force_missing_include2;
9663 unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
9664 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9665 uii = *loadbuf_alias32;
9666 ujj = *force_missing_include2_alias32++;
9667 uii |= ujj;
9668 ujj <<= 1;
9669 *loadbuf_alias32++ = uii & (~ujj);
9670 }
9671 loadbuf = (unsigned char*)loadbuf_alias32;
9672 fmicp = (unsigned char*)force_missing_include2_alias32;
9673 } else {
9674 fmicp = (unsigned char*)force_missing_include2;
9675 }
9676 #else
9677 if (!(((uintptr_t)loadbuf) & 3)) {
9678 loadbuf_alias32 = (uint32_t*)loadbuf;
9679 unfiltered_sample_ctd = unfiltered_sample_ct / BITCT2;
9680 for (; sample_bidx < unfiltered_sample_ctd; sample_bidx++) {
9681 uii = *loadbuf_alias32;
9682 ujj = *force_missing_include2++;
9683 uii |= ujj;
9684 ujj <<= 1;
9685 *loadbuf_alias32++ = uii & (~ujj);
9686 }
9687 loadbuf = (unsigned char*)loadbuf_alias32;
9688 }
9689 fmicp = (unsigned char*)force_missing_include2;
9690 #endif
9691 for (; loadbuf < loadbuf_end;) {
9692 ucc = *loadbuf;
9693 ucc2 = *fmicp++;
9694 ucc |= ucc2;
9695 ucc2 <<= 1;
9696 *loadbuf++ = ucc & (~ucc2);
9697 }
9698 }
9699
open_and_size_string_list(char * fname,FILE ** infile_ptr,uintptr_t * list_len_ptr,uintptr_t * max_str_len_ptr)9700 int32_t open_and_size_string_list(char* fname, FILE** infile_ptr, uintptr_t* list_len_ptr, uintptr_t* max_str_len_ptr) {
9701 // assumes file is not open yet, and g_textbuf is safe to clobber
9702 uint32_t max_len = 0;
9703 uintptr_t line_idx = 0;
9704 uintptr_t list_len = 0;
9705 int32_t retval = 0;
9706 char* bufptr;
9707 uint32_t cur_len;
9708 if (fopen_checked(fname, "r", infile_ptr)) {
9709 goto open_and_size_string_list_ret_OPEN_FAIL;
9710 }
9711 g_textbuf[MAXLINELEN - 1] = ' ';
9712 while (fgets(g_textbuf, MAXLINELEN, *infile_ptr)) {
9713 line_idx++;
9714 if (!g_textbuf[MAXLINELEN - 1]) {
9715 LOGERRPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
9716 goto open_and_size_string_list_ret_INVALID_FORMAT;
9717 }
9718 bufptr = skip_initial_spaces(g_textbuf);
9719 if (is_eoln_kns(*bufptr)) {
9720 continue;
9721 }
9722 // don't complain about more than one entry on a line for now
9723 list_len++;
9724 cur_len = strlen_se(bufptr);
9725 if (cur_len >= max_len) {
9726 max_len = cur_len + 1;
9727 }
9728 }
9729 if (!feof(*infile_ptr)) {
9730 goto open_and_size_string_list_ret_READ_FAIL;
9731 }
9732 *list_len_ptr = list_len;
9733 *max_str_len_ptr = max_len;
9734 while (0) {
9735 open_and_size_string_list_ret_OPEN_FAIL:
9736 retval = RET_OPEN_FAIL;
9737 break;
9738 open_and_size_string_list_ret_READ_FAIL:
9739 retval = RET_READ_FAIL;
9740 break;
9741 open_and_size_string_list_ret_INVALID_FORMAT:
9742 retval = RET_INVALID_FORMAT;
9743 break;
9744 }
9745 return retval;
9746 }
9747
load_string_list(FILE ** infile_ptr,uintptr_t max_str_len,char * str_list)9748 int32_t load_string_list(FILE** infile_ptr, uintptr_t max_str_len, char* str_list) {
9749 // assumes file is open (probably by open_and_size_string_list), and
9750 // g_textbuf is safe to clobber
9751 int32_t retval = 0;
9752 char* bufptr;
9753 uint32_t cur_len;
9754 rewind(*infile_ptr);
9755 while (fgets(g_textbuf, MAXLINELEN, *infile_ptr)) {
9756 bufptr = skip_initial_spaces(g_textbuf);
9757 if (is_eoln_kns(*bufptr)) {
9758 continue;
9759 }
9760 cur_len = strlen_se(bufptr);
9761 memcpy(str_list, bufptr, cur_len);
9762 str_list[cur_len] = '\0';
9763 str_list = &(str_list[max_str_len]);
9764 }
9765 if (!feof(*infile_ptr)) {
9766 goto load_string_list_ret_READ_FAIL;
9767 }
9768 while (0) {
9769 load_string_list_ret_READ_FAIL:
9770 retval = RET_READ_FAIL;
9771 break;
9772 }
9773 return retval;
9774 }
9775
open_and_skip_first_lines(FILE ** infile_ptr,char * fname,char * loadbuf,uintptr_t loadbuf_size,uint32_t lines_to_skip)9776 int32_t open_and_skip_first_lines(FILE** infile_ptr, char* fname, char* loadbuf, uintptr_t loadbuf_size, uint32_t lines_to_skip) {
9777 uint32_t line_idx;
9778 loadbuf[loadbuf_size - 1] = ' ';
9779 if (fopen_checked(fname, "r", infile_ptr)) {
9780 return RET_OPEN_FAIL;
9781 }
9782 for (line_idx = 1; line_idx <= lines_to_skip; line_idx++) {
9783 if (!fgets(loadbuf, loadbuf_size, *infile_ptr)) {
9784 if (feof(*infile_ptr)) {
9785 LOGERRPRINTFWW("Error: Fewer lines than expected in %s.\n", fname);
9786 return RET_INVALID_FORMAT;
9787 } else {
9788 return RET_READ_FAIL;
9789 }
9790 }
9791 if (!(loadbuf[loadbuf_size - 1])) {
9792 if ((loadbuf_size == MAXLINELEN) || (loadbuf_size == MAXLINEBUFLEN)) {
9793 LOGERRPRINTFWW("Error: Line %u of %s is pathologically long.\n", line_idx, fname);
9794 return RET_INVALID_FORMAT;
9795 } else {
9796 return RET_NOMEM;
9797 }
9798 }
9799 }
9800 return 0;
9801 }
9802
load_to_first_token(FILE * infile,uintptr_t loadbuf_size,char comment_char,const char * file_descrip,char * loadbuf,char ** bufptr_ptr,uintptr_t * line_idx_ptr)9803 int32_t load_to_first_token(FILE* infile, uintptr_t loadbuf_size, char comment_char, const char* file_descrip, char* loadbuf, char** bufptr_ptr, uintptr_t* line_idx_ptr) {
9804 uintptr_t line_idx = 0;
9805 while (fgets(loadbuf, loadbuf_size, infile)) {
9806 line_idx++;
9807 if (!(loadbuf[loadbuf_size - 1])) {
9808 // PLINK 1.9 has two text line loading modes: "regular" and "long".
9809 // * "Regular" mode limits lines to about MAXLINELEN (about 128k as of
9810 // this writing) characters.
9811 // * "Long" mode theoretically accepts lines up to about MAXLINEBUFLEN
9812 // (~2 GB) characters but degrades gracefully if less memory is
9813 // available (in that case, an out-of-memory instead of an
9814 // invalid-format error is reported on fgets overflow). Any long
9815 // buffer size larger than MAXLINELEN should work properly with
9816 // plink_common.
9817 if ((loadbuf_size == MAXLINELEN) || (loadbuf_size == MAXLINEBUFLEN)) {
9818 LOGERRPRINTF("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, file_descrip);
9819 return RET_INVALID_FORMAT;
9820 } else {
9821 return RET_NOMEM;
9822 }
9823 }
9824 *bufptr_ptr = skip_initial_spaces(loadbuf);
9825 if (!is_eoln_kns(**bufptr_ptr)) {
9826 if ((**bufptr_ptr) != comment_char) {
9827 *line_idx_ptr = line_idx;
9828 return 0;
9829 }
9830 }
9831 }
9832 if (!feof(infile)) {
9833 return RET_READ_FAIL;
9834 }
9835 LOGERRPRINTF("Error: Empty %s.\n", file_descrip);
9836 return RET_INVALID_FORMAT;
9837 }
9838
open_and_load_to_first_token(FILE ** infile_ptr,char * fname,uintptr_t loadbuf_size,char comment_char,const char * file_descrip,char * loadbuf,char ** bufptr_ptr,uintptr_t * line_idx_ptr)9839 int32_t open_and_load_to_first_token(FILE** infile_ptr, char* fname, uintptr_t loadbuf_size, char comment_char, const char* file_descrip, char* loadbuf, char** bufptr_ptr, uintptr_t* line_idx_ptr) {
9840 loadbuf[loadbuf_size - 1] = ' ';
9841 if (fopen_checked(fname, "r", infile_ptr)) {
9842 return RET_OPEN_FAIL;
9843 }
9844 return load_to_first_token(*infile_ptr, loadbuf_size, comment_char, file_descrip, loadbuf, bufptr_ptr, line_idx_ptr);
9845 }
9846
scan_max_strlen(char * fname,uint32_t colnum,uint32_t colnum2,uint32_t headerskip,char skipchar,uintptr_t * max_str_len_ptr,uintptr_t * max_str2_len_ptr)9847 int32_t scan_max_strlen(char* fname, uint32_t colnum, uint32_t colnum2, uint32_t headerskip, char skipchar, uintptr_t* max_str_len_ptr, uintptr_t* max_str2_len_ptr) {
9848 // colnum and colnum2 are 1-based indices. If colnum2 is zero, only colnum
9849 // is scanned.
9850 // Includes terminating null in lengths.
9851 FILE* infile = nullptr;
9852 uintptr_t loadbuf_size = bigstack_left();
9853 uintptr_t max_str_len = *max_str_len_ptr;
9854 uintptr_t max_str2_len = 0;
9855 char* loadbuf = (char*)g_bigstack_base;
9856 uint32_t colmin;
9857 uint32_t coldiff;
9858 char* str1_ptr;
9859 char* str2_ptr;
9860 char cc;
9861 uintptr_t cur_str_len;
9862 uintptr_t line_idx;
9863 int32_t retval;
9864 if (loadbuf_size > MAXLINEBUFLEN) {
9865 loadbuf_size = MAXLINEBUFLEN;
9866 } else if (loadbuf_size <= MAXLINELEN) {
9867 goto scan_max_strlen_ret_NOMEM;
9868 }
9869 retval = open_and_skip_first_lines(&infile, fname, loadbuf, loadbuf_size, headerskip);
9870 if (retval) {
9871 goto scan_max_strlen_ret_1;
9872 }
9873 if (colnum < colnum2) {
9874 max_str2_len = *max_str2_len_ptr;
9875 colmin = colnum - 1;
9876 coldiff = colnum2 - colnum;
9877 } else if (colnum2) {
9878 max_str2_len = max_str_len;
9879 max_str_len = *max_str2_len_ptr;
9880 colmin = colnum2 - 1;
9881 coldiff = colnum - colnum2;
9882 } else {
9883 colmin = colnum - 1;
9884 coldiff = 0;
9885 colnum2 = 0xffffffffU;
9886 }
9887 line_idx = headerskip;
9888 while (fgets(loadbuf, loadbuf_size, infile)) {
9889 line_idx++;
9890 if (!(loadbuf[loadbuf_size - 1])) {
9891 if (loadbuf_size == MAXLINEBUFLEN) {
9892 LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
9893 goto scan_max_strlen_ret_INVALID_FORMAT_2;
9894 } else {
9895 goto scan_max_strlen_ret_NOMEM;
9896 }
9897 }
9898 str1_ptr = skip_initial_spaces(loadbuf);
9899 cc = *str1_ptr;
9900 if (is_eoln_kns(cc) || (cc == skipchar)) {
9901 continue;
9902 }
9903 str1_ptr = next_token_multz(str1_ptr, colmin);
9904 str2_ptr = next_token_multz(str1_ptr, coldiff);
9905 if (no_more_tokens_kns(str2_ptr)) {
9906 // probably want option for letting this slide in the future
9907 LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, fname);
9908 goto scan_max_strlen_ret_INVALID_FORMAT_2;
9909 }
9910 cur_str_len = strlen_se(str1_ptr);
9911 if (cur_str_len >= max_str_len) {
9912 max_str_len = cur_str_len + 1;
9913 }
9914 if (coldiff) {
9915 cur_str_len = strlen_se(str2_ptr);
9916 if (cur_str_len >= max_str2_len) {
9917 max_str2_len = cur_str_len + 1;
9918 }
9919 }
9920 }
9921 if (!feof(infile)) {
9922 goto scan_max_strlen_ret_READ_FAIL;
9923 }
9924 if (colnum < colnum2) {
9925 *max_str_len_ptr = max_str_len;
9926 if (coldiff) {
9927 *max_str2_len_ptr = max_str2_len;
9928 }
9929 } else {
9930 *max_str_len_ptr = max_str2_len;
9931 *max_str2_len_ptr = max_str_len;
9932 }
9933 while (0) {
9934 scan_max_strlen_ret_NOMEM:
9935 retval = RET_NOMEM;
9936 break;
9937 scan_max_strlen_ret_READ_FAIL:
9938 retval = RET_READ_FAIL;
9939 break;
9940 scan_max_strlen_ret_INVALID_FORMAT_2:
9941 logerrprintb();
9942 retval = RET_INVALID_FORMAT;
9943 break;
9944 }
9945 scan_max_strlen_ret_1:
9946 fclose_cond(infile);
9947 return retval;
9948 }
9949
scan_max_fam_indiv_strlen(char * fname,uint32_t colnum,uintptr_t * max_sample_id_len_ptr)9950 int32_t scan_max_fam_indiv_strlen(char* fname, uint32_t colnum, uintptr_t* max_sample_id_len_ptr) {
9951 // colnum is a 1-based index with the FID column number; IID column is
9952 // assumed to follow.
9953 // Includes terminating null in lengths.
9954 FILE* infile = nullptr;
9955 uintptr_t loadbuf_size = bigstack_left();
9956 uintptr_t max_sample_id_len = *max_sample_id_len_ptr;
9957 uintptr_t line_idx = 0;
9958 char* loadbuf = (char*)g_bigstack_base;
9959 char* bufptr;
9960 char* bufptr2;
9961 uintptr_t cur_sample_id_len;
9962 int32_t retval;
9963 colnum--;
9964 if (loadbuf_size > MAXLINEBUFLEN) {
9965 loadbuf_size = MAXLINEBUFLEN;
9966 } else if (loadbuf_size <= MAXLINELEN) {
9967 goto scan_max_fam_indiv_strlen_ret_NOMEM;
9968 }
9969 retval = open_and_skip_first_lines(&infile, fname, loadbuf, loadbuf_size, 0);
9970 if (retval) {
9971 goto scan_max_fam_indiv_strlen_ret_1;
9972 }
9973 while (fgets(loadbuf, loadbuf_size, infile)) {
9974 line_idx++;
9975 if (!(loadbuf[loadbuf_size - 1])) {
9976 if (loadbuf_size == MAXLINEBUFLEN) {
9977 LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s is pathologically long.\n", line_idx, fname);
9978 goto scan_max_fam_indiv_strlen_ret_INVALID_FORMAT_2;
9979 } else {
9980 goto scan_max_fam_indiv_strlen_ret_NOMEM;
9981 }
9982 }
9983 bufptr = skip_initial_spaces(loadbuf);
9984 if (is_eoln_kns(*bufptr)) {
9985 continue;
9986 }
9987 bufptr = next_token_multz(bufptr, colnum);
9988 bufptr2 = next_token(bufptr);
9989 if (no_more_tokens_kns(bufptr2)) {
9990 LOGPREPRINTFWW("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, fname);
9991 goto scan_max_fam_indiv_strlen_ret_INVALID_FORMAT_2;
9992 }
9993 cur_sample_id_len = strlen_se(bufptr) + strlen_se(bufptr2) + 2;
9994 if (cur_sample_id_len > max_sample_id_len) {
9995 max_sample_id_len = cur_sample_id_len;
9996 }
9997 }
9998 if (!feof(infile)) {
9999 goto scan_max_fam_indiv_strlen_ret_READ_FAIL;
10000 }
10001 *max_sample_id_len_ptr = max_sample_id_len;
10002 while (0) {
10003 scan_max_fam_indiv_strlen_ret_NOMEM:
10004 retval = RET_NOMEM;
10005 break;
10006 scan_max_fam_indiv_strlen_ret_READ_FAIL:
10007 retval = RET_READ_FAIL;
10008 break;
10009 scan_max_fam_indiv_strlen_ret_INVALID_FORMAT_2:
10010 logerrprintb();
10011 retval = RET_INVALID_FORMAT;
10012 break;
10013 }
10014 scan_max_fam_indiv_strlen_ret_1:
10015 fclose_cond(infile);
10016 return retval;
10017 }
10018
10019 /*
10020 void inplace_collapse_uint32(uint32_t* item_arr, uint32_t unfiltered_ct, uintptr_t* exclude_arr, uint32_t filtered_ct) {
10021 if (unfiltered_ct == filtered_ct) {
10022 return;
10023 }
10024 uint32_t item_uidx = next_set_unsafe(exclude_arr, 0);
10025 uint32_t item_idx = item_uidx;
10026 for (; item_idx < filtered_ct; item_idx++, item_uidx++) {
10027 next_unset_unsafe_ck(exclude_arr, &item_uidx);
10028 item_arr[item_idx] = item_arr[item_uidx];
10029 }
10030 }
10031 */
10032
inplace_collapse_uint32_incl(uint32_t * item_arr,uint32_t unfiltered_ct,uintptr_t * incl_arr,uint32_t filtered_ct)10033 void inplace_collapse_uint32_incl(uint32_t* item_arr, uint32_t unfiltered_ct, uintptr_t* incl_arr, uint32_t filtered_ct) {
10034 if (unfiltered_ct == filtered_ct) {
10035 return;
10036 }
10037 uint32_t item_uidx = next_unset_unsafe(incl_arr, 0);
10038 uint32_t item_idx = item_uidx;
10039 for (; item_idx < filtered_ct; item_idx++, item_uidx++) {
10040 next_set_unsafe_ck(incl_arr, &item_uidx);
10041 item_arr[item_idx] = item_arr[item_uidx];
10042 }
10043 }
10044
alloc_and_init_collapsed_arr(char * item_arr,uintptr_t item_len,uintptr_t unfiltered_ct,uintptr_t * exclude_arr,uintptr_t filtered_ct,uint32_t read_only)10045 char* alloc_and_init_collapsed_arr(char* item_arr, uintptr_t item_len, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t filtered_ct, uint32_t read_only) {
10046 uint32_t item_uidx = 0;
10047 char* new_arr;
10048 char* wptr;
10049 char* wptr_end;
10050 uintptr_t item_uidx_stop;
10051 uintptr_t delta;
10052 if (read_only && (unfiltered_ct == filtered_ct)) {
10053 return item_arr;
10054 }
10055 if (bigstack_alloc_c(filtered_ct * item_len, &new_arr)) {
10056 return nullptr;
10057 }
10058 wptr = new_arr;
10059 wptr_end = &(new_arr[filtered_ct * item_len]);
10060 while (wptr < wptr_end) {
10061 item_uidx = next_unset_ul_unsafe(exclude_arr, item_uidx);
10062 item_uidx_stop = next_set_ul(exclude_arr, item_uidx, unfiltered_ct);
10063 delta = item_uidx_stop - item_uidx;
10064 memcpy(wptr, &(item_arr[item_uidx * item_len]), delta * item_len);
10065 wptr = &(wptr[delta * item_len]);
10066 item_uidx = item_uidx_stop;
10067 }
10068 return new_arr;
10069 }
10070
alloc_and_init_collapsed_arr_incl(char * item_arr,uintptr_t item_len,uintptr_t unfiltered_ct,uintptr_t * include_arr,uintptr_t filtered_ct,uint32_t read_only)10071 char* alloc_and_init_collapsed_arr_incl(char* item_arr, uintptr_t item_len, uintptr_t unfiltered_ct, uintptr_t* include_arr, uintptr_t filtered_ct, uint32_t read_only) {
10072 uint32_t item_uidx = 0;
10073 char* new_arr;
10074 char* wptr;
10075 char* wptr_end;
10076 uintptr_t item_uidx_stop;
10077 uintptr_t delta;
10078 if (read_only && (unfiltered_ct == filtered_ct)) {
10079 return item_arr;
10080 }
10081 if (bigstack_alloc_c(filtered_ct * item_len, &new_arr)) {
10082 return nullptr;
10083 }
10084 wptr = new_arr;
10085 wptr_end = &(new_arr[filtered_ct * item_len]);
10086 do {
10087 item_uidx = next_set_ul_unsafe(include_arr, item_uidx);
10088 item_uidx_stop = next_unset_ul(include_arr, item_uidx, unfiltered_ct);
10089 delta = item_uidx_stop - item_uidx;
10090 memcpy(wptr, &(item_arr[item_uidx * item_len]), delta * item_len);
10091 wptr = &(wptr[delta * item_len]);
10092 item_uidx = item_uidx_stop;
10093 } while (wptr < wptr_end);
10094 return new_arr;
10095 }
10096
inplace_delta_collapse_arr(char * item_arr,uintptr_t item_len,uintptr_t filtered_ct_orig,uintptr_t filtered_ct_new,uintptr_t * exclude_orig,uintptr_t * exclude_new)10097 void inplace_delta_collapse_arr(char* item_arr, uintptr_t item_len, uintptr_t filtered_ct_orig, uintptr_t filtered_ct_new, uintptr_t* exclude_orig, uintptr_t* exclude_new) {
10098 // if this sort of collapse function is ever in an important loop, check
10099 // whether specialized 4-byte and 8-byte versions are much faster
10100 uintptr_t* exclude_orig_start = exclude_orig;
10101 char* write_end = &(item_arr[filtered_ct_new * item_len]);
10102 uintptr_t read_idx = 1;
10103 uint32_t uii = 0;
10104 char* write_ptr;
10105 uintptr_t ulii;
10106 uintptr_t uljj;
10107 uint32_t read_uidx;
10108 uint32_t ujj;
10109 if (filtered_ct_new == filtered_ct_orig) {
10110 return;
10111 }
10112 // find location of first newly excluded item
10113 while (1) {
10114 ulii = *exclude_orig;
10115 uljj = *exclude_new;
10116 if (ulii != uljj) {
10117 break;
10118 }
10119 uii += popcount_long(ulii);
10120 exclude_orig++;
10121 exclude_new++;
10122 }
10123 exclude_new -= ((uintptr_t)(exclude_orig - exclude_orig_start));
10124 read_uidx = BITCT * ((uintptr_t)(exclude_orig - exclude_orig_start));
10125 ujj = CTZLU(ulii ^ uljj);
10126 read_uidx += ujj;
10127 uii += popcount_long(ulii & ((ONELU << ujj) - ONELU));
10128 uii = read_uidx - uii; // now equal to # initial filtered indices skipped
10129 filtered_ct_new -= uii;
10130 item_arr = &(item_arr[uii * item_len]);
10131 write_ptr = item_arr;
10132 read_uidx++;
10133 for (; write_ptr < write_end; read_uidx++, read_idx++) {
10134 next_unset_unsafe_ck(exclude_orig_start, &read_uidx);
10135 if (IS_SET(exclude_new, read_uidx)) {
10136 continue;
10137 }
10138 memcpy(write_ptr, &(item_arr[read_idx * item_len]), item_len);
10139 write_ptr = &(write_ptr[item_len]);
10140 }
10141 }
10142
inplace_delta_collapse_bitfield(uintptr_t * read_ptr,uint32_t filtered_ct_new,uintptr_t * exclude_orig,uintptr_t * exclude_new)10143 void inplace_delta_collapse_bitfield(uintptr_t* read_ptr, uint32_t filtered_ct_new, uintptr_t* exclude_orig, uintptr_t* exclude_new) {
10144 // only guaranteed to zero out trailing bits up to the nearest 16-byte
10145 // boundary on 64-bit systems
10146 uintptr_t* write_ptr = read_ptr;
10147 uintptr_t readw = *read_ptr++;
10148 uintptr_t writew = 0;
10149 uint32_t item_uidx = 0;
10150 uint32_t item_mwidx = 0;
10151 uint32_t item_idx = 0;
10152 for (; item_idx < filtered_ct_new; item_uidx++) {
10153 next_unset_unsafe_ck(exclude_orig, &item_uidx);
10154 if (!is_set(exclude_new, item_uidx)) {
10155 if ((readw >> item_mwidx) & 1) {
10156 writew |= ONELU << (item_idx % BITCT);
10157 }
10158 if (!((++item_idx) % BITCT)) {
10159 *write_ptr++ = writew;
10160 writew = 0;
10161 }
10162 }
10163 if (++item_mwidx == BITCT) {
10164 item_mwidx = 0;
10165 readw = *read_ptr++;
10166 }
10167 }
10168 if (write_ptr < read_ptr) {
10169 *write_ptr++ = writew;
10170 if (write_ptr < read_ptr) {
10171 *write_ptr = 0;
10172 }
10173 }
10174 }
10175
copy_bitarr_subset_excl(const uintptr_t * __restrict raw_bitarr,const uintptr_t * __restrict subset_excl,uint32_t raw_bitarr_size,uint32_t subset_size,uintptr_t * __restrict output_bitarr)10176 void copy_bitarr_subset_excl(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_excl, uint32_t raw_bitarr_size, uint32_t subset_size, uintptr_t* __restrict output_bitarr) {
10177 uintptr_t cur_write = 0;
10178 uint32_t item_uidx = 0;
10179 uint32_t write_bit = 0;
10180 uint32_t item_idx = 0;
10181 uint32_t item_uidx_stop;
10182 if (!subset_excl[0]) {
10183 item_uidx = next_set(subset_excl, 0, raw_bitarr_size & (~(BITCT - 1))) & (~(BITCT - 1));
10184 memcpy(output_bitarr, raw_bitarr, item_uidx / 8);
10185 item_idx = item_uidx;
10186 output_bitarr = &(output_bitarr[item_uidx / BITCT]);
10187 }
10188 while (item_idx < subset_size) {
10189 item_uidx = next_unset_unsafe(subset_excl, item_uidx);
10190 item_uidx_stop = next_set(subset_excl, item_uidx, raw_bitarr_size);
10191 item_idx += item_uidx_stop - item_uidx;
10192 do {
10193 cur_write |= ((raw_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << write_bit;
10194 if (++write_bit == BITCT) {
10195 *output_bitarr++ = cur_write;
10196 cur_write = 0;
10197 write_bit = 0;
10198 }
10199 } while (++item_uidx < item_uidx_stop);
10200 }
10201 if (write_bit) {
10202 *output_bitarr = cur_write;
10203 }
10204 }
10205
copy_bitarr_subset(const uintptr_t * __restrict raw_bitarr,const uintptr_t * __restrict subset_mask,uint32_t raw_bitarr_size,uint32_t subset_size,uintptr_t * __restrict output_bitarr)10206 void copy_bitarr_subset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t raw_bitarr_size, uint32_t subset_size, uintptr_t* __restrict output_bitarr) {
10207 // full-blown blocked copy not worth it due to undefined CTZLU(0), >> 64,
10208 // << 64
10209 uintptr_t cur_output_word = 0;
10210 uint32_t item_uidx = 0;
10211 uint32_t word_write_shift = 0;
10212 uint32_t item_idx = 0;
10213 uint32_t item_uidx_stop;
10214 if (!(~subset_mask[0])) {
10215 item_uidx = next_unset(subset_mask, 0, raw_bitarr_size & (~(BITCT - 1))) & (~(BITCT - 1));
10216 memcpy(output_bitarr, raw_bitarr, item_uidx / 8);
10217 item_idx = item_uidx;
10218 output_bitarr = &(output_bitarr[item_uidx / BITCT]);
10219 }
10220 while (item_idx < subset_size) {
10221 item_uidx = next_set_unsafe(subset_mask, item_uidx);
10222
10223 // can speed this up a bit once we have a guaranteed unset bit at the end
10224 item_uidx_stop = next_unset(subset_mask, item_uidx, raw_bitarr_size);
10225
10226 item_idx += item_uidx_stop - item_uidx;
10227 do {
10228 cur_output_word |= ((raw_bitarr[item_uidx / BITCT] >> (item_uidx % BITCT)) & 1) << word_write_shift;
10229 if (++word_write_shift == BITCT) {
10230 *output_bitarr++ = cur_output_word;
10231 cur_output_word = 0;
10232 word_write_shift = 0;
10233 }
10234 } while (++item_uidx < item_uidx_stop);
10235 }
10236 if (word_write_shift) {
10237 *output_bitarr = cur_output_word;
10238 }
10239 }
10240
uncollapse_copy_flip_include_arr(uintptr_t * collapsed_include_arr,uintptr_t unfiltered_ct,uintptr_t * exclude_arr,uintptr_t * output_exclude_arr)10241 void uncollapse_copy_flip_include_arr(uintptr_t* collapsed_include_arr, uintptr_t unfiltered_ct, uintptr_t* exclude_arr, uintptr_t* output_exclude_arr) {
10242 uintptr_t unfiltered_ctl = BITCT_TO_WORDCT(unfiltered_ct);
10243 uintptr_t* output_exclude_true_end = &(output_exclude_arr[unfiltered_ctl]);
10244 uintptr_t* output_exclude_end = &(output_exclude_arr[unfiltered_ct / BITCT]);
10245 uintptr_t cea_read = 0;
10246 uint32_t read_bit = BITCT;
10247 uint32_t write_bit;
10248 uintptr_t cur_write;
10249 uintptr_t cur_read = 0;
10250 if (!exclude_arr[0]) {
10251 // copy-with-possible-offset is substantially slower, so treat initial lack
10252 // of offset as a special case
10253 for (cur_read = 0; cur_read < unfiltered_ctl; cur_read++) {
10254 *output_exclude_arr++ = ~(*collapsed_include_arr++);
10255 if (*(++exclude_arr)) {
10256 break;
10257 }
10258 }
10259 }
10260 while (output_exclude_arr < output_exclude_end) {
10261 cur_write = *exclude_arr++;
10262 // want efficient handling of all-zeroes and all-ones here
10263 if (cur_write) {
10264 cur_read = ~cur_write;
10265 uncollapse_copy_flip_include_arr_loop:
10266 while (cur_read) {
10267 write_bit = CTZLU(cur_read);
10268 if (read_bit == BITCT) {
10269 cea_read = ~(*collapsed_include_arr++);
10270 read_bit = 0;
10271 }
10272 cur_write |= (cea_read & ONELU) << write_bit;
10273 cea_read >>= 1;
10274 read_bit++;
10275 cur_read &= cur_read - ONELU;
10276 }
10277 *output_exclude_arr = cur_write;
10278 } else {
10279 if (read_bit == BITCT) {
10280 *output_exclude_arr = ~(*collapsed_include_arr++);
10281 } else {
10282 cur_write = cea_read;
10283 cea_read = ~(*collapsed_include_arr++);
10284 *output_exclude_arr = cur_write | (cea_read << (BITCT - read_bit));
10285 cea_read >>= read_bit;
10286 }
10287 }
10288 output_exclude_arr++;
10289 }
10290 if (output_exclude_arr < output_exclude_true_end) {
10291 cur_write = *exclude_arr++;
10292 cur_read = (~cur_write) & ((ONELU << (unfiltered_ct % BITCT)) - ONELU);
10293 goto uncollapse_copy_flip_include_arr_loop;
10294 }
10295 }
10296
copy_when_nonmissing(uintptr_t * loadbuf,char * source,uintptr_t elem_size,uintptr_t unfiltered_sample_ct,uintptr_t missing_ct,char * dest)10297 void copy_when_nonmissing(uintptr_t* loadbuf, char* source, uintptr_t elem_size, uintptr_t unfiltered_sample_ct, uintptr_t missing_ct, char* dest) {
10298 uintptr_t* loadbuf_end = &(loadbuf[QUATERCT_TO_WORDCT(unfiltered_sample_ct)]);
10299 uintptr_t last_missing_p1 = 0;
10300 uintptr_t sample_idx_offset = 0;
10301 uintptr_t cur_word;
10302 uintptr_t new_missing_idx;
10303 uintptr_t diff;
10304 if (!missing_ct) {
10305 memcpy(dest, source, unfiltered_sample_ct * elem_size);
10306 return;
10307 }
10308 do {
10309 cur_word = *loadbuf++;
10310 cur_word = cur_word & (~(cur_word >> 1)) & FIVEMASK;
10311 while (cur_word) {
10312 new_missing_idx = sample_idx_offset + (CTZLU(cur_word) / 2);
10313 diff = new_missing_idx - last_missing_p1;
10314 if (diff) {
10315 dest = memcpya(dest, &(source[last_missing_p1 * elem_size]), diff * elem_size);
10316 }
10317 last_missing_p1 = new_missing_idx + 1;
10318 cur_word &= cur_word - 1;
10319 }
10320 sample_idx_offset += BITCT2;
10321 } while (loadbuf < loadbuf_end);
10322 diff = unfiltered_sample_ct - last_missing_p1;
10323 if (diff) {
10324 memcpy(dest, &(source[last_missing_p1 * elem_size]), diff * elem_size);
10325 }
10326 }
10327
collapse_duplicate_ids(char * sorted_ids,uintptr_t id_ct,uintptr_t max_id_len,uint32_t * id_starts)10328 uint32_t collapse_duplicate_ids(char* sorted_ids, uintptr_t id_ct, uintptr_t max_id_len, uint32_t* id_starts) {
10329 // Collapses array of sorted IDs to remove duplicates, and writes
10330 // pre-collapse positions to id_starts (so e.g. duplication count of any
10331 // sample ID can be determined via subtraction) if it isn't nullptr.
10332 // Returns id_ct of collapsed array.
10333 uintptr_t read_idx;
10334 uintptr_t write_idx;
10335 if (!id_ct) {
10336 return 0;
10337 }
10338 if (id_starts) {
10339 id_starts[0] = 0;
10340 for (read_idx = 1; read_idx < id_ct; read_idx++) {
10341 if (!strcmp(&(sorted_ids[(read_idx - 1) * max_id_len]), &(sorted_ids[read_idx * max_id_len]))) {
10342 break;
10343 }
10344 id_starts[read_idx] = read_idx;
10345 }
10346 write_idx = read_idx;
10347 while (++read_idx < id_ct) {
10348 if (strcmp(&(sorted_ids[(write_idx - 1) * max_id_len]), &(sorted_ids[read_idx * max_id_len]))) {
10349 strcpy(&(sorted_ids[write_idx * max_id_len]), &(sorted_ids[read_idx * max_id_len]));
10350 id_starts[write_idx++] = read_idx;
10351 }
10352 }
10353 } else {
10354 for (read_idx = 1; read_idx < id_ct; read_idx++) {
10355 if (!strcmp(&(sorted_ids[(read_idx - 1) * max_id_len]), &(sorted_ids[read_idx * max_id_len]))) {
10356 break;
10357 }
10358 }
10359 write_idx = read_idx;
10360 while (++read_idx < id_ct) {
10361 if (strcmp(&(sorted_ids[(write_idx - 1) * max_id_len]), &(sorted_ids[read_idx * max_id_len]))) {
10362 strcpy(&(sorted_ids[write_idx * max_id_len]), &(sorted_ids[read_idx * max_id_len]));
10363 write_idx++;
10364 }
10365 }
10366 }
10367 return write_idx;
10368 }
10369
range_list_init(Range_list * range_list_ptr)10370 void range_list_init(Range_list* range_list_ptr) {
10371 range_list_ptr->names = nullptr;
10372 range_list_ptr->starts_range = nullptr;
10373 range_list_ptr->name_ct = 0;
10374 range_list_ptr->name_max_len = 0;
10375 }
10376
free_range_list(Range_list * range_list_ptr)10377 void free_range_list(Range_list* range_list_ptr) {
10378 free_cond(range_list_ptr->names);
10379 free_cond(range_list_ptr->starts_range);
10380 }
10381
10382 // implementation used in PLINK 1.07 stats.cpp
10383 // probably want to remove this function and use erf() calls in the future
normdist(double zz)10384 double normdist(double zz) {
10385 double sqrt2pi = 2.50662827463;
10386 double t0;
10387 double z1;
10388 double p0;
10389 t0 = 1 / (1 + 0.2316419 * fabs(zz));
10390 z1 = exp(-0.5 * zz * zz) / sqrt2pi;
10391 p0 = z1 * t0 * (0.31938153 + t0 * (-0.356563782 + t0 * (1.781477937 + t0 * (-1.821255978 + 1.330274429 * t0))));
10392 return zz >= 0 ? 1 - p0 : p0;
10393 }
10394
rand_normal(double * secondval_ptr)10395 double rand_normal(double* secondval_ptr) {
10396 // N(0, 1)
10397 double dxx = sqrt(-2 * log(rand_unif()));
10398 double dyy = 2 * PI * rand_unif();
10399 *secondval_ptr = dxx * cos(dyy);
10400 return dxx * sin(dyy);
10401 }
10402
init_sfmt64_from_sfmt32(sfmt_t * sfmt32,sfmt_t * sfmt64)10403 void init_sfmt64_from_sfmt32(sfmt_t* sfmt32, sfmt_t* sfmt64) {
10404 // sfmt_genrand_uint64() is not supposed to be called after
10405 // sfmt_genrand_uint32() is called on the same generator. To work around
10406 // this, we initialize a new sfmt64 generator with this function when
10407 // necessary, and stick to genrand_uint32() calls with the main generator.
10408 uint32_t init_arr[4];
10409 uint32_t uii;
10410 for (uii = 0; uii < 4; uii++) {
10411 init_arr[uii] = sfmt_genrand_uint32(sfmt32);
10412 }
10413 sfmt_init_by_array(sfmt64, init_arr, 4);
10414 }
10415
generate_perm1_interleaved(uint32_t tot_ct,uint32_t set_ct,uintptr_t perm_idx,uintptr_t perm_ct,uintptr_t * perm_buf)10416 void generate_perm1_interleaved(uint32_t tot_ct, uint32_t set_ct, uintptr_t perm_idx, uintptr_t perm_ct, uintptr_t* perm_buf) {
10417 uintptr_t tot_ctl = BITCT_TO_WORDCT(tot_ct);
10418 uintptr_t tot_rem = tot_ct & (BITCT - 1);
10419 uint32_t tot_quotient = (uint32_t)(0x100000000LLU / tot_ct);
10420 uint32_t upper_bound = tot_ct * tot_quotient - 1;
10421 uintptr_t uljj = perm_ct - perm_idx;
10422 uint32_t totq_preshift;
10423 uint64_t totq_magic;
10424 uint32_t totq_postshift;
10425 uint32_t totq_incr;
10426 uintptr_t* pbptr;
10427 uint32_t num_set;
10428 uint32_t urand;
10429 uintptr_t ulii;
10430 // seeing as how we're gonna divide by the same number a billion times or so,
10431 // it just might be worth optimizing that division...
10432 magic_num(tot_quotient, &totq_magic, &totq_preshift, &totq_postshift, &totq_incr);
10433 if (set_ct * 2 < tot_ct) {
10434 for (ulii = 0; ulii < tot_ctl; ulii++) {
10435 fill_ulong_zero(uljj, &(perm_buf[perm_idx + (ulii * perm_ct)]));
10436 }
10437 for (; perm_idx < perm_ct; perm_idx++) {
10438 pbptr = &(perm_buf[perm_idx]);
10439 for (num_set = 0; num_set < set_ct; num_set++) {
10440 do {
10441 do {
10442 urand = sfmt_genrand_uint32(&g_sfmt);
10443 } while (urand > upper_bound);
10444 // this is identical to ulii = urand / tot_quotient
10445 ulii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
10446 uljj = ulii / BITCT;
10447 ulii &= (BITCT - 1);
10448 } while ((pbptr[uljj * perm_ct] >> ulii) & 1);
10449 pbptr[uljj * perm_ct] |= (ONELU << ulii);
10450 }
10451 }
10452 } else {
10453 for (ulii = 0; ulii < tot_ctl; ulii++) {
10454 fill_ulong_one(uljj, &(perm_buf[perm_idx + (ulii * perm_ct)]));
10455 }
10456 // "set" has reversed meaning here
10457 set_ct = tot_ct - set_ct;
10458 for (; perm_idx < perm_ct; perm_idx++) {
10459 pbptr = &(perm_buf[perm_idx]);
10460 for (num_set = 0; num_set < set_ct; num_set++) {
10461 do {
10462 do {
10463 urand = sfmt_genrand_uint32(&g_sfmt);
10464 } while (urand > upper_bound);
10465 ulii = (totq_magic * ((urand >> totq_preshift) + totq_incr)) >> totq_postshift;
10466 uljj = ulii / BITCT;
10467 ulii &= (BITCT - 1);
10468 } while (!((pbptr[uljj * perm_ct] >> ulii) & 1));
10469 pbptr[uljj * perm_ct] &= ~(ONELU << ulii);
10470 }
10471 }
10472 if (tot_rem) {
10473 uljj = (~ZEROLU) >> (BITCT - tot_rem);
10474 pbptr = &(perm_buf[(tot_ctl - 1) * perm_ct + perm_idx]);
10475 for (ulii = perm_idx; ulii < perm_ct; ulii++) {
10476 *pbptr &= uljj;
10477 pbptr++;
10478 }
10479 }
10480 }
10481 }
10482
cubic_real_roots(double coef_a,double coef_b,double coef_c,double * solutions)10483 uint32_t cubic_real_roots(double coef_a, double coef_b, double coef_c, double* solutions) {
10484 // Analytically finds all real roots of x^3 + ax^2 + bx + c, saving them in
10485 // solutions[] (sorted from smallest to largest), and returning the count.
10486 // Multiple roots are only returned/counted once.
10487 // Additional research into numerical stability may be in order here.
10488 double a2 = coef_a * coef_a;
10489 double qq = (a2 - 3 * coef_b) * (1.0 / 9.0);
10490 double rr = (2 * a2 * coef_a - 9 * coef_a * coef_b + 27 * coef_c) * (1.0 / 54.0);
10491 double r2 = rr * rr;
10492 double q3 = qq * qq * qq;
10493 double adiv3 = coef_a * (1.0 / 3.0);
10494 double sq;
10495 double dxx;
10496 if (r2 < q3) {
10497 // three real roots
10498 sq = sqrt(qq);
10499 dxx = acos(rr / (qq * sq)) * (1.0 / 3.0);
10500 sq *= -2;
10501 solutions[0] = sq * cos(dxx) - adiv3;
10502 solutions[1] = sq * cos(dxx + (2.0 * PI / 3.0)) - adiv3;
10503 solutions[2] = sq * cos(dxx - (2.0 * PI / 3.0)) - adiv3;
10504 // now sort and check for within-epsilon equality
10505 if (solutions[0] > solutions[1]) {
10506 dxx = solutions[0];
10507 solutions[0] = solutions[1];
10508 if (dxx > solutions[2]) {
10509 solutions[1] = solutions[2];
10510 solutions[2] = dxx;
10511 } else {
10512 solutions[1] = dxx;
10513 }
10514 if (solutions[0] > solutions[1]) {
10515 dxx = solutions[0];
10516 solutions[0] = solutions[1];
10517 solutions[1] = dxx;
10518 }
10519 } else if (solutions[1] > solutions[2]) {
10520 dxx = solutions[1];
10521 solutions[1] = solutions[2];
10522 solutions[2] = dxx;
10523 }
10524 if (solutions[1] - solutions[0] < EPSILON) {
10525 solutions[1] = solutions[2];
10526 return (solutions[1] - solutions[0] < EPSILON)? 1 : 2;
10527 }
10528 return (solutions[2] - solutions[1] < EPSILON)? 2 : 3;
10529 }
10530 dxx = -pow(fabs(rr) + sqrt(r2 - q3), 1.0 / 3.0);
10531 if (dxx == 0.0) {
10532 solutions[0] = -adiv3;
10533 return 1;
10534 }
10535 if (rr < 0.0) {
10536 dxx = -dxx;
10537 }
10538 sq = qq / dxx;
10539 solutions[0] = dxx + sq - adiv3;
10540 // use of regular epsilon here has actually burned us
10541 if (fabs(dxx - sq) >= (EPSILON * 8)) {
10542 return 1;
10543 }
10544 if (dxx >= 0.0) {
10545 solutions[1] = solutions[0];
10546 solutions[0] = -dxx - adiv3;
10547 } else {
10548 solutions[1] = -dxx - adiv3;
10549 }
10550 return 2;
10551 }
10552
join_threads(pthread_t * threads,uint32_t ctp1)10553 void join_threads(pthread_t* threads, uint32_t ctp1) {
10554 if (!(--ctp1)) {
10555 return;
10556 }
10557 #ifdef _WIN32
10558 WaitForMultipleObjects(ctp1, threads, 1, INFINITE);
10559 for (uint32_t uii = 0; uii < ctp1; ++uii) {
10560 CloseHandle(threads[uii]);
10561 }
10562 #else
10563 for (uint32_t uii = 0; uii < ctp1; uii++) {
10564 pthread_join(threads[uii], nullptr);
10565 }
10566 #endif
10567 }
10568
10569 #ifdef _WIN32
spawn_threads(pthread_t * threads,unsigned (__stdcall * start_routine)(void *),uintptr_t ct)10570 int32_t spawn_threads(pthread_t* threads, unsigned (__stdcall *start_routine)(void*), uintptr_t ct)
10571 #else
10572 int32_t spawn_threads(pthread_t* threads, void* (*start_routine)(void*), uintptr_t ct)
10573 #endif
10574 {
10575 uintptr_t ulii;
10576 if (ct == 1) {
10577 return 0;
10578 }
10579 for (ulii = 1; ulii < ct; ulii++) {
10580 #ifdef _WIN32
10581 threads[ulii - 1] = (HANDLE)_beginthreadex(nullptr, 4096, start_routine, (void*)ulii, 0, nullptr);
10582 if (!threads[ulii - 1]) {
10583 join_threads(threads, ulii);
10584 return -1;
10585 }
10586 #else
10587 if (pthread_create(&(threads[ulii - 1]), nullptr, start_routine, (void*)ulii)) {
10588 join_threads(threads, ulii);
10589 return -1;
10590 }
10591 #endif
10592 }
10593 return 0;
10594 }
10595
10596 // Okay, it's time to bite the bullet and stop creating and destroying threads
10597 // like crazy, at least in the small-block-size GRM calculation; Intel
10598 // MKL-powered GCTA 1.24 blew away our code on the NIH 512-core test machine
10599 // when the maximum number of threads was used. Mostly because threads were
10600 // actually costing much more in creation/destruction time than they saved;
10601 // much better wall-clock times would have resulted from manually setting
10602 // --threads to a low number. That's not cool.
10603 //
10604 // New framework:
10605 // * On all operating systems, g_is_last_thread_block indicates whether all
10606 // threads should terminate upon completion of the current block. (Initially
10607 // had this volatile, then realized that the presence of the sync-wait should
10608 // be enough to force the global variable to be reread.)
10609 // * On Linux and OS X, if we aren't dealing with the final block,
10610 // spawn_threads2() also reinitializes g_thread_active_ct.
10611 // * On Linux and OS X, spawn_threads2() checks if g_thread_mutex_initialized
10612 // is set. If not, it, it is set, g_thread_sync_mutex,
10613 // g_thread_cur_block_done_condvar and g_thread_start_next_condvar are
10614 // initialized, then threads are launched.
10615 // If it has, pthread_cond_broadcast() acts on g_thread_start_next_condvar.
10616 // * On Windows, spawn_threads2() checks if g_thread_mutex_initialized is set.
10617 // If it has not, it, along with g_thread_start_next_event[] and
10618 // g_thread_cur_block_done_events[], are initialized, then the threads are
10619 // launched. If it has, SetEvent() acts on g_thread_start_next_event[].
10620 // (It used to act on only one event; then I realized that safely dealing
10621 // with a manual-reset event could be a pain if the first thread finishes
10622 // before the last one wakes up...)
10623 // * Thread functions are expected to be of the form
10624 // THREAD_RET_TYPE function_name(void* arg) {
10625 // uintptr_t tidx = (uintptr_t)arg;
10626 // ...
10627 // while (1) {
10628 // ... // process current block
10629 // if ((!tidx) || g_is_last_thread_block) {
10630 // THREAD_RETURN;
10631 // }
10632 // THREAD_BLOCK_FINISH(tidx);
10633 // }
10634 // }
10635 // * On Linux and OS X, THREAD_BLOCK_FINISH() acquires a mutex, decrements
10636 // g_thread_active_ct, calls pthread_cond_signal() on
10637 // g_thread_cur_block_done_condvar iff g_thread_active_ct is now zero, then
10638 // unconditionally calls pthread_cond_wait on g_thread_start_next_condvar and
10639 // the mutex.
10640 // * On Windows, THREAD_BLOCK_FINISH() calls SetEvent() on
10641 // g_thread_cur_block_done_events[tidx - 1], then waits on
10642 // g_thread_start_next_event[tidx - 1].
10643 // * If the termination variable is set, join_threads2() waits for all threads
10644 // to complete, then cleans up all multithreading objects. Otherwise, on
10645 // Linux and OS X, it acquires the mutex and calls pthread_cond_wait() on
10646 // g_thread_cur_block_done_condvar and the mutex; and on Windows, it calls
10647 // WaitForMultipleObjects() on g_thread_cur_block_done_events[].
10648 // WaitForMultipleObjects has a 64 object limit, and for now it doesn't seem
10649 // too important to use a for loop to handle more objects?... well, we can
10650 // add that if anyone wants it, but for now the Windows thread limit is 65
10651 // (the main thread isn't part of the wait).
10652 //
10653 // This is only very slightly better than the original approach on my old
10654 // MacBook Pro (since threading overhead was never high to begin with, there
10655 // being only 2 cores...), but the impact should be more noticeable on heavily
10656 // multicore machines.
10657 //
10658 // The next performance improvement to make is double-buffering; tricky to
10659 // estimate how much (if any) "consumption" the main I/O thread should be
10660 // doing, though, so it may want a job queue to go with it.
10661
10662 uintptr_t g_thread_spawn_ct;
10663 uint32_t g_is_last_thread_block = 0;
10664 #ifdef _WIN32
10665 HANDLE g_thread_start_next_event[MAX_THREADS];
10666 HANDLE g_thread_cur_block_done_events[MAX_THREADS];
10667 #else
10668 static pthread_mutex_t g_thread_sync_mutex;
10669 static pthread_cond_t g_thread_cur_block_done_condvar;
10670 static pthread_cond_t g_thread_start_next_condvar;
10671 uint32_t g_thread_active_ct;
10672
THREAD_BLOCK_FINISH(uintptr_t tidx)10673 void THREAD_BLOCK_FINISH(uintptr_t tidx) {
10674 uintptr_t initial_spawn_ct = g_thread_spawn_ct;
10675 pthread_mutex_lock(&g_thread_sync_mutex);
10676 if (!(--g_thread_active_ct)) {
10677 pthread_cond_signal(&g_thread_cur_block_done_condvar);
10678 }
10679 while (g_thread_spawn_ct == initial_spawn_ct) {
10680 // spurious wakeup guard
10681 pthread_cond_wait(&g_thread_start_next_condvar, &g_thread_sync_mutex);
10682 }
10683 pthread_mutex_unlock(&g_thread_sync_mutex);
10684 }
10685 #endif
10686 static uint32_t g_thread_mutex_initialized = 0;
10687
join_threads2(pthread_t * threads,uint32_t ctp1,uint32_t is_last_block)10688 void join_threads2(pthread_t* threads, uint32_t ctp1, uint32_t is_last_block) {
10689 uint32_t uii;
10690 if (!(--ctp1)) {
10691 if (is_last_block) {
10692 // allow another multithreaded function to be called later
10693 g_thread_mutex_initialized = 0;
10694 }
10695 return;
10696 }
10697 #ifdef _WIN32
10698 if (!is_last_block) {
10699 WaitForMultipleObjects(ctp1, g_thread_cur_block_done_events, 1, INFINITE);
10700 } else {
10701 WaitForMultipleObjects(ctp1, threads, 1, INFINITE);
10702 for (uii = 0; uii < ctp1; uii++) {
10703 CloseHandle(threads[uii]);
10704 CloseHandle(g_thread_start_next_event[uii]);
10705 CloseHandle(g_thread_cur_block_done_events[uii]);
10706 }
10707 g_thread_mutex_initialized = 0;
10708 }
10709 #else
10710 if (!is_last_block) {
10711 pthread_mutex_lock(&g_thread_sync_mutex);
10712 while (g_thread_active_ct) {
10713 pthread_cond_wait(&g_thread_cur_block_done_condvar, &g_thread_sync_mutex);
10714 }
10715 // keep mutex until next block loaded
10716 } else {
10717 for (uii = 0; uii < ctp1; uii++) {
10718 pthread_join(threads[uii], nullptr);
10719 }
10720 // slightly inefficient if there are multiple multithreaded commands being
10721 // run, but if different commands require different numbers of threads,
10722 // optimizing this sort of thing away could introduce bugs...
10723 pthread_mutex_destroy(&g_thread_sync_mutex);
10724 pthread_cond_destroy(&g_thread_cur_block_done_condvar);
10725 pthread_cond_destroy(&g_thread_start_next_condvar);
10726 g_thread_mutex_initialized = 0;
10727 }
10728 #endif
10729 }
10730
10731 #ifdef _WIN32
spawn_threads2(pthread_t * threads,unsigned (__stdcall * start_routine)(void *),uintptr_t ct,uint32_t is_last_block)10732 int32_t spawn_threads2(pthread_t* threads, unsigned (__stdcall *start_routine)(void*), uintptr_t ct, uint32_t is_last_block)
10733 #else
10734 int32_t spawn_threads2(pthread_t* threads, void* (*start_routine)(void*), uintptr_t ct, uint32_t is_last_block)
10735 #endif
10736 {
10737 uintptr_t ulii;
10738 // this needs to go before the ct == 1 check since start_routine() might need
10739 // it
10740 if (g_is_last_thread_block != is_last_block) {
10741 // might save us an unnecessary memory write that confuses the cache
10742 // coherency logic?
10743 g_is_last_thread_block = is_last_block;
10744 }
10745 #ifdef _WIN32
10746 if (!g_thread_mutex_initialized) {
10747 g_thread_spawn_ct = 0;
10748 g_thread_mutex_initialized = 1;
10749 if (ct == 1) {
10750 return 0;
10751 }
10752 for (ulii = 1; ulii < ct; ulii++) {
10753 g_thread_start_next_event[ulii - 1] = CreateEvent(nullptr, FALSE, FALSE, nullptr);
10754 g_thread_cur_block_done_events[ulii - 1] = CreateEvent(nullptr, FALSE, FALSE, nullptr);
10755 }
10756 for (ulii = 1; ulii < ct; ulii++) {
10757 threads[ulii - 1] = (HANDLE)_beginthreadex(nullptr, 4096, start_routine, (void*)ulii, 0, nullptr);
10758 if (!threads[ulii - 1]) {
10759 if (ulii > 1) {
10760 join_threads2(threads, ulii, is_last_block);
10761 if (!is_last_block) {
10762 for (uintptr_t uljj = 0; uljj < ulii - 1; ++uljj) {
10763 CloseHandle(threads[uljj]);
10764 }
10765 }
10766 }
10767 if ((!is_last_block) || (ulii == 1)) {
10768 for (uint32_t uii = 0; uii < ct - 1; ++uii) {
10769 CloseHandle(g_thread_start_next_event[uii]);
10770 CloseHandle(g_thread_cur_block_done_events[uii]);
10771 }
10772 g_thread_mutex_initialized = 0;
10773 }
10774 return -1;
10775 }
10776 }
10777 } else {
10778 g_thread_spawn_ct++;
10779 for (ulii = 1; ulii < ct; ulii++) {
10780 SetEvent(g_thread_start_next_event[ulii - 1]);
10781 }
10782 }
10783 #else
10784 if (!is_last_block) {
10785 g_thread_active_ct = ct - 1;
10786 }
10787 if (!g_thread_mutex_initialized) {
10788 g_thread_spawn_ct = 0; // tidx 0 may need to know modulus
10789 g_thread_mutex_initialized = 1;
10790 if (ct == 1) {
10791 return 0;
10792 }
10793 if (pthread_mutex_init(&g_thread_sync_mutex, nullptr) ||
10794 pthread_cond_init(&g_thread_cur_block_done_condvar, nullptr) ||
10795 pthread_cond_init(&g_thread_start_next_condvar, nullptr)) {
10796 return -1;
10797 }
10798 for (ulii = 1; ulii < ct; ulii++) {
10799 if (pthread_create(&(threads[ulii - 1]), nullptr, start_routine, (void*)ulii)) {
10800 if (ulii > 1) {
10801 join_threads2(threads, ulii, is_last_block);
10802 if (!is_last_block) {
10803 for (uintptr_t uljj = 0; uljj < ulii - 1; ++uljj) {
10804 pthread_cancel(threads[uljj]);
10805 }
10806 }
10807 }
10808 if ((!is_last_block) || (ulii == 1)) {
10809 pthread_mutex_destroy(&g_thread_sync_mutex);
10810 pthread_cond_destroy(&g_thread_cur_block_done_condvar);
10811 pthread_cond_destroy(&g_thread_start_next_condvar);
10812 g_thread_mutex_initialized = 0;
10813 }
10814 return -1;
10815 }
10816 }
10817 } else {
10818 g_thread_spawn_ct++;
10819 if (ct == 1) {
10820 return 0;
10821 }
10822 // still holding mutex
10823 pthread_mutex_unlock(&g_thread_sync_mutex);
10824 pthread_cond_broadcast(&g_thread_start_next_condvar);
10825 }
10826 #endif
10827 return 0;
10828 }
10829
10830 sfmt_t** g_sfmtp_arr;
10831
bigstack_init_sfmtp(uint32_t thread_ct)10832 uint32_t bigstack_init_sfmtp(uint32_t thread_ct) {
10833 uint32_t uibuf[4];
10834 uint32_t tidx;
10835 uint32_t uii;
10836 g_sfmtp_arr = (sfmt_t**)bigstack_alloc(thread_ct * sizeof(intptr_t));
10837 if (!g_sfmtp_arr) {
10838 return 1;
10839 }
10840 g_sfmtp_arr[0] = &g_sfmt;
10841 if (thread_ct > 1) {
10842 for (tidx = 1; tidx < thread_ct; tidx++) {
10843 g_sfmtp_arr[tidx] = (sfmt_t*)bigstack_alloc(sizeof(sfmt_t));
10844 if (!g_sfmtp_arr[tidx]) {
10845 return 1;
10846 }
10847 for (uii = 0; uii < 4; uii++) {
10848 uibuf[uii] = sfmt_genrand_uint32(&g_sfmt);
10849 }
10850 sfmt_init_by_array(g_sfmtp_arr[tidx], uibuf, 4);
10851 }
10852 }
10853 return 0;
10854 }
10855