1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "pgenlib_misc.h"
19 
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23 
24 #ifdef USE_AVX2
CopyNyparrNonemptySubset(const uintptr_t * __restrict raw_nyparr,const uintptr_t * __restrict subset_mask,uint32_t raw_nyparr_entry_ct,uint32_t subset_entry_ct,uintptr_t * __restrict output_nyparr)25 void CopyNyparrNonemptySubset(const uintptr_t* __restrict raw_nyparr, const uintptr_t* __restrict subset_mask, uint32_t raw_nyparr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_nyparr) {
26   if (subset_entry_ct == raw_nyparr_entry_ct) {
27     memcpy(output_nyparr, raw_nyparr, DivUp(subset_entry_ct, kBitsPerWordD2) * sizeof(intptr_t));
28     ZeroTrailingNyps(subset_entry_ct, output_nyparr);
29     return;
30   }
31   assert(subset_entry_ct);
32   uintptr_t cur_output_word = 0;
33 
34   uintptr_t* output_nyparr_iter = output_nyparr;
35 
36   uintptr_t* output_nyparr_last = &(output_nyparr[subset_entry_ct / kBitsPerWordD2]);
37   const uint32_t word_write_shift_end = 2 * (subset_entry_ct % kBitsPerWordD2);
38   uint32_t word_write_shift = 0;
39   for (uint32_t subset_mask_widx = 0; ; ++subset_mask_widx) {
40     const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
41     if (cur_include_word) {
42       uint32_t cur_include_halfword = S_CAST(Halfword, cur_include_word);
43       for (uint32_t wordhalf_idx = 0; ; ++wordhalf_idx) {
44         if (cur_include_halfword) {
45           uintptr_t extracted_bits = raw_nyparr[subset_mask_widx * 2 + wordhalf_idx];
46           uint32_t set_bit_ct = kBitsPerWord;
47           if (cur_include_halfword != UINT32_MAX) {
48             const uintptr_t pext_mask = 3 * UnpackHalfwordToWord(cur_include_halfword);
49             extracted_bits = _pext_u64(extracted_bits, pext_mask);
50             set_bit_ct = PopcountWord(pext_mask);
51           }
52           cur_output_word |= extracted_bits << word_write_shift;
53           word_write_shift += set_bit_ct;
54           if (word_write_shift >= kBitsPerWord) {
55             *output_nyparr_iter++ = cur_output_word;
56             word_write_shift -= kBitsPerWord;
57             cur_output_word = 0;
58             if (word_write_shift) {
59               cur_output_word = extracted_bits >> (set_bit_ct - word_write_shift);
60             }
61           }
62         }
63         if (wordhalf_idx) {
64           break;
65         }
66         cur_include_halfword = cur_include_word >> kBitsPerWordD2;
67       }
68       if (output_nyparr_iter == output_nyparr_last) {
69         if (word_write_shift == word_write_shift_end) {
70           if (word_write_shift_end) {
71             *output_nyparr_last = cur_output_word;
72           }
73           return;
74         }
75       }
76     }
77   }
78 }
79 
80 // bit_idx_start assumed to be < kBitsPerWord
CopyGenomatchSubset(const uintptr_t * __restrict raw_bitarr,const uintptr_t * __restrict genoarr,uintptr_t match_word,uint32_t write_bit_idx_start,uint32_t bit_ct,uintptr_t * __restrict output_bitarr)81 void CopyGenomatchSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t write_bit_idx_start, uint32_t bit_ct, uintptr_t* __restrict output_bitarr) {
82   const uint32_t bit_idx_end = bit_ct + write_bit_idx_start;
83   const uint32_t bit_idx_end_lowbits = bit_idx_end % kBitsPerWord;
84   const Halfword* raw_bitarr_alias = R_CAST(const Halfword*, raw_bitarr);
85   uintptr_t* output_bitarr_iter = output_bitarr;
86   uintptr_t* output_bitarr_last = &(output_bitarr[bit_idx_end / kBitsPerWord]);
87   uintptr_t cur_output_word = 0;
88   uint32_t read_widx = UINT32_MAX;  // deliberate overflow
89   uint32_t write_idx_lowbits = write_bit_idx_start;
90   while ((output_bitarr_iter != output_bitarr_last) || (write_idx_lowbits != bit_idx_end_lowbits)) {
91     uintptr_t cur_mask_word;
92     // sparse genoarr optimization
93     // guaranteed to terminate since there's at least one more set bit
94     do {
95       // todo: try reading two genoarr words at a time.  would need to be very
96       // careful with the possible trailing word, though.
97       // more important to optimize this function now that regular phased-call
98       // handling code is using it.
99       cur_mask_word = genoarr[++read_widx] ^ match_word;
100       cur_mask_word = (~(cur_mask_word | (cur_mask_word >> 1))) & kMask5555;
101     } while (!cur_mask_word);
102     uintptr_t extracted_bits = raw_bitarr_alias[read_widx];
103     uint32_t set_bit_ct = kBitsPerWordD2;
104     if (cur_mask_word != kMask5555) {
105       const uintptr_t cur_mask_hw = PackWordToHalfword(cur_mask_word);
106       set_bit_ct = PopcountWord(cur_mask_word);
107       extracted_bits = _pext_u64(extracted_bits, cur_mask_hw);
108     }
109     cur_output_word |= extracted_bits << write_idx_lowbits;
110     const uint32_t new_write_idx_lowbits = write_idx_lowbits + set_bit_ct;
111     if (new_write_idx_lowbits >= kBitsPerWord) {
112       *output_bitarr_iter++ = cur_output_word;
113       // ...and these are the bits that fell off
114       // impossible for write_idx_lowbits to be zero here
115       cur_output_word = extracted_bits >> (kBitsPerWord - write_idx_lowbits);
116     }
117     write_idx_lowbits = new_write_idx_lowbits % kBitsPerWord;
118   }
119   if (write_idx_lowbits) {
120     *output_bitarr_iter = cur_output_word;
121   }
122 }
123 
124 // Variant of ExpandBytearr() which is based off a target 2-bit value instead
125 // of single expand_mask bits.  expand_size must be the number of instance of
126 // the target value in genovec.
ExpandBytearrFromGenoarr(const void * __restrict compact_bitarr,const uintptr_t * __restrict genoarr,uintptr_t match_word,uint32_t genoword_ct,uint32_t expand_size,uint32_t read_start_bit,uintptr_t * __restrict target)127 void ExpandBytearrFromGenoarr(const void* __restrict compact_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t genoword_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target) {
128   const uint32_t expand_sizex_m1 = expand_size + read_start_bit - 1;
129   const uint32_t leading_byte_ct = 1 + (expand_sizex_m1 % kBitsPerWord) / CHAR_BIT;
130   const uint32_t genoword_ct_m1 = genoword_ct - 1;
131   uintptr_t compact_word = SubwordLoad(compact_bitarr, leading_byte_ct) >> read_start_bit;
132   const uintptr_t* compact_bitarr_iter = R_CAST(const uintptr_t*, &(S_CAST(const unsigned char*, compact_bitarr)[leading_byte_ct]));
133   uint32_t compact_idx_lowbits = read_start_bit + CHAR_BIT * (sizeof(intptr_t) - leading_byte_ct);
134   for (uint32_t widx = 0; ; widx += 2) {
135     uintptr_t mask_word;
136     if (widx >= genoword_ct_m1) {
137       if (widx > genoword_ct_m1) {
138         return;
139       }
140       mask_word = 0;
141     } else {
142       const uintptr_t geno_word1 = genoarr[widx + 1] ^ match_word;
143       mask_word = PackWordToHalfwordMask5555(~(geno_word1 | (geno_word1 >> 1)));
144       mask_word = mask_word << 32;
145     }
146     const uintptr_t geno_word0 = genoarr[widx] ^ match_word;
147     mask_word |= PackWordToHalfwordMask5555(~(geno_word0 | (geno_word0 >> 1)));
148     uintptr_t write_word = 0;
149     if (mask_word) {
150       const uint32_t mask_set_ct = PopcountWord(mask_word);
151       uint32_t next_compact_idx_lowbits = compact_idx_lowbits + mask_set_ct;
152       if (next_compact_idx_lowbits <= kBitsPerWord) {
153         write_word = _pdep_u64(compact_word, mask_word);
154         if (mask_set_ct != kBitsPerWord) {
155           compact_word = compact_word >> mask_set_ct;
156         } else {
157           // avoid nasal demons
158           compact_word = 0;
159         }
160       } else {
161 #  ifdef __arm__
162 #    error "Unaligned accesses in ExpandBytearrFromGenoarr()."
163 #  endif
164         uintptr_t next_compact_word = *compact_bitarr_iter++;
165         next_compact_idx_lowbits -= kBitsPerWord;
166         compact_word |= next_compact_word << (kBitsPerWord - compact_idx_lowbits);
167         write_word = _pdep_u64(compact_word, mask_word);
168         if (next_compact_idx_lowbits != kBitsPerWord) {
169           compact_word = next_compact_word >> next_compact_idx_lowbits;
170         } else {
171           compact_word = 0;
172         }
173       }
174       compact_idx_lowbits = next_compact_idx_lowbits;
175     }
176     target[widx / 2] = write_word;
177   }
178 }
179 #else  // !USE_AVX2
180 void CopyNyparrNonemptySubset(const uintptr_t* __restrict raw_nyparr, const uintptr_t* __restrict subset_mask, uint32_t raw_nyparr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_nyparr) {
181   if (subset_entry_ct == raw_nyparr_entry_ct) {
182     // subset_mask may be nullptr in this case
183     memcpy(output_nyparr, raw_nyparr, DivUp(subset_entry_ct, kBitsPerWordD2) * sizeof(intptr_t));
184     ZeroTrailingNyps(subset_entry_ct, output_nyparr);
185     return;
186   }
187   assert(subset_entry_ct);
188   assert(raw_nyparr_entry_ct >= subset_entry_ct);
189   uintptr_t cur_output_word = 0;
190 
191   uintptr_t* output_nyparr_iter = output_nyparr;
192 
193   uintptr_t* output_nyparr_last = &(output_nyparr[subset_entry_ct / kBitsPerWordD2]);
194   const uint32_t word_write_halfshift_end = subset_entry_ct % kBitsPerWordD2;
195   uint32_t word_write_halfshift = 0;
196   // if <= 2/3-filled, use sparse copy algorithm
197   // (tried CopyBitarrSubset() approach, that actually worsened things)
198   if (subset_entry_ct * (3 * k1LU) <= raw_nyparr_entry_ct * (2 * k1LU)) {
199     for (uint32_t subset_mask_widx = 0; ; ++subset_mask_widx) {
200       const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
201       if (cur_include_word) {
202         uint32_t cur_include_halfword = S_CAST(Halfword, cur_include_word);
203         for (uint32_t wordhalf_idx = 0; ; ++wordhalf_idx) {
204           if (cur_include_halfword) {
205             uintptr_t raw_nyparr_word = raw_nyparr[subset_mask_widx * 2 + wordhalf_idx];
206             do {
207               uint32_t rqa_idx_lowbits = ctzu32(cur_include_halfword);
208               cur_output_word |= ((raw_nyparr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
209               if (++word_write_halfshift == kBitsPerWordD2) {
210                 *output_nyparr_iter++ = cur_output_word;
211                 word_write_halfshift = 0;
212                 cur_output_word = 0;
213               }
214               cur_include_halfword &= cur_include_halfword - 1;
215             } while (cur_include_halfword);
216           }
217           if (wordhalf_idx) {
218             break;
219           }
220           cur_include_halfword = cur_include_word >> kBitsPerWordD2;
221         }
222         if (output_nyparr_iter == output_nyparr_last) {
223           if (word_write_halfshift == word_write_halfshift_end) {
224             if (word_write_halfshift_end) {
225               *output_nyparr_last = cur_output_word;
226             }
227             return;
228           }
229         }
230       }
231     }
232   }
233   // blocked copy
234   const uintptr_t* raw_nyparr_iter = raw_nyparr;
235   for (; ; ++subset_mask) {
236     const uintptr_t cur_include_word = *subset_mask;
237     uintptr_t cur_include_halfword = S_CAST(Halfword, cur_include_word);
238     for (uint32_t wordhalf_idx = 0; ; ++wordhalf_idx) {
239       uintptr_t raw_nyparr_word = *raw_nyparr_iter++;
240       while (cur_include_halfword) {
241         const uint32_t rqa_idx_lowbits = ctzw(cur_include_halfword);
242 
243         // TAOCP, 7.1.3, (43).
244         const uintptr_t bottom_block_remover = (cur_include_halfword | (cur_include_halfword - 1)) + 1;
245 
246         const uintptr_t raw_nyparr_curblock_unmasked = raw_nyparr_word >> (rqa_idx_lowbits * 2);
247         const uint32_t rqa_block_len = ctzw(bottom_block_remover) - rqa_idx_lowbits;
248         const uint32_t block_len_limit = kBitsPerWordD2 - word_write_halfshift;
249         cur_output_word |= raw_nyparr_curblock_unmasked << (2 * word_write_halfshift);
250         if (rqa_block_len < block_len_limit) {
251           word_write_halfshift += rqa_block_len;
252           cur_output_word = bzhi(cur_output_word, word_write_halfshift * 2);
253         } else {
254           // no need to mask, extra bits vanish off the high end
255           *output_nyparr_iter++ = cur_output_word;
256           word_write_halfshift = rqa_block_len - block_len_limit;
257           if (word_write_halfshift) {
258             cur_output_word = bzhi(raw_nyparr_curblock_unmasked >> (2 * block_len_limit), 2 * word_write_halfshift);
259           } else {
260             // avoid potential right-shift-[word length]
261             cur_output_word = 0;
262           }
263         }
264         cur_include_halfword &= bottom_block_remover;
265       }
266       if (wordhalf_idx) {
267         break;
268       }
269       cur_include_halfword = cur_include_word >> kBitsPerWordD2;
270     }
271     if (output_nyparr_iter == output_nyparr_last) {
272       if (word_write_halfshift == word_write_halfshift_end) {
273         if (word_write_halfshift_end) {
274           *output_nyparr_last = cur_output_word;
275         }
276         return;
277       }
278     }
279   }
280 }
281 
282 void CopyGenomatchSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict genovec, uintptr_t match_word, uint32_t write_bit_idx_start, uint32_t bit_ct, uintptr_t* __restrict output_bitarr) {
283   const uint32_t bit_idx_end = write_bit_idx_start + bit_ct;
284   const uint32_t bit_idx_end_lowbits = bit_idx_end % kBitsPerWord;
285   const Halfword* raw_bitarr_alias = R_CAST(const Halfword*, raw_bitarr);
286   uintptr_t* output_bitarr_iter = output_bitarr;
287   uintptr_t* output_bitarr_last = &(output_bitarr[bit_idx_end / kBitsPerWord]);
288   uintptr_t cur_output_word = 0;
289   uint32_t read_widx = UINT32_MAX;  // deliberate overflow
290   uint32_t write_idx_lowbits = write_bit_idx_start;
291   while ((output_bitarr_iter != output_bitarr_last) || (write_idx_lowbits != bit_idx_end_lowbits)) {
292     uintptr_t geno_word;
293     // sparse genovec optimization
294     // guaranteed to terminate since there's at least one more set bit
295     do {
296       geno_word = genovec[++read_widx] ^ match_word;
297       geno_word = (~(geno_word | (geno_word >> 1))) & kMask5555;
298     } while (!geno_word);
299     // screw it, just iterate over set bits
300     const uint32_t cur_halfword = raw_bitarr_alias[read_widx];
301     do {
302       const uint32_t sample_idx_lowbits = ctzw(geno_word) / 2;
303       cur_output_word |= S_CAST(uintptr_t, (cur_halfword >> sample_idx_lowbits) & k1LU) << write_idx_lowbits;
304       if (++write_idx_lowbits == kBitsPerWord) {
305         *output_bitarr_iter++ = cur_output_word;
306         cur_output_word = 0;
307         write_idx_lowbits = 0;
308       }
309       geno_word &= geno_word - 1;
310     } while (geno_word);
311   }
312   if (write_idx_lowbits) {
313     *output_bitarr_iter = cur_output_word;
314   }
315 }
316 
317 void ExpandBytearrFromGenoarr(const void* __restrict compact_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t genoword_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target) {
318   Halfword* target_alias = R_CAST(Halfword*, target);
319   ZeroHwArr(RoundUpPow2(genoword_ct, 2), target_alias);
320   const uintptr_t* compact_bitarr_alias = S_CAST(const uintptr_t*, compact_bitarr);
321   const uint32_t expand_sizex_m1 = expand_size + read_start_bit - 1;
322   const uint32_t compact_widx_last = expand_sizex_m1 / kBitsPerWord;
323   uint32_t compact_idx_lowbits = read_start_bit;
324   uint32_t loop_len = kBitsPerWord;
325   uintptr_t write_hwidx = 0;
326   uintptr_t genomatch_bits = genoarr[0] ^ match_word;
327   genomatch_bits = (~(genomatch_bits | (genomatch_bits >> 1))) & kMask5555;
328   for (uint32_t compact_widx = 0; ; ++compact_widx) {
329     uintptr_t compact_word;
330     if (compact_widx >= compact_widx_last) {
331       if (compact_widx > compact_widx_last) {
332         return;
333       }
334       loop_len = 1 + (expand_sizex_m1 % kBitsPerWord);
335       // avoid possible segfault
336       compact_word = SubwordLoad(&(compact_bitarr_alias[compact_widx]), DivUp(loop_len, CHAR_BIT));
337     } else {
338 #ifdef __arm__
339 #  error "Unaligned accesses in ExpandBytearrFromGenoarr()."
340 #endif
341       compact_word = compact_bitarr_alias[compact_widx];
342     }
343     for (; compact_idx_lowbits != loop_len; ++compact_idx_lowbits) {
344       while (!genomatch_bits) {
345         genomatch_bits = genoarr[++write_hwidx] ^ match_word;
346         genomatch_bits = (~(genomatch_bits | (genomatch_bits >> 1))) & kMask5555;
347       }
348       if (compact_word & (k1LU << compact_idx_lowbits)) {
349         const uint32_t lowbit_idx = ctzw(genomatch_bits);
350         target_alias[write_hwidx] |= 1U << (lowbit_idx / 2);
351       }
352       genomatch_bits &= genomatch_bits - 1;
353     }
354     compact_idx_lowbits = 0;
355   }
356 }
357 #endif
358 
359 // Harley-Seal algorithm only works for bitarrays, not nyparrays, so don't
360 // add an AVX2 specialization here.
361 // ...unless something like the interleaved_vec strategy is used?  hmm.  should
362 // test this on basic frequency counter.
363 /*
364 void Count2FreqVec3(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict alt1_plus_bothset_ctp, uint32_t* __restrict bothset_ctp) {
365   assert(!(vec_ct % 3));
366   // Increments bothset_ct by the number of 0b11 in the current block, and
367   // alt1_ct by twice the number of 0b10 plus the number of 0b01.
368   const VecW m1 = VCONST_W(kMask5555);
369   const VecW m2 = VCONST_W(kMask3333);
370   const VecW m4 = VCONST_W(kMask0F0F);
371   const VecW* geno_vvec_iter = geno_vvec;
372   uint32_t alt1_plus_bothset_ct = 0;
373   uint32_t bothset_ct = 0;
374 
375   while (1) {
376     UniVec acc_alt1_plus_bothset;
377     UniVec acc_bothset;
378     acc_alt1_plus_bothset.vw = vecw_setzero();
379     acc_bothset.vw = vecw_setzero();
380     const VecW* geno_vvec_stop;
381     if (vec_ct < 30) {
382       if (!vec_ct) {
383         *alt1_plus_bothset_ctp = alt1_plus_bothset_ct;
384         *bothset_ctp = bothset_ct;
385         return;
386       }
387       geno_vvec_stop = &(geno_vvec_iter[vec_ct]);
388       vec_ct = 0;
389     } else {
390       geno_vvec_stop = &(geno_vvec_iter[30]);
391       vec_ct -= 30;
392     }
393     do {
394       VecW cur_geno_vword1 = *geno_vvec_iter++;
395       // process first two vwords simultaneously to minimize linear dependence
396       VecW cur_geno_vword2 = *geno_vvec_iter++;
397       VecW cur_geno_vword_low_lshifted1 = vecw_slli(cur_geno_vword1 & m1, 1);
398       VecW cur_geno_vword_low_lshifted2 = vecw_slli(cur_geno_vword2 & m1, 1);
399 
400       // 00 -> 00; 01 -> 01; 10 -> 10; 11 -> 01
401       // note that _mm_andnot_si128 flips the *first* argument before the AND
402       // operation.
403       VecW alt1_plus_bothset1 = vecw_and_notfirst(cur_geno_vword_low_lshifted1, cur_geno_vword1);
404       VecW alt1_plus_bothset2 = vecw_and_notfirst(cur_geno_vword_low_lshifted2, cur_geno_vword2);
405 
406       VecW bothset1 = vecw_srli(cur_geno_vword_low_lshifted1 & cur_geno_vword1, 1);
407       VecW bothset2 = vecw_srli(cur_geno_vword_low_lshifted2 & cur_geno_vword2, 1);
408 
409       cur_geno_vword1 = *geno_vvec_iter++;
410       alt1_plus_bothset1 = (alt1_plus_bothset1 & m2) + (vecw_srli(alt1_plus_bothset1, 2) & m2);
411       bothset2 = bothset1 + bothset2;
412       alt1_plus_bothset2 = (alt1_plus_bothset2 & m2) + (vecw_srli(alt1_plus_bothset2, 2) & m2);
413       cur_geno_vword_low_lshifted1 = vecw_slli(cur_geno_vword1 & m1, 1);
414 
415       alt1_plus_bothset2 = alt1_plus_bothset1 + alt1_plus_bothset2;
416       // alt1_plus_bothset2 now contains 4-bit values from 0-8, while bothset2
417       // contains 2-bit values from 0-2
418       // (todo: check whether this is faster if we use double_bothsetx
419       // variables instead of bothset1/bothset2)
420       bothset1 = vecw_srli(cur_geno_vword_low_lshifted1 & cur_geno_vword1, 1);
421       alt1_plus_bothset1 = vecw_and_notfirst(cur_geno_vword_low_lshifted1, cur_geno_vword1);
422       bothset2 = bothset1 + bothset2;
423       alt1_plus_bothset1 = (alt1_plus_bothset1 & m2) + (vecw_srli(alt1_plus_bothset1, 2) & m2);
424 
425       bothset2 = (bothset2 & m2) + (vecw_srli(bothset2, 2) & m2);
426       alt1_plus_bothset2 = alt1_plus_bothset1 + alt1_plus_bothset2;
427       // alt1_plus_bothset2 now contains 4-bit values from 0-12, while bothset2
428       // contains 4-bit values from 0-6.  aggregate both into 8-bit values.
429       bothset2 = (bothset2 & m4) + (vecw_srli(bothset2, 4) & m4);
430       alt1_plus_bothset2 = (alt1_plus_bothset2 & m4) + (vecw_srli(alt1_plus_bothset2, 4) & m4);
431 
432       acc_bothset.vw = acc_bothset.vw + bothset2;
433       acc_alt1_plus_bothset.vw = acc_alt1_plus_bothset.vw + alt1_plus_bothset2;
434     } while (geno_vvec_iter < geno_vvec_stop);
435     const VecW m8 = VCONST_W(kMask00FF);
436     acc_bothset.vw = (acc_bothset.vw + vecw_srli(acc_bothset.vw, 8)) & m8;
437     acc_alt1_plus_bothset.vw = (acc_alt1_plus_bothset.vw & m8) + (vecw_srli(acc_alt1_plus_bothset.vw, 8) & m8);
438     bothset_ct += UniVecHsum16(acc_bothset);
439     alt1_plus_bothset_ct += UniVecHsum16(acc_alt1_plus_bothset);
440   }
441 }
442 */
443 
444 // todo: benchmark against general-purpose counter
Count12Vec6(const VecW * geno_vvec,uint32_t vec_ct,uint32_t * __restrict raw_01_ctp,uint32_t * __restrict raw_both_ctp)445 void Count12Vec6(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict raw_01_ctp, uint32_t* __restrict raw_both_ctp) {
446   assert(!(vec_ct % 6));
447   const VecW m1 = VCONST_W(kMask5555);
448   const VecW m2 = VCONST_W(kMask3333);
449   const VecW m4 = VCONST_W(kMask0F0F);
450   const VecW* geno_vvec_iter = geno_vvec;
451   VecW acc_01 = vecw_setzero();
452   VecW acc_both = vecw_setzero();
453   uintptr_t cur_incr = 60;
454   for (; ; vec_ct -= cur_incr) {
455     if (vec_ct < 60) {
456       if (!vec_ct) {
457         *raw_01_ctp = HsumW(acc_01);
458         *raw_both_ctp = HsumW(acc_both);
459         return;
460       }
461       cur_incr = vec_ct;
462     }
463     VecW inner_acc_01 = vecw_setzero();
464     VecW inner_acc_both = vecw_setzero();
465     const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
466     do {
467       VecW cur_vvec = *geno_vvec_iter++;
468       VecW vvec_rshift = vecw_srli(cur_vvec, 1);
469       VecW nyp_both = m1 & (cur_vvec ^ vvec_rshift);
470       VecW nyp_01 = nyp_both & cur_vvec;
471 
472       cur_vvec = *geno_vvec_iter++;
473       vvec_rshift = vecw_srli(cur_vvec, 1);
474       VecW vvec_both = m1 & (cur_vvec ^ vvec_rshift);
475       nyp_01 = nyp_01 + (vvec_both & cur_vvec);
476       nyp_both = nyp_both + vvec_both;
477 
478       cur_vvec = *geno_vvec_iter++;
479       vvec_rshift = vecw_srli(cur_vvec, 1);
480       vvec_both = m1 & (cur_vvec ^ vvec_rshift);
481       nyp_01 = nyp_01 + (vvec_both & cur_vvec);
482       nyp_both = nyp_both + vvec_both;
483 
484       VecW nybble_01 = (nyp_01 & m2) + (vecw_srli(nyp_01, 2) & m2);
485       VecW nybble_both = (nyp_both & m2) + (vecw_srli(nyp_both, 2) & m2);
486 
487       cur_vvec = *geno_vvec_iter++;
488       vvec_rshift = vecw_srli(cur_vvec, 1);
489       nyp_both = m1 & (cur_vvec ^ vvec_rshift);
490       nyp_01 = nyp_both & cur_vvec;
491 
492       cur_vvec = *geno_vvec_iter++;
493       vvec_rshift = vecw_srli(cur_vvec, 1);
494       vvec_both = m1 & (cur_vvec ^ vvec_rshift);
495       nyp_01 = nyp_01 + (vvec_both & cur_vvec);
496       nyp_both = nyp_both + vvec_both;
497 
498       cur_vvec = *geno_vvec_iter++;
499       vvec_rshift = vecw_srli(cur_vvec, 1);
500       vvec_both = m1 & (cur_vvec ^ vvec_rshift);
501       nyp_01 = nyp_01 + (vvec_both & cur_vvec);
502       nyp_both = nyp_both + vvec_both;
503 
504       nybble_01 = nybble_01 + (nyp_01 & m2) + (vecw_srli(nyp_01, 2) & m2);
505       nybble_both = nybble_both + (nyp_both & m2) + (vecw_srli(nyp_both, 2) & m2);
506 
507       inner_acc_01 = inner_acc_01 + (nybble_01 & m4) + (vecw_srli(nybble_01, 4) & m4);
508       inner_acc_both = inner_acc_both + (nybble_both & m4) + (vecw_srli(nybble_both, 4) & m4);
509     } while (geno_vvec_iter < geno_vvec_stop);
510     const VecW m0 = vecw_setzero();
511     acc_01 = acc_01 + vecw_bytesum(inner_acc_01, m0);
512     acc_both = acc_both + vecw_bytesum(inner_acc_both, m0);
513   }
514 }
515 
GenovecCount12Unsafe(const uintptr_t * genovec,uint32_t sample_ct,uint32_t * __restrict raw_01_ctp,uint32_t * __restrict raw_10_ctp)516 void GenovecCount12Unsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* __restrict raw_01_ctp, uint32_t* __restrict raw_10_ctp) {
517   // assumes trailing bits of last genovec word are zeroed out.
518   // sample_ct == 0 ok.
519   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
520   uint32_t raw_01_ct;
521   uint32_t raw_both_ct;
522   uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (6 * kWordsPerVec));
523   assert(VecIsAligned(genovec));
524   Count12Vec6(R_CAST(const VecW*, genovec), word_idx / kWordsPerVec, &raw_01_ct, &raw_both_ct);
525   for (; word_idx != sample_ctl2; ++word_idx) {
526     const uintptr_t cur_geno_word = genovec[word_idx];
527     const uintptr_t cur_rshift = cur_geno_word >> 1;
528     const uintptr_t cur_both = kMask5555 & (cur_geno_word ^ cur_rshift);
529     raw_01_ct += Popcount01Word(cur_both & cur_geno_word);
530     raw_both_ct += Popcount01Word(cur_both);
531   }
532   *raw_01_ctp = raw_01_ct;
533   *raw_10_ctp = raw_both_ct - raw_01_ct;
534 }
535 
Count3FreqVec6(const VecW * geno_vvec,uint32_t vec_ct,uint32_t * __restrict even_ctp,uint32_t * __restrict odd_ctp,uint32_t * __restrict bothset_ctp)536 void Count3FreqVec6(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp) {
537   assert(!(vec_ct % 6));
538   // Sets even_ct to the number of set low bits in the current block, odd_ct to
539   // the number of set high bits, and bothset_ct by the number of 0b11s.
540   const VecW m1 = VCONST_W(kMask5555);
541   const VecW m2 = VCONST_W(kMask3333);
542   const VecW m4 = VCONST_W(kMask0F0F);
543   const VecW* geno_vvec_iter = geno_vvec;
544   VecW acc_even = vecw_setzero();
545   VecW acc_odd = vecw_setzero();
546   VecW acc_bothset = vecw_setzero();
547   uintptr_t cur_incr = 60;
548   for (; ; vec_ct -= cur_incr) {
549     if (vec_ct < 60) {
550       if (!vec_ct) {
551         *even_ctp = HsumW(acc_even);
552         *odd_ctp = HsumW(acc_odd);
553         *bothset_ctp = HsumW(acc_bothset);
554         return;
555       }
556       cur_incr = vec_ct;
557     }
558     VecW inner_acc_even = vecw_setzero();
559     VecW inner_acc_odd = vecw_setzero();
560     VecW inner_acc_bothset = vecw_setzero();
561     const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
562     do {
563       // hmm, this seems to have more linear dependence than I'd want, but the
564       // reorderings I tried just made the code harder to read without helping,
565       // so I'll leave this alone
566       VecW cur_geno_vword = vecw_loadu(geno_vvec_iter);
567       ++geno_vvec_iter;
568       VecW odd1 = m1 & vecw_srli(cur_geno_vword, 1);
569       VecW even1 = m1 & cur_geno_vword;
570       VecW bothset1 = odd1 & cur_geno_vword;
571 
572       cur_geno_vword = vecw_loadu(geno_vvec_iter);
573       ++geno_vvec_iter;
574       VecW cur_geno_vword_high = m1 & vecw_srli(cur_geno_vword, 1);
575       even1 = even1 + (m1 & cur_geno_vword);
576       odd1 = odd1 + cur_geno_vword_high;
577       bothset1 = bothset1 + (cur_geno_vword_high & cur_geno_vword);
578 
579       cur_geno_vword = vecw_loadu(geno_vvec_iter);
580       ++geno_vvec_iter;
581       cur_geno_vword_high = m1 & vecw_srli(cur_geno_vword, 1);
582       even1 = even1 + (m1 & cur_geno_vword);
583       odd1 = odd1 + cur_geno_vword_high;
584       bothset1 = bothset1 + (cur_geno_vword_high & cur_geno_vword);
585 
586       even1 = (even1 & m2) + (vecw_srli(even1, 2) & m2);
587       odd1 = (odd1 & m2) + (vecw_srli(odd1, 2) & m2);
588       bothset1 = (bothset1 & m2) + (vecw_srli(bothset1, 2) & m2);
589 
590       cur_geno_vword = vecw_loadu(geno_vvec_iter);
591       ++geno_vvec_iter;
592       VecW odd2 = m1 & vecw_srli(cur_geno_vword, 1);
593       VecW even2 = m1 & cur_geno_vword;
594       VecW bothset2 = odd2 & cur_geno_vword;
595 
596       cur_geno_vword = vecw_loadu(geno_vvec_iter);
597       ++geno_vvec_iter;
598       cur_geno_vword_high = m1 & vecw_srli(cur_geno_vword, 1);
599       even2 = even2 + (m1 & cur_geno_vword);
600       odd2 = odd2 + cur_geno_vword_high;
601       bothset2 = bothset2 + (cur_geno_vword_high & cur_geno_vword);
602 
603       cur_geno_vword = vecw_loadu(geno_vvec_iter);
604       ++geno_vvec_iter;
605       cur_geno_vword_high = m1 & vecw_srli(cur_geno_vword, 1);
606       even2 = even2 + (m1 & cur_geno_vword);
607       odd2 = odd2 + cur_geno_vword_high;
608       bothset2 = bothset2 + (cur_geno_vword_high & cur_geno_vword);
609 
610       even1 = even1 + (even2 & m2) + (vecw_srli(even2, 2) & m2);
611       odd1 = odd1 + (odd2 & m2) + (vecw_srli(odd2, 2) & m2);
612       bothset1 = bothset1 + (bothset2 & m2) + (vecw_srli(bothset2, 2) & m2);
613       // these now contain 4-bit values from 0-12
614 
615       inner_acc_even = inner_acc_even + (even1 & m4) + (vecw_srli(even1, 4) & m4);
616       inner_acc_odd = inner_acc_odd + (odd1 & m4) + (vecw_srli(odd1, 4) & m4);
617       inner_acc_bothset = inner_acc_bothset + (bothset1 & m4) + (vecw_srli(bothset1, 4) & m4);
618     } while (geno_vvec_iter < geno_vvec_stop);
619     const VecW m0 = vecw_setzero();
620     acc_even = acc_even + vecw_bytesum(inner_acc_even, m0);
621     acc_odd = acc_odd + vecw_bytesum(inner_acc_odd, m0);
622     acc_bothset = acc_bothset + vecw_bytesum(inner_acc_bothset, m0);
623   }
624 }
625 
FillInterleavedMaskVec(const uintptr_t * __restrict subset_mask,uint32_t base_vec_ct,uintptr_t * interleaved_mask_vec)626 void FillInterleavedMaskVec(const uintptr_t* __restrict subset_mask, uint32_t base_vec_ct, uintptr_t* interleaved_mask_vec) {
627 #ifdef __LP64__
628   // This is a cousin of github.com/KWillets/simd_interleave , which was
629   // written in response to
630   //   https://lemire.me/blog/2018/01/09/how-fast-can-you-bit-interleave-32-bit-integers-simd-edition/
631   // AVX2 implementation takes ~40% less time than before, and SSE4.2 takes
632   // ~65% less.  This also avoids the Ryzen-screwing _pdep_u64()/_pext_u64()
633   // operations.
634   const VecW m4 = VCONST_W(kMask0F0F);
635 #  ifdef USE_SSE42
636   const VecW lookup0 = vecw_setr8(
637       0, 1, 4, 5, 16, 17, 20, 21,
638       64, 65, 68, 69, 80, 81, 84, 85);
639   const VecW lookup1 = vecw_slli(lookup0, 1);
640 #  else
641   const VecW m1 = VCONST_W(kMask5555);
642   const VecW m2 = VCONST_W(kMask3333);
643 #  endif
644   const VecW* subset_mask_valias = R_CAST(const VecW*, subset_mask);
645   VecW* interleaved_mask_valias = R_CAST(VecW*, interleaved_mask_vec);
646 
647   for (uint32_t vidx = 0; vidx != base_vec_ct; ++vidx) {
648     // I'll assume the compiler can handle this register allocation job.
649     VecW cur_vec = subset_mask_valias[vidx];
650 
651     cur_vec = vecw_permute0xd8_if_avx2(cur_vec);
652     const VecW vec_even = cur_vec & m4;
653     const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;
654     VecW vec_lo = vecw_unpacklo8(vec_even, vec_odd);
655     VecW vec_hi = vecw_unpackhi8(vec_even, vec_odd);
656 #  ifdef USE_SSE42
657     vec_lo = vecw_shuffle8(lookup0, vec_lo);
658     vec_hi = vecw_shuffle8(lookup1, vec_hi);
659 #  else
660     vec_lo = (vec_lo | vecw_slli(vec_lo, 2)) & m2;
661     vec_hi = (vec_hi | vecw_slli(vec_hi, 2)) & m2;
662     vec_lo = (vec_lo | vecw_slli(vec_lo, 1)) & m1;
663     vec_hi = (vec_hi | vecw_slli(vec_hi, 1)) & m1;
664     vec_hi = vecw_slli(vec_hi, 1);
665 #  endif
666     interleaved_mask_valias[vidx] = vec_lo | vec_hi;
667   }
668 #else  // !__LP64__
669   for (uint32_t widx = 0; widx != base_vec_ct; ++widx) {
670     const uintptr_t orig_word = subset_mask[widx];
671     uintptr_t ww_even = S_CAST(uint16_t, orig_word);
672     uintptr_t ww_odd = orig_word >> 16;
673     ww_even = UnpackHalfwordToWord(ww_even);
674     ww_odd = UnpackHalfwordToWord(ww_odd);
675     interleaved_mask_vec[widx] = ww_even | (ww_odd << 1);
676   }
677 #endif
678 }
679 
680 /*
681 void GenovecAlleleCtsUnsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* __restrict allele_cts, uint32_t* __restrict bothset_ctp) {
682   // assumes trailing bits of last genovec word are zeroed out.
683   // sets allele_cts[0] to the number of observed ref alleles, and
684   // allele_cts[1] to the number of observed alt1s.
685   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
686   uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (3 * kWordsPerVec));
687   uint32_t alt1_plus_bothset_ct;
688   uint32_t bothset_ct;
689   assert(VecIsAligned(genovec));
690   Count2FreqVec3(R_CAST(const VecW*, genovec), word_idx / kWordsPerVec, &alt1_plus_bothset_ct, &bothset_ct);
691   for (; word_idx != sample_ctl2; ++word_idx) {
692     const uintptr_t cur_geno_word = genovec[word_idx];
693     const uintptr_t cur_geno_word_low_lshifted = (cur_geno_word & kMask5555) << 1;
694     alt1_plus_bothset_ct += NypsumWord((~cur_geno_word_low_lshifted) & cur_geno_word);
695     bothset_ct += NypsumWord(cur_geno_word_low_lshifted & cur_geno_word);
696   }
697   const uint32_t alt1_ct = alt1_plus_bothset_ct - bothset_ct;
698   allele_cts[0] = (sample_ct - bothset_ct) * 2 - alt1_ct;
699   allele_cts[1] = alt1_ct;
700   *bothset_ctp = bothset_ct;
701 }
702 */
703 
704 void GenoarrCountFreqsUnsafe(const uintptr_t* genoarr, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
705   // fills genocounts[0] with the number of 00s, genocounts[1] with the number
706   // of 01s, etc.
707   // assumes trailing bits of last genoarr word are zeroed out.
708   // sample_ct == 0 ok.
709   // no longer requires vector-alignment.
710   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
711   uint32_t even_ct;
712   uint32_t odd_ct;
713   uint32_t bothset_ct;
714   uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (6 * kWordsPerVec));
715   Count3FreqVec6(R_CAST(const VecW*, genoarr), word_idx / kWordsPerVec, &even_ct, &odd_ct, &bothset_ct);
716   for (; word_idx != sample_ctl2; ++word_idx) {
717     const uintptr_t cur_geno_word = genoarr[word_idx];
718     const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
719     even_ct += Popcount01Word(cur_geno_word & kMask5555);
720     odd_ct += Popcount01Word(cur_geno_word_high);
721     bothset_ct += Popcount01Word(cur_geno_word & cur_geno_word_high);
722   }
723   genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
724   genocounts[1] = even_ct - bothset_ct;
725   genocounts[2] = odd_ct - bothset_ct;
726   genocounts[3] = bothset_ct;
727 }
728 
729 // geno_vvec now allowed to be unaligned.
CountSubset3FreqVec6(const VecW * __restrict geno_vvec,const VecW * __restrict interleaved_mask_vvec,uint32_t vec_ct,uint32_t * __restrict even_ctp,uint32_t * __restrict odd_ctp,uint32_t * __restrict bothset_ctp)730 void CountSubset3FreqVec6(const VecW* __restrict geno_vvec, const VecW* __restrict interleaved_mask_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp) {
731   assert(!(vec_ct % 6));
732   // Sets even_ct to the number of set low bits in the current block after
733   // subsetting, odd_ct to the number of set high bits, and bothset_ct by the
734   // number of 0b11s.
735   const VecW m1 = VCONST_W(kMask5555);
736   const VecW m2 = VCONST_W(kMask3333);
737   const VecW m4 = VCONST_W(kMask0F0F);
738   const VecW* geno_vvec_iter = geno_vvec;
739   const VecW* interleaved_mask_vvec_iter = interleaved_mask_vvec;
740   VecW acc_even = vecw_setzero();
741   VecW acc_odd = vecw_setzero();
742   VecW acc_bothset = vecw_setzero();
743   uintptr_t cur_incr = 60;
744   for (; ; vec_ct -= cur_incr) {
745     if (vec_ct < 60) {
746       if (!vec_ct) {
747         *even_ctp = HsumW(acc_even);
748         *odd_ctp = HsumW(acc_odd);
749         *bothset_ctp = HsumW(acc_bothset);
750         return;
751       }
752       cur_incr = vec_ct;
753     }
754     VecW inner_acc_even = vecw_setzero();
755     VecW inner_acc_odd = vecw_setzero();
756     VecW inner_acc_bothset = vecw_setzero();
757     const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
758     do {
759       VecW interleaved_mask_vword = *interleaved_mask_vvec_iter++;
760       VecW cur_geno_vword = vecw_loadu(geno_vvec_iter);
761       ++geno_vvec_iter;
762       VecW cur_mask = interleaved_mask_vword & m1;
763       VecW odd1 = cur_mask & vecw_srli(cur_geno_vword, 1);
764       VecW even1 = cur_mask & cur_geno_vword;
765       VecW bothset1 = odd1 & cur_geno_vword;
766 
767       cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
768       cur_geno_vword = vecw_loadu(geno_vvec_iter);
769       ++geno_vvec_iter;
770       VecW cur_geno_vword_high_masked = cur_mask & vecw_srli(cur_geno_vword, 1);
771       even1 = even1 + (cur_mask & cur_geno_vword);
772       odd1 = odd1 + cur_geno_vword_high_masked;
773       bothset1 = bothset1 + (cur_geno_vword_high_masked & cur_geno_vword);
774 
775       interleaved_mask_vword = *interleaved_mask_vvec_iter++;
776       cur_mask = interleaved_mask_vword & m1;
777       cur_geno_vword = vecw_loadu(geno_vvec_iter);
778       ++geno_vvec_iter;
779       cur_geno_vword_high_masked = cur_mask & vecw_srli(cur_geno_vword, 1);
780       even1 = even1 + (cur_mask & cur_geno_vword);
781       odd1 = odd1 + cur_geno_vword_high_masked;
782       bothset1 = bothset1 + (cur_geno_vword_high_masked & cur_geno_vword);
783 
784       even1 = (even1 & m2) + (vecw_srli(even1, 2) & m2);
785       odd1 = (odd1 & m2) + (vecw_srli(odd1, 2) & m2);
786       bothset1 = (bothset1 & m2) + (vecw_srli(bothset1, 2) & m2);
787 
788       cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
789       cur_geno_vword = vecw_loadu(geno_vvec_iter);
790       ++geno_vvec_iter;
791       VecW odd2 = cur_mask & vecw_srli(cur_geno_vword, 1);
792       VecW even2 = cur_mask & cur_geno_vword;
793       VecW bothset2 = odd2 & cur_geno_vword;
794 
795       interleaved_mask_vword = *interleaved_mask_vvec_iter++;
796       cur_mask = interleaved_mask_vword & m1;
797       cur_geno_vword = vecw_loadu(geno_vvec_iter);
798       ++geno_vvec_iter;
799       cur_geno_vword_high_masked = cur_mask & vecw_srli(cur_geno_vword, 1);
800       even2 = even2 + (cur_mask & cur_geno_vword);
801       odd2 = odd2 + cur_geno_vword_high_masked;
802       bothset2 = bothset2 + (cur_geno_vword_high_masked & cur_geno_vword);
803 
804       cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
805       cur_geno_vword = vecw_loadu(geno_vvec_iter);
806       ++geno_vvec_iter;
807       cur_geno_vword_high_masked = cur_mask & vecw_srli(cur_geno_vword, 1);
808       even2 = even2 + (cur_mask & cur_geno_vword);
809       odd2 = odd2 + cur_geno_vword_high_masked;
810       bothset2 = bothset2 + (cur_geno_vword_high_masked & cur_geno_vword);
811 
812       even1 = even1 + (even2 & m2) + (vecw_srli(even2, 2) & m2);
813       odd1 = odd1 + (odd2 & m2) + (vecw_srli(odd2, 2) & m2);
814       bothset1 = bothset1 + (bothset2 & m2) + (vecw_srli(bothset2, 2) & m2);
815       // these now contain 4-bit values from 0-12
816 
817       inner_acc_even = inner_acc_even + (even1 & m4) + (vecw_srli(even1, 4) & m4);
818       inner_acc_odd = inner_acc_odd + (odd1 & m4) + (vecw_srli(odd1, 4) & m4);
819       inner_acc_bothset = inner_acc_bothset + (bothset1 & m4) + (vecw_srli(bothset1, 4) & m4);
820     } while (geno_vvec_iter < geno_vvec_stop);
821     const VecW m0 = vecw_setzero();
822     acc_even = acc_even + vecw_bytesum(inner_acc_even, m0);
823     acc_odd = acc_odd + vecw_bytesum(inner_acc_odd, m0);
824     acc_bothset = acc_bothset + vecw_bytesum(inner_acc_bothset, m0);
825   }
826 }
827 
828 void GenoarrCountSubsetFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
829   // fills genocounts[0] with the number of 00s, genocounts[1] with the number
830   // of 01s, etc.
831   // {raw_}sample_ct == 0 ok.
832   const uint32_t raw_sample_ctv2 = NypCtToVecCt(raw_sample_ct);
833   uint32_t even_ct;
834   uint32_t odd_ct;
835   uint32_t bothset_ct;
836 #ifdef __LP64__
837   uint32_t vec_idx = raw_sample_ctv2 - (raw_sample_ctv2 % 6);
838   CountSubset3FreqVec6(R_CAST(const VecW*, genoarr), R_CAST(const VecW*, sample_include_interleaved_vec), vec_idx, &even_ct, &odd_ct, &bothset_ct);
839   const uintptr_t* genoarr_iter = &(genoarr[kWordsPerVec * vec_idx]);
840   const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[(kWordsPerVec / 2) * vec_idx]);
841 #  ifdef USE_AVX2
842   uintptr_t mask_base1 = 0;
843   uintptr_t mask_base2 = 0;
844   uintptr_t mask_base3 = 0;
845   uintptr_t mask_base4 = 0;
846   for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
847     uintptr_t mask_word1;
848     uintptr_t mask_word2;
849     uintptr_t mask_word3;
850     uintptr_t mask_word4;
851     if (!(vec_idx % 2)) {
852       mask_base1 = *interleaved_mask_iter++;
853       mask_base2 = *interleaved_mask_iter++;
854       mask_base3 = *interleaved_mask_iter++;
855       mask_base4 = *interleaved_mask_iter++;
856       mask_word1 = mask_base1 & kMask5555;
857       mask_word2 = mask_base2 & kMask5555;
858       mask_word3 = mask_base3 & kMask5555;
859       mask_word4 = mask_base4 & kMask5555;
860     } else {
861       mask_word1 = (mask_base1 >> 1) & kMask5555;
862       mask_word2 = (mask_base2 >> 1) & kMask5555;
863       mask_word3 = (mask_base3 >> 1) & kMask5555;
864       mask_word4 = (mask_base4 >> 1) & kMask5555;
865     }
866     for (uint32_t vechalf_idx = 0; ; ++vechalf_idx) {
867       const uintptr_t cur_geno_word1 = *genoarr_iter++;
868       const uintptr_t cur_geno_word2 = *genoarr_iter++;
869       const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
870       const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
871       even_ct += PopcountWord(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
872       odd_ct += PopcountWord((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
873       bothset_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
874       if (vechalf_idx) {
875         break;
876       }
877       mask_word1 = mask_word3;
878       mask_word2 = mask_word4;
879     }
880   }
881 #  else  // not USE_AVX2
882   uintptr_t mask_base1 = 0;
883   uintptr_t mask_base2 = 0;
884   for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
885     uintptr_t mask_word1;
886     uintptr_t mask_word2;
887     if (!(vec_idx % 2)) {
888       mask_base1 = *interleaved_mask_iter++;
889       mask_base2 = *interleaved_mask_iter++;
890       mask_word1 = mask_base1 & kMask5555;
891       mask_word2 = mask_base2 & kMask5555;
892     } else {
893       mask_word1 = (mask_base1 >> 1) & kMask5555;
894       mask_word2 = (mask_base2 >> 1) & kMask5555;
895     }
896     const uintptr_t cur_geno_word1 = *genoarr_iter++;
897     const uintptr_t cur_geno_word2 = *genoarr_iter++;
898     const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
899     const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
900 #    ifdef USE_SSE42
901     even_ct += PopcountWord(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
902     odd_ct += PopcountWord((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
903     bothset_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
904 #    else
905     even_ct += NypsumWord((cur_geno_word1 & mask_word1) + (cur_geno_word2 & mask_word2));
906     odd_ct += NypsumWord(cur_geno_word1_high_masked + cur_geno_word2_high_masked);
907     bothset_ct += NypsumWord((cur_geno_word1 & cur_geno_word1_high_masked) + (cur_geno_word2 & cur_geno_word2_high_masked));
908 #    endif
909   }
910 #  endif  // not USE_AVX2
911 #else  // not __LP64__
912   uint32_t word_idx = raw_sample_ctv2 - (raw_sample_ctv2 % 6);
913   CountSubset3FreqVec6(R_CAST(const VecW*, genoarr), R_CAST(const VecW*, sample_include_interleaved_vec), word_idx, &even_ct, &odd_ct, &bothset_ct);
914   const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[word_idx / 2]);
915   uintptr_t mask_base = 0;
916   for (; word_idx != raw_sample_ctv2; ++word_idx) {
917     uintptr_t mask_word;
918     if (!(word_idx % 2)) {
919       mask_base = *interleaved_mask_iter++;
920       mask_word = mask_base & kMask5555;
921     } else {
922       mask_word = (mask_base >> 1) & kMask5555;
923     }
924     const uintptr_t cur_geno_word = genoarr[word_idx];
925     const uintptr_t cur_geno_word_high_masked = mask_word & (cur_geno_word >> 1);
926     even_ct += Popcount01Word(cur_geno_word & mask_word);
927     odd_ct += Popcount01Word(cur_geno_word_high_masked);
928     bothset_ct += Popcount01Word(cur_geno_word & cur_geno_word_high_masked);
929   }
930 #endif
931   genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
932   genocounts[1] = even_ct - bothset_ct;
933   genocounts[2] = odd_ct - bothset_ct;
934   genocounts[3] = bothset_ct;
935 }
936 
937 void GenoarrCountSubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
938   // slower GenoarrCountSubsetFreqs() which does not require
939   // sample_include_interleaved_vec to be precomputed.
940   // {raw_}sample_ct == 0 ok.
941   const uint32_t raw_sample_ctl2 = NypCtToWordCt(raw_sample_ct);
942   const uint32_t fullword_ct = raw_sample_ctl2 / 2;
943   uint32_t even_ct = 0;
944   uint32_t odd_ct = 0;
945   uint32_t bothset_ct = 0;
946   for (uint32_t widx = 0; widx != fullword_ct; ++widx) {
947     // possible todo: try vectorizing this loop in SSE4.2+ high-sample-ct case
948     // with shuffle-based dynamic unpacking of sample_include?
949     const uintptr_t mask_word = sample_include[widx];
950     if (mask_word) {
951       uintptr_t geno_word = genoarr[2 * widx];
952       uintptr_t geno_even = PackWordToHalfwordMask5555(geno_word);
953       uintptr_t geno_odd = PackWordToHalfwordMaskAAAA(geno_word);
954       geno_word = genoarr[2 * widx + 1];
955       geno_even |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(geno_word)) << kBitsPerWordD2;
956       geno_odd |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(geno_word)) << kBitsPerWordD2;
957       const uintptr_t geno_even_masked = geno_even & mask_word;
958       even_ct += PopcountWord(geno_even_masked);
959       odd_ct += PopcountWord(geno_odd & mask_word);
960       bothset_ct += PopcountWord(geno_odd & geno_even_masked);
961     }
962   }
963   if (raw_sample_ctl2 % 2) {
964     const uintptr_t mask_hw = sample_include[fullword_ct];
965     if (mask_hw) {
966       const uintptr_t geno_word = genoarr[2 * fullword_ct];
967       // todo: benchmark main loop unpack vs. pack
968       const uintptr_t mask_word = UnpackHalfwordToWord(mask_hw);
969       const uintptr_t geno_word_shifted = geno_word >> 1;
970       const uintptr_t geno_word_masked = geno_word & mask_word;
971       even_ct += Popcount01Word(geno_word_masked);
972       odd_ct += Popcount01Word(geno_word_shifted & mask_word);
973       bothset_ct += Popcount01Word(geno_word_masked & geno_word_shifted);
974     }
975   }
976   genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
977   genocounts[1] = even_ct - bothset_ct;
978   genocounts[2] = odd_ct - bothset_ct;
979   genocounts[3] = bothset_ct;
980 }
981 
982 void GenoarrCountInvsubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_exclude, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
983   // {raw_}sample_ct == 0 ok.
984   // ugh, 'fullword' is overloaded.  probable todo: keep halfword/fullword,
985   // switch more common use case to bodyword/trailword.
986   const uint32_t bodyword_ct = raw_sample_ct / kBitsPerWord;
987   uint32_t even_ct = 0;
988   uint32_t odd_ct = 0;
989   uint32_t bothset_ct = 0;
990   for (uint32_t widx = 0; widx != bodyword_ct; ++widx) {
991     // possible todo: try vectorizing this loop in SSE4.2+ high-sample-ct case
992     // with shuffle-based dynamic unpacking of sample_exclude?
993     const uintptr_t mask_word = ~sample_exclude[widx];
994     if (mask_word) {
995       uintptr_t geno_word = genoarr[2 * widx];
996       uintptr_t geno_even = PackWordToHalfwordMask5555(geno_word);
997       uintptr_t geno_odd = PackWordToHalfwordMaskAAAA(geno_word);
998       geno_word = genoarr[2 * widx + 1];
999       geno_even |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(geno_word)) << kBitsPerWordD2;
1000       geno_odd |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(geno_word)) << kBitsPerWordD2;
1001       const uintptr_t geno_even_masked = geno_even & mask_word;
1002       even_ct += PopcountWord(geno_even_masked);
1003       odd_ct += PopcountWord(geno_odd & mask_word);
1004       bothset_ct += PopcountWord(geno_odd & geno_even_masked);
1005     }
1006   }
1007   const uint32_t remainder = raw_sample_ct % kBitsPerWord;
1008   if (remainder) {
1009     const uintptr_t mask_word = bzhi(~sample_exclude[bodyword_ct], remainder);
1010     if (mask_word) {
1011       uintptr_t geno_word = genoarr[2 * bodyword_ct];
1012       uintptr_t geno_even = PackWordToHalfwordMask5555(geno_word);
1013       uintptr_t geno_odd = PackWordToHalfwordMaskAAAA(geno_word);
1014       if (remainder > kBitsPerWordD2) {
1015         geno_word = genoarr[2 * bodyword_ct + 1];
1016         geno_even |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(geno_word)) << kBitsPerWordD2;
1017         geno_odd |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(geno_word)) << kBitsPerWordD2;
1018       }
1019       const uintptr_t geno_even_masked = geno_even & mask_word;
1020       even_ct += PopcountWord(geno_even_masked);
1021       odd_ct += PopcountWord(geno_odd & mask_word);
1022       bothset_ct += PopcountWord(geno_odd & geno_even_masked);
1023     }
1024   }
1025   genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
1026   genocounts[1] = even_ct - bothset_ct;
1027   genocounts[2] = odd_ct - bothset_ct;
1028   genocounts[3] = bothset_ct;
1029 }
1030 
1031 void GenoarrCountSubsetIntersectFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict subset1, const uintptr_t* __restrict subset2, uint32_t raw_sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
1032   // {raw_}sample_ct == 0 ok.
1033   const uint32_t raw_sample_ctl2 = NypCtToWordCt(raw_sample_ct);
1034   const uint32_t fullword_ct = raw_sample_ctl2 / 2;
1035   uint32_t subset_intersect_ct = 0;
1036   uint32_t even_ct = 0;
1037   uint32_t odd_ct = 0;
1038   uint32_t bothset_ct = 0;
1039   for (uint32_t widx = 0; widx != fullword_ct; ++widx) {
1040     // hmm, there may be little point to vectorizing this
1041     const uintptr_t mask_word = subset1[widx] & subset2[widx];
1042     if (mask_word) {
1043       uintptr_t geno_word = genoarr[2 * widx];
1044       uintptr_t geno_even = PackWordToHalfwordMask5555(geno_word);
1045       uintptr_t geno_odd = PackWordToHalfwordMaskAAAA(geno_word);
1046       geno_word = genoarr[2 * widx + 1];
1047       geno_even |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(geno_word)) << kBitsPerWordD2;
1048       geno_odd |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(geno_word)) << kBitsPerWordD2;
1049       const uintptr_t geno_even_masked = geno_even & mask_word;
1050       subset_intersect_ct += PopcountWord(mask_word);
1051       even_ct += PopcountWord(geno_even_masked);
1052       odd_ct += PopcountWord(geno_odd & mask_word);
1053       bothset_ct += PopcountWord(geno_odd & geno_even_masked);
1054     }
1055   }
1056   if (raw_sample_ctl2 % 2) {
1057     const uintptr_t mask_hw = subset1[fullword_ct] & subset2[fullword_ct];
1058     if (mask_hw) {
1059       const uintptr_t geno_word = genoarr[fullword_ct * 2];
1060       const uintptr_t mask_word = UnpackHalfwordToWord(mask_hw);
1061       const uintptr_t geno_word_shifted = geno_word >> 1;
1062       const uintptr_t geno_word_masked = geno_word & mask_word;
1063       subset_intersect_ct += Popcount01Word(mask_word);
1064       even_ct += Popcount01Word(geno_word_masked);
1065       odd_ct += Popcount01Word(geno_word_shifted & mask_word);
1066       bothset_ct += Popcount01Word(geno_word_masked & geno_word_shifted);
1067     }
1068   }
1069   genocounts[0] = subset_intersect_ct + bothset_ct - even_ct - odd_ct;
1070   genocounts[1] = even_ct - bothset_ct;
1071   genocounts[2] = odd_ct - bothset_ct;
1072   genocounts[3] = bothset_ct;
1073 }
1074 
GenovecInvertUnsafe(uint32_t sample_ct,uintptr_t * genovec)1075 void GenovecInvertUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
1076   // flips 0 to 2 and vice versa.
1077   // "unsafe" because trailing bits are not zeroed out.
1078   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
1079   assert(VecIsAligned(genovec));
1080   const VecW not_m1 = VCONST_W(kMaskAAAA);
1081   VecW* vptr = R_CAST(VecW*, genovec);
1082   for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
1083     VecW cur_vec = vptr[vidx];
1084     // flip high bit iff low bit is unset
1085     vptr[vidx] = cur_vec ^ vecw_and_notfirst(vecw_slli(cur_vec, 1), not_m1);
1086   }
1087 }
1088 
1089 void DifflistCountSubsetFreqs(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t common_geno, uint32_t difflist_len, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
1090   STD_ARRAY_REF_FILL0(4, genocounts);
1091   for (uint32_t difflist_idx = 0; difflist_idx != difflist_len; ++difflist_idx) {
1092     const uint32_t raw_sample_idx = difflist_sample_ids[difflist_idx];
1093     if (IsSet(sample_include, raw_sample_idx)) {
1094       genocounts[GetNyparrEntry(raregeno, difflist_idx)] += 1;
1095     }
1096   }
1097   genocounts[common_geno] = sample_ct - genocounts[0] - genocounts[1] - genocounts[2] - genocounts[3];
1098 }
1099 
1100 
1101 static_assert(kPglNypTransposeBatch == S_CAST(uint32_t, kNypsPerCacheline), "TransposeNypblock64() needs to be updated.");
1102 #ifdef __LP64__
TransposeNypblock64(const uintptr_t * read_iter,uint32_t read_ul_stride,uint32_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * __restrict write_iter,unsigned char * __restrict buf0,unsigned char * __restrict buf1)1103 void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1) {
1104   // buf0 and buf1 must each be vector-aligned and have size 16k
1105   // Tried using previous AVX2 small-buffer approach, but that was a bit
1106   // worse... though maybe it should be revisited?
1107   const uint32_t buf0_row_ct = DivUp(write_batch_size, 32);
1108   {
1109     // Fold the first 3 shuffles into the ingestion loop.
1110     // Can fold 2 additional shuffles here by ingesting uint16s instead.  That
1111     // removes the need for the second loop, halving the workspace requirement.
1112     // Benchmark results of that approach are slightly but consistently worse,
1113     // though.
1114     const uintptr_t* initial_read_iter = R_CAST(const uintptr_t*, read_iter);
1115     const uintptr_t* initial_read_end = &(initial_read_iter[buf0_row_ct]);
1116     uintptr_t* initial_target_iter = R_CAST(uintptr_t*, buf0);
1117     const uint32_t read_batch_rem = kNypsPerCacheline - read_batch_size;
1118     // Tried fancier vector-based ingestion, didn't help.
1119     for (; initial_read_iter != initial_read_end; ++initial_read_iter) {
1120       const uintptr_t* read_iter_tmp = initial_read_iter;
1121       for (uint32_t ujj = 0; ujj != read_batch_size; ++ujj) {
1122         *initial_target_iter++ = *read_iter_tmp;
1123         read_iter_tmp = &(read_iter_tmp[read_ul_stride]);
1124       }
1125       ZeroWArr(read_batch_rem, initial_target_iter);
1126       initial_target_iter = &(initial_target_iter[read_batch_rem]);
1127     }
1128   }
1129 
1130   // shuffle from 64 -> 16
1131   const uint32_t buf1_row_ct = DivUp(write_batch_size, 8);
1132   {
1133     // full buf0 row is 256 * 8 bytes
1134     // full buf1 row is 512 bytes
1135     const VecW* buf0_read_iter = R_CAST(const VecW*, buf0);
1136     __m128i* write_iter0 = R_CAST(__m128i*, buf1);
1137     const uint32_t buf0_row_clwidth = DivUp(read_batch_size, 8);
1138 #  ifdef USE_SSE42
1139     const VecW gather_u32s = vecw_setr8(0, 1, 8, 9, 2, 3, 10, 11,
1140                                         4, 5, 12, 13, 6, 7, 14, 15);
1141 #  else
1142     const VecW m16 = VCONST_W(kMask0000FFFF);
1143 #  endif
1144     for (uint32_t bidx = 0; bidx != buf0_row_ct; ++bidx) {
1145       __m128i* write_iter1 = &(write_iter0[32]);
1146       __m128i* write_iter2 = &(write_iter1[32]);
1147       __m128i* write_iter3 = &(write_iter2[32]);
1148       for (uint32_t clidx = 0; clidx != buf0_row_clwidth; ++clidx) {
1149 #  ifdef USE_AVX2
1150         VecW loader0 = buf0_read_iter[clidx * 2];
1151         VecW loader1 = buf0_read_iter[clidx * 2 + 1];
1152         loader0 = vecw_shuffle8(loader0, gather_u32s);
1153         loader1 = vecw_shuffle8(loader1, gather_u32s);
1154         //    (0,0) (0,1) (1,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (31,3)
1155         //      (0,4) ... (31,7)
1156         // -> (0,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (15,3) (0,4) ... (15,7)
1157         //      (16,0) ... (31,3) (16,4) ...
1158         loader0 = vecw_permute0xd8_if_avx2(loader0);
1159         loader1 = vecw_permute0xd8_if_avx2(loader1);
1160         // -> (0,0) ... (7,1) (0,2) ... (7,3) (0,4) ... (7,7) (8,0) ... (15,7)
1161         //      (16,0) ... (23,7) (24,0) ... (31,7)
1162         const __m256i vec_lo = _mm256_shuffle_epi32(R_CAST(__m256i, loader0), 0xd8);
1163         const __m256i vec_hi = _mm256_shuffle_epi32(R_CAST(__m256i, loader1), 0xd8);
1164         const __m256i final0145 = _mm256_unpacklo_epi64(vec_lo, vec_hi);
1165         const __m256i final2367 = _mm256_unpackhi_epi64(vec_lo, vec_hi);
1166         // GCC doesn't support _mm256_storeu_si128i as of this writing.
1167         write_iter0[clidx] = _mm256_castsi256_si128(final0145);
1168         write_iter1[clidx] = _mm256_castsi256_si128(final2367);
1169         write_iter2[clidx] = _mm256_extracti128_si256(final0145, 1);
1170         write_iter3[clidx] = _mm256_extracti128_si256(final2367, 1);
1171 #  else
1172         VecW loader0 = buf0_read_iter[clidx * 4];
1173         VecW loader1 = buf0_read_iter[clidx * 4 + 1];
1174         VecW loader2 = buf0_read_iter[clidx * 4 + 2];
1175         VecW loader3 = buf0_read_iter[clidx * 4 + 3];
1176 #    ifdef USE_SSE42
1177         loader0 = vecw_shuffle8(loader0, gather_u32s);
1178         loader1 = vecw_shuffle8(loader1, gather_u32s);
1179         loader2 = vecw_shuffle8(loader2, gather_u32s);
1180         loader3 = vecw_shuffle8(loader3, gather_u32s);
1181 #    else
1182         VecW tmp_lo = vecw_unpacklo16(loader0, loader1);
1183         VecW tmp_hi = vecw_unpackhi16(loader0, loader1);
1184         loader0 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16);
1185         loader1 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 16), m16);
1186         tmp_lo = vecw_unpacklo16(loader2, loader3);
1187         tmp_hi = vecw_unpackhi16(loader2, loader3);
1188         loader2 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16);
1189         loader3 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 16), m16);
1190 #    endif
1191         //    (0,0) (0,1) (1,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (31,3)
1192         //  + (0,4) ... (31,7)
1193         // -> (0,0) ... (7,3) (0,4) ... (7,7) (8,0) ... (15,7)
1194         const VecW lo_0_15 = vecw_unpacklo32(loader0, loader1);
1195         const VecW lo_16_31 = vecw_unpackhi32(loader0, loader1);
1196         const VecW hi_0_15 = vecw_unpacklo32(loader2, loader3);
1197         const VecW hi_16_31 = vecw_unpackhi32(loader2, loader3);
1198         write_iter0[clidx] = R_CAST(__m128i, vecw_unpacklo64(lo_0_15, hi_0_15));
1199         write_iter1[clidx] = R_CAST(__m128i, vecw_unpackhi64(lo_0_15, hi_0_15));
1200         write_iter2[clidx] = R_CAST(__m128i, vecw_unpacklo64(lo_16_31, hi_16_31));
1201         write_iter3[clidx] = R_CAST(__m128i, vecw_unpackhi64(lo_16_31, hi_16_31));
1202 #  endif
1203       }
1204       buf0_read_iter = &(buf0_read_iter[2048 / kBytesPerVec]);
1205       write_iter0 = &(write_iter3[32]);
1206     }
1207   }
1208 
1209   // movemask from 16 -> 2
1210   const VecW* source_iter = R_CAST(VecW*, buf1);
1211   const VecW m8 = VCONST_W(kMask00FF);
1212 
1213   // Take advantage of current function contract.
1214   const uint32_t buf1_fullrow_ct = (write_batch_size + 3) / 8;
1215 
1216   const uint32_t write_v8ui_stride = kVec8thUintPerWord * write_ul_stride;
1217   const uint32_t vec_ct = DivUp(read_batch_size, (kBytesPerVec / 2));
1218   Vec8thUint* target_iter0 = R_CAST(Vec8thUint*, write_iter);
1219   for (uint32_t uii = 0; uii != buf1_fullrow_ct; ++uii) {
1220     Vec8thUint* target_iter1 = &(target_iter0[write_v8ui_stride]);
1221     Vec8thUint* target_iter2 = &(target_iter1[write_v8ui_stride]);
1222     Vec8thUint* target_iter3 = &(target_iter2[write_v8ui_stride]);
1223     Vec8thUint* target_iter4 = &(target_iter3[write_v8ui_stride]);
1224     Vec8thUint* target_iter5 = &(target_iter4[write_v8ui_stride]);
1225     Vec8thUint* target_iter6 = &(target_iter5[write_v8ui_stride]);
1226     Vec8thUint* target_iter7 = &(target_iter6[write_v8ui_stride]);
1227     for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
1228       const VecW loader = source_iter[vidx];
1229       // Using goal bit-coordinates, where '_' indicates irrelevant content, we
1230       // set target_0123 to
1231       //   _ (0, 0) _ (1, 0) _ (2, 0) _ (3, 0) _ (0, 1) _ (1, 1) _ (2, 1) ...
1232       // and target_4567 to
1233       //   _ (4, 0) _ (5, 0) _ (6, 0) _ (7, 0) _ (4, 1) _ (5, 1) _ (6, 1) ...
1234       // This is perfectly arranged for movemask.
1235       VecW target_4567 = vecw_blendv(loader, vecw_srli(loader, 7), m8);
1236       target_iter7[vidx] = vecw_movemask(target_4567);
1237       target_4567 = vecw_slli(target_4567, 2);
1238       target_iter6[vidx] = vecw_movemask(target_4567);
1239       target_4567 = vecw_slli(target_4567, 2);
1240       target_iter5[vidx] = vecw_movemask(target_4567);
1241       target_4567 = vecw_slli(target_4567, 2);
1242       target_iter4[vidx] = vecw_movemask(target_4567);
1243       VecW target_0123 = vecw_blendv(vecw_slli(loader, 8), vecw_slli(loader, 1), m8);
1244       target_iter3[vidx] = vecw_movemask(target_0123);
1245       target_0123 = vecw_slli(target_0123, 2);
1246       target_iter2[vidx] = vecw_movemask(target_0123);
1247       target_0123 = vecw_slli(target_0123, 2);
1248       target_iter1[vidx] = vecw_movemask(target_0123);
1249       target_0123 = vecw_slli(target_0123, 2);
1250       target_iter0[vidx] = vecw_movemask(target_0123);
1251     }
1252     source_iter = &(source_iter[(2 * kPglNypTransposeBatch) / kBytesPerVec]);
1253     target_iter0 = &(target_iter7[write_v8ui_stride]);
1254   }
1255   if (buf1_fullrow_ct == buf1_row_ct) {
1256     return;
1257   }
1258   Vec8thUint* target_iter1 = &(target_iter0[write_v8ui_stride]);
1259   Vec8thUint* target_iter2 = &(target_iter1[write_v8ui_stride]);
1260   Vec8thUint* target_iter3 = &(target_iter2[write_v8ui_stride]);
1261   for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
1262     const VecW loader = source_iter[vidx];
1263     VecW target_0123 = vecw_blendv(vecw_slli(loader, 8), vecw_slli(loader, 1), m8);
1264     target_iter3[vidx] = vecw_movemask(target_0123);
1265     target_0123 = vecw_slli(target_0123, 2);
1266     target_iter2[vidx] = vecw_movemask(target_0123);
1267     target_0123 = vecw_slli(target_0123, 2);
1268     target_iter1[vidx] = vecw_movemask(target_0123);
1269     target_0123 = vecw_slli(target_0123, 2);
1270     target_iter0[vidx] = vecw_movemask(target_0123);
1271   }
1272 }
1273 #else  // !__LP64__
1274 static_assert(kWordsPerVec == 1, "TransposeNypblock32() needs to be updated.");
TransposeNypblock32(const uintptr_t * read_iter,uint32_t read_ul_stride,uint32_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * __restrict write_iter,unsigned char * __restrict buf0,unsigned char * __restrict buf1)1275 void TransposeNypblock32(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1) {
1276   // buf0 and buf1 must each be vector-aligned and have size 16k
1277   // defining them as unsigned char* might prevent a strict-aliasing issue?
1278   // (might need to go through greater contortions to actually be safe?)
1279   const uint32_t buf_row_ct = NypCtToByteCt(write_batch_size);
1280   // fold the first 6 shuffles into the initial ingestion loop
1281   const unsigned char* initial_read_iter = R_CAST(const unsigned char*, read_iter);
1282   const unsigned char* initial_read_end = &(initial_read_iter[buf_row_ct]);
1283   unsigned char* initial_target_iter = buf0;
1284   const uint32_t read_byte_stride = read_ul_stride * kBytesPerWord;
1285   const uint32_t read_batch_rem = kNypsPerCacheline - read_batch_size;
1286   for (; initial_read_iter != initial_read_end; ++initial_read_iter) {
1287     const unsigned char* read_iter_tmp = initial_read_iter;
1288     for (uint32_t ujj = 0; ujj != read_batch_size; ++ujj) {
1289       *initial_target_iter++ = *read_iter_tmp;
1290       read_iter_tmp = &(read_iter_tmp[read_byte_stride]);
1291     }
1292     initial_target_iter = memsetua(initial_target_iter, 0, read_batch_rem);
1293   }
1294 
1295   // second-to-last shuffle, 8 bit spacing -> 4
1296   const VecW* source_iter = R_CAST(VecW*, buf0);
1297   uintptr_t* target_iter0 = R_CAST(uintptr_t*, buf1);
1298   const uint32_t write_word_ct = NypCtToWordCt(read_batch_size);
1299   const uint32_t penult_inner_loop_iter_ct = 2 * write_word_ct;
1300   const uint32_t cur_write_skip = 2 * kWordsPerCacheline - penult_inner_loop_iter_ct;
1301   for (uint32_t uii = 0; uii != buf_row_ct; ++uii) {
1302     uintptr_t* target_iter1 = &(target_iter0[kWordsPerCacheline * 2]);
1303     for (uint32_t ujj = 0; ujj != penult_inner_loop_iter_ct; ++ujj) {
1304       const uintptr_t source_word_lo = *source_iter++;
1305       const uintptr_t source_word_hi = *source_iter++;
1306       uintptr_t target_word0_lo = source_word_lo & kMask0F0F;
1307       uintptr_t target_word1_lo = (source_word_lo >> 4) & kMask0F0F;
1308       uintptr_t target_word0_hi = source_word_hi & kMask0F0F;
1309       uintptr_t target_word1_hi = (source_word_hi >> 4) & kMask0F0F;
1310       target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
1311       target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
1312       target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
1313       target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
1314       target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
1315       target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
1316       target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
1317       target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
1318       *target_iter0++ = S_CAST(Halfword, target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
1319       *target_iter1++ = S_CAST(Halfword, target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
1320     }
1321     source_iter = &(source_iter[2 * cur_write_skip]);
1322     target_iter0 = &(target_iter1[cur_write_skip]);
1323   }
1324 
1325   // last shuffle, 4 bit spacing -> 2
1326   source_iter = R_CAST(VecW*, buf1);
1327   target_iter0 = write_iter;
1328   const uint32_t last_loop_iter_ct = DivUp(write_batch_size, 2);
1329   for (uint32_t uii = 0; uii != last_loop_iter_ct; ++uii) {
1330     uintptr_t* target_iter1 = &(target_iter0[write_ul_stride]);
1331     for (uint32_t ujj = 0; ujj != write_word_ct; ++ujj) {
1332       const uintptr_t source_word_lo = *source_iter++;
1333       const uintptr_t source_word_hi = *source_iter++;
1334       uintptr_t target_word0_lo = source_word_lo & kMask3333;
1335       uintptr_t target_word1_lo = (source_word_lo >> 2) & kMask3333;
1336       uintptr_t target_word0_hi = source_word_hi & kMask3333;
1337       uintptr_t target_word1_hi = (source_word_hi >> 2) & kMask3333;
1338       target_word0_lo = (target_word0_lo | (target_word0_lo >> 2)) & kMask0F0F;
1339       target_word1_lo = (target_word1_lo | (target_word1_lo >> 2)) & kMask0F0F;
1340       target_word0_hi = (target_word0_hi | (target_word0_hi >> 2)) & kMask0F0F;
1341       target_word1_hi = (target_word1_hi | (target_word1_hi >> 2)) & kMask0F0F;
1342       target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
1343       target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
1344       target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
1345       target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
1346       target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
1347       target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
1348       target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
1349       target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
1350       target_iter0[ujj] = S_CAST(Halfword, target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
1351       target_iter1[ujj] = S_CAST(Halfword, target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
1352     }
1353     source_iter = &(source_iter[2 * (kWordsPerCacheline - write_word_ct)]);
1354     target_iter0 = &(target_iter1[write_ul_stride]);
1355   }
1356 }
1357 #endif  // !__LP64__
1358 
BiallelicDosage16Invert(uint32_t dosage_ct,uint16_t * dosage_main)1359 void BiallelicDosage16Invert(uint32_t dosage_ct, uint16_t* dosage_main) {
1360   // replace each x with (32768 - x).
1361   // compiler is smart enough to vectorize this.
1362   for (uint32_t uii = 0; uii != dosage_ct; ++uii) {
1363     dosage_main[uii] = 32768 - dosage_main[uii];
1364   }
1365 }
1366 
BiallelicDphase16Invert(uint32_t dphase_ct,int16_t * dphase_delta)1367 void BiallelicDphase16Invert(uint32_t dphase_ct, int16_t* dphase_delta) {
1368   for (uint32_t uii = 0; uii != dphase_ct; ++uii) {
1369     dphase_delta[uii] = -dphase_delta[uii];
1370   }
1371 }
1372 
GenoarrToMissingnessUnsafe(const uintptr_t * __restrict genoarr,uint32_t sample_ct,uintptr_t * __restrict missingness)1373 void GenoarrToMissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict missingness) {
1374   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
1375   Halfword* missingness_alias = R_CAST(Halfword*, missingness);
1376   for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
1377     const uintptr_t cur_geno_word = genoarr[widx];
1378     missingness_alias[widx] = PackWordToHalfwordMask5555(cur_geno_word & (cur_geno_word >> 1));
1379   }
1380   if (sample_ctl2 % 2) {
1381     missingness_alias[sample_ctl2] = 0;
1382   }
1383 }
1384 
GenoarrToNonmissingnessUnsafe(const uintptr_t * __restrict genoarr,uint32_t sample_ct,uintptr_t * __restrict nonmissingness)1385 void GenoarrToNonmissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict nonmissingness) {
1386   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
1387   Halfword* nonmissingness_alias = R_CAST(Halfword*, nonmissingness);
1388   for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
1389     const uintptr_t cur_geno_word = genoarr[widx];
1390     nonmissingness_alias[widx] = PackWordToHalfwordMask5555(~(cur_geno_word & (cur_geno_word >> 1)));
1391   }
1392 }
1393 
SplitHomRef2hetUnsafeW(const uintptr_t * genoarr,uint32_t inword_ct,uintptr_t * __restrict hom_buf,uintptr_t * __restrict ref2het_buf)1394 void SplitHomRef2hetUnsafeW(const uintptr_t* genoarr, uint32_t inword_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf) {
1395   Halfword* hom_alias = R_CAST(Halfword*, hom_buf);
1396   Halfword* ref2het_alias = R_CAST(Halfword*, ref2het_buf);
1397   for (uint32_t widx = 0; widx != inword_ct; ++widx) {
1398     const uintptr_t geno_word = genoarr[widx];
1399     hom_alias[widx] = PackWordToHalfwordMask5555(~geno_word);
1400     ref2het_alias[widx] = PackWordToHalfwordMaskAAAA(~geno_word);
1401   }
1402 }
1403 
SplitHomRef2het(const uintptr_t * genoarr,uint32_t sample_ct,uintptr_t * __restrict hom_buf,uintptr_t * __restrict ref2het_buf)1404 void SplitHomRef2het(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf) {
1405   const uint32_t full_outword_ct = sample_ct / kBitsPerWord;
1406   SplitHomRef2hetUnsafeW(genoarr, full_outword_ct * 2, hom_buf, ref2het_buf);
1407   const uint32_t remainder = sample_ct % kBitsPerWord;
1408   if (remainder) {
1409     uintptr_t geno_word = genoarr[full_outword_ct * 2];
1410     uintptr_t hom_word = PackWordToHalfwordMask5555(~geno_word);
1411     uintptr_t ref2het_word = PackWordToHalfwordMaskAAAA(~geno_word);
1412     if (remainder > kBitsPerWordD2) {
1413       geno_word = genoarr[full_outword_ct * 2 + 1];
1414       hom_word |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(~geno_word)) << kBitsPerWordD2;
1415       ref2het_word |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(~geno_word)) << kBitsPerWordD2;
1416     }
1417     const uintptr_t cur_mask = (k1LU << remainder) - 1;
1418     hom_buf[full_outword_ct] = hom_word & cur_mask;
1419     ref2het_buf[full_outword_ct] = ref2het_word & cur_mask;
1420   }
1421 }
1422 
1423 // Note that once the result elements are only 4 bits wide, shuffle8 should
1424 // beat a 256x4 lookup table when SSE4 is available.  (possible todo: see if
1425 // shuffle8-based loop also wins here.)
GenoarrLookup256x1bx4(const uintptr_t * genoarr,const void * table256x1bx4,uint32_t sample_ct,void * __restrict result)1426 void GenoarrLookup256x1bx4(const uintptr_t* genoarr, const void* table256x1bx4, uint32_t sample_ct, void* __restrict result) {
1427   const uint32_t* table_alias = S_CAST(const uint32_t*, table256x1bx4);
1428   const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
1429   uint32_t* result_alias = S_CAST(uint32_t*, result);
1430   const uint32_t full_byte_ct = sample_ct / 4;
1431   for (uint32_t byte_idx = 0; byte_idx != full_byte_ct; ++byte_idx) {
1432     result_alias[byte_idx] = table_alias[genoarr_alias[byte_idx]];
1433   }
1434   const uint32_t remainder = sample_ct % 4;
1435   if (remainder) {
1436     unsigned char* result_last = R_CAST(unsigned char*, &(result_alias[full_byte_ct]));
1437     uintptr_t geno_byte = genoarr_alias[full_byte_ct];
1438     for (uint32_t uii = 0; uii != remainder; ++uii) {
1439       result_last[uii] = table_alias[geno_byte & 3];
1440       geno_byte >>= 2;
1441     }
1442   }
1443 }
1444 
GenoarrLookup16x4bx2(const uintptr_t * genoarr,const void * table16x4bx2,uint32_t sample_ct,void * __restrict result)1445 void GenoarrLookup16x4bx2(const uintptr_t* genoarr, const void* table16x4bx2, uint32_t sample_ct, void* __restrict result) {
1446   const uint64_t* table_alias = S_CAST(const uint64_t*, table16x4bx2);
1447   uint64_t* result_iter = S_CAST(uint64_t*, result);
1448   const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1449   uint32_t loop_len = kBitsPerWordD4;
1450   uintptr_t geno_word = 0;
1451   for (uint32_t widx = 0; ; ++widx) {
1452     if (widx >= sample_ctl2m1) {
1453       if (widx > sample_ctl2m1) {
1454         if (sample_ct % 2) {
1455           memcpy(result_iter, &(table_alias[geno_word & 3]), 4);
1456         }
1457         return;
1458       }
1459       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1460     }
1461     geno_word = genoarr[widx];
1462     for (uint32_t uii = 0; uii != loop_len; ++uii) {
1463       const uintptr_t cur_2geno = geno_word & 15;
1464       *result_iter++ = table_alias[cur_2geno];
1465       geno_word >>= 4;
1466     }
1467   }
1468 }
1469 
1470 // this might be important for genovec -> AlleleCode expansion
GenoarrLookup256x2bx4(const uintptr_t * genoarr,const void * table256x2bx4,uint32_t sample_ct,void * __restrict result)1471 void GenoarrLookup256x2bx4(const uintptr_t* genoarr, const void* table256x2bx4, uint32_t sample_ct, void* __restrict result) {
1472   const uint64_t* table_alias = S_CAST(const uint64_t*, table256x2bx4);
1473   const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
1474   uint64_t* result_alias = S_CAST(uint64_t*, result);
1475   const uint32_t full_byte_ct = sample_ct / 4;
1476   for (uint32_t byte_idx = 0; byte_idx != full_byte_ct; ++byte_idx) {
1477     result_alias[byte_idx] = table_alias[genoarr_alias[byte_idx]];
1478   }
1479   const uint32_t remainder = sample_ct % 4;
1480   if (remainder) {
1481     uint16_t* result_last = R_CAST(uint16_t*, &(result_alias[full_byte_ct]));
1482     uintptr_t geno_byte = genoarr_alias[full_byte_ct];
1483     for (uint32_t uii = 0; uii != remainder; ++uii) {
1484       memcpy_k(&(result_last[uii]), &(table_alias[geno_byte & 3]), 2);
1485       geno_byte >>= 2;
1486     }
1487   }
1488 }
1489 
1490 #ifdef __LP64__
GenoarrLookup4x16b(const uintptr_t * genoarr,const void * table4x16b,uint32_t sample_ct,void * result)1491 void GenoarrLookup4x16b(const uintptr_t* genoarr, const void* table4x16b, uint32_t sample_ct, void* result) {
1492   const __m128i* table_alias = S_CAST(const __m128i*, table4x16b);
1493   __m128i* result_iter = S_CAST(__m128i*, result);
1494   const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1495   uint32_t loop_len = kBitsPerWordD2;
1496   uintptr_t geno_word = 0;
1497   for (uint32_t widx = 0; ; ++widx) {
1498     if (widx >= sample_ctl2m1) {
1499       if (widx > sample_ctl2m1) {
1500         return;
1501       }
1502       loop_len = ModNz(sample_ct, kBitsPerWordD2);
1503     }
1504     geno_word = genoarr[widx];
1505     for (uint32_t uii = 0; uii != loop_len; ++uii) {
1506       _mm_storeu_si128(result_iter, table_alias[geno_word & 3]);
1507       ++result_iter;
1508       geno_word >>= 2;
1509     }
1510   }
1511 }
1512 
GenoarrLookup16x8bx2(const uintptr_t * genoarr,const void * table16x8bx2,uint32_t sample_ct,void * __restrict result)1513 void GenoarrLookup16x8bx2(const uintptr_t* genoarr, const void* table16x8bx2, uint32_t sample_ct, void* __restrict result) {
1514   const __m128i* table_alias = S_CAST(const __m128i*, table16x8bx2);
1515   __m128i* result_iter = S_CAST(__m128i*, result);
1516   const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1517   uint32_t loop_len = kBitsPerWordD4;
1518   uintptr_t geno_word = 0;
1519   for (uint32_t widx = 0; ; ++widx) {
1520     if (widx >= sample_ctl2m1) {
1521       if (widx > sample_ctl2m1) {
1522         if (sample_ct % 2) {
1523           memcpy(result_iter, &(table_alias[geno_word & 3]), 8);
1524         }
1525         return;
1526       }
1527       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1528     }
1529     geno_word = genoarr[widx];
1530     for (uint32_t uii = 0; uii != loop_len; ++uii) {
1531       const uintptr_t cur_2geno = geno_word & 15;
1532       _mm_storeu_si128(result_iter, table_alias[cur_2geno]);
1533       ++result_iter;
1534       geno_word >>= 4;
1535     }
1536   }
1537 }
1538 
GenoarrLookup256x4bx4(const uintptr_t * genoarr,const void * table256x4bx4,uint32_t sample_ct,void * __restrict result)1539 void GenoarrLookup256x4bx4(const uintptr_t* genoarr, const void* table256x4bx4, uint32_t sample_ct, void* __restrict result) {
1540   const __m128i* table_alias = S_CAST(const __m128i*, table256x4bx4);
1541   const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
1542   __m128i* result_alias = S_CAST(__m128i*, result);
1543   const uint32_t full_byte_ct = sample_ct / 4;
1544   for (uint32_t byte_idx = 0; byte_idx != full_byte_ct; ++byte_idx) {
1545     _mm_storeu_si128(&(result_alias[byte_idx]), table_alias[genoarr_alias[byte_idx]]);
1546   }
1547   const uint32_t remainder = sample_ct % 4;
1548   if (remainder) {
1549     uint32_t* result_last = R_CAST(uint32_t*, &(result_alias[full_byte_ct]));
1550     uintptr_t geno_byte = genoarr_alias[full_byte_ct];
1551     for (uint32_t uii = 0; uii != remainder; ++uii) {
1552       memcpy(&(result_last[uii]), &(table_alias[geno_byte & 3]), 4);
1553       geno_byte >>= 2;
1554     }
1555   }
1556 }
1557 #else
GenoarrLookup4x16b(const uintptr_t * genoarr,const void * table4x16b,uint32_t sample_ct,void * result)1558 void GenoarrLookup4x16b(const uintptr_t* genoarr, const void* table4x16b, uint32_t sample_ct, void* result) {
1559   const uint64_t* table_alias = S_CAST(const uint64_t*, table4x16b);
1560   uint64_t* result_iter = S_CAST(uint64_t*, result);
1561   const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1562   uint32_t loop_len = kBitsPerWordD2;
1563   uintptr_t geno_word = 0;
1564   for (uint32_t widx = 0; ; ++widx) {
1565     if (widx >= sample_ctl2m1) {
1566       if (widx > sample_ctl2m1) {
1567         return;
1568       }
1569       loop_len = ModNz(sample_ct, kBitsPerWordD2);
1570     }
1571     geno_word = genoarr[widx];
1572     for (uint32_t uii = 0; uii != loop_len; ++uii) {
1573       memcpy(result_iter, &(table_alias[(geno_word & 3) * 2]), 16);
1574       result_iter = &(result_iter[2]);
1575       geno_word >>= 2;
1576     }
1577   }
1578 }
1579 
GenoarrLookup16x8bx2(const uintptr_t * genoarr,const void * table16x8bx2,uint32_t sample_ct,void * __restrict result)1580 void GenoarrLookup16x8bx2(const uintptr_t* genoarr, const void* table16x8bx2, uint32_t sample_ct, void* __restrict result) {
1581   const uint64_t* table_alias = S_CAST(const uint64_t*, table16x8bx2);
1582   uint64_t* result_iter = S_CAST(uint64_t*, result);
1583   const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1584   uint32_t loop_len = kBitsPerWordD4;
1585   uintptr_t geno_word = 0;
1586   for (uint32_t widx = 0; ; ++widx) {
1587     if (widx >= sample_ctl2m1) {
1588       if (widx > sample_ctl2m1) {
1589         if (sample_ct % 2) {
1590           memcpy(result_iter, &(table_alias[(geno_word & 3) * 2]), 8);
1591         }
1592         return;
1593       }
1594       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1595     }
1596     geno_word = genoarr[widx];
1597     for (uint32_t uii = 0; uii != loop_len; ++uii) {
1598       const uintptr_t cur_2geno = geno_word & 15;
1599       memcpy(result_iter, &(table_alias[cur_2geno * 2]), 16);
1600       result_iter = &(result_iter[2]);
1601       geno_word >>= 4;
1602     }
1603   }
1604 }
1605 
GenoarrLookup256x4bx4(const uintptr_t * genoarr,const void * table256x4bx4,uint32_t sample_ct,void * __restrict result)1606 void GenoarrLookup256x4bx4(const uintptr_t* genoarr, const void* table256x4bx4, uint32_t sample_ct, void* __restrict result) {
1607   const uint32_t* table_alias = S_CAST(const uint32_t*, table256x4bx4);
1608   const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
1609   uint32_t* result_alias = S_CAST(uint32_t*, result);
1610   const uint32_t full_byte_ct = sample_ct / 4;
1611   for (uint32_t byte_idx = 0; byte_idx != full_byte_ct; ++byte_idx) {
1612     memcpy(&(result_alias[byte_idx * 4]), &(table_alias[genoarr_alias[byte_idx] * 4]), 16);
1613   }
1614   const uint32_t remainder = sample_ct % 4;
1615   if (remainder) {
1616     uint32_t* result_last = &(result_alias[full_byte_ct * 4]);
1617     uintptr_t geno_byte = genoarr_alias[full_byte_ct];
1618     for (uint32_t uii = 0; uii != remainder; ++uii) {
1619       memcpy(&(result_last[uii]), &(table_alias[(geno_byte & 3) * 4]), 4);
1620       geno_byte >>= 2;
1621     }
1622   }
1623 }
1624 #endif
1625 
InitLookup16x4bx2(void * table16x4bx2)1626 void InitLookup16x4bx2(void* table16x4bx2) {
1627   uint32_t* table_iter = S_CAST(uint32_t*, table16x4bx2);
1628   uint32_t vals[4];
1629   vals[0] = table_iter[0];
1630   table_iter[1] = vals[0];
1631   vals[1] = table_iter[2];
1632   table_iter[3] = vals[0];
1633   vals[2] = table_iter[4];
1634   table_iter[5] = vals[0];
1635   vals[3] = table_iter[6];
1636   table_iter[7] = vals[0];
1637   table_iter = &(table_iter[8]);
1638   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1639     const uint32_t cur_high = vals[high_idx];
1640     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1641       *table_iter++ = vals[low_idx];
1642       *table_iter++ = cur_high;
1643     }
1644   }
1645 }
1646 
InitLookup16x8bx2(void * table16x8bx2)1647 void InitLookup16x8bx2(void* table16x8bx2) {
1648   uint64_t* table_iter = S_CAST(uint64_t*, table16x8bx2);
1649   uint64_t vals[4];
1650   vals[0] = table_iter[0];
1651   table_iter[1] = vals[0];
1652   vals[1] = table_iter[2];
1653   table_iter[3] = vals[0];
1654   vals[2] = table_iter[4];
1655   table_iter[5] = vals[0];
1656   vals[3] = table_iter[6];
1657   table_iter[7] = vals[0];
1658   table_iter = &(table_iter[8]);
1659   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1660     // bugfix (20 Jun 2018): cur_high needs to be a uint64_t, not a uint32_t
1661     const uint64_t cur_high = vals[high_idx];
1662     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1663       *table_iter++ = vals[low_idx];
1664       *table_iter++ = cur_high;
1665     }
1666   }
1667 }
1668 
InitLookup256x1bx4(void * table256x1bx4)1669 void InitLookup256x1bx4(void* table256x1bx4) {
1670   unsigned char* table_iter = S_CAST(unsigned char*, table256x1bx4);
1671   unsigned char vals[4];
1672   vals[0] = table_iter[0];
1673   vals[1] = table_iter[4];
1674   vals[2] = table_iter[8];
1675   vals[3] = table_iter[12];
1676   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
1677     const uint32_t cur_high = vals[high_idx];
1678     for (uint32_t second_idx = 0; second_idx != 4; ++second_idx) {
1679       const uint32_t cur_second = vals[second_idx];
1680       for (uint32_t third_idx = 0; third_idx != 4; ++third_idx) {
1681         const uint32_t cur_third = vals[third_idx];
1682         for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1683           *table_iter++ = vals[low_idx];
1684           *table_iter++ = cur_third;
1685           *table_iter++ = cur_second;
1686           *table_iter++ = cur_high;
1687         }
1688       }
1689     }
1690   }
1691 }
1692 
InitLookup256x2bx4(void * table256x2bx4)1693 void InitLookup256x2bx4(void* table256x2bx4) {
1694   uint16_t* table_iter = S_CAST(uint16_t*, table256x2bx4);
1695   uint16_t vals[4];
1696   vals[0] = table_iter[0];
1697   vals[1] = table_iter[4];
1698   vals[2] = table_iter[8];
1699   vals[3] = table_iter[12];
1700   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
1701     const uint32_t cur_high = vals[high_idx];
1702     for (uint32_t second_idx = 0; second_idx != 4; ++second_idx) {
1703       const uint32_t cur_second = vals[second_idx];
1704       for (uint32_t third_idx = 0; third_idx != 4; ++third_idx) {
1705         const uint32_t cur_third = vals[third_idx];
1706         for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1707           *table_iter++ = vals[low_idx];
1708           *table_iter++ = cur_third;
1709           *table_iter++ = cur_second;
1710           *table_iter++ = cur_high;
1711         }
1712       }
1713     }
1714   }
1715 }
1716 
InitLookup256x4bx4(void * table256x4bx4)1717 void InitLookup256x4bx4(void* table256x4bx4) {
1718   uint32_t* table_iter = S_CAST(uint32_t*, table256x4bx4);
1719   uint32_t vals[4];
1720   vals[0] = table_iter[0];
1721   vals[1] = table_iter[4];
1722   vals[2] = table_iter[8];
1723   vals[3] = table_iter[12];
1724   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
1725     const uint32_t cur_high = vals[high_idx];
1726     for (uint32_t second_idx = 0; second_idx != 4; ++second_idx) {
1727       const uint32_t cur_second = vals[second_idx];
1728       for (uint32_t third_idx = 0; third_idx != 4; ++third_idx) {
1729         const uint32_t cur_third = vals[third_idx];
1730         for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1731           *table_iter++ = vals[low_idx];
1732           *table_iter++ = cur_third;
1733           *table_iter++ = cur_second;
1734           *table_iter++ = cur_high;
1735         }
1736       }
1737     }
1738   }
1739 }
1740 
PhaseLookup4b(const uintptr_t * genoarr,const uintptr_t * phasepresent,const uintptr_t * phaseinfo,const void * table56x4bx2,uint32_t sample_ct,void * __restrict result)1741 void PhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x4bx2, uint32_t sample_ct, void* __restrict result) {
1742   const uint64_t* table_alias = S_CAST(const uint64_t*, table56x4bx2);
1743   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
1744   const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
1745   const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
1746   uint64_t* result_iter = S_CAST(uint64_t*, result);
1747   uint32_t loop_len = kBitsPerWordD4;
1748   uintptr_t geno_word = 0;
1749   uintptr_t phasepresent_hw_shifted = 0;
1750   uintptr_t phaseinfo_hw_shifted = 0;
1751   for (uint32_t widx = 0; ; ++widx) {
1752     if (widx >= sample_ctl2_m1) {
1753       if (widx > sample_ctl2_m1) {
1754         if (sample_ct % 2) {
1755           uintptr_t cur_idx = (geno_word & 3);
1756           // assume trailing bits of phasepresent/phaseinfo clear
1757           // phaseinfo_hw_shifted not necessarily updated, so need if-statement
1758           // bugfix (25 Jun 2018): must only consider active bit of
1759           // phasepresent_hw_shifted, not the already-processed ones
1760           if (phasepresent_hw_shifted & 16) {
1761             cur_idx ^= 16 | (phaseinfo_hw_shifted & 2);
1762           }
1763           memcpy(result_iter, &(table_alias[cur_idx]), 4);
1764         }
1765         return;
1766       }
1767       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1768     }
1769     geno_word = genoarr[widx];
1770     phasepresent_hw_shifted = phasepresent_alias[widx];
1771     if (!phasepresent_hw_shifted) {
1772       for (uint32_t uii = 0; uii != loop_len; ++uii) {
1773         *result_iter++ = table_alias[geno_word & 15];
1774         geno_word >>= 4;
1775       }
1776     } else {
1777       phasepresent_hw_shifted = phasepresent_hw_shifted << 4;
1778       phaseinfo_hw_shifted = phaseinfo_alias[widx];
1779 
1780       // note that this must be on a separate line (or we have to static_cast)
1781       phaseinfo_hw_shifted = phaseinfo_hw_shifted << 1;
1782 
1783       for (uint32_t uii = 0; uii != loop_len; ++uii) {
1784         const uintptr_t cur_idx = ((geno_word & 15) | (phasepresent_hw_shifted & 48)) ^ (phaseinfo_hw_shifted & 6);
1785         *result_iter++ = table_alias[cur_idx];
1786         geno_word >>= 4;
1787         phasepresent_hw_shifted >>= 2;
1788         phaseinfo_hw_shifted >>= 2;
1789       }
1790     }
1791   }
1792 }
1793 
InitPhaseLookup4b(void * table56x4bx2)1794 void InitPhaseLookup4b(void* table56x4bx2) {
1795   uint32_t* table_iter = S_CAST(uint32_t*, table56x4bx2);
1796   uint32_t vals[4];
1797   vals[0] = table_iter[0];
1798   table_iter[1] = vals[0];
1799   vals[1] = table_iter[2];
1800   table_iter[3] = vals[0];
1801   vals[2] = table_iter[4];
1802   table_iter[5] = vals[0];
1803   vals[3] = table_iter[6];
1804   table_iter[7] = vals[0];
1805   table_iter = &(table_iter[8]);
1806   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1807     const uint32_t cur_high = vals[high_idx];
1808     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1809       *table_iter++ = vals[low_idx];
1810       *table_iter++ = cur_high;
1811     }
1812   }
1813   // [16][0]..[31][1]: bit 4 is set
1814   // low bits must be 01 or 11
1815   const uint32_t val_phaseinfo0 = table_iter[2];
1816   table_iter[3] = vals[0];
1817   const uint32_t val_phaseinfo1 = table_iter[6];
1818   table_iter[7] = vals[0];
1819   table_iter = &(table_iter[8]);
1820   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1821     const uint32_t cur_high = vals[high_idx];
1822     table_iter[2] = val_phaseinfo0;
1823     table_iter[3] = cur_high;
1824     table_iter[6] = val_phaseinfo1;
1825     table_iter[7] = cur_high;
1826     table_iter = &(table_iter[8]);
1827   }
1828   // [32][0]..[39][1]: bit 5 set, bit 4 unset
1829   // high bits must be 00 or 01
1830   for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
1831     const uint32_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
1832     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1833       *table_iter++ = vals[low_idx];
1834       *table_iter++ = cur_high;
1835     }
1836   }
1837   table_iter = &(table_iter[16]);
1838   // [48][0]..[55][1]: bits 4 and 5 set
1839   for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
1840     const uint32_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
1841     table_iter[2] = val_phaseinfo0;
1842     table_iter[3] = cur_high;
1843     table_iter[6] = val_phaseinfo1;
1844     table_iter[7] = cur_high;
1845     table_iter = &(table_iter[8]);
1846   }
1847 }
1848 
1849 #ifdef __LP64__
PhaseLookup8b(const uintptr_t * genoarr,const uintptr_t * phasepresent,const uintptr_t * phaseinfo,const void * table56x8bx2,uint32_t sample_ct,void * __restrict result)1850 void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* __restrict result) {
1851   const __m128i* table_alias = S_CAST(const __m128i*, table56x8bx2);
1852   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
1853   const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
1854   const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
1855   __m128i* result_iter = S_CAST(__m128i*, result);
1856   uint32_t loop_len = kBitsPerWordD4;
1857   uintptr_t geno_word = 0;
1858   uintptr_t phasepresent_hw_shifted = 0;
1859   uintptr_t phaseinfo_hw_shifted = 0;
1860   for (uint32_t widx = 0; ; ++widx) {
1861     if (widx >= sample_ctl2_m1) {
1862       if (widx > sample_ctl2_m1) {
1863         if (sample_ct % 2) {
1864           uintptr_t cur_idx = (geno_word & 3);
1865           if (phasepresent_hw_shifted & 16) {
1866             cur_idx ^= 16 | (phaseinfo_hw_shifted & 2);
1867           }
1868           memcpy(result_iter, &(table_alias[cur_idx]), 8);
1869         }
1870         return;
1871       }
1872       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1873     }
1874     geno_word = genoarr[widx];
1875     phasepresent_hw_shifted = phasepresent_alias[widx];
1876     if (!phasepresent_hw_shifted) {
1877       for (uint32_t uii = 0; uii != loop_len; ++uii) {
1878         _mm_storeu_si128(result_iter, table_alias[geno_word & 15]);
1879         ++result_iter;
1880         geno_word >>= 4;
1881       }
1882     } else {
1883       phasepresent_hw_shifted = phasepresent_hw_shifted << 4;
1884       phaseinfo_hw_shifted = phaseinfo_alias[widx];
1885       phaseinfo_hw_shifted = phaseinfo_hw_shifted << 1;
1886       for (uint32_t uii = 0; uii != loop_len; ++uii) {
1887         const uintptr_t cur_idx = ((geno_word & 15) | (phasepresent_hw_shifted & 48)) ^ (phaseinfo_hw_shifted & 6);
1888         _mm_storeu_si128(result_iter, table_alias[cur_idx]);
1889         ++result_iter;
1890         geno_word >>= 4;
1891         phasepresent_hw_shifted >>= 2;
1892         phaseinfo_hw_shifted >>= 2;
1893       }
1894     }
1895   }
1896 }
1897 #else
PhaseLookup8b(const uintptr_t * genoarr,const uintptr_t * phasepresent,const uintptr_t * phaseinfo,const void * table56x8bx2,uint32_t sample_ct,void * __restrict result)1898 void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* __restrict result) {
1899   const uint64_t* table_alias = S_CAST(const uint64_t*, table56x8bx2);
1900   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
1901   const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
1902   const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
1903   uint64_t* result_iter = S_CAST(uint64_t*, result);
1904   uint32_t loop_len = kBitsPerWordD4;
1905   uintptr_t geno_word = 0;
1906   uintptr_t phasepresent_hw_shifted = 0;
1907   uintptr_t phaseinfo_hw_shifted = 0;
1908   for (uint32_t widx = 0; ; ++widx) {
1909     if (widx >= sample_ctl2_m1) {
1910       if (widx > sample_ctl2_m1) {
1911         if (sample_ct % 2) {
1912           uintptr_t cur_idx = (geno_word & 3);
1913           if (phasepresent_hw_shifted & 16) {
1914             cur_idx ^= 16 | (phaseinfo_hw_shifted & 2);
1915           }
1916           memcpy(result_iter, &(table_alias[cur_idx * 2]), 8);
1917         }
1918         return;
1919       }
1920       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1921     }
1922     geno_word = genoarr[widx];
1923     phasepresent_hw_shifted = phasepresent_alias[widx];
1924     if (!phasepresent_hw_shifted) {
1925       for (uint32_t uii = 0; uii != loop_len; ++uii) {
1926         memcpy(result_iter, &(table_alias[(geno_word & 15) * 2]), 16);
1927         result_iter = &(result_iter[2]);
1928         geno_word >>= 4;
1929       }
1930     } else {
1931       phasepresent_hw_shifted = phasepresent_hw_shifted << 4;
1932       phaseinfo_hw_shifted = phaseinfo_alias[widx];
1933       phaseinfo_hw_shifted = phaseinfo_hw_shifted << 1;
1934       for (uint32_t uii = 0; uii != loop_len; ++uii) {
1935         const uintptr_t cur_idx = ((geno_word & 15) | (phasepresent_hw_shifted & 48)) ^ (phaseinfo_hw_shifted & 6);
1936         memcpy(result_iter, &(table_alias[cur_idx * 2]), 16);
1937         geno_word >>= 4;
1938         phasepresent_hw_shifted >>= 2;
1939         phaseinfo_hw_shifted >>= 2;
1940       }
1941     }
1942   }
1943 }
1944 #endif
1945 
InitPhaseLookup8b(void * table56x8bx2)1946 void InitPhaseLookup8b(void* table56x8bx2) {
1947   uint64_t* table_iter = S_CAST(uint64_t*, table56x8bx2);
1948   uint64_t vals[4];
1949   vals[0] = table_iter[0];
1950   table_iter[1] = vals[0];
1951   vals[1] = table_iter[2];
1952   table_iter[3] = vals[0];
1953   vals[2] = table_iter[4];
1954   table_iter[5] = vals[0];
1955   vals[3] = table_iter[6];
1956   table_iter[7] = vals[0];
1957   table_iter = &(table_iter[8]);
1958   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1959     const uint64_t cur_high = vals[high_idx];
1960     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1961       *table_iter++ = vals[low_idx];
1962       *table_iter++ = cur_high;
1963     }
1964   }
1965   // [16][0]..[31][1]: bit 4 is set
1966   // low bits must be 01 or 11
1967   const uint64_t val_phaseinfo0 = table_iter[2];
1968   table_iter[3] = vals[0];
1969   const uint64_t val_phaseinfo1 = table_iter[6];
1970   table_iter[7] = vals[0];
1971   table_iter = &(table_iter[8]);
1972   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1973     const uint64_t cur_high = vals[high_idx];
1974     table_iter[2] = val_phaseinfo0;
1975     table_iter[3] = cur_high;
1976     table_iter[6] = val_phaseinfo1;
1977     table_iter[7] = cur_high;
1978     table_iter = &(table_iter[8]);
1979   }
1980   // [32][0]..[39][1]: bit 5 set, bit 4 unset
1981   // high bits must be 00 or 01
1982   for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
1983     const uint64_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
1984     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1985       *table_iter++ = vals[low_idx];
1986       *table_iter++ = cur_high;
1987     }
1988   }
1989   table_iter = &(table_iter[16]);
1990   // [48][0]..[55][1]: bits 4 and 5 set
1991   for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
1992     const uint64_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
1993     table_iter[2] = val_phaseinfo0;
1994     table_iter[3] = cur_high;
1995     table_iter[6] = val_phaseinfo1;
1996     table_iter[7] = cur_high;
1997     table_iter = &(table_iter[8]);
1998   }
1999 }
2000 
2001 // bits 0..3: two genotypes
2002 // bits 4..5: two (phasepresent | sex_male) bits
2003 // bits 1,3: unpacked phaseinfo xor
PhaseXNohhLookup4b(const uintptr_t * genoarr,const uintptr_t * phasepresent,const uintptr_t * phaseinfo,const uintptr_t * sex_male,const void * table64x4bx2,uint32_t sample_ct,void * result)2004 void PhaseXNohhLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result) {
2005   const uint64_t* table_alias = S_CAST(const uint64_t*, table64x4bx2);
2006   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2007   const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
2008   const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
2009   const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2010   uint64_t* result_iter = S_CAST(uint64_t*, result);
2011   uint32_t loop_len = kBitsPerWordD4;
2012   uintptr_t geno_word_xored = 0;
2013   uintptr_t male_or_phasepresent_hw_shifted = 0;
2014   for (uint32_t widx = 0; ; ++widx) {
2015     if (widx >= sample_ctl2_m1) {
2016       if (widx > sample_ctl2_m1) {
2017         if (sample_ct % 2) {
2018           uintptr_t cur_idx = (geno_word_xored & 3) | (male_or_phasepresent_hw_shifted & 16);
2019           memcpy(result_iter, &(table_alias[cur_idx]), 4);
2020         }
2021         return;
2022       }
2023       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2024     }
2025     geno_word_xored = genoarr[widx];
2026     male_or_phasepresent_hw_shifted = sex_male_alias[widx];
2027     const uintptr_t phasepresent_hw = phasepresent_alias[widx];
2028     male_or_phasepresent_hw_shifted |= phasepresent_hw;
2029     male_or_phasepresent_hw_shifted <<= 4;
2030     if (!phasepresent_hw) {
2031       for (uint32_t uii = 0; uii != loop_len; ++uii) {
2032         *result_iter++ = table_alias[(geno_word_xored & 15) | (male_or_phasepresent_hw_shifted & 48)];
2033         geno_word_xored >>= 4;
2034         male_or_phasepresent_hw_shifted >>= 2;
2035       }
2036     } else {
2037       geno_word_xored ^= UnpackHalfwordToWordShift1(phaseinfo_alias[widx]);
2038       for (uint32_t uii = 0; uii != loop_len; ++uii) {
2039         const uintptr_t cur_idx = (geno_word_xored & 15) | (male_or_phasepresent_hw_shifted & 48);
2040         *result_iter++ = table_alias[cur_idx];
2041         geno_word_xored >>= 4;
2042         male_or_phasepresent_hw_shifted >>= 2;
2043       }
2044     }
2045   }
2046 }
2047 
InitPhaseXNohhLookup4b(void * table64x4bx2)2048 void InitPhaseXNohhLookup4b(void* table64x4bx2) {
2049   uint32_t* table_iter = S_CAST(uint32_t*, table64x4bx2);
2050   uint32_t vals[4];
2051   vals[0] = table_iter[0];
2052   table_iter[1] = vals[0];
2053   vals[1] = table_iter[2];
2054   table_iter[3] = vals[0];
2055   vals[2] = table_iter[4];
2056   table_iter[5] = vals[0];
2057   vals[3] = table_iter[6];
2058   table_iter[7] = vals[0];
2059   table_iter = &(table_iter[8]);
2060   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2061     const uint32_t cur_high = vals[high_idx];
2062     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2063       *table_iter++ = vals[low_idx];
2064       *table_iter++ = cur_high;
2065     }
2066   }
2067   // [16][0]..[31][1]: bit 4 is set
2068   uint32_t male_or_phasepresent_vals[4];
2069   for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2070     male_or_phasepresent_vals[low_idx] = *table_iter++;
2071     *table_iter++ = vals[0];
2072   }
2073   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2074     const uint32_t cur_high = vals[high_idx];
2075     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2076       *table_iter++ = male_or_phasepresent_vals[low_idx];
2077       *table_iter++ = cur_high;
2078     }
2079   }
2080   // [32][0]..[47][1]: bit 5 set, bit 4 unset
2081   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2082     const uint32_t cur_high = male_or_phasepresent_vals[high_idx];
2083     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2084       *table_iter++ = vals[low_idx];
2085       *table_iter++ = cur_high;
2086     }
2087   }
2088   // [48][0]..[63][1]: bits 4 and 5 set
2089   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2090     const uint32_t cur_high = male_or_phasepresent_vals[high_idx];
2091     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2092       *table_iter++ = male_or_phasepresent_vals[low_idx];
2093       *table_iter++ = cur_high;
2094     }
2095   }
2096 }
2097 
GenoarrSexLookup4b(const uintptr_t * genoarr,const uintptr_t * sex_male,const void * table64x4bx2,uint32_t sample_ct,void * result)2098 void GenoarrSexLookup4b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result) {
2099   const uint64_t* table_alias = S_CAST(const uint64_t*, table64x4bx2);
2100   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2101   const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2102   uint64_t* result_iter = S_CAST(uint64_t*, result);
2103   uint32_t loop_len = kBitsPerWordD4;
2104   uintptr_t geno_word = 0;
2105   uintptr_t male_hw_shifted = 0;
2106   for (uint32_t widx = 0; ; ++widx) {
2107     if (widx >= sample_ctl2_m1) {
2108       if (widx > sample_ctl2_m1) {
2109         if (sample_ct % 2) {
2110           uintptr_t cur_idx = (geno_word & 3) | (male_hw_shifted & 16);
2111           memcpy(result_iter, &(table_alias[cur_idx]), 4);
2112         }
2113         return;
2114       }
2115       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2116     }
2117     geno_word = genoarr[widx];
2118     male_hw_shifted = sex_male_alias[widx];
2119     male_hw_shifted <<= 4;
2120     for (uint32_t uii = 0; uii != loop_len; ++uii) {
2121       *result_iter++ = table_alias[(geno_word & 15) | (male_hw_shifted & 48)];
2122       geno_word >>= 4;
2123       male_hw_shifted >>= 2;
2124     }
2125   }
2126 }
2127 
InitPhaseXNohhLookup8b(void * table64x8bx2)2128 void InitPhaseXNohhLookup8b(void* table64x8bx2) {
2129   uint64_t* table_iter = S_CAST(uint64_t*, table64x8bx2);
2130   uint64_t vals[4];
2131   vals[0] = table_iter[0];
2132   table_iter[1] = vals[0];
2133   vals[1] = table_iter[2];
2134   table_iter[3] = vals[0];
2135   vals[2] = table_iter[4];
2136   table_iter[5] = vals[0];
2137   vals[3] = table_iter[6];
2138   table_iter[7] = vals[0];
2139   table_iter = &(table_iter[8]);
2140   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2141     const uint64_t cur_high = vals[high_idx];
2142     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2143       *table_iter++ = vals[low_idx];
2144       *table_iter++ = cur_high;
2145     }
2146   }
2147   // [16][0]..[31][1]: bit 4 is set
2148   uint64_t male_or_phasepresent_vals[4];
2149   for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2150     male_or_phasepresent_vals[low_idx] = *table_iter++;
2151     *table_iter++ = vals[0];
2152   }
2153   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2154     const uint64_t cur_high = vals[high_idx];
2155     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2156       *table_iter++ = male_or_phasepresent_vals[low_idx];
2157       *table_iter++ = cur_high;
2158     }
2159   }
2160   // [32][0]..[47][1]: bit 5 set, bit 4 unset
2161   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2162     const uint64_t cur_high = male_or_phasepresent_vals[high_idx];
2163     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2164       *table_iter++ = vals[low_idx];
2165       *table_iter++ = cur_high;
2166     }
2167   }
2168   // [48][0]..[63][1]: bits 4 and 5 set
2169   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2170     const uint64_t cur_high = male_or_phasepresent_vals[high_idx];
2171     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2172       *table_iter++ = male_or_phasepresent_vals[low_idx];
2173       *table_iter++ = cur_high;
2174     }
2175   }
2176 }
2177 
2178 #ifdef __LP64__
GenoarrSexLookup8b(const uintptr_t * genoarr,const uintptr_t * sex_male,const void * table64x8bx2,uint32_t sample_ct,void * result)2179 void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result) {
2180   const __m128i* table_alias = S_CAST(const __m128i*, table64x8bx2);
2181   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2182   const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2183   __m128i* result_iter = S_CAST(__m128i*, result);
2184   uint32_t loop_len = kBitsPerWordD4;
2185   uintptr_t geno_word = 0;
2186   uintptr_t male_hw_shifted = 0;
2187   for (uint32_t widx = 0; ; ++widx) {
2188     if (widx >= sample_ctl2_m1) {
2189       if (widx > sample_ctl2_m1) {
2190         if (sample_ct % 2) {
2191           uintptr_t cur_idx = (geno_word & 3) | (male_hw_shifted & 16);
2192           memcpy(result_iter, &(table_alias[cur_idx]), 8);
2193         }
2194         return;
2195       }
2196       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2197     }
2198     geno_word = genoarr[widx];
2199     male_hw_shifted = sex_male_alias[widx];
2200     male_hw_shifted <<= 4;
2201     for (uint32_t uii = 0; uii != loop_len; ++uii) {
2202       _mm_storeu_si128(result_iter, table_alias[(geno_word & 15) | (male_hw_shifted & 48)]);
2203       ++result_iter;
2204       geno_word >>= 4;
2205       male_hw_shifted >>= 2;
2206     }
2207   }
2208 }
2209 #else
GenoarrSexLookup8b(const uintptr_t * genoarr,const uintptr_t * sex_male,const void * table64x8bx2,uint32_t sample_ct,void * result)2210 void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result) {
2211   const uint64_t* table_alias = S_CAST(const uint64_t*, table64x8bx2);
2212   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2213   const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2214   uint64_t* result_iter = S_CAST(uint64_t*, result);
2215   uint32_t loop_len = kBitsPerWordD4;
2216   uintptr_t geno_word = 0;
2217   uintptr_t male_hw_shifted = 0;
2218   for (uint32_t widx = 0; ; ++widx) {
2219     if (widx >= sample_ctl2_m1) {
2220       if (widx > sample_ctl2_m1) {
2221         if (sample_ct % 2) {
2222           uintptr_t cur_idx = (geno_word & 3) | (male_hw_shifted & 16);
2223           memcpy(result_iter, &(table_alias[cur_idx * 2]), 8);
2224         }
2225         return;
2226       }
2227       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2228     }
2229     geno_word = genoarr[widx];
2230     male_hw_shifted = sex_male_alias[widx];
2231     male_hw_shifted <<= 4;
2232     for (uint32_t uii = 0; uii != loop_len; ++uii) {
2233       memcpy(result_iter, &(table_alias[((geno_word & 15) | (male_hw_shifted & 48)) * 2]), 16);
2234       result_iter = &(result_iter[2]);
2235       geno_word >>= 4;
2236       male_hw_shifted >>= 2;
2237     }
2238   }
2239 }
2240 #endif
2241 
VcfPhaseLookup4b(const uintptr_t * genoarr,const uintptr_t * cur_phased,const uintptr_t * phaseinfo,const void * table246x4bx2,uint32_t sample_ct,void * __restrict result)2242 void VcfPhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x4bx2, uint32_t sample_ct, void* __restrict result) {
2243   const uint64_t* table_alias = S_CAST(const uint64_t*, table246x4bx2);
2244   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2245   const Halfword* cur_phased_alias = R_CAST(const Halfword*, cur_phased);
2246   const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
2247   uint64_t* result_iter = S_CAST(uint64_t*, result);
2248   uint32_t loop_len = kBitsPerWordD4;
2249   uintptr_t geno_word = 0;
2250   uintptr_t cur_phased_hw_shifted = 0;
2251   uintptr_t phaseinfo_hw_shifted = 0;
2252   for (uint32_t widx = 0; ; ++widx) {
2253     if (widx >= sample_ctl2_m1) {
2254       if (widx > sample_ctl2_m1) {
2255         if (sample_ct % 2) {
2256           uintptr_t cur_idx = (geno_word & 3) | (cur_phased_hw_shifted & 16) | (phaseinfo_hw_shifted & 64);
2257           memcpy(result_iter, &(table_alias[cur_idx]), 4);
2258         }
2259         return;
2260       }
2261       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2262     }
2263     geno_word = genoarr[widx];
2264     cur_phased_hw_shifted = cur_phased_alias[widx];
2265     if (!cur_phased_hw_shifted) {
2266       for (uint32_t uii = 0; uii != loop_len; ++uii) {
2267         *result_iter++ = table_alias[geno_word & 15];
2268         geno_word >>= 4;
2269       }
2270     } else {
2271       cur_phased_hw_shifted = cur_phased_hw_shifted << 4;
2272       phaseinfo_hw_shifted = phaseinfo_alias[widx];
2273 
2274       // note that this must be on a separate line (or we have to static_cast)
2275       phaseinfo_hw_shifted = phaseinfo_hw_shifted << 6;
2276 
2277       for (uint32_t uii = 0; uii != loop_len; ++uii) {
2278         const uintptr_t cur_idx = (geno_word & 15) | (cur_phased_hw_shifted & 48) | (phaseinfo_hw_shifted & 192);
2279         *result_iter++ = table_alias[cur_idx];
2280         geno_word >>= 4;
2281         cur_phased_hw_shifted >>= 2;
2282         phaseinfo_hw_shifted >>= 2;
2283       }
2284     }
2285   }
2286 }
2287 
InitVcfPhaseLookup4b(void * table246x4bx2)2288 void InitVcfPhaseLookup4b(void* table246x4bx2) {
2289   uint32_t* table_iter = S_CAST(uint32_t*, table246x4bx2);
2290   uint32_t unphased_vals[4];
2291   unphased_vals[0] = table_iter[0];
2292   table_iter[1] = unphased_vals[0];
2293   unphased_vals[1] = table_iter[2];
2294   table_iter[3] = unphased_vals[0];
2295   unphased_vals[2] = table_iter[4];
2296   table_iter[5] = unphased_vals[0];
2297   unphased_vals[3] = table_iter[6];
2298   table_iter[7] = unphased_vals[0];
2299   table_iter = &(table_iter[8]);
2300   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2301     const uint32_t cur_high = unphased_vals[high_idx];
2302     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2303       *table_iter++ = unphased_vals[low_idx];
2304       *table_iter++ = cur_high;
2305     }
2306   }
2307   // [16][0]..[31][1]: first entry is phased and unflipped, second is unphased
2308   uint32_t phased_unflipped_vals[4];
2309   phased_unflipped_vals[0] = table_iter[0];
2310   table_iter[1] = unphased_vals[0];
2311   phased_unflipped_vals[1] = table_iter[2];
2312   table_iter[3] = unphased_vals[0];
2313   phased_unflipped_vals[2] = table_iter[4];
2314   table_iter[5] = unphased_vals[0];
2315   phased_unflipped_vals[3] = table_iter[6];
2316   table_iter[7] = unphased_vals[0];
2317   table_iter = &(table_iter[8]);
2318   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2319     const uint32_t cur_high = unphased_vals[high_idx];
2320     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2321       *table_iter++ = phased_unflipped_vals[low_idx];
2322       *table_iter++ = cur_high;
2323     }
2324   }
2325   // [32][0]..[63][1]: second entry is phased and unflipped
2326   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2327     const uint32_t cur_high = phased_unflipped_vals[high_idx];
2328     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2329       *table_iter++ = unphased_vals[low_idx];
2330       *table_iter++ = cur_high;
2331     }
2332   }
2333   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2334     const uint32_t cur_high = phased_unflipped_vals[high_idx];
2335     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2336       *table_iter++ = phased_unflipped_vals[low_idx];
2337       *table_iter++ = cur_high;
2338     }
2339   }
2340   // [64][0]..[79][1] should be impossible
2341   table_iter = &(table_iter[32]);
2342   // [80][0]..[95][1]: first entry is phased and flipped, second is unphased
2343   // genotype must be 01
2344   const uint32_t phased_flipped_01 = table_iter[2];
2345   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2346     table_iter[2] = phased_flipped_01;
2347     table_iter[3] = unphased_vals[high_idx];
2348     table_iter = &(table_iter[8]);
2349   }
2350   // [96][0]..[111][1] should be impossible
2351   table_iter = &(table_iter[32]);
2352   // [112][0]..[127][1]: first entry phased-flipped, second phased-unflipped
2353   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2354     table_iter[2] = phased_flipped_01;
2355     table_iter[3] = phased_unflipped_vals[high_idx];
2356     table_iter = &(table_iter[8]);
2357   }
2358   // [128][0]..[163][1] should be impossible
2359   table_iter = &(table_iter[72]);
2360   // [164][0]..[167][1]: second entry phased-flipped, first entry unphased
2361   for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2362     *table_iter++ = unphased_vals[low_idx];
2363     *table_iter++ = phased_flipped_01;
2364   }
2365   // [168][0]..[179][1] should be impossible
2366   table_iter = &(table_iter[24]);
2367   // [180][0]..[183][1]: second entry phased-flipped, first phased-unflipped
2368   for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2369     *table_iter++ = phased_unflipped_vals[low_idx];
2370     *table_iter++ = phased_flipped_01;
2371   }
2372   // [184][0]..[244][1] should be impossible
2373   // [245][0]..[245][1]: both phased-flipped
2374   table_iter[122] = phased_flipped_01;
2375   table_iter[123] = phased_flipped_01;
2376 }
2377 
VcfPhaseLookup2b(const uintptr_t * genoarr,const uintptr_t * cur_phased,const uintptr_t * phaseinfo,const void * table246x2bx2,uint32_t sample_ct,void * __restrict result)2378 void VcfPhaseLookup2b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x2bx2, uint32_t sample_ct, void* __restrict result) {
2379   const uint32_t* table_alias = S_CAST(const uint32_t*, table246x2bx2);
2380   const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2381   const Halfword* cur_phased_alias = R_CAST(const Halfword*, cur_phased);
2382   const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
2383   uint32_t* result_iter = S_CAST(uint32_t*, result);
2384   uint32_t loop_len = kBitsPerWordD4;
2385   uintptr_t geno_word = 0;
2386   uintptr_t cur_phased_hw_shifted = 0;
2387   uintptr_t phaseinfo_hw_shifted = 0;
2388   for (uint32_t widx = 0; ; ++widx) {
2389     if (widx >= sample_ctl2_m1) {
2390       if (widx > sample_ctl2_m1) {
2391         if (sample_ct % 2) {
2392           uintptr_t cur_idx = (geno_word & 3) | (cur_phased_hw_shifted & 16) | (phaseinfo_hw_shifted & 64);
2393           memcpy(result_iter, &(table_alias[cur_idx]), 2);
2394         }
2395         return;
2396       }
2397       loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2398     }
2399     geno_word = genoarr[widx];
2400     cur_phased_hw_shifted = cur_phased_alias[widx];
2401     if (!cur_phased_hw_shifted) {
2402       for (uint32_t uii = 0; uii != loop_len; ++uii) {
2403         *result_iter++ = table_alias[geno_word & 15];
2404         geno_word >>= 4;
2405       }
2406     } else {
2407       cur_phased_hw_shifted = cur_phased_hw_shifted << 4;
2408       phaseinfo_hw_shifted = phaseinfo_alias[widx];
2409 
2410       // note that this must be on a separate line (or we have to static_cast)
2411       phaseinfo_hw_shifted = phaseinfo_hw_shifted << 6;
2412 
2413       for (uint32_t uii = 0; uii != loop_len; ++uii) {
2414         const uintptr_t cur_idx = (geno_word & 15) | (cur_phased_hw_shifted & 48) | (phaseinfo_hw_shifted & 192);
2415         *result_iter++ = table_alias[cur_idx];
2416         geno_word >>= 4;
2417         cur_phased_hw_shifted >>= 2;
2418         phaseinfo_hw_shifted >>= 2;
2419       }
2420     }
2421   }
2422 }
2423 
InitVcfPhaseLookup2b(void * table246x2bx2)2424 void InitVcfPhaseLookup2b(void* table246x2bx2) {
2425   uint16_t* table_iter = S_CAST(uint16_t*, table246x2bx2);
2426   uint16_t unphased_vals[4];
2427   unphased_vals[0] = table_iter[0];
2428   table_iter[1] = unphased_vals[0];
2429   unphased_vals[1] = table_iter[2];
2430   table_iter[3] = unphased_vals[0];
2431   unphased_vals[2] = table_iter[4];
2432   table_iter[5] = unphased_vals[0];
2433   unphased_vals[3] = table_iter[6];
2434   table_iter[7] = unphased_vals[0];
2435   table_iter = &(table_iter[8]);
2436   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2437     const uint32_t cur_high = unphased_vals[high_idx];
2438     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2439       *table_iter++ = unphased_vals[low_idx];
2440       *table_iter++ = cur_high;
2441     }
2442   }
2443   // [16][0]..[31][1]: first entry is phased and unflipped, second is unphased
2444   uint16_t phased_unflipped_vals[4];
2445   phased_unflipped_vals[0] = table_iter[0];
2446   table_iter[1] = unphased_vals[0];
2447   phased_unflipped_vals[1] = table_iter[2];
2448   table_iter[3] = unphased_vals[0];
2449   phased_unflipped_vals[2] = table_iter[4];
2450   table_iter[5] = unphased_vals[0];
2451   phased_unflipped_vals[3] = table_iter[6];
2452   table_iter[7] = unphased_vals[0];
2453   table_iter = &(table_iter[8]);
2454   for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2455     const uint32_t cur_high = unphased_vals[high_idx];
2456     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2457       *table_iter++ = phased_unflipped_vals[low_idx];
2458       *table_iter++ = cur_high;
2459     }
2460   }
2461   // [32][0]..[63][1]: second entry is phased and unflipped
2462   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2463     const uint32_t cur_high = phased_unflipped_vals[high_idx];
2464     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2465       *table_iter++ = unphased_vals[low_idx];
2466       *table_iter++ = cur_high;
2467     }
2468   }
2469   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2470     const uint32_t cur_high = phased_unflipped_vals[high_idx];
2471     for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2472       *table_iter++ = phased_unflipped_vals[low_idx];
2473       *table_iter++ = cur_high;
2474     }
2475   }
2476   // [64][0]..[79][1] should be impossible
2477   table_iter = &(table_iter[32]);
2478   // [80][0]..[95][1]: first entry is phased and flipped, second is unphased
2479   // genotype must be 01
2480   const uint32_t phased_flipped_01 = table_iter[2];
2481   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2482     table_iter[2] = phased_flipped_01;
2483     table_iter[3] = unphased_vals[high_idx];
2484     table_iter = &(table_iter[8]);
2485   }
2486   // [96][0]..[111][1] should be impossible
2487   table_iter = &(table_iter[32]);
2488   // [112][0]..[127][1]: first entry phased-flipped, second phased-unflipped
2489   for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2490     table_iter[2] = phased_flipped_01;
2491     table_iter[3] = phased_unflipped_vals[high_idx];
2492     table_iter = &(table_iter[8]);
2493   }
2494   // [128][0]..[163][1] should be impossible
2495   table_iter = &(table_iter[72]);
2496   // [164][0]..[167][1]: second entry phased-flipped, first entry unphased
2497   for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2498     *table_iter++ = unphased_vals[low_idx];
2499     *table_iter++ = phased_flipped_01;
2500   }
2501   // [168][0]..[179][1] should be impossible
2502   table_iter = &(table_iter[24]);
2503   // [180][0]..[183][1]: second entry phased-flipped, first phased-unflipped
2504   for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2505     *table_iter++ = phased_unflipped_vals[low_idx];
2506     *table_iter++ = phased_flipped_01;
2507   }
2508   // [184][0]..[244][1] should be impossible
2509   // [245][0]..[245][1]: both phased-flipped
2510   table_iter[122] = phased_flipped_01;
2511   table_iter[123] = phased_flipped_01;
2512 }
2513 
2514 
ClearGenoarrMissing1bit8Unsafe(const uintptr_t * __restrict genoarr,uint32_t * subset_sizep,uintptr_t * __restrict subset,void * __restrict sparse_vals)2515 void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals) {
2516   const uint32_t orig_subset_size = *subset_sizep;
2517   Halfword* subset_alias = R_CAST(Halfword*, subset);
2518   uint32_t read_idx = 0;
2519   // deliberate overflow
2520   for (uint32_t read_widx = UINT32_MAX; ; ) {
2521     uint32_t subset_bits;
2522     do {
2523       subset_bits = subset_alias[++read_widx];
2524     } while (!subset_bits);
2525     uintptr_t detect_11 = genoarr[read_widx];
2526     detect_11 = detect_11 & (detect_11 >> 1) & kMask5555;
2527     if (detect_11) {
2528       uint32_t detect_11_hw = PackWordToHalfword(detect_11);
2529       const uint32_t joint_u32 = subset_bits & detect_11_hw;
2530       if (joint_u32) {
2531         uintptr_t lowbit = joint_u32 & (-joint_u32);
2532         uint32_t write_idx = read_idx + PopcountWord(subset_bits & (lowbit - 1));
2533         read_idx = write_idx + 1;
2534         uint32_t subset_bits_write = subset_bits ^ lowbit;
2535         unsigned char* sparse_vals_uc = S_CAST(unsigned char*, sparse_vals);
2536         subset_bits &= -(2 * lowbit);
2537         for (; read_idx != orig_subset_size; ++read_idx) {
2538 #ifdef USE_AVX2
2539           if (!subset_bits) {
2540             subset_alias[read_widx] = subset_bits_write;
2541             do {
2542               subset_bits = subset_alias[++read_widx];
2543             } while (!subset_bits);
2544             subset_bits_write = subset_bits;
2545             detect_11 = genoarr[read_widx];
2546             detect_11 = detect_11 & (detect_11 >> 1);
2547             detect_11_hw = PackWordToHalfwordMask5555(detect_11);
2548           }
2549           lowbit = subset_bits & (-subset_bits);
2550           subset_bits ^= lowbit;
2551           if (lowbit & detect_11_hw) {
2552             subset_bits_write ^= lowbit;
2553             continue;
2554           }
2555 #else
2556           if (!subset_bits) {
2557             subset_alias[read_widx] = subset_bits_write;
2558             do {
2559               subset_bits = subset_alias[++read_widx];
2560             } while (!subset_bits);
2561             subset_bits_write = subset_bits;
2562             detect_11 = genoarr[read_widx];
2563             detect_11 = detect_11 & (detect_11 >> 1);
2564           }
2565           lowbit = subset_bits & (-subset_bits);
2566           subset_bits ^= lowbit;
2567           if ((lowbit * lowbit) & detect_11) {
2568             subset_bits_write ^= lowbit;
2569             continue;
2570           }
2571 #endif
2572           sparse_vals_uc[write_idx++] = sparse_vals_uc[read_idx];
2573         }
2574         subset_alias[read_widx] = subset_bits_write;
2575         *subset_sizep = write_idx;
2576         return;
2577       }
2578     }
2579     read_idx += PopcountWord(subset_bits);
2580     if (read_idx == orig_subset_size) {
2581       return;
2582     }
2583   }
2584 }
2585 
ClearGenoarrMissing1bit16Unsafe(const uintptr_t * __restrict genoarr,uint32_t * subset_sizep,uintptr_t * __restrict subset,void * __restrict sparse_vals)2586 void ClearGenoarrMissing1bit16Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals) {
2587   const uint32_t orig_subset_size = *subset_sizep;
2588   Halfword* subset_alias = R_CAST(Halfword*, subset);
2589   uint32_t read_idx = 0;
2590   // deliberate overflow
2591   for (uint32_t read_widx = UINT32_MAX; ; ) {
2592     uint32_t subset_bits;
2593     do {
2594       subset_bits = subset_alias[++read_widx];
2595     } while (!subset_bits);
2596     uintptr_t detect_11 = genoarr[read_widx];
2597     detect_11 = detect_11 & (detect_11 >> 1) & kMask5555;
2598     if (detect_11) {
2599       uint32_t detect_11_hw = PackWordToHalfword(detect_11);
2600       const uint32_t joint_u32 = subset_bits & detect_11_hw;
2601       if (joint_u32) {
2602         uintptr_t lowbit = joint_u32 & (-joint_u32);
2603         uint32_t write_idx = read_idx + PopcountWord(subset_bits & (lowbit - 1));
2604         read_idx = write_idx + 1;
2605         uint32_t subset_bits_write = subset_bits ^ lowbit;
2606         uint16_t* sparse_vals_u16 = S_CAST(uint16_t*, sparse_vals);
2607         subset_bits &= -(2 * lowbit);
2608         for (; read_idx != orig_subset_size; ++read_idx) {
2609 #ifdef USE_AVX2
2610           if (!subset_bits) {
2611             subset_alias[read_widx] = subset_bits_write;
2612             do {
2613               subset_bits = subset_alias[++read_widx];
2614             } while (!subset_bits);
2615             subset_bits_write = subset_bits;
2616             detect_11 = genoarr[read_widx];
2617             detect_11 = detect_11 & (detect_11 >> 1);
2618             detect_11_hw = PackWordToHalfwordMask5555(detect_11);
2619           }
2620           lowbit = subset_bits & (-subset_bits);
2621           subset_bits ^= lowbit;
2622           if (lowbit & detect_11_hw) {
2623             subset_bits_write ^= lowbit;
2624             continue;
2625           }
2626 #else
2627           if (!subset_bits) {
2628             subset_alias[read_widx] = subset_bits_write;
2629             do {
2630               subset_bits = subset_alias[++read_widx];
2631             } while (!subset_bits);
2632             subset_bits_write = subset_bits;
2633             detect_11 = genoarr[read_widx];
2634             detect_11 = detect_11 & (detect_11 >> 1);
2635           }
2636           lowbit = subset_bits & (-subset_bits);
2637           subset_bits ^= lowbit;
2638           if ((lowbit * lowbit) & detect_11) {
2639             subset_bits_write ^= lowbit;
2640             continue;
2641           }
2642 #endif
2643           sparse_vals_u16[write_idx++] = sparse_vals_u16[read_idx];
2644         }
2645         subset_alias[read_widx] = subset_bits_write;
2646         *subset_sizep = write_idx;
2647         return;
2648       }
2649     }
2650     read_idx += PopcountWord(subset_bits);
2651     if (read_idx == orig_subset_size) {
2652       return;
2653     }
2654   }
2655 }
2656 
MultiallelicDiploidMinimac3R2(const uint64_t * __restrict sums,const uint64_t * __restrict hap_ssqs_x2,uint32_t nm_sample_ct,uint32_t allele_ct,uint32_t extra_phased_het_ct)2657 double MultiallelicDiploidMinimac3R2(const uint64_t* __restrict sums, const uint64_t* __restrict hap_ssqs_x2, uint32_t nm_sample_ct, uint32_t allele_ct, uint32_t extra_phased_het_ct) {
2658   // sums[k] == sum_i [left_dosage_{ik} + right_dosage_{ik}]
2659   // hap_ssqs_x2[k] ==
2660   //   2 * sum_i [(left_dosage_{ik})^2 + (right_dosage_{ik})^2]
2661   //   This may be odd, since it's computed as
2662   //     (left + right)^2 + (left - right)^2
2663   //   and the parities of the two integers can be different.
2664   // For phased hardcalls, it is fine for the hap_ssqs_x2[k] values to
2665   // correspond to unphased hardcalls iff extra_phased_het_ct is the number of
2666   // phased-hets that weren't accounted for in hap_ssqs_x2[]; this makes it
2667   // straightforward for GetMultiallelicCountsAndDosage16s to stick to the
2668   // custom internal multiallelic-count functions.
2669   if (!nm_sample_ct) {
2670     return (0.0 / 0.0);
2671   }
2672   // Allelic phased-dosages are on a (k-1)-dimensional simplex; embed this in
2673   // R^k as the (1, 0, ..., 0), (0, 1, ..., 0), ..., (0, 0, ..., 1) polytope.
2674   // Define
2675   //   m_k := (1/2n) * sum_i [left_dosage_{ik} + right_dosage_{ik}]
2676   // Minimac3-r2 is defined as empirical phased-dosage variance divided by
2677   // expected-under-allele-frequencies variance.
2678   // Expected sum-of-squared-Euclidean-distance with perfect imputation is
2679   //   2n("1"^2 - sum_k ((m_k)^2))
2680   // and observed sum-of-squared-distance is
2681   //   sum_k (sum_i [(left_dosage_{ik})^2 + (right_dosage_{ik})^2] -
2682   //          2n((m_k)^2))
2683 
2684   // ssq_sum_x2 can be as large as 2^31 * nm_sample_ct; meansq_sum can cancel
2685   // as little as (1 / allele_ct) of it
2686   if (nm_sample_ct < 92682) {
2687     uint64_t ssq_sum_x2 = extra_phased_het_ct * 0x20000000LLU;
2688     uint64_t meansq_sum = 0;
2689     for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
2690       const uint64_t cur_allele_dosage = sums[allele_idx];
2691       ssq_sum_x2 += hap_ssqs_x2[allele_idx];
2692       // cur_allele_dosage == 2n * m_k
2693       // -> meansq_sum becomes 2n * sum_k [2n((m_k)^2)]
2694       meansq_sum += cur_allele_dosage * cur_allele_dosage;
2695     }
2696     const uint64_t observed_variance_times_2n = ssq_sum_x2 * nm_sample_ct - meansq_sum;
2697     // "1"^2 -> 16384^2 in our units.  So 2n * 2n * "1"^2 is equal to
2698     //   n * n * 16384^2 * 4.
2699     const uint64_t expected_variance_times_2n = nm_sample_ct * 0x40000000LLU * nm_sample_ct - meansq_sum;
2700     // mach_r2 == 1 test cases:
2701     // - AA, AB, BB: 1, 4, 4
2702     //   sums[0] = 6 * 2^14
2703     //   sums[1] = 12 * 2^14
2704     //   ssqs[0] = 8 * 2^28
2705     //   ssqs[1] = 20 * 2^28
2706     //   ssq_sum = (8 + 20) * 2^28
2707     //   meansq_sum = (6 * 6 + 12 * 12) * 2^28
2708     //   observed_variance = 28 * 9 * 2^28 - 180 * 2^28
2709     //   expected_variance = (9 * 9 * 4 * 2^28 - 180 * 2^28) / 2
2710     // - AA, AB, BB, AC, BC, CC: 1, 4, 4, 6, 12, 9
2711     //   sums[0] = 12 * 2^14
2712     //   sums[1] = 24 * 2^14
2713     //   sums[2] = 36 * 2^14
2714     //   ssqs[0] = 14 * 2^28
2715     //   ssqs[1] = 32 * 2^28
2716     //   ssqs[2] = 54 * 2^28
2717     //   ssq_sum = (14 + 32 + 54) * 2^28
2718     //   meansq_sum = (12 * 12 + 24 * 24 + 36 * 36) * 2^28
2719     //   observed_variance = 100 * 36 * 2^28 - 56 * 36 * 2^28
2720     //   expected_variance = (36 * 36 * 4 * 2^28 - 56 * 36 * 2^28) / 2
2721     return S_CAST(double, observed_variance_times_2n) / S_CAST(double, expected_variance_times_2n);
2722   }
2723   // Need to avoid catastrophic cancellation here.
2724   const uint64_t nm_sample_ct_x32768 = nm_sample_ct * 0x8000LLU;
2725   double expected_variance_times_2n = 0.0;
2726   double observed_variance_times_2n_hi = 0.0;
2727   int64_t observed_variance_times_2n_lo = 0;
2728   for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
2729     const uint64_t cur_allele_dosage = sums[allele_idx];
2730     const uint64_t cur_ssq_x2 = hap_ssqs_x2[allele_idx];
2731     const double cur_allele_dosaged = u63tod(cur_allele_dosage);
2732     // Restructure expected_variance as sum/product of
2733     // guaranteed-nonnegative numbers.
2734     //
2735     // cur_allele_dosage == 2n * m_k
2736     // cur_allele_dosage[k] * ("1" * 2n - cur_allele_dosage[k])
2737     // = "1" * 4n^2 * m_k - 4n^2 * m_k^2
2738     //
2739     // sum_k (cur_allele_dosage[k] * ("1" * 2n - cur_allele_dosage[k]))
2740     // = "1" * 4n^2 * sum_k m_k - 4n^2 * sum_k m_k^2
2741     // = "1" * 4n^2 * "1" - 4n^2 * sum_k m_k^2
2742     // = 4n^2 * ("1"^2 - sum_k m_k^2)
2743     // This is 2n times the expected variance.
2744     expected_variance_times_2n += cur_allele_dosaged * u63tod(nm_sample_ct_x32768 - cur_allele_dosage);
2745     // Emulate 128-bit integers to make observed_variance computation work.
2746     // "left" corresponds to the ssq_sum_x2 * nm_sample_ct term, "right"
2747     // corresponds to meansq_sum.
2748     const uint64_t cur_ssq_x2_hi = cur_ssq_x2 >> 32;
2749     uint64_t left_lo = (cur_ssq_x2 & 0xffffffffLLU) * nm_sample_ct;
2750     const uint64_t left_hi = (left_lo >> 32) + cur_ssq_x2_hi * nm_sample_ct;
2751     left_lo &= 0xffffffffU;
2752     const uint64_t cur_allele_dosage_lo = cur_allele_dosage & 0xffffffffLLU;
2753     const uint64_t cur_allele_dosage_hi = cur_allele_dosage >> 32;
2754     uint64_t right_lo = cur_allele_dosage_lo * cur_allele_dosage_lo;
2755     const uint64_t right_hi = (right_lo >> 32) + (cur_allele_dosage_lo + cur_allele_dosage) * cur_allele_dosage_hi;
2756     right_lo &= 0xffffffffU;
2757     observed_variance_times_2n_hi += u63tod(left_hi - right_hi);
2758     observed_variance_times_2n_lo += S_CAST(int64_t, left_lo) - S_CAST(int64_t, right_lo);
2759   }
2760   const double observed_variance_times_2n = (observed_variance_times_2n_hi * 4294967296.0) + (u31tod(extra_phased_het_ct) * 536870912.0) + observed_variance_times_2n_lo;
2761   return observed_variance_times_2n / expected_variance_times_2n;
2762 }
2763 
PgrDifflistToGenovecUnsafe(const uintptr_t * __restrict raregeno,const uint32_t * difflist_sample_ids,uintptr_t difflist_common_geno,uint32_t sample_ct,uint32_t difflist_len,uintptr_t * __restrict genovec)2764 void PgrDifflistToGenovecUnsafe(const uintptr_t* __restrict raregeno, const uint32_t* difflist_sample_ids, uintptr_t difflist_common_geno, uint32_t sample_ct, uint32_t difflist_len, uintptr_t* __restrict genovec) {
2765   // Ok for trailing bits of raregeno to be nonzero.  Does not zero out
2766   // trailing bits of genovec.
2767   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
2768   // could just memset up to word boundary; this should be a bit more
2769   // vector-instruction-friendly, though
2770   vecset(genovec, difflist_common_geno * kMask5555, vec_ct);
2771   const uintptr_t* raregeno_incr = raregeno;
2772   uint32_t difflist_idx = 0;
2773   uint32_t difflist_idx_stop = kBitsPerWordD2;
2774   if (!difflist_common_geno) {
2775     // faster inner loop since there's no existing value to mask out
2776     // todo: check if this should just be deleted since the code bloat causes
2777     // too many more cache misses
2778     for (; ; difflist_idx_stop += kBitsPerWordD2) {
2779       if (difflist_idx_stop > difflist_len) {
2780         if (difflist_idx == difflist_len) {
2781           return;
2782         }
2783         difflist_idx_stop = difflist_len;
2784       }
2785       uintptr_t raregeno_word = *raregeno_incr++;
2786       for (; difflist_idx != difflist_idx_stop; ++difflist_idx) {
2787         const uint32_t cur_sample_idx = difflist_sample_ids[difflist_idx];
2788         genovec[cur_sample_idx / kBitsPerWordD2] |= (raregeno_word & 3) << (2 * (cur_sample_idx % kBitsPerWordD2));
2789         raregeno_word >>= 2;
2790       }
2791     }
2792   }
2793   for (; ; difflist_idx_stop += kBitsPerWordD2) {
2794     if (difflist_idx_stop > difflist_len) {
2795       if (difflist_idx == difflist_len) {
2796         return;
2797       }
2798       difflist_idx_stop = difflist_len;
2799     }
2800     uintptr_t raregeno_word = *raregeno_incr++;
2801     for (; difflist_idx != difflist_idx_stop; ++difflist_idx) {
2802       const uint32_t cur_sample_idx = difflist_sample_ids[difflist_idx];
2803       AssignNyparrEntry(cur_sample_idx, raregeno_word & 3, genovec);
2804       raregeno_word >>= 2;
2805     }
2806   }
2807 }
2808 
2809 const uint16_t kHcToAlleleCodes[1024] = QUAD_TABLE256(0, 0x100, 0x101, 0xffff);
2810 
2811 static_assert(sizeof(AlleleCode) == 1, "PglMultiallelicSparseToDenseMiss() needs to be updated.");
PglMultiallelicSparseToDenseMiss(const PgenVariant * pgvp,uint32_t sample_ct,AlleleCode * __restrict wide_codes)2812 void PglMultiallelicSparseToDenseMiss(const PgenVariant* pgvp, uint32_t sample_ct, AlleleCode* __restrict wide_codes) {
2813   GenoarrLookup256x2bx4(pgvp->genovec, kHcToAlleleCodes, sample_ct, wide_codes);
2814   const uint32_t patch_01_ct = pgvp->patch_01_ct;
2815   if (patch_01_ct) {
2816     const uintptr_t* patch_01_set = pgvp->patch_01_set;
2817     uintptr_t sample_idx_base = 0;
2818     uintptr_t cur_bits = patch_01_set[0];
2819     const AlleleCode* patch_01_vals = pgvp->patch_01_vals;
2820     AlleleCode* wide_codes1 = &(wide_codes[1]);
2821     for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
2822       const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
2823       wide_codes1[2 * sample_idx] = patch_01_vals[uii];
2824     }
2825   }
2826   const uint32_t patch_10_ct = pgvp->patch_10_ct;
2827   if (patch_10_ct) {
2828     const uintptr_t* patch_10_set = pgvp->patch_10_set;
2829     uintptr_t sample_idx_base = 0;
2830     uintptr_t cur_bits = patch_10_set[0];
2831     const DoubleAlleleCode* patch_10_vals_alias = R_CAST(const DoubleAlleleCode*, pgvp->patch_10_vals);
2832     DoubleAlleleCode* wide_codes_alias = R_CAST(DoubleAlleleCode*, wide_codes);
2833     for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
2834       const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
2835       wide_codes_alias[sample_idx] = patch_10_vals_alias[uii];
2836     }
2837   }
2838 }
2839 
2840 #ifdef __cplusplus
2841 }  // namespace plink2
2842 #endif
2843