1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "pgenlib_misc.h"
19
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23
24 #ifdef USE_AVX2
CopyNyparrNonemptySubset(const uintptr_t * __restrict raw_nyparr,const uintptr_t * __restrict subset_mask,uint32_t raw_nyparr_entry_ct,uint32_t subset_entry_ct,uintptr_t * __restrict output_nyparr)25 void CopyNyparrNonemptySubset(const uintptr_t* __restrict raw_nyparr, const uintptr_t* __restrict subset_mask, uint32_t raw_nyparr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_nyparr) {
26 if (subset_entry_ct == raw_nyparr_entry_ct) {
27 memcpy(output_nyparr, raw_nyparr, DivUp(subset_entry_ct, kBitsPerWordD2) * sizeof(intptr_t));
28 ZeroTrailingNyps(subset_entry_ct, output_nyparr);
29 return;
30 }
31 assert(subset_entry_ct);
32 uintptr_t cur_output_word = 0;
33
34 uintptr_t* output_nyparr_iter = output_nyparr;
35
36 uintptr_t* output_nyparr_last = &(output_nyparr[subset_entry_ct / kBitsPerWordD2]);
37 const uint32_t word_write_shift_end = 2 * (subset_entry_ct % kBitsPerWordD2);
38 uint32_t word_write_shift = 0;
39 for (uint32_t subset_mask_widx = 0; ; ++subset_mask_widx) {
40 const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
41 if (cur_include_word) {
42 uint32_t cur_include_halfword = S_CAST(Halfword, cur_include_word);
43 for (uint32_t wordhalf_idx = 0; ; ++wordhalf_idx) {
44 if (cur_include_halfword) {
45 uintptr_t extracted_bits = raw_nyparr[subset_mask_widx * 2 + wordhalf_idx];
46 uint32_t set_bit_ct = kBitsPerWord;
47 if (cur_include_halfword != UINT32_MAX) {
48 const uintptr_t pext_mask = 3 * UnpackHalfwordToWord(cur_include_halfword);
49 extracted_bits = _pext_u64(extracted_bits, pext_mask);
50 set_bit_ct = PopcountWord(pext_mask);
51 }
52 cur_output_word |= extracted_bits << word_write_shift;
53 word_write_shift += set_bit_ct;
54 if (word_write_shift >= kBitsPerWord) {
55 *output_nyparr_iter++ = cur_output_word;
56 word_write_shift -= kBitsPerWord;
57 cur_output_word = 0;
58 if (word_write_shift) {
59 cur_output_word = extracted_bits >> (set_bit_ct - word_write_shift);
60 }
61 }
62 }
63 if (wordhalf_idx) {
64 break;
65 }
66 cur_include_halfword = cur_include_word >> kBitsPerWordD2;
67 }
68 if (output_nyparr_iter == output_nyparr_last) {
69 if (word_write_shift == word_write_shift_end) {
70 if (word_write_shift_end) {
71 *output_nyparr_last = cur_output_word;
72 }
73 return;
74 }
75 }
76 }
77 }
78 }
79
80 // bit_idx_start assumed to be < kBitsPerWord
CopyGenomatchSubset(const uintptr_t * __restrict raw_bitarr,const uintptr_t * __restrict genoarr,uintptr_t match_word,uint32_t write_bit_idx_start,uint32_t bit_ct,uintptr_t * __restrict output_bitarr)81 void CopyGenomatchSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t write_bit_idx_start, uint32_t bit_ct, uintptr_t* __restrict output_bitarr) {
82 const uint32_t bit_idx_end = bit_ct + write_bit_idx_start;
83 const uint32_t bit_idx_end_lowbits = bit_idx_end % kBitsPerWord;
84 const Halfword* raw_bitarr_alias = R_CAST(const Halfword*, raw_bitarr);
85 uintptr_t* output_bitarr_iter = output_bitarr;
86 uintptr_t* output_bitarr_last = &(output_bitarr[bit_idx_end / kBitsPerWord]);
87 uintptr_t cur_output_word = 0;
88 uint32_t read_widx = UINT32_MAX; // deliberate overflow
89 uint32_t write_idx_lowbits = write_bit_idx_start;
90 while ((output_bitarr_iter != output_bitarr_last) || (write_idx_lowbits != bit_idx_end_lowbits)) {
91 uintptr_t cur_mask_word;
92 // sparse genoarr optimization
93 // guaranteed to terminate since there's at least one more set bit
94 do {
95 // todo: try reading two genoarr words at a time. would need to be very
96 // careful with the possible trailing word, though.
97 // more important to optimize this function now that regular phased-call
98 // handling code is using it.
99 cur_mask_word = genoarr[++read_widx] ^ match_word;
100 cur_mask_word = (~(cur_mask_word | (cur_mask_word >> 1))) & kMask5555;
101 } while (!cur_mask_word);
102 uintptr_t extracted_bits = raw_bitarr_alias[read_widx];
103 uint32_t set_bit_ct = kBitsPerWordD2;
104 if (cur_mask_word != kMask5555) {
105 const uintptr_t cur_mask_hw = PackWordToHalfword(cur_mask_word);
106 set_bit_ct = PopcountWord(cur_mask_word);
107 extracted_bits = _pext_u64(extracted_bits, cur_mask_hw);
108 }
109 cur_output_word |= extracted_bits << write_idx_lowbits;
110 const uint32_t new_write_idx_lowbits = write_idx_lowbits + set_bit_ct;
111 if (new_write_idx_lowbits >= kBitsPerWord) {
112 *output_bitarr_iter++ = cur_output_word;
113 // ...and these are the bits that fell off
114 // impossible for write_idx_lowbits to be zero here
115 cur_output_word = extracted_bits >> (kBitsPerWord - write_idx_lowbits);
116 }
117 write_idx_lowbits = new_write_idx_lowbits % kBitsPerWord;
118 }
119 if (write_idx_lowbits) {
120 *output_bitarr_iter = cur_output_word;
121 }
122 }
123
124 // Variant of ExpandBytearr() which is based off a target 2-bit value instead
125 // of single expand_mask bits. expand_size must be the number of instance of
126 // the target value in genovec.
ExpandBytearrFromGenoarr(const void * __restrict compact_bitarr,const uintptr_t * __restrict genoarr,uintptr_t match_word,uint32_t genoword_ct,uint32_t expand_size,uint32_t read_start_bit,uintptr_t * __restrict target)127 void ExpandBytearrFromGenoarr(const void* __restrict compact_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t genoword_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target) {
128 const uint32_t expand_sizex_m1 = expand_size + read_start_bit - 1;
129 const uint32_t leading_byte_ct = 1 + (expand_sizex_m1 % kBitsPerWord) / CHAR_BIT;
130 const uint32_t genoword_ct_m1 = genoword_ct - 1;
131 uintptr_t compact_word = SubwordLoad(compact_bitarr, leading_byte_ct) >> read_start_bit;
132 const uintptr_t* compact_bitarr_iter = R_CAST(const uintptr_t*, &(S_CAST(const unsigned char*, compact_bitarr)[leading_byte_ct]));
133 uint32_t compact_idx_lowbits = read_start_bit + CHAR_BIT * (sizeof(intptr_t) - leading_byte_ct);
134 for (uint32_t widx = 0; ; widx += 2) {
135 uintptr_t mask_word;
136 if (widx >= genoword_ct_m1) {
137 if (widx > genoword_ct_m1) {
138 return;
139 }
140 mask_word = 0;
141 } else {
142 const uintptr_t geno_word1 = genoarr[widx + 1] ^ match_word;
143 mask_word = PackWordToHalfwordMask5555(~(geno_word1 | (geno_word1 >> 1)));
144 mask_word = mask_word << 32;
145 }
146 const uintptr_t geno_word0 = genoarr[widx] ^ match_word;
147 mask_word |= PackWordToHalfwordMask5555(~(geno_word0 | (geno_word0 >> 1)));
148 uintptr_t write_word = 0;
149 if (mask_word) {
150 const uint32_t mask_set_ct = PopcountWord(mask_word);
151 uint32_t next_compact_idx_lowbits = compact_idx_lowbits + mask_set_ct;
152 if (next_compact_idx_lowbits <= kBitsPerWord) {
153 write_word = _pdep_u64(compact_word, mask_word);
154 if (mask_set_ct != kBitsPerWord) {
155 compact_word = compact_word >> mask_set_ct;
156 } else {
157 // avoid nasal demons
158 compact_word = 0;
159 }
160 } else {
161 # ifdef __arm__
162 # error "Unaligned accesses in ExpandBytearrFromGenoarr()."
163 # endif
164 uintptr_t next_compact_word = *compact_bitarr_iter++;
165 next_compact_idx_lowbits -= kBitsPerWord;
166 compact_word |= next_compact_word << (kBitsPerWord - compact_idx_lowbits);
167 write_word = _pdep_u64(compact_word, mask_word);
168 if (next_compact_idx_lowbits != kBitsPerWord) {
169 compact_word = next_compact_word >> next_compact_idx_lowbits;
170 } else {
171 compact_word = 0;
172 }
173 }
174 compact_idx_lowbits = next_compact_idx_lowbits;
175 }
176 target[widx / 2] = write_word;
177 }
178 }
179 #else // !USE_AVX2
180 void CopyNyparrNonemptySubset(const uintptr_t* __restrict raw_nyparr, const uintptr_t* __restrict subset_mask, uint32_t raw_nyparr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_nyparr) {
181 if (subset_entry_ct == raw_nyparr_entry_ct) {
182 // subset_mask may be nullptr in this case
183 memcpy(output_nyparr, raw_nyparr, DivUp(subset_entry_ct, kBitsPerWordD2) * sizeof(intptr_t));
184 ZeroTrailingNyps(subset_entry_ct, output_nyparr);
185 return;
186 }
187 assert(subset_entry_ct);
188 assert(raw_nyparr_entry_ct >= subset_entry_ct);
189 uintptr_t cur_output_word = 0;
190
191 uintptr_t* output_nyparr_iter = output_nyparr;
192
193 uintptr_t* output_nyparr_last = &(output_nyparr[subset_entry_ct / kBitsPerWordD2]);
194 const uint32_t word_write_halfshift_end = subset_entry_ct % kBitsPerWordD2;
195 uint32_t word_write_halfshift = 0;
196 // if <= 2/3-filled, use sparse copy algorithm
197 // (tried CopyBitarrSubset() approach, that actually worsened things)
198 if (subset_entry_ct * (3 * k1LU) <= raw_nyparr_entry_ct * (2 * k1LU)) {
199 for (uint32_t subset_mask_widx = 0; ; ++subset_mask_widx) {
200 const uintptr_t cur_include_word = subset_mask[subset_mask_widx];
201 if (cur_include_word) {
202 uint32_t cur_include_halfword = S_CAST(Halfword, cur_include_word);
203 for (uint32_t wordhalf_idx = 0; ; ++wordhalf_idx) {
204 if (cur_include_halfword) {
205 uintptr_t raw_nyparr_word = raw_nyparr[subset_mask_widx * 2 + wordhalf_idx];
206 do {
207 uint32_t rqa_idx_lowbits = ctzu32(cur_include_halfword);
208 cur_output_word |= ((raw_nyparr_word >> (rqa_idx_lowbits * 2)) & 3) << (word_write_halfshift * 2);
209 if (++word_write_halfshift == kBitsPerWordD2) {
210 *output_nyparr_iter++ = cur_output_word;
211 word_write_halfshift = 0;
212 cur_output_word = 0;
213 }
214 cur_include_halfword &= cur_include_halfword - 1;
215 } while (cur_include_halfword);
216 }
217 if (wordhalf_idx) {
218 break;
219 }
220 cur_include_halfword = cur_include_word >> kBitsPerWordD2;
221 }
222 if (output_nyparr_iter == output_nyparr_last) {
223 if (word_write_halfshift == word_write_halfshift_end) {
224 if (word_write_halfshift_end) {
225 *output_nyparr_last = cur_output_word;
226 }
227 return;
228 }
229 }
230 }
231 }
232 }
233 // blocked copy
234 const uintptr_t* raw_nyparr_iter = raw_nyparr;
235 for (; ; ++subset_mask) {
236 const uintptr_t cur_include_word = *subset_mask;
237 uintptr_t cur_include_halfword = S_CAST(Halfword, cur_include_word);
238 for (uint32_t wordhalf_idx = 0; ; ++wordhalf_idx) {
239 uintptr_t raw_nyparr_word = *raw_nyparr_iter++;
240 while (cur_include_halfword) {
241 const uint32_t rqa_idx_lowbits = ctzw(cur_include_halfword);
242
243 // TAOCP, 7.1.3, (43).
244 const uintptr_t bottom_block_remover = (cur_include_halfword | (cur_include_halfword - 1)) + 1;
245
246 const uintptr_t raw_nyparr_curblock_unmasked = raw_nyparr_word >> (rqa_idx_lowbits * 2);
247 const uint32_t rqa_block_len = ctzw(bottom_block_remover) - rqa_idx_lowbits;
248 const uint32_t block_len_limit = kBitsPerWordD2 - word_write_halfshift;
249 cur_output_word |= raw_nyparr_curblock_unmasked << (2 * word_write_halfshift);
250 if (rqa_block_len < block_len_limit) {
251 word_write_halfshift += rqa_block_len;
252 cur_output_word = bzhi(cur_output_word, word_write_halfshift * 2);
253 } else {
254 // no need to mask, extra bits vanish off the high end
255 *output_nyparr_iter++ = cur_output_word;
256 word_write_halfshift = rqa_block_len - block_len_limit;
257 if (word_write_halfshift) {
258 cur_output_word = bzhi(raw_nyparr_curblock_unmasked >> (2 * block_len_limit), 2 * word_write_halfshift);
259 } else {
260 // avoid potential right-shift-[word length]
261 cur_output_word = 0;
262 }
263 }
264 cur_include_halfword &= bottom_block_remover;
265 }
266 if (wordhalf_idx) {
267 break;
268 }
269 cur_include_halfword = cur_include_word >> kBitsPerWordD2;
270 }
271 if (output_nyparr_iter == output_nyparr_last) {
272 if (word_write_halfshift == word_write_halfshift_end) {
273 if (word_write_halfshift_end) {
274 *output_nyparr_last = cur_output_word;
275 }
276 return;
277 }
278 }
279 }
280 }
281
282 void CopyGenomatchSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict genovec, uintptr_t match_word, uint32_t write_bit_idx_start, uint32_t bit_ct, uintptr_t* __restrict output_bitarr) {
283 const uint32_t bit_idx_end = write_bit_idx_start + bit_ct;
284 const uint32_t bit_idx_end_lowbits = bit_idx_end % kBitsPerWord;
285 const Halfword* raw_bitarr_alias = R_CAST(const Halfword*, raw_bitarr);
286 uintptr_t* output_bitarr_iter = output_bitarr;
287 uintptr_t* output_bitarr_last = &(output_bitarr[bit_idx_end / kBitsPerWord]);
288 uintptr_t cur_output_word = 0;
289 uint32_t read_widx = UINT32_MAX; // deliberate overflow
290 uint32_t write_idx_lowbits = write_bit_idx_start;
291 while ((output_bitarr_iter != output_bitarr_last) || (write_idx_lowbits != bit_idx_end_lowbits)) {
292 uintptr_t geno_word;
293 // sparse genovec optimization
294 // guaranteed to terminate since there's at least one more set bit
295 do {
296 geno_word = genovec[++read_widx] ^ match_word;
297 geno_word = (~(geno_word | (geno_word >> 1))) & kMask5555;
298 } while (!geno_word);
299 // screw it, just iterate over set bits
300 const uint32_t cur_halfword = raw_bitarr_alias[read_widx];
301 do {
302 const uint32_t sample_idx_lowbits = ctzw(geno_word) / 2;
303 cur_output_word |= S_CAST(uintptr_t, (cur_halfword >> sample_idx_lowbits) & k1LU) << write_idx_lowbits;
304 if (++write_idx_lowbits == kBitsPerWord) {
305 *output_bitarr_iter++ = cur_output_word;
306 cur_output_word = 0;
307 write_idx_lowbits = 0;
308 }
309 geno_word &= geno_word - 1;
310 } while (geno_word);
311 }
312 if (write_idx_lowbits) {
313 *output_bitarr_iter = cur_output_word;
314 }
315 }
316
317 void ExpandBytearrFromGenoarr(const void* __restrict compact_bitarr, const uintptr_t* __restrict genoarr, uintptr_t match_word, uint32_t genoword_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target) {
318 Halfword* target_alias = R_CAST(Halfword*, target);
319 ZeroHwArr(RoundUpPow2(genoword_ct, 2), target_alias);
320 const uintptr_t* compact_bitarr_alias = S_CAST(const uintptr_t*, compact_bitarr);
321 const uint32_t expand_sizex_m1 = expand_size + read_start_bit - 1;
322 const uint32_t compact_widx_last = expand_sizex_m1 / kBitsPerWord;
323 uint32_t compact_idx_lowbits = read_start_bit;
324 uint32_t loop_len = kBitsPerWord;
325 uintptr_t write_hwidx = 0;
326 uintptr_t genomatch_bits = genoarr[0] ^ match_word;
327 genomatch_bits = (~(genomatch_bits | (genomatch_bits >> 1))) & kMask5555;
328 for (uint32_t compact_widx = 0; ; ++compact_widx) {
329 uintptr_t compact_word;
330 if (compact_widx >= compact_widx_last) {
331 if (compact_widx > compact_widx_last) {
332 return;
333 }
334 loop_len = 1 + (expand_sizex_m1 % kBitsPerWord);
335 // avoid possible segfault
336 compact_word = SubwordLoad(&(compact_bitarr_alias[compact_widx]), DivUp(loop_len, CHAR_BIT));
337 } else {
338 #ifdef __arm__
339 # error "Unaligned accesses in ExpandBytearrFromGenoarr()."
340 #endif
341 compact_word = compact_bitarr_alias[compact_widx];
342 }
343 for (; compact_idx_lowbits != loop_len; ++compact_idx_lowbits) {
344 while (!genomatch_bits) {
345 genomatch_bits = genoarr[++write_hwidx] ^ match_word;
346 genomatch_bits = (~(genomatch_bits | (genomatch_bits >> 1))) & kMask5555;
347 }
348 if (compact_word & (k1LU << compact_idx_lowbits)) {
349 const uint32_t lowbit_idx = ctzw(genomatch_bits);
350 target_alias[write_hwidx] |= 1U << (lowbit_idx / 2);
351 }
352 genomatch_bits &= genomatch_bits - 1;
353 }
354 compact_idx_lowbits = 0;
355 }
356 }
357 #endif
358
359 // Harley-Seal algorithm only works for bitarrays, not nyparrays, so don't
360 // add an AVX2 specialization here.
361 // ...unless something like the interleaved_vec strategy is used? hmm. should
362 // test this on basic frequency counter.
363 /*
364 void Count2FreqVec3(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict alt1_plus_bothset_ctp, uint32_t* __restrict bothset_ctp) {
365 assert(!(vec_ct % 3));
366 // Increments bothset_ct by the number of 0b11 in the current block, and
367 // alt1_ct by twice the number of 0b10 plus the number of 0b01.
368 const VecW m1 = VCONST_W(kMask5555);
369 const VecW m2 = VCONST_W(kMask3333);
370 const VecW m4 = VCONST_W(kMask0F0F);
371 const VecW* geno_vvec_iter = geno_vvec;
372 uint32_t alt1_plus_bothset_ct = 0;
373 uint32_t bothset_ct = 0;
374
375 while (1) {
376 UniVec acc_alt1_plus_bothset;
377 UniVec acc_bothset;
378 acc_alt1_plus_bothset.vw = vecw_setzero();
379 acc_bothset.vw = vecw_setzero();
380 const VecW* geno_vvec_stop;
381 if (vec_ct < 30) {
382 if (!vec_ct) {
383 *alt1_plus_bothset_ctp = alt1_plus_bothset_ct;
384 *bothset_ctp = bothset_ct;
385 return;
386 }
387 geno_vvec_stop = &(geno_vvec_iter[vec_ct]);
388 vec_ct = 0;
389 } else {
390 geno_vvec_stop = &(geno_vvec_iter[30]);
391 vec_ct -= 30;
392 }
393 do {
394 VecW cur_geno_vword1 = *geno_vvec_iter++;
395 // process first two vwords simultaneously to minimize linear dependence
396 VecW cur_geno_vword2 = *geno_vvec_iter++;
397 VecW cur_geno_vword_low_lshifted1 = vecw_slli(cur_geno_vword1 & m1, 1);
398 VecW cur_geno_vword_low_lshifted2 = vecw_slli(cur_geno_vword2 & m1, 1);
399
400 // 00 -> 00; 01 -> 01; 10 -> 10; 11 -> 01
401 // note that _mm_andnot_si128 flips the *first* argument before the AND
402 // operation.
403 VecW alt1_plus_bothset1 = vecw_and_notfirst(cur_geno_vword_low_lshifted1, cur_geno_vword1);
404 VecW alt1_plus_bothset2 = vecw_and_notfirst(cur_geno_vword_low_lshifted2, cur_geno_vword2);
405
406 VecW bothset1 = vecw_srli(cur_geno_vword_low_lshifted1 & cur_geno_vword1, 1);
407 VecW bothset2 = vecw_srli(cur_geno_vword_low_lshifted2 & cur_geno_vword2, 1);
408
409 cur_geno_vword1 = *geno_vvec_iter++;
410 alt1_plus_bothset1 = (alt1_plus_bothset1 & m2) + (vecw_srli(alt1_plus_bothset1, 2) & m2);
411 bothset2 = bothset1 + bothset2;
412 alt1_plus_bothset2 = (alt1_plus_bothset2 & m2) + (vecw_srli(alt1_plus_bothset2, 2) & m2);
413 cur_geno_vword_low_lshifted1 = vecw_slli(cur_geno_vword1 & m1, 1);
414
415 alt1_plus_bothset2 = alt1_plus_bothset1 + alt1_plus_bothset2;
416 // alt1_plus_bothset2 now contains 4-bit values from 0-8, while bothset2
417 // contains 2-bit values from 0-2
418 // (todo: check whether this is faster if we use double_bothsetx
419 // variables instead of bothset1/bothset2)
420 bothset1 = vecw_srli(cur_geno_vword_low_lshifted1 & cur_geno_vword1, 1);
421 alt1_plus_bothset1 = vecw_and_notfirst(cur_geno_vword_low_lshifted1, cur_geno_vword1);
422 bothset2 = bothset1 + bothset2;
423 alt1_plus_bothset1 = (alt1_plus_bothset1 & m2) + (vecw_srli(alt1_plus_bothset1, 2) & m2);
424
425 bothset2 = (bothset2 & m2) + (vecw_srli(bothset2, 2) & m2);
426 alt1_plus_bothset2 = alt1_plus_bothset1 + alt1_plus_bothset2;
427 // alt1_plus_bothset2 now contains 4-bit values from 0-12, while bothset2
428 // contains 4-bit values from 0-6. aggregate both into 8-bit values.
429 bothset2 = (bothset2 & m4) + (vecw_srli(bothset2, 4) & m4);
430 alt1_plus_bothset2 = (alt1_plus_bothset2 & m4) + (vecw_srli(alt1_plus_bothset2, 4) & m4);
431
432 acc_bothset.vw = acc_bothset.vw + bothset2;
433 acc_alt1_plus_bothset.vw = acc_alt1_plus_bothset.vw + alt1_plus_bothset2;
434 } while (geno_vvec_iter < geno_vvec_stop);
435 const VecW m8 = VCONST_W(kMask00FF);
436 acc_bothset.vw = (acc_bothset.vw + vecw_srli(acc_bothset.vw, 8)) & m8;
437 acc_alt1_plus_bothset.vw = (acc_alt1_plus_bothset.vw & m8) + (vecw_srli(acc_alt1_plus_bothset.vw, 8) & m8);
438 bothset_ct += UniVecHsum16(acc_bothset);
439 alt1_plus_bothset_ct += UniVecHsum16(acc_alt1_plus_bothset);
440 }
441 }
442 */
443
444 // todo: benchmark against general-purpose counter
Count12Vec6(const VecW * geno_vvec,uint32_t vec_ct,uint32_t * __restrict raw_01_ctp,uint32_t * __restrict raw_both_ctp)445 void Count12Vec6(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict raw_01_ctp, uint32_t* __restrict raw_both_ctp) {
446 assert(!(vec_ct % 6));
447 const VecW m1 = VCONST_W(kMask5555);
448 const VecW m2 = VCONST_W(kMask3333);
449 const VecW m4 = VCONST_W(kMask0F0F);
450 const VecW* geno_vvec_iter = geno_vvec;
451 VecW acc_01 = vecw_setzero();
452 VecW acc_both = vecw_setzero();
453 uintptr_t cur_incr = 60;
454 for (; ; vec_ct -= cur_incr) {
455 if (vec_ct < 60) {
456 if (!vec_ct) {
457 *raw_01_ctp = HsumW(acc_01);
458 *raw_both_ctp = HsumW(acc_both);
459 return;
460 }
461 cur_incr = vec_ct;
462 }
463 VecW inner_acc_01 = vecw_setzero();
464 VecW inner_acc_both = vecw_setzero();
465 const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
466 do {
467 VecW cur_vvec = *geno_vvec_iter++;
468 VecW vvec_rshift = vecw_srli(cur_vvec, 1);
469 VecW nyp_both = m1 & (cur_vvec ^ vvec_rshift);
470 VecW nyp_01 = nyp_both & cur_vvec;
471
472 cur_vvec = *geno_vvec_iter++;
473 vvec_rshift = vecw_srli(cur_vvec, 1);
474 VecW vvec_both = m1 & (cur_vvec ^ vvec_rshift);
475 nyp_01 = nyp_01 + (vvec_both & cur_vvec);
476 nyp_both = nyp_both + vvec_both;
477
478 cur_vvec = *geno_vvec_iter++;
479 vvec_rshift = vecw_srli(cur_vvec, 1);
480 vvec_both = m1 & (cur_vvec ^ vvec_rshift);
481 nyp_01 = nyp_01 + (vvec_both & cur_vvec);
482 nyp_both = nyp_both + vvec_both;
483
484 VecW nybble_01 = (nyp_01 & m2) + (vecw_srli(nyp_01, 2) & m2);
485 VecW nybble_both = (nyp_both & m2) + (vecw_srli(nyp_both, 2) & m2);
486
487 cur_vvec = *geno_vvec_iter++;
488 vvec_rshift = vecw_srli(cur_vvec, 1);
489 nyp_both = m1 & (cur_vvec ^ vvec_rshift);
490 nyp_01 = nyp_both & cur_vvec;
491
492 cur_vvec = *geno_vvec_iter++;
493 vvec_rshift = vecw_srli(cur_vvec, 1);
494 vvec_both = m1 & (cur_vvec ^ vvec_rshift);
495 nyp_01 = nyp_01 + (vvec_both & cur_vvec);
496 nyp_both = nyp_both + vvec_both;
497
498 cur_vvec = *geno_vvec_iter++;
499 vvec_rshift = vecw_srli(cur_vvec, 1);
500 vvec_both = m1 & (cur_vvec ^ vvec_rshift);
501 nyp_01 = nyp_01 + (vvec_both & cur_vvec);
502 nyp_both = nyp_both + vvec_both;
503
504 nybble_01 = nybble_01 + (nyp_01 & m2) + (vecw_srli(nyp_01, 2) & m2);
505 nybble_both = nybble_both + (nyp_both & m2) + (vecw_srli(nyp_both, 2) & m2);
506
507 inner_acc_01 = inner_acc_01 + (nybble_01 & m4) + (vecw_srli(nybble_01, 4) & m4);
508 inner_acc_both = inner_acc_both + (nybble_both & m4) + (vecw_srli(nybble_both, 4) & m4);
509 } while (geno_vvec_iter < geno_vvec_stop);
510 const VecW m0 = vecw_setzero();
511 acc_01 = acc_01 + vecw_bytesum(inner_acc_01, m0);
512 acc_both = acc_both + vecw_bytesum(inner_acc_both, m0);
513 }
514 }
515
GenovecCount12Unsafe(const uintptr_t * genovec,uint32_t sample_ct,uint32_t * __restrict raw_01_ctp,uint32_t * __restrict raw_10_ctp)516 void GenovecCount12Unsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* __restrict raw_01_ctp, uint32_t* __restrict raw_10_ctp) {
517 // assumes trailing bits of last genovec word are zeroed out.
518 // sample_ct == 0 ok.
519 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
520 uint32_t raw_01_ct;
521 uint32_t raw_both_ct;
522 uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (6 * kWordsPerVec));
523 assert(VecIsAligned(genovec));
524 Count12Vec6(R_CAST(const VecW*, genovec), word_idx / kWordsPerVec, &raw_01_ct, &raw_both_ct);
525 for (; word_idx != sample_ctl2; ++word_idx) {
526 const uintptr_t cur_geno_word = genovec[word_idx];
527 const uintptr_t cur_rshift = cur_geno_word >> 1;
528 const uintptr_t cur_both = kMask5555 & (cur_geno_word ^ cur_rshift);
529 raw_01_ct += Popcount01Word(cur_both & cur_geno_word);
530 raw_both_ct += Popcount01Word(cur_both);
531 }
532 *raw_01_ctp = raw_01_ct;
533 *raw_10_ctp = raw_both_ct - raw_01_ct;
534 }
535
Count3FreqVec6(const VecW * geno_vvec,uint32_t vec_ct,uint32_t * __restrict even_ctp,uint32_t * __restrict odd_ctp,uint32_t * __restrict bothset_ctp)536 void Count3FreqVec6(const VecW* geno_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp) {
537 assert(!(vec_ct % 6));
538 // Sets even_ct to the number of set low bits in the current block, odd_ct to
539 // the number of set high bits, and bothset_ct by the number of 0b11s.
540 const VecW m1 = VCONST_W(kMask5555);
541 const VecW m2 = VCONST_W(kMask3333);
542 const VecW m4 = VCONST_W(kMask0F0F);
543 const VecW* geno_vvec_iter = geno_vvec;
544 VecW acc_even = vecw_setzero();
545 VecW acc_odd = vecw_setzero();
546 VecW acc_bothset = vecw_setzero();
547 uintptr_t cur_incr = 60;
548 for (; ; vec_ct -= cur_incr) {
549 if (vec_ct < 60) {
550 if (!vec_ct) {
551 *even_ctp = HsumW(acc_even);
552 *odd_ctp = HsumW(acc_odd);
553 *bothset_ctp = HsumW(acc_bothset);
554 return;
555 }
556 cur_incr = vec_ct;
557 }
558 VecW inner_acc_even = vecw_setzero();
559 VecW inner_acc_odd = vecw_setzero();
560 VecW inner_acc_bothset = vecw_setzero();
561 const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
562 do {
563 // hmm, this seems to have more linear dependence than I'd want, but the
564 // reorderings I tried just made the code harder to read without helping,
565 // so I'll leave this alone
566 VecW cur_geno_vword = vecw_loadu(geno_vvec_iter);
567 ++geno_vvec_iter;
568 VecW odd1 = m1 & vecw_srli(cur_geno_vword, 1);
569 VecW even1 = m1 & cur_geno_vword;
570 VecW bothset1 = odd1 & cur_geno_vword;
571
572 cur_geno_vword = vecw_loadu(geno_vvec_iter);
573 ++geno_vvec_iter;
574 VecW cur_geno_vword_high = m1 & vecw_srli(cur_geno_vword, 1);
575 even1 = even1 + (m1 & cur_geno_vword);
576 odd1 = odd1 + cur_geno_vword_high;
577 bothset1 = bothset1 + (cur_geno_vword_high & cur_geno_vword);
578
579 cur_geno_vword = vecw_loadu(geno_vvec_iter);
580 ++geno_vvec_iter;
581 cur_geno_vword_high = m1 & vecw_srli(cur_geno_vword, 1);
582 even1 = even1 + (m1 & cur_geno_vword);
583 odd1 = odd1 + cur_geno_vword_high;
584 bothset1 = bothset1 + (cur_geno_vword_high & cur_geno_vword);
585
586 even1 = (even1 & m2) + (vecw_srli(even1, 2) & m2);
587 odd1 = (odd1 & m2) + (vecw_srli(odd1, 2) & m2);
588 bothset1 = (bothset1 & m2) + (vecw_srli(bothset1, 2) & m2);
589
590 cur_geno_vword = vecw_loadu(geno_vvec_iter);
591 ++geno_vvec_iter;
592 VecW odd2 = m1 & vecw_srli(cur_geno_vword, 1);
593 VecW even2 = m1 & cur_geno_vword;
594 VecW bothset2 = odd2 & cur_geno_vword;
595
596 cur_geno_vword = vecw_loadu(geno_vvec_iter);
597 ++geno_vvec_iter;
598 cur_geno_vword_high = m1 & vecw_srli(cur_geno_vword, 1);
599 even2 = even2 + (m1 & cur_geno_vword);
600 odd2 = odd2 + cur_geno_vword_high;
601 bothset2 = bothset2 + (cur_geno_vword_high & cur_geno_vword);
602
603 cur_geno_vword = vecw_loadu(geno_vvec_iter);
604 ++geno_vvec_iter;
605 cur_geno_vword_high = m1 & vecw_srli(cur_geno_vword, 1);
606 even2 = even2 + (m1 & cur_geno_vword);
607 odd2 = odd2 + cur_geno_vword_high;
608 bothset2 = bothset2 + (cur_geno_vword_high & cur_geno_vword);
609
610 even1 = even1 + (even2 & m2) + (vecw_srli(even2, 2) & m2);
611 odd1 = odd1 + (odd2 & m2) + (vecw_srli(odd2, 2) & m2);
612 bothset1 = bothset1 + (bothset2 & m2) + (vecw_srli(bothset2, 2) & m2);
613 // these now contain 4-bit values from 0-12
614
615 inner_acc_even = inner_acc_even + (even1 & m4) + (vecw_srli(even1, 4) & m4);
616 inner_acc_odd = inner_acc_odd + (odd1 & m4) + (vecw_srli(odd1, 4) & m4);
617 inner_acc_bothset = inner_acc_bothset + (bothset1 & m4) + (vecw_srli(bothset1, 4) & m4);
618 } while (geno_vvec_iter < geno_vvec_stop);
619 const VecW m0 = vecw_setzero();
620 acc_even = acc_even + vecw_bytesum(inner_acc_even, m0);
621 acc_odd = acc_odd + vecw_bytesum(inner_acc_odd, m0);
622 acc_bothset = acc_bothset + vecw_bytesum(inner_acc_bothset, m0);
623 }
624 }
625
FillInterleavedMaskVec(const uintptr_t * __restrict subset_mask,uint32_t base_vec_ct,uintptr_t * interleaved_mask_vec)626 void FillInterleavedMaskVec(const uintptr_t* __restrict subset_mask, uint32_t base_vec_ct, uintptr_t* interleaved_mask_vec) {
627 #ifdef __LP64__
628 // This is a cousin of github.com/KWillets/simd_interleave , which was
629 // written in response to
630 // https://lemire.me/blog/2018/01/09/how-fast-can-you-bit-interleave-32-bit-integers-simd-edition/
631 // AVX2 implementation takes ~40% less time than before, and SSE4.2 takes
632 // ~65% less. This also avoids the Ryzen-screwing _pdep_u64()/_pext_u64()
633 // operations.
634 const VecW m4 = VCONST_W(kMask0F0F);
635 # ifdef USE_SSE42
636 const VecW lookup0 = vecw_setr8(
637 0, 1, 4, 5, 16, 17, 20, 21,
638 64, 65, 68, 69, 80, 81, 84, 85);
639 const VecW lookup1 = vecw_slli(lookup0, 1);
640 # else
641 const VecW m1 = VCONST_W(kMask5555);
642 const VecW m2 = VCONST_W(kMask3333);
643 # endif
644 const VecW* subset_mask_valias = R_CAST(const VecW*, subset_mask);
645 VecW* interleaved_mask_valias = R_CAST(VecW*, interleaved_mask_vec);
646
647 for (uint32_t vidx = 0; vidx != base_vec_ct; ++vidx) {
648 // I'll assume the compiler can handle this register allocation job.
649 VecW cur_vec = subset_mask_valias[vidx];
650
651 cur_vec = vecw_permute0xd8_if_avx2(cur_vec);
652 const VecW vec_even = cur_vec & m4;
653 const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;
654 VecW vec_lo = vecw_unpacklo8(vec_even, vec_odd);
655 VecW vec_hi = vecw_unpackhi8(vec_even, vec_odd);
656 # ifdef USE_SSE42
657 vec_lo = vecw_shuffle8(lookup0, vec_lo);
658 vec_hi = vecw_shuffle8(lookup1, vec_hi);
659 # else
660 vec_lo = (vec_lo | vecw_slli(vec_lo, 2)) & m2;
661 vec_hi = (vec_hi | vecw_slli(vec_hi, 2)) & m2;
662 vec_lo = (vec_lo | vecw_slli(vec_lo, 1)) & m1;
663 vec_hi = (vec_hi | vecw_slli(vec_hi, 1)) & m1;
664 vec_hi = vecw_slli(vec_hi, 1);
665 # endif
666 interleaved_mask_valias[vidx] = vec_lo | vec_hi;
667 }
668 #else // !__LP64__
669 for (uint32_t widx = 0; widx != base_vec_ct; ++widx) {
670 const uintptr_t orig_word = subset_mask[widx];
671 uintptr_t ww_even = S_CAST(uint16_t, orig_word);
672 uintptr_t ww_odd = orig_word >> 16;
673 ww_even = UnpackHalfwordToWord(ww_even);
674 ww_odd = UnpackHalfwordToWord(ww_odd);
675 interleaved_mask_vec[widx] = ww_even | (ww_odd << 1);
676 }
677 #endif
678 }
679
680 /*
681 void GenovecAlleleCtsUnsafe(const uintptr_t* genovec, uint32_t sample_ct, uint32_t* __restrict allele_cts, uint32_t* __restrict bothset_ctp) {
682 // assumes trailing bits of last genovec word are zeroed out.
683 // sets allele_cts[0] to the number of observed ref alleles, and
684 // allele_cts[1] to the number of observed alt1s.
685 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
686 uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (3 * kWordsPerVec));
687 uint32_t alt1_plus_bothset_ct;
688 uint32_t bothset_ct;
689 assert(VecIsAligned(genovec));
690 Count2FreqVec3(R_CAST(const VecW*, genovec), word_idx / kWordsPerVec, &alt1_plus_bothset_ct, &bothset_ct);
691 for (; word_idx != sample_ctl2; ++word_idx) {
692 const uintptr_t cur_geno_word = genovec[word_idx];
693 const uintptr_t cur_geno_word_low_lshifted = (cur_geno_word & kMask5555) << 1;
694 alt1_plus_bothset_ct += NypsumWord((~cur_geno_word_low_lshifted) & cur_geno_word);
695 bothset_ct += NypsumWord(cur_geno_word_low_lshifted & cur_geno_word);
696 }
697 const uint32_t alt1_ct = alt1_plus_bothset_ct - bothset_ct;
698 allele_cts[0] = (sample_ct - bothset_ct) * 2 - alt1_ct;
699 allele_cts[1] = alt1_ct;
700 *bothset_ctp = bothset_ct;
701 }
702 */
703
704 void GenoarrCountFreqsUnsafe(const uintptr_t* genoarr, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
705 // fills genocounts[0] with the number of 00s, genocounts[1] with the number
706 // of 01s, etc.
707 // assumes trailing bits of last genoarr word are zeroed out.
708 // sample_ct == 0 ok.
709 // no longer requires vector-alignment.
710 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
711 uint32_t even_ct;
712 uint32_t odd_ct;
713 uint32_t bothset_ct;
714 uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (6 * kWordsPerVec));
715 Count3FreqVec6(R_CAST(const VecW*, genoarr), word_idx / kWordsPerVec, &even_ct, &odd_ct, &bothset_ct);
716 for (; word_idx != sample_ctl2; ++word_idx) {
717 const uintptr_t cur_geno_word = genoarr[word_idx];
718 const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
719 even_ct += Popcount01Word(cur_geno_word & kMask5555);
720 odd_ct += Popcount01Word(cur_geno_word_high);
721 bothset_ct += Popcount01Word(cur_geno_word & cur_geno_word_high);
722 }
723 genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
724 genocounts[1] = even_ct - bothset_ct;
725 genocounts[2] = odd_ct - bothset_ct;
726 genocounts[3] = bothset_ct;
727 }
728
729 // geno_vvec now allowed to be unaligned.
CountSubset3FreqVec6(const VecW * __restrict geno_vvec,const VecW * __restrict interleaved_mask_vvec,uint32_t vec_ct,uint32_t * __restrict even_ctp,uint32_t * __restrict odd_ctp,uint32_t * __restrict bothset_ctp)730 void CountSubset3FreqVec6(const VecW* __restrict geno_vvec, const VecW* __restrict interleaved_mask_vvec, uint32_t vec_ct, uint32_t* __restrict even_ctp, uint32_t* __restrict odd_ctp, uint32_t* __restrict bothset_ctp) {
731 assert(!(vec_ct % 6));
732 // Sets even_ct to the number of set low bits in the current block after
733 // subsetting, odd_ct to the number of set high bits, and bothset_ct by the
734 // number of 0b11s.
735 const VecW m1 = VCONST_W(kMask5555);
736 const VecW m2 = VCONST_W(kMask3333);
737 const VecW m4 = VCONST_W(kMask0F0F);
738 const VecW* geno_vvec_iter = geno_vvec;
739 const VecW* interleaved_mask_vvec_iter = interleaved_mask_vvec;
740 VecW acc_even = vecw_setzero();
741 VecW acc_odd = vecw_setzero();
742 VecW acc_bothset = vecw_setzero();
743 uintptr_t cur_incr = 60;
744 for (; ; vec_ct -= cur_incr) {
745 if (vec_ct < 60) {
746 if (!vec_ct) {
747 *even_ctp = HsumW(acc_even);
748 *odd_ctp = HsumW(acc_odd);
749 *bothset_ctp = HsumW(acc_bothset);
750 return;
751 }
752 cur_incr = vec_ct;
753 }
754 VecW inner_acc_even = vecw_setzero();
755 VecW inner_acc_odd = vecw_setzero();
756 VecW inner_acc_bothset = vecw_setzero();
757 const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
758 do {
759 VecW interleaved_mask_vword = *interleaved_mask_vvec_iter++;
760 VecW cur_geno_vword = vecw_loadu(geno_vvec_iter);
761 ++geno_vvec_iter;
762 VecW cur_mask = interleaved_mask_vword & m1;
763 VecW odd1 = cur_mask & vecw_srli(cur_geno_vword, 1);
764 VecW even1 = cur_mask & cur_geno_vword;
765 VecW bothset1 = odd1 & cur_geno_vword;
766
767 cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
768 cur_geno_vword = vecw_loadu(geno_vvec_iter);
769 ++geno_vvec_iter;
770 VecW cur_geno_vword_high_masked = cur_mask & vecw_srli(cur_geno_vword, 1);
771 even1 = even1 + (cur_mask & cur_geno_vword);
772 odd1 = odd1 + cur_geno_vword_high_masked;
773 bothset1 = bothset1 + (cur_geno_vword_high_masked & cur_geno_vword);
774
775 interleaved_mask_vword = *interleaved_mask_vvec_iter++;
776 cur_mask = interleaved_mask_vword & m1;
777 cur_geno_vword = vecw_loadu(geno_vvec_iter);
778 ++geno_vvec_iter;
779 cur_geno_vword_high_masked = cur_mask & vecw_srli(cur_geno_vword, 1);
780 even1 = even1 + (cur_mask & cur_geno_vword);
781 odd1 = odd1 + cur_geno_vword_high_masked;
782 bothset1 = bothset1 + (cur_geno_vword_high_masked & cur_geno_vword);
783
784 even1 = (even1 & m2) + (vecw_srli(even1, 2) & m2);
785 odd1 = (odd1 & m2) + (vecw_srli(odd1, 2) & m2);
786 bothset1 = (bothset1 & m2) + (vecw_srli(bothset1, 2) & m2);
787
788 cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
789 cur_geno_vword = vecw_loadu(geno_vvec_iter);
790 ++geno_vvec_iter;
791 VecW odd2 = cur_mask & vecw_srli(cur_geno_vword, 1);
792 VecW even2 = cur_mask & cur_geno_vword;
793 VecW bothset2 = odd2 & cur_geno_vword;
794
795 interleaved_mask_vword = *interleaved_mask_vvec_iter++;
796 cur_mask = interleaved_mask_vword & m1;
797 cur_geno_vword = vecw_loadu(geno_vvec_iter);
798 ++geno_vvec_iter;
799 cur_geno_vword_high_masked = cur_mask & vecw_srli(cur_geno_vword, 1);
800 even2 = even2 + (cur_mask & cur_geno_vword);
801 odd2 = odd2 + cur_geno_vword_high_masked;
802 bothset2 = bothset2 + (cur_geno_vword_high_masked & cur_geno_vword);
803
804 cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
805 cur_geno_vword = vecw_loadu(geno_vvec_iter);
806 ++geno_vvec_iter;
807 cur_geno_vword_high_masked = cur_mask & vecw_srli(cur_geno_vword, 1);
808 even2 = even2 + (cur_mask & cur_geno_vword);
809 odd2 = odd2 + cur_geno_vword_high_masked;
810 bothset2 = bothset2 + (cur_geno_vword_high_masked & cur_geno_vword);
811
812 even1 = even1 + (even2 & m2) + (vecw_srli(even2, 2) & m2);
813 odd1 = odd1 + (odd2 & m2) + (vecw_srli(odd2, 2) & m2);
814 bothset1 = bothset1 + (bothset2 & m2) + (vecw_srli(bothset2, 2) & m2);
815 // these now contain 4-bit values from 0-12
816
817 inner_acc_even = inner_acc_even + (even1 & m4) + (vecw_srli(even1, 4) & m4);
818 inner_acc_odd = inner_acc_odd + (odd1 & m4) + (vecw_srli(odd1, 4) & m4);
819 inner_acc_bothset = inner_acc_bothset + (bothset1 & m4) + (vecw_srli(bothset1, 4) & m4);
820 } while (geno_vvec_iter < geno_vvec_stop);
821 const VecW m0 = vecw_setzero();
822 acc_even = acc_even + vecw_bytesum(inner_acc_even, m0);
823 acc_odd = acc_odd + vecw_bytesum(inner_acc_odd, m0);
824 acc_bothset = acc_bothset + vecw_bytesum(inner_acc_bothset, m0);
825 }
826 }
827
828 void GenoarrCountSubsetFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
829 // fills genocounts[0] with the number of 00s, genocounts[1] with the number
830 // of 01s, etc.
831 // {raw_}sample_ct == 0 ok.
832 const uint32_t raw_sample_ctv2 = NypCtToVecCt(raw_sample_ct);
833 uint32_t even_ct;
834 uint32_t odd_ct;
835 uint32_t bothset_ct;
836 #ifdef __LP64__
837 uint32_t vec_idx = raw_sample_ctv2 - (raw_sample_ctv2 % 6);
838 CountSubset3FreqVec6(R_CAST(const VecW*, genoarr), R_CAST(const VecW*, sample_include_interleaved_vec), vec_idx, &even_ct, &odd_ct, &bothset_ct);
839 const uintptr_t* genoarr_iter = &(genoarr[kWordsPerVec * vec_idx]);
840 const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[(kWordsPerVec / 2) * vec_idx]);
841 # ifdef USE_AVX2
842 uintptr_t mask_base1 = 0;
843 uintptr_t mask_base2 = 0;
844 uintptr_t mask_base3 = 0;
845 uintptr_t mask_base4 = 0;
846 for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
847 uintptr_t mask_word1;
848 uintptr_t mask_word2;
849 uintptr_t mask_word3;
850 uintptr_t mask_word4;
851 if (!(vec_idx % 2)) {
852 mask_base1 = *interleaved_mask_iter++;
853 mask_base2 = *interleaved_mask_iter++;
854 mask_base3 = *interleaved_mask_iter++;
855 mask_base4 = *interleaved_mask_iter++;
856 mask_word1 = mask_base1 & kMask5555;
857 mask_word2 = mask_base2 & kMask5555;
858 mask_word3 = mask_base3 & kMask5555;
859 mask_word4 = mask_base4 & kMask5555;
860 } else {
861 mask_word1 = (mask_base1 >> 1) & kMask5555;
862 mask_word2 = (mask_base2 >> 1) & kMask5555;
863 mask_word3 = (mask_base3 >> 1) & kMask5555;
864 mask_word4 = (mask_base4 >> 1) & kMask5555;
865 }
866 for (uint32_t vechalf_idx = 0; ; ++vechalf_idx) {
867 const uintptr_t cur_geno_word1 = *genoarr_iter++;
868 const uintptr_t cur_geno_word2 = *genoarr_iter++;
869 const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
870 const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
871 even_ct += PopcountWord(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
872 odd_ct += PopcountWord((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
873 bothset_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
874 if (vechalf_idx) {
875 break;
876 }
877 mask_word1 = mask_word3;
878 mask_word2 = mask_word4;
879 }
880 }
881 # else // not USE_AVX2
882 uintptr_t mask_base1 = 0;
883 uintptr_t mask_base2 = 0;
884 for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
885 uintptr_t mask_word1;
886 uintptr_t mask_word2;
887 if (!(vec_idx % 2)) {
888 mask_base1 = *interleaved_mask_iter++;
889 mask_base2 = *interleaved_mask_iter++;
890 mask_word1 = mask_base1 & kMask5555;
891 mask_word2 = mask_base2 & kMask5555;
892 } else {
893 mask_word1 = (mask_base1 >> 1) & kMask5555;
894 mask_word2 = (mask_base2 >> 1) & kMask5555;
895 }
896 const uintptr_t cur_geno_word1 = *genoarr_iter++;
897 const uintptr_t cur_geno_word2 = *genoarr_iter++;
898 const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
899 const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
900 # ifdef USE_SSE42
901 even_ct += PopcountWord(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
902 odd_ct += PopcountWord((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
903 bothset_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
904 # else
905 even_ct += NypsumWord((cur_geno_word1 & mask_word1) + (cur_geno_word2 & mask_word2));
906 odd_ct += NypsumWord(cur_geno_word1_high_masked + cur_geno_word2_high_masked);
907 bothset_ct += NypsumWord((cur_geno_word1 & cur_geno_word1_high_masked) + (cur_geno_word2 & cur_geno_word2_high_masked));
908 # endif
909 }
910 # endif // not USE_AVX2
911 #else // not __LP64__
912 uint32_t word_idx = raw_sample_ctv2 - (raw_sample_ctv2 % 6);
913 CountSubset3FreqVec6(R_CAST(const VecW*, genoarr), R_CAST(const VecW*, sample_include_interleaved_vec), word_idx, &even_ct, &odd_ct, &bothset_ct);
914 const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[word_idx / 2]);
915 uintptr_t mask_base = 0;
916 for (; word_idx != raw_sample_ctv2; ++word_idx) {
917 uintptr_t mask_word;
918 if (!(word_idx % 2)) {
919 mask_base = *interleaved_mask_iter++;
920 mask_word = mask_base & kMask5555;
921 } else {
922 mask_word = (mask_base >> 1) & kMask5555;
923 }
924 const uintptr_t cur_geno_word = genoarr[word_idx];
925 const uintptr_t cur_geno_word_high_masked = mask_word & (cur_geno_word >> 1);
926 even_ct += Popcount01Word(cur_geno_word & mask_word);
927 odd_ct += Popcount01Word(cur_geno_word_high_masked);
928 bothset_ct += Popcount01Word(cur_geno_word & cur_geno_word_high_masked);
929 }
930 #endif
931 genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
932 genocounts[1] = even_ct - bothset_ct;
933 genocounts[2] = odd_ct - bothset_ct;
934 genocounts[3] = bothset_ct;
935 }
936
937 void GenoarrCountSubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
938 // slower GenoarrCountSubsetFreqs() which does not require
939 // sample_include_interleaved_vec to be precomputed.
940 // {raw_}sample_ct == 0 ok.
941 const uint32_t raw_sample_ctl2 = NypCtToWordCt(raw_sample_ct);
942 const uint32_t fullword_ct = raw_sample_ctl2 / 2;
943 uint32_t even_ct = 0;
944 uint32_t odd_ct = 0;
945 uint32_t bothset_ct = 0;
946 for (uint32_t widx = 0; widx != fullword_ct; ++widx) {
947 // possible todo: try vectorizing this loop in SSE4.2+ high-sample-ct case
948 // with shuffle-based dynamic unpacking of sample_include?
949 const uintptr_t mask_word = sample_include[widx];
950 if (mask_word) {
951 uintptr_t geno_word = genoarr[2 * widx];
952 uintptr_t geno_even = PackWordToHalfwordMask5555(geno_word);
953 uintptr_t geno_odd = PackWordToHalfwordMaskAAAA(geno_word);
954 geno_word = genoarr[2 * widx + 1];
955 geno_even |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(geno_word)) << kBitsPerWordD2;
956 geno_odd |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(geno_word)) << kBitsPerWordD2;
957 const uintptr_t geno_even_masked = geno_even & mask_word;
958 even_ct += PopcountWord(geno_even_masked);
959 odd_ct += PopcountWord(geno_odd & mask_word);
960 bothset_ct += PopcountWord(geno_odd & geno_even_masked);
961 }
962 }
963 if (raw_sample_ctl2 % 2) {
964 const uintptr_t mask_hw = sample_include[fullword_ct];
965 if (mask_hw) {
966 const uintptr_t geno_word = genoarr[2 * fullword_ct];
967 // todo: benchmark main loop unpack vs. pack
968 const uintptr_t mask_word = UnpackHalfwordToWord(mask_hw);
969 const uintptr_t geno_word_shifted = geno_word >> 1;
970 const uintptr_t geno_word_masked = geno_word & mask_word;
971 even_ct += Popcount01Word(geno_word_masked);
972 odd_ct += Popcount01Word(geno_word_shifted & mask_word);
973 bothset_ct += Popcount01Word(geno_word_masked & geno_word_shifted);
974 }
975 }
976 genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
977 genocounts[1] = even_ct - bothset_ct;
978 genocounts[2] = odd_ct - bothset_ct;
979 genocounts[3] = bothset_ct;
980 }
981
982 void GenoarrCountInvsubsetFreqs2(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict sample_exclude, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
983 // {raw_}sample_ct == 0 ok.
984 // ugh, 'fullword' is overloaded. probable todo: keep halfword/fullword,
985 // switch more common use case to bodyword/trailword.
986 const uint32_t bodyword_ct = raw_sample_ct / kBitsPerWord;
987 uint32_t even_ct = 0;
988 uint32_t odd_ct = 0;
989 uint32_t bothset_ct = 0;
990 for (uint32_t widx = 0; widx != bodyword_ct; ++widx) {
991 // possible todo: try vectorizing this loop in SSE4.2+ high-sample-ct case
992 // with shuffle-based dynamic unpacking of sample_exclude?
993 const uintptr_t mask_word = ~sample_exclude[widx];
994 if (mask_word) {
995 uintptr_t geno_word = genoarr[2 * widx];
996 uintptr_t geno_even = PackWordToHalfwordMask5555(geno_word);
997 uintptr_t geno_odd = PackWordToHalfwordMaskAAAA(geno_word);
998 geno_word = genoarr[2 * widx + 1];
999 geno_even |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(geno_word)) << kBitsPerWordD2;
1000 geno_odd |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(geno_word)) << kBitsPerWordD2;
1001 const uintptr_t geno_even_masked = geno_even & mask_word;
1002 even_ct += PopcountWord(geno_even_masked);
1003 odd_ct += PopcountWord(geno_odd & mask_word);
1004 bothset_ct += PopcountWord(geno_odd & geno_even_masked);
1005 }
1006 }
1007 const uint32_t remainder = raw_sample_ct % kBitsPerWord;
1008 if (remainder) {
1009 const uintptr_t mask_word = bzhi(~sample_exclude[bodyword_ct], remainder);
1010 if (mask_word) {
1011 uintptr_t geno_word = genoarr[2 * bodyword_ct];
1012 uintptr_t geno_even = PackWordToHalfwordMask5555(geno_word);
1013 uintptr_t geno_odd = PackWordToHalfwordMaskAAAA(geno_word);
1014 if (remainder > kBitsPerWordD2) {
1015 geno_word = genoarr[2 * bodyword_ct + 1];
1016 geno_even |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(geno_word)) << kBitsPerWordD2;
1017 geno_odd |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(geno_word)) << kBitsPerWordD2;
1018 }
1019 const uintptr_t geno_even_masked = geno_even & mask_word;
1020 even_ct += PopcountWord(geno_even_masked);
1021 odd_ct += PopcountWord(geno_odd & mask_word);
1022 bothset_ct += PopcountWord(geno_odd & geno_even_masked);
1023 }
1024 }
1025 genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
1026 genocounts[1] = even_ct - bothset_ct;
1027 genocounts[2] = odd_ct - bothset_ct;
1028 genocounts[3] = bothset_ct;
1029 }
1030
1031 void GenoarrCountSubsetIntersectFreqs(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict subset1, const uintptr_t* __restrict subset2, uint32_t raw_sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
1032 // {raw_}sample_ct == 0 ok.
1033 const uint32_t raw_sample_ctl2 = NypCtToWordCt(raw_sample_ct);
1034 const uint32_t fullword_ct = raw_sample_ctl2 / 2;
1035 uint32_t subset_intersect_ct = 0;
1036 uint32_t even_ct = 0;
1037 uint32_t odd_ct = 0;
1038 uint32_t bothset_ct = 0;
1039 for (uint32_t widx = 0; widx != fullword_ct; ++widx) {
1040 // hmm, there may be little point to vectorizing this
1041 const uintptr_t mask_word = subset1[widx] & subset2[widx];
1042 if (mask_word) {
1043 uintptr_t geno_word = genoarr[2 * widx];
1044 uintptr_t geno_even = PackWordToHalfwordMask5555(geno_word);
1045 uintptr_t geno_odd = PackWordToHalfwordMaskAAAA(geno_word);
1046 geno_word = genoarr[2 * widx + 1];
1047 geno_even |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(geno_word)) << kBitsPerWordD2;
1048 geno_odd |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(geno_word)) << kBitsPerWordD2;
1049 const uintptr_t geno_even_masked = geno_even & mask_word;
1050 subset_intersect_ct += PopcountWord(mask_word);
1051 even_ct += PopcountWord(geno_even_masked);
1052 odd_ct += PopcountWord(geno_odd & mask_word);
1053 bothset_ct += PopcountWord(geno_odd & geno_even_masked);
1054 }
1055 }
1056 if (raw_sample_ctl2 % 2) {
1057 const uintptr_t mask_hw = subset1[fullword_ct] & subset2[fullword_ct];
1058 if (mask_hw) {
1059 const uintptr_t geno_word = genoarr[fullword_ct * 2];
1060 const uintptr_t mask_word = UnpackHalfwordToWord(mask_hw);
1061 const uintptr_t geno_word_shifted = geno_word >> 1;
1062 const uintptr_t geno_word_masked = geno_word & mask_word;
1063 subset_intersect_ct += Popcount01Word(mask_word);
1064 even_ct += Popcount01Word(geno_word_masked);
1065 odd_ct += Popcount01Word(geno_word_shifted & mask_word);
1066 bothset_ct += Popcount01Word(geno_word_masked & geno_word_shifted);
1067 }
1068 }
1069 genocounts[0] = subset_intersect_ct + bothset_ct - even_ct - odd_ct;
1070 genocounts[1] = even_ct - bothset_ct;
1071 genocounts[2] = odd_ct - bothset_ct;
1072 genocounts[3] = bothset_ct;
1073 }
1074
GenovecInvertUnsafe(uint32_t sample_ct,uintptr_t * genovec)1075 void GenovecInvertUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
1076 // flips 0 to 2 and vice versa.
1077 // "unsafe" because trailing bits are not zeroed out.
1078 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
1079 assert(VecIsAligned(genovec));
1080 const VecW not_m1 = VCONST_W(kMaskAAAA);
1081 VecW* vptr = R_CAST(VecW*, genovec);
1082 for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
1083 VecW cur_vec = vptr[vidx];
1084 // flip high bit iff low bit is unset
1085 vptr[vidx] = cur_vec ^ vecw_and_notfirst(vecw_slli(cur_vec, 1), not_m1);
1086 }
1087 }
1088
1089 void DifflistCountSubsetFreqs(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t common_geno, uint32_t difflist_len, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
1090 STD_ARRAY_REF_FILL0(4, genocounts);
1091 for (uint32_t difflist_idx = 0; difflist_idx != difflist_len; ++difflist_idx) {
1092 const uint32_t raw_sample_idx = difflist_sample_ids[difflist_idx];
1093 if (IsSet(sample_include, raw_sample_idx)) {
1094 genocounts[GetNyparrEntry(raregeno, difflist_idx)] += 1;
1095 }
1096 }
1097 genocounts[common_geno] = sample_ct - genocounts[0] - genocounts[1] - genocounts[2] - genocounts[3];
1098 }
1099
1100
1101 static_assert(kPglNypTransposeBatch == S_CAST(uint32_t, kNypsPerCacheline), "TransposeNypblock64() needs to be updated.");
1102 #ifdef __LP64__
TransposeNypblock64(const uintptr_t * read_iter,uint32_t read_ul_stride,uint32_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * __restrict write_iter,unsigned char * __restrict buf0,unsigned char * __restrict buf1)1103 void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1) {
1104 // buf0 and buf1 must each be vector-aligned and have size 16k
1105 // Tried using previous AVX2 small-buffer approach, but that was a bit
1106 // worse... though maybe it should be revisited?
1107 const uint32_t buf0_row_ct = DivUp(write_batch_size, 32);
1108 {
1109 // Fold the first 3 shuffles into the ingestion loop.
1110 // Can fold 2 additional shuffles here by ingesting uint16s instead. That
1111 // removes the need for the second loop, halving the workspace requirement.
1112 // Benchmark results of that approach are slightly but consistently worse,
1113 // though.
1114 const uintptr_t* initial_read_iter = R_CAST(const uintptr_t*, read_iter);
1115 const uintptr_t* initial_read_end = &(initial_read_iter[buf0_row_ct]);
1116 uintptr_t* initial_target_iter = R_CAST(uintptr_t*, buf0);
1117 const uint32_t read_batch_rem = kNypsPerCacheline - read_batch_size;
1118 // Tried fancier vector-based ingestion, didn't help.
1119 for (; initial_read_iter != initial_read_end; ++initial_read_iter) {
1120 const uintptr_t* read_iter_tmp = initial_read_iter;
1121 for (uint32_t ujj = 0; ujj != read_batch_size; ++ujj) {
1122 *initial_target_iter++ = *read_iter_tmp;
1123 read_iter_tmp = &(read_iter_tmp[read_ul_stride]);
1124 }
1125 ZeroWArr(read_batch_rem, initial_target_iter);
1126 initial_target_iter = &(initial_target_iter[read_batch_rem]);
1127 }
1128 }
1129
1130 // shuffle from 64 -> 16
1131 const uint32_t buf1_row_ct = DivUp(write_batch_size, 8);
1132 {
1133 // full buf0 row is 256 * 8 bytes
1134 // full buf1 row is 512 bytes
1135 const VecW* buf0_read_iter = R_CAST(const VecW*, buf0);
1136 __m128i* write_iter0 = R_CAST(__m128i*, buf1);
1137 const uint32_t buf0_row_clwidth = DivUp(read_batch_size, 8);
1138 # ifdef USE_SSE42
1139 const VecW gather_u32s = vecw_setr8(0, 1, 8, 9, 2, 3, 10, 11,
1140 4, 5, 12, 13, 6, 7, 14, 15);
1141 # else
1142 const VecW m16 = VCONST_W(kMask0000FFFF);
1143 # endif
1144 for (uint32_t bidx = 0; bidx != buf0_row_ct; ++bidx) {
1145 __m128i* write_iter1 = &(write_iter0[32]);
1146 __m128i* write_iter2 = &(write_iter1[32]);
1147 __m128i* write_iter3 = &(write_iter2[32]);
1148 for (uint32_t clidx = 0; clidx != buf0_row_clwidth; ++clidx) {
1149 # ifdef USE_AVX2
1150 VecW loader0 = buf0_read_iter[clidx * 2];
1151 VecW loader1 = buf0_read_iter[clidx * 2 + 1];
1152 loader0 = vecw_shuffle8(loader0, gather_u32s);
1153 loader1 = vecw_shuffle8(loader1, gather_u32s);
1154 // (0,0) (0,1) (1,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (31,3)
1155 // (0,4) ... (31,7)
1156 // -> (0,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (15,3) (0,4) ... (15,7)
1157 // (16,0) ... (31,3) (16,4) ...
1158 loader0 = vecw_permute0xd8_if_avx2(loader0);
1159 loader1 = vecw_permute0xd8_if_avx2(loader1);
1160 // -> (0,0) ... (7,1) (0,2) ... (7,3) (0,4) ... (7,7) (8,0) ... (15,7)
1161 // (16,0) ... (23,7) (24,0) ... (31,7)
1162 const __m256i vec_lo = _mm256_shuffle_epi32(R_CAST(__m256i, loader0), 0xd8);
1163 const __m256i vec_hi = _mm256_shuffle_epi32(R_CAST(__m256i, loader1), 0xd8);
1164 const __m256i final0145 = _mm256_unpacklo_epi64(vec_lo, vec_hi);
1165 const __m256i final2367 = _mm256_unpackhi_epi64(vec_lo, vec_hi);
1166 // GCC doesn't support _mm256_storeu_si128i as of this writing.
1167 write_iter0[clidx] = _mm256_castsi256_si128(final0145);
1168 write_iter1[clidx] = _mm256_castsi256_si128(final2367);
1169 write_iter2[clidx] = _mm256_extracti128_si256(final0145, 1);
1170 write_iter3[clidx] = _mm256_extracti128_si256(final2367, 1);
1171 # else
1172 VecW loader0 = buf0_read_iter[clidx * 4];
1173 VecW loader1 = buf0_read_iter[clidx * 4 + 1];
1174 VecW loader2 = buf0_read_iter[clidx * 4 + 2];
1175 VecW loader3 = buf0_read_iter[clidx * 4 + 3];
1176 # ifdef USE_SSE42
1177 loader0 = vecw_shuffle8(loader0, gather_u32s);
1178 loader1 = vecw_shuffle8(loader1, gather_u32s);
1179 loader2 = vecw_shuffle8(loader2, gather_u32s);
1180 loader3 = vecw_shuffle8(loader3, gather_u32s);
1181 # else
1182 VecW tmp_lo = vecw_unpacklo16(loader0, loader1);
1183 VecW tmp_hi = vecw_unpackhi16(loader0, loader1);
1184 loader0 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16);
1185 loader1 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 16), m16);
1186 tmp_lo = vecw_unpacklo16(loader2, loader3);
1187 tmp_hi = vecw_unpackhi16(loader2, loader3);
1188 loader2 = vecw_blendv(vecw_slli(tmp_hi, 16), tmp_lo, m16);
1189 loader3 = vecw_blendv(tmp_hi, vecw_srli(tmp_lo, 16), m16);
1190 # endif
1191 // (0,0) (0,1) (1,0) ... (7,1) (0,2) ... (7,3) (8,0) ... (31,3)
1192 // + (0,4) ... (31,7)
1193 // -> (0,0) ... (7,3) (0,4) ... (7,7) (8,0) ... (15,7)
1194 const VecW lo_0_15 = vecw_unpacklo32(loader0, loader1);
1195 const VecW lo_16_31 = vecw_unpackhi32(loader0, loader1);
1196 const VecW hi_0_15 = vecw_unpacklo32(loader2, loader3);
1197 const VecW hi_16_31 = vecw_unpackhi32(loader2, loader3);
1198 write_iter0[clidx] = R_CAST(__m128i, vecw_unpacklo64(lo_0_15, hi_0_15));
1199 write_iter1[clidx] = R_CAST(__m128i, vecw_unpackhi64(lo_0_15, hi_0_15));
1200 write_iter2[clidx] = R_CAST(__m128i, vecw_unpacklo64(lo_16_31, hi_16_31));
1201 write_iter3[clidx] = R_CAST(__m128i, vecw_unpackhi64(lo_16_31, hi_16_31));
1202 # endif
1203 }
1204 buf0_read_iter = &(buf0_read_iter[2048 / kBytesPerVec]);
1205 write_iter0 = &(write_iter3[32]);
1206 }
1207 }
1208
1209 // movemask from 16 -> 2
1210 const VecW* source_iter = R_CAST(VecW*, buf1);
1211 const VecW m8 = VCONST_W(kMask00FF);
1212
1213 // Take advantage of current function contract.
1214 const uint32_t buf1_fullrow_ct = (write_batch_size + 3) / 8;
1215
1216 const uint32_t write_v8ui_stride = kVec8thUintPerWord * write_ul_stride;
1217 const uint32_t vec_ct = DivUp(read_batch_size, (kBytesPerVec / 2));
1218 Vec8thUint* target_iter0 = R_CAST(Vec8thUint*, write_iter);
1219 for (uint32_t uii = 0; uii != buf1_fullrow_ct; ++uii) {
1220 Vec8thUint* target_iter1 = &(target_iter0[write_v8ui_stride]);
1221 Vec8thUint* target_iter2 = &(target_iter1[write_v8ui_stride]);
1222 Vec8thUint* target_iter3 = &(target_iter2[write_v8ui_stride]);
1223 Vec8thUint* target_iter4 = &(target_iter3[write_v8ui_stride]);
1224 Vec8thUint* target_iter5 = &(target_iter4[write_v8ui_stride]);
1225 Vec8thUint* target_iter6 = &(target_iter5[write_v8ui_stride]);
1226 Vec8thUint* target_iter7 = &(target_iter6[write_v8ui_stride]);
1227 for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
1228 const VecW loader = source_iter[vidx];
1229 // Using goal bit-coordinates, where '_' indicates irrelevant content, we
1230 // set target_0123 to
1231 // _ (0, 0) _ (1, 0) _ (2, 0) _ (3, 0) _ (0, 1) _ (1, 1) _ (2, 1) ...
1232 // and target_4567 to
1233 // _ (4, 0) _ (5, 0) _ (6, 0) _ (7, 0) _ (4, 1) _ (5, 1) _ (6, 1) ...
1234 // This is perfectly arranged for movemask.
1235 VecW target_4567 = vecw_blendv(loader, vecw_srli(loader, 7), m8);
1236 target_iter7[vidx] = vecw_movemask(target_4567);
1237 target_4567 = vecw_slli(target_4567, 2);
1238 target_iter6[vidx] = vecw_movemask(target_4567);
1239 target_4567 = vecw_slli(target_4567, 2);
1240 target_iter5[vidx] = vecw_movemask(target_4567);
1241 target_4567 = vecw_slli(target_4567, 2);
1242 target_iter4[vidx] = vecw_movemask(target_4567);
1243 VecW target_0123 = vecw_blendv(vecw_slli(loader, 8), vecw_slli(loader, 1), m8);
1244 target_iter3[vidx] = vecw_movemask(target_0123);
1245 target_0123 = vecw_slli(target_0123, 2);
1246 target_iter2[vidx] = vecw_movemask(target_0123);
1247 target_0123 = vecw_slli(target_0123, 2);
1248 target_iter1[vidx] = vecw_movemask(target_0123);
1249 target_0123 = vecw_slli(target_0123, 2);
1250 target_iter0[vidx] = vecw_movemask(target_0123);
1251 }
1252 source_iter = &(source_iter[(2 * kPglNypTransposeBatch) / kBytesPerVec]);
1253 target_iter0 = &(target_iter7[write_v8ui_stride]);
1254 }
1255 if (buf1_fullrow_ct == buf1_row_ct) {
1256 return;
1257 }
1258 Vec8thUint* target_iter1 = &(target_iter0[write_v8ui_stride]);
1259 Vec8thUint* target_iter2 = &(target_iter1[write_v8ui_stride]);
1260 Vec8thUint* target_iter3 = &(target_iter2[write_v8ui_stride]);
1261 for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
1262 const VecW loader = source_iter[vidx];
1263 VecW target_0123 = vecw_blendv(vecw_slli(loader, 8), vecw_slli(loader, 1), m8);
1264 target_iter3[vidx] = vecw_movemask(target_0123);
1265 target_0123 = vecw_slli(target_0123, 2);
1266 target_iter2[vidx] = vecw_movemask(target_0123);
1267 target_0123 = vecw_slli(target_0123, 2);
1268 target_iter1[vidx] = vecw_movemask(target_0123);
1269 target_0123 = vecw_slli(target_0123, 2);
1270 target_iter0[vidx] = vecw_movemask(target_0123);
1271 }
1272 }
1273 #else // !__LP64__
1274 static_assert(kWordsPerVec == 1, "TransposeNypblock32() needs to be updated.");
TransposeNypblock32(const uintptr_t * read_iter,uint32_t read_ul_stride,uint32_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * __restrict write_iter,unsigned char * __restrict buf0,unsigned char * __restrict buf1)1275 void TransposeNypblock32(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, unsigned char* __restrict buf0, unsigned char* __restrict buf1) {
1276 // buf0 and buf1 must each be vector-aligned and have size 16k
1277 // defining them as unsigned char* might prevent a strict-aliasing issue?
1278 // (might need to go through greater contortions to actually be safe?)
1279 const uint32_t buf_row_ct = NypCtToByteCt(write_batch_size);
1280 // fold the first 6 shuffles into the initial ingestion loop
1281 const unsigned char* initial_read_iter = R_CAST(const unsigned char*, read_iter);
1282 const unsigned char* initial_read_end = &(initial_read_iter[buf_row_ct]);
1283 unsigned char* initial_target_iter = buf0;
1284 const uint32_t read_byte_stride = read_ul_stride * kBytesPerWord;
1285 const uint32_t read_batch_rem = kNypsPerCacheline - read_batch_size;
1286 for (; initial_read_iter != initial_read_end; ++initial_read_iter) {
1287 const unsigned char* read_iter_tmp = initial_read_iter;
1288 for (uint32_t ujj = 0; ujj != read_batch_size; ++ujj) {
1289 *initial_target_iter++ = *read_iter_tmp;
1290 read_iter_tmp = &(read_iter_tmp[read_byte_stride]);
1291 }
1292 initial_target_iter = memsetua(initial_target_iter, 0, read_batch_rem);
1293 }
1294
1295 // second-to-last shuffle, 8 bit spacing -> 4
1296 const VecW* source_iter = R_CAST(VecW*, buf0);
1297 uintptr_t* target_iter0 = R_CAST(uintptr_t*, buf1);
1298 const uint32_t write_word_ct = NypCtToWordCt(read_batch_size);
1299 const uint32_t penult_inner_loop_iter_ct = 2 * write_word_ct;
1300 const uint32_t cur_write_skip = 2 * kWordsPerCacheline - penult_inner_loop_iter_ct;
1301 for (uint32_t uii = 0; uii != buf_row_ct; ++uii) {
1302 uintptr_t* target_iter1 = &(target_iter0[kWordsPerCacheline * 2]);
1303 for (uint32_t ujj = 0; ujj != penult_inner_loop_iter_ct; ++ujj) {
1304 const uintptr_t source_word_lo = *source_iter++;
1305 const uintptr_t source_word_hi = *source_iter++;
1306 uintptr_t target_word0_lo = source_word_lo & kMask0F0F;
1307 uintptr_t target_word1_lo = (source_word_lo >> 4) & kMask0F0F;
1308 uintptr_t target_word0_hi = source_word_hi & kMask0F0F;
1309 uintptr_t target_word1_hi = (source_word_hi >> 4) & kMask0F0F;
1310 target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
1311 target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
1312 target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
1313 target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
1314 target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
1315 target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
1316 target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
1317 target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
1318 *target_iter0++ = S_CAST(Halfword, target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
1319 *target_iter1++ = S_CAST(Halfword, target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
1320 }
1321 source_iter = &(source_iter[2 * cur_write_skip]);
1322 target_iter0 = &(target_iter1[cur_write_skip]);
1323 }
1324
1325 // last shuffle, 4 bit spacing -> 2
1326 source_iter = R_CAST(VecW*, buf1);
1327 target_iter0 = write_iter;
1328 const uint32_t last_loop_iter_ct = DivUp(write_batch_size, 2);
1329 for (uint32_t uii = 0; uii != last_loop_iter_ct; ++uii) {
1330 uintptr_t* target_iter1 = &(target_iter0[write_ul_stride]);
1331 for (uint32_t ujj = 0; ujj != write_word_ct; ++ujj) {
1332 const uintptr_t source_word_lo = *source_iter++;
1333 const uintptr_t source_word_hi = *source_iter++;
1334 uintptr_t target_word0_lo = source_word_lo & kMask3333;
1335 uintptr_t target_word1_lo = (source_word_lo >> 2) & kMask3333;
1336 uintptr_t target_word0_hi = source_word_hi & kMask3333;
1337 uintptr_t target_word1_hi = (source_word_hi >> 2) & kMask3333;
1338 target_word0_lo = (target_word0_lo | (target_word0_lo >> 2)) & kMask0F0F;
1339 target_word1_lo = (target_word1_lo | (target_word1_lo >> 2)) & kMask0F0F;
1340 target_word0_hi = (target_word0_hi | (target_word0_hi >> 2)) & kMask0F0F;
1341 target_word1_hi = (target_word1_hi | (target_word1_hi >> 2)) & kMask0F0F;
1342 target_word0_lo = (target_word0_lo | (target_word0_lo >> 4)) & kMask00FF;
1343 target_word1_lo = (target_word1_lo | (target_word1_lo >> 4)) & kMask00FF;
1344 target_word0_hi = (target_word0_hi | (target_word0_hi >> 4)) & kMask00FF;
1345 target_word1_hi = (target_word1_hi | (target_word1_hi >> 4)) & kMask00FF;
1346 target_word0_lo = target_word0_lo | (target_word0_lo >> kBitsPerWordD4);
1347 target_word1_lo = target_word1_lo | (target_word1_lo >> kBitsPerWordD4);
1348 target_word0_hi = target_word0_hi | (target_word0_hi >> kBitsPerWordD4);
1349 target_word1_hi = target_word1_hi | (target_word1_hi >> kBitsPerWordD4);
1350 target_iter0[ujj] = S_CAST(Halfword, target_word0_lo) | (target_word0_hi << kBitsPerWordD2);
1351 target_iter1[ujj] = S_CAST(Halfword, target_word1_lo) | (target_word1_hi << kBitsPerWordD2);
1352 }
1353 source_iter = &(source_iter[2 * (kWordsPerCacheline - write_word_ct)]);
1354 target_iter0 = &(target_iter1[write_ul_stride]);
1355 }
1356 }
1357 #endif // !__LP64__
1358
BiallelicDosage16Invert(uint32_t dosage_ct,uint16_t * dosage_main)1359 void BiallelicDosage16Invert(uint32_t dosage_ct, uint16_t* dosage_main) {
1360 // replace each x with (32768 - x).
1361 // compiler is smart enough to vectorize this.
1362 for (uint32_t uii = 0; uii != dosage_ct; ++uii) {
1363 dosage_main[uii] = 32768 - dosage_main[uii];
1364 }
1365 }
1366
BiallelicDphase16Invert(uint32_t dphase_ct,int16_t * dphase_delta)1367 void BiallelicDphase16Invert(uint32_t dphase_ct, int16_t* dphase_delta) {
1368 for (uint32_t uii = 0; uii != dphase_ct; ++uii) {
1369 dphase_delta[uii] = -dphase_delta[uii];
1370 }
1371 }
1372
GenoarrToMissingnessUnsafe(const uintptr_t * __restrict genoarr,uint32_t sample_ct,uintptr_t * __restrict missingness)1373 void GenoarrToMissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict missingness) {
1374 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
1375 Halfword* missingness_alias = R_CAST(Halfword*, missingness);
1376 for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
1377 const uintptr_t cur_geno_word = genoarr[widx];
1378 missingness_alias[widx] = PackWordToHalfwordMask5555(cur_geno_word & (cur_geno_word >> 1));
1379 }
1380 if (sample_ctl2 % 2) {
1381 missingness_alias[sample_ctl2] = 0;
1382 }
1383 }
1384
GenoarrToNonmissingnessUnsafe(const uintptr_t * __restrict genoarr,uint32_t sample_ct,uintptr_t * __restrict nonmissingness)1385 void GenoarrToNonmissingnessUnsafe(const uintptr_t* __restrict genoarr, uint32_t sample_ct, uintptr_t* __restrict nonmissingness) {
1386 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
1387 Halfword* nonmissingness_alias = R_CAST(Halfword*, nonmissingness);
1388 for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
1389 const uintptr_t cur_geno_word = genoarr[widx];
1390 nonmissingness_alias[widx] = PackWordToHalfwordMask5555(~(cur_geno_word & (cur_geno_word >> 1)));
1391 }
1392 }
1393
SplitHomRef2hetUnsafeW(const uintptr_t * genoarr,uint32_t inword_ct,uintptr_t * __restrict hom_buf,uintptr_t * __restrict ref2het_buf)1394 void SplitHomRef2hetUnsafeW(const uintptr_t* genoarr, uint32_t inword_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf) {
1395 Halfword* hom_alias = R_CAST(Halfword*, hom_buf);
1396 Halfword* ref2het_alias = R_CAST(Halfword*, ref2het_buf);
1397 for (uint32_t widx = 0; widx != inword_ct; ++widx) {
1398 const uintptr_t geno_word = genoarr[widx];
1399 hom_alias[widx] = PackWordToHalfwordMask5555(~geno_word);
1400 ref2het_alias[widx] = PackWordToHalfwordMaskAAAA(~geno_word);
1401 }
1402 }
1403
SplitHomRef2het(const uintptr_t * genoarr,uint32_t sample_ct,uintptr_t * __restrict hom_buf,uintptr_t * __restrict ref2het_buf)1404 void SplitHomRef2het(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* __restrict hom_buf, uintptr_t* __restrict ref2het_buf) {
1405 const uint32_t full_outword_ct = sample_ct / kBitsPerWord;
1406 SplitHomRef2hetUnsafeW(genoarr, full_outword_ct * 2, hom_buf, ref2het_buf);
1407 const uint32_t remainder = sample_ct % kBitsPerWord;
1408 if (remainder) {
1409 uintptr_t geno_word = genoarr[full_outword_ct * 2];
1410 uintptr_t hom_word = PackWordToHalfwordMask5555(~geno_word);
1411 uintptr_t ref2het_word = PackWordToHalfwordMaskAAAA(~geno_word);
1412 if (remainder > kBitsPerWordD2) {
1413 geno_word = genoarr[full_outword_ct * 2 + 1];
1414 hom_word |= S_CAST(uintptr_t, PackWordToHalfwordMask5555(~geno_word)) << kBitsPerWordD2;
1415 ref2het_word |= S_CAST(uintptr_t, PackWordToHalfwordMaskAAAA(~geno_word)) << kBitsPerWordD2;
1416 }
1417 const uintptr_t cur_mask = (k1LU << remainder) - 1;
1418 hom_buf[full_outword_ct] = hom_word & cur_mask;
1419 ref2het_buf[full_outword_ct] = ref2het_word & cur_mask;
1420 }
1421 }
1422
1423 // Note that once the result elements are only 4 bits wide, shuffle8 should
1424 // beat a 256x4 lookup table when SSE4 is available. (possible todo: see if
1425 // shuffle8-based loop also wins here.)
GenoarrLookup256x1bx4(const uintptr_t * genoarr,const void * table256x1bx4,uint32_t sample_ct,void * __restrict result)1426 void GenoarrLookup256x1bx4(const uintptr_t* genoarr, const void* table256x1bx4, uint32_t sample_ct, void* __restrict result) {
1427 const uint32_t* table_alias = S_CAST(const uint32_t*, table256x1bx4);
1428 const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
1429 uint32_t* result_alias = S_CAST(uint32_t*, result);
1430 const uint32_t full_byte_ct = sample_ct / 4;
1431 for (uint32_t byte_idx = 0; byte_idx != full_byte_ct; ++byte_idx) {
1432 result_alias[byte_idx] = table_alias[genoarr_alias[byte_idx]];
1433 }
1434 const uint32_t remainder = sample_ct % 4;
1435 if (remainder) {
1436 unsigned char* result_last = R_CAST(unsigned char*, &(result_alias[full_byte_ct]));
1437 uintptr_t geno_byte = genoarr_alias[full_byte_ct];
1438 for (uint32_t uii = 0; uii != remainder; ++uii) {
1439 result_last[uii] = table_alias[geno_byte & 3];
1440 geno_byte >>= 2;
1441 }
1442 }
1443 }
1444
GenoarrLookup16x4bx2(const uintptr_t * genoarr,const void * table16x4bx2,uint32_t sample_ct,void * __restrict result)1445 void GenoarrLookup16x4bx2(const uintptr_t* genoarr, const void* table16x4bx2, uint32_t sample_ct, void* __restrict result) {
1446 const uint64_t* table_alias = S_CAST(const uint64_t*, table16x4bx2);
1447 uint64_t* result_iter = S_CAST(uint64_t*, result);
1448 const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1449 uint32_t loop_len = kBitsPerWordD4;
1450 uintptr_t geno_word = 0;
1451 for (uint32_t widx = 0; ; ++widx) {
1452 if (widx >= sample_ctl2m1) {
1453 if (widx > sample_ctl2m1) {
1454 if (sample_ct % 2) {
1455 memcpy(result_iter, &(table_alias[geno_word & 3]), 4);
1456 }
1457 return;
1458 }
1459 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1460 }
1461 geno_word = genoarr[widx];
1462 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1463 const uintptr_t cur_2geno = geno_word & 15;
1464 *result_iter++ = table_alias[cur_2geno];
1465 geno_word >>= 4;
1466 }
1467 }
1468 }
1469
1470 // this might be important for genovec -> AlleleCode expansion
GenoarrLookup256x2bx4(const uintptr_t * genoarr,const void * table256x2bx4,uint32_t sample_ct,void * __restrict result)1471 void GenoarrLookup256x2bx4(const uintptr_t* genoarr, const void* table256x2bx4, uint32_t sample_ct, void* __restrict result) {
1472 const uint64_t* table_alias = S_CAST(const uint64_t*, table256x2bx4);
1473 const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
1474 uint64_t* result_alias = S_CAST(uint64_t*, result);
1475 const uint32_t full_byte_ct = sample_ct / 4;
1476 for (uint32_t byte_idx = 0; byte_idx != full_byte_ct; ++byte_idx) {
1477 result_alias[byte_idx] = table_alias[genoarr_alias[byte_idx]];
1478 }
1479 const uint32_t remainder = sample_ct % 4;
1480 if (remainder) {
1481 uint16_t* result_last = R_CAST(uint16_t*, &(result_alias[full_byte_ct]));
1482 uintptr_t geno_byte = genoarr_alias[full_byte_ct];
1483 for (uint32_t uii = 0; uii != remainder; ++uii) {
1484 memcpy_k(&(result_last[uii]), &(table_alias[geno_byte & 3]), 2);
1485 geno_byte >>= 2;
1486 }
1487 }
1488 }
1489
1490 #ifdef __LP64__
GenoarrLookup4x16b(const uintptr_t * genoarr,const void * table4x16b,uint32_t sample_ct,void * result)1491 void GenoarrLookup4x16b(const uintptr_t* genoarr, const void* table4x16b, uint32_t sample_ct, void* result) {
1492 const __m128i* table_alias = S_CAST(const __m128i*, table4x16b);
1493 __m128i* result_iter = S_CAST(__m128i*, result);
1494 const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1495 uint32_t loop_len = kBitsPerWordD2;
1496 uintptr_t geno_word = 0;
1497 for (uint32_t widx = 0; ; ++widx) {
1498 if (widx >= sample_ctl2m1) {
1499 if (widx > sample_ctl2m1) {
1500 return;
1501 }
1502 loop_len = ModNz(sample_ct, kBitsPerWordD2);
1503 }
1504 geno_word = genoarr[widx];
1505 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1506 _mm_storeu_si128(result_iter, table_alias[geno_word & 3]);
1507 ++result_iter;
1508 geno_word >>= 2;
1509 }
1510 }
1511 }
1512
GenoarrLookup16x8bx2(const uintptr_t * genoarr,const void * table16x8bx2,uint32_t sample_ct,void * __restrict result)1513 void GenoarrLookup16x8bx2(const uintptr_t* genoarr, const void* table16x8bx2, uint32_t sample_ct, void* __restrict result) {
1514 const __m128i* table_alias = S_CAST(const __m128i*, table16x8bx2);
1515 __m128i* result_iter = S_CAST(__m128i*, result);
1516 const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1517 uint32_t loop_len = kBitsPerWordD4;
1518 uintptr_t geno_word = 0;
1519 for (uint32_t widx = 0; ; ++widx) {
1520 if (widx >= sample_ctl2m1) {
1521 if (widx > sample_ctl2m1) {
1522 if (sample_ct % 2) {
1523 memcpy(result_iter, &(table_alias[geno_word & 3]), 8);
1524 }
1525 return;
1526 }
1527 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1528 }
1529 geno_word = genoarr[widx];
1530 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1531 const uintptr_t cur_2geno = geno_word & 15;
1532 _mm_storeu_si128(result_iter, table_alias[cur_2geno]);
1533 ++result_iter;
1534 geno_word >>= 4;
1535 }
1536 }
1537 }
1538
GenoarrLookup256x4bx4(const uintptr_t * genoarr,const void * table256x4bx4,uint32_t sample_ct,void * __restrict result)1539 void GenoarrLookup256x4bx4(const uintptr_t* genoarr, const void* table256x4bx4, uint32_t sample_ct, void* __restrict result) {
1540 const __m128i* table_alias = S_CAST(const __m128i*, table256x4bx4);
1541 const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
1542 __m128i* result_alias = S_CAST(__m128i*, result);
1543 const uint32_t full_byte_ct = sample_ct / 4;
1544 for (uint32_t byte_idx = 0; byte_idx != full_byte_ct; ++byte_idx) {
1545 _mm_storeu_si128(&(result_alias[byte_idx]), table_alias[genoarr_alias[byte_idx]]);
1546 }
1547 const uint32_t remainder = sample_ct % 4;
1548 if (remainder) {
1549 uint32_t* result_last = R_CAST(uint32_t*, &(result_alias[full_byte_ct]));
1550 uintptr_t geno_byte = genoarr_alias[full_byte_ct];
1551 for (uint32_t uii = 0; uii != remainder; ++uii) {
1552 memcpy(&(result_last[uii]), &(table_alias[geno_byte & 3]), 4);
1553 geno_byte >>= 2;
1554 }
1555 }
1556 }
1557 #else
GenoarrLookup4x16b(const uintptr_t * genoarr,const void * table4x16b,uint32_t sample_ct,void * result)1558 void GenoarrLookup4x16b(const uintptr_t* genoarr, const void* table4x16b, uint32_t sample_ct, void* result) {
1559 const uint64_t* table_alias = S_CAST(const uint64_t*, table4x16b);
1560 uint64_t* result_iter = S_CAST(uint64_t*, result);
1561 const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1562 uint32_t loop_len = kBitsPerWordD2;
1563 uintptr_t geno_word = 0;
1564 for (uint32_t widx = 0; ; ++widx) {
1565 if (widx >= sample_ctl2m1) {
1566 if (widx > sample_ctl2m1) {
1567 return;
1568 }
1569 loop_len = ModNz(sample_ct, kBitsPerWordD2);
1570 }
1571 geno_word = genoarr[widx];
1572 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1573 memcpy(result_iter, &(table_alias[(geno_word & 3) * 2]), 16);
1574 result_iter = &(result_iter[2]);
1575 geno_word >>= 2;
1576 }
1577 }
1578 }
1579
GenoarrLookup16x8bx2(const uintptr_t * genoarr,const void * table16x8bx2,uint32_t sample_ct,void * __restrict result)1580 void GenoarrLookup16x8bx2(const uintptr_t* genoarr, const void* table16x8bx2, uint32_t sample_ct, void* __restrict result) {
1581 const uint64_t* table_alias = S_CAST(const uint64_t*, table16x8bx2);
1582 uint64_t* result_iter = S_CAST(uint64_t*, result);
1583 const uint32_t sample_ctl2m1 = (sample_ct - 1) / kBitsPerWordD2;
1584 uint32_t loop_len = kBitsPerWordD4;
1585 uintptr_t geno_word = 0;
1586 for (uint32_t widx = 0; ; ++widx) {
1587 if (widx >= sample_ctl2m1) {
1588 if (widx > sample_ctl2m1) {
1589 if (sample_ct % 2) {
1590 memcpy(result_iter, &(table_alias[(geno_word & 3) * 2]), 8);
1591 }
1592 return;
1593 }
1594 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1595 }
1596 geno_word = genoarr[widx];
1597 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1598 const uintptr_t cur_2geno = geno_word & 15;
1599 memcpy(result_iter, &(table_alias[cur_2geno * 2]), 16);
1600 result_iter = &(result_iter[2]);
1601 geno_word >>= 4;
1602 }
1603 }
1604 }
1605
GenoarrLookup256x4bx4(const uintptr_t * genoarr,const void * table256x4bx4,uint32_t sample_ct,void * __restrict result)1606 void GenoarrLookup256x4bx4(const uintptr_t* genoarr, const void* table256x4bx4, uint32_t sample_ct, void* __restrict result) {
1607 const uint32_t* table_alias = S_CAST(const uint32_t*, table256x4bx4);
1608 const unsigned char* genoarr_alias = R_CAST(const unsigned char*, genoarr);
1609 uint32_t* result_alias = S_CAST(uint32_t*, result);
1610 const uint32_t full_byte_ct = sample_ct / 4;
1611 for (uint32_t byte_idx = 0; byte_idx != full_byte_ct; ++byte_idx) {
1612 memcpy(&(result_alias[byte_idx * 4]), &(table_alias[genoarr_alias[byte_idx] * 4]), 16);
1613 }
1614 const uint32_t remainder = sample_ct % 4;
1615 if (remainder) {
1616 uint32_t* result_last = &(result_alias[full_byte_ct * 4]);
1617 uintptr_t geno_byte = genoarr_alias[full_byte_ct];
1618 for (uint32_t uii = 0; uii != remainder; ++uii) {
1619 memcpy(&(result_last[uii]), &(table_alias[(geno_byte & 3) * 4]), 4);
1620 geno_byte >>= 2;
1621 }
1622 }
1623 }
1624 #endif
1625
InitLookup16x4bx2(void * table16x4bx2)1626 void InitLookup16x4bx2(void* table16x4bx2) {
1627 uint32_t* table_iter = S_CAST(uint32_t*, table16x4bx2);
1628 uint32_t vals[4];
1629 vals[0] = table_iter[0];
1630 table_iter[1] = vals[0];
1631 vals[1] = table_iter[2];
1632 table_iter[3] = vals[0];
1633 vals[2] = table_iter[4];
1634 table_iter[5] = vals[0];
1635 vals[3] = table_iter[6];
1636 table_iter[7] = vals[0];
1637 table_iter = &(table_iter[8]);
1638 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1639 const uint32_t cur_high = vals[high_idx];
1640 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1641 *table_iter++ = vals[low_idx];
1642 *table_iter++ = cur_high;
1643 }
1644 }
1645 }
1646
InitLookup16x8bx2(void * table16x8bx2)1647 void InitLookup16x8bx2(void* table16x8bx2) {
1648 uint64_t* table_iter = S_CAST(uint64_t*, table16x8bx2);
1649 uint64_t vals[4];
1650 vals[0] = table_iter[0];
1651 table_iter[1] = vals[0];
1652 vals[1] = table_iter[2];
1653 table_iter[3] = vals[0];
1654 vals[2] = table_iter[4];
1655 table_iter[5] = vals[0];
1656 vals[3] = table_iter[6];
1657 table_iter[7] = vals[0];
1658 table_iter = &(table_iter[8]);
1659 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1660 // bugfix (20 Jun 2018): cur_high needs to be a uint64_t, not a uint32_t
1661 const uint64_t cur_high = vals[high_idx];
1662 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1663 *table_iter++ = vals[low_idx];
1664 *table_iter++ = cur_high;
1665 }
1666 }
1667 }
1668
InitLookup256x1bx4(void * table256x1bx4)1669 void InitLookup256x1bx4(void* table256x1bx4) {
1670 unsigned char* table_iter = S_CAST(unsigned char*, table256x1bx4);
1671 unsigned char vals[4];
1672 vals[0] = table_iter[0];
1673 vals[1] = table_iter[4];
1674 vals[2] = table_iter[8];
1675 vals[3] = table_iter[12];
1676 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
1677 const uint32_t cur_high = vals[high_idx];
1678 for (uint32_t second_idx = 0; second_idx != 4; ++second_idx) {
1679 const uint32_t cur_second = vals[second_idx];
1680 for (uint32_t third_idx = 0; third_idx != 4; ++third_idx) {
1681 const uint32_t cur_third = vals[third_idx];
1682 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1683 *table_iter++ = vals[low_idx];
1684 *table_iter++ = cur_third;
1685 *table_iter++ = cur_second;
1686 *table_iter++ = cur_high;
1687 }
1688 }
1689 }
1690 }
1691 }
1692
InitLookup256x2bx4(void * table256x2bx4)1693 void InitLookup256x2bx4(void* table256x2bx4) {
1694 uint16_t* table_iter = S_CAST(uint16_t*, table256x2bx4);
1695 uint16_t vals[4];
1696 vals[0] = table_iter[0];
1697 vals[1] = table_iter[4];
1698 vals[2] = table_iter[8];
1699 vals[3] = table_iter[12];
1700 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
1701 const uint32_t cur_high = vals[high_idx];
1702 for (uint32_t second_idx = 0; second_idx != 4; ++second_idx) {
1703 const uint32_t cur_second = vals[second_idx];
1704 for (uint32_t third_idx = 0; third_idx != 4; ++third_idx) {
1705 const uint32_t cur_third = vals[third_idx];
1706 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1707 *table_iter++ = vals[low_idx];
1708 *table_iter++ = cur_third;
1709 *table_iter++ = cur_second;
1710 *table_iter++ = cur_high;
1711 }
1712 }
1713 }
1714 }
1715 }
1716
InitLookup256x4bx4(void * table256x4bx4)1717 void InitLookup256x4bx4(void* table256x4bx4) {
1718 uint32_t* table_iter = S_CAST(uint32_t*, table256x4bx4);
1719 uint32_t vals[4];
1720 vals[0] = table_iter[0];
1721 vals[1] = table_iter[4];
1722 vals[2] = table_iter[8];
1723 vals[3] = table_iter[12];
1724 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
1725 const uint32_t cur_high = vals[high_idx];
1726 for (uint32_t second_idx = 0; second_idx != 4; ++second_idx) {
1727 const uint32_t cur_second = vals[second_idx];
1728 for (uint32_t third_idx = 0; third_idx != 4; ++third_idx) {
1729 const uint32_t cur_third = vals[third_idx];
1730 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1731 *table_iter++ = vals[low_idx];
1732 *table_iter++ = cur_third;
1733 *table_iter++ = cur_second;
1734 *table_iter++ = cur_high;
1735 }
1736 }
1737 }
1738 }
1739 }
1740
PhaseLookup4b(const uintptr_t * genoarr,const uintptr_t * phasepresent,const uintptr_t * phaseinfo,const void * table56x4bx2,uint32_t sample_ct,void * __restrict result)1741 void PhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x4bx2, uint32_t sample_ct, void* __restrict result) {
1742 const uint64_t* table_alias = S_CAST(const uint64_t*, table56x4bx2);
1743 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
1744 const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
1745 const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
1746 uint64_t* result_iter = S_CAST(uint64_t*, result);
1747 uint32_t loop_len = kBitsPerWordD4;
1748 uintptr_t geno_word = 0;
1749 uintptr_t phasepresent_hw_shifted = 0;
1750 uintptr_t phaseinfo_hw_shifted = 0;
1751 for (uint32_t widx = 0; ; ++widx) {
1752 if (widx >= sample_ctl2_m1) {
1753 if (widx > sample_ctl2_m1) {
1754 if (sample_ct % 2) {
1755 uintptr_t cur_idx = (geno_word & 3);
1756 // assume trailing bits of phasepresent/phaseinfo clear
1757 // phaseinfo_hw_shifted not necessarily updated, so need if-statement
1758 // bugfix (25 Jun 2018): must only consider active bit of
1759 // phasepresent_hw_shifted, not the already-processed ones
1760 if (phasepresent_hw_shifted & 16) {
1761 cur_idx ^= 16 | (phaseinfo_hw_shifted & 2);
1762 }
1763 memcpy(result_iter, &(table_alias[cur_idx]), 4);
1764 }
1765 return;
1766 }
1767 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1768 }
1769 geno_word = genoarr[widx];
1770 phasepresent_hw_shifted = phasepresent_alias[widx];
1771 if (!phasepresent_hw_shifted) {
1772 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1773 *result_iter++ = table_alias[geno_word & 15];
1774 geno_word >>= 4;
1775 }
1776 } else {
1777 phasepresent_hw_shifted = phasepresent_hw_shifted << 4;
1778 phaseinfo_hw_shifted = phaseinfo_alias[widx];
1779
1780 // note that this must be on a separate line (or we have to static_cast)
1781 phaseinfo_hw_shifted = phaseinfo_hw_shifted << 1;
1782
1783 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1784 const uintptr_t cur_idx = ((geno_word & 15) | (phasepresent_hw_shifted & 48)) ^ (phaseinfo_hw_shifted & 6);
1785 *result_iter++ = table_alias[cur_idx];
1786 geno_word >>= 4;
1787 phasepresent_hw_shifted >>= 2;
1788 phaseinfo_hw_shifted >>= 2;
1789 }
1790 }
1791 }
1792 }
1793
InitPhaseLookup4b(void * table56x4bx2)1794 void InitPhaseLookup4b(void* table56x4bx2) {
1795 uint32_t* table_iter = S_CAST(uint32_t*, table56x4bx2);
1796 uint32_t vals[4];
1797 vals[0] = table_iter[0];
1798 table_iter[1] = vals[0];
1799 vals[1] = table_iter[2];
1800 table_iter[3] = vals[0];
1801 vals[2] = table_iter[4];
1802 table_iter[5] = vals[0];
1803 vals[3] = table_iter[6];
1804 table_iter[7] = vals[0];
1805 table_iter = &(table_iter[8]);
1806 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1807 const uint32_t cur_high = vals[high_idx];
1808 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1809 *table_iter++ = vals[low_idx];
1810 *table_iter++ = cur_high;
1811 }
1812 }
1813 // [16][0]..[31][1]: bit 4 is set
1814 // low bits must be 01 or 11
1815 const uint32_t val_phaseinfo0 = table_iter[2];
1816 table_iter[3] = vals[0];
1817 const uint32_t val_phaseinfo1 = table_iter[6];
1818 table_iter[7] = vals[0];
1819 table_iter = &(table_iter[8]);
1820 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1821 const uint32_t cur_high = vals[high_idx];
1822 table_iter[2] = val_phaseinfo0;
1823 table_iter[3] = cur_high;
1824 table_iter[6] = val_phaseinfo1;
1825 table_iter[7] = cur_high;
1826 table_iter = &(table_iter[8]);
1827 }
1828 // [32][0]..[39][1]: bit 5 set, bit 4 unset
1829 // high bits must be 00 or 01
1830 for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
1831 const uint32_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
1832 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1833 *table_iter++ = vals[low_idx];
1834 *table_iter++ = cur_high;
1835 }
1836 }
1837 table_iter = &(table_iter[16]);
1838 // [48][0]..[55][1]: bits 4 and 5 set
1839 for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
1840 const uint32_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
1841 table_iter[2] = val_phaseinfo0;
1842 table_iter[3] = cur_high;
1843 table_iter[6] = val_phaseinfo1;
1844 table_iter[7] = cur_high;
1845 table_iter = &(table_iter[8]);
1846 }
1847 }
1848
1849 #ifdef __LP64__
PhaseLookup8b(const uintptr_t * genoarr,const uintptr_t * phasepresent,const uintptr_t * phaseinfo,const void * table56x8bx2,uint32_t sample_ct,void * __restrict result)1850 void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* __restrict result) {
1851 const __m128i* table_alias = S_CAST(const __m128i*, table56x8bx2);
1852 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
1853 const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
1854 const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
1855 __m128i* result_iter = S_CAST(__m128i*, result);
1856 uint32_t loop_len = kBitsPerWordD4;
1857 uintptr_t geno_word = 0;
1858 uintptr_t phasepresent_hw_shifted = 0;
1859 uintptr_t phaseinfo_hw_shifted = 0;
1860 for (uint32_t widx = 0; ; ++widx) {
1861 if (widx >= sample_ctl2_m1) {
1862 if (widx > sample_ctl2_m1) {
1863 if (sample_ct % 2) {
1864 uintptr_t cur_idx = (geno_word & 3);
1865 if (phasepresent_hw_shifted & 16) {
1866 cur_idx ^= 16 | (phaseinfo_hw_shifted & 2);
1867 }
1868 memcpy(result_iter, &(table_alias[cur_idx]), 8);
1869 }
1870 return;
1871 }
1872 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1873 }
1874 geno_word = genoarr[widx];
1875 phasepresent_hw_shifted = phasepresent_alias[widx];
1876 if (!phasepresent_hw_shifted) {
1877 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1878 _mm_storeu_si128(result_iter, table_alias[geno_word & 15]);
1879 ++result_iter;
1880 geno_word >>= 4;
1881 }
1882 } else {
1883 phasepresent_hw_shifted = phasepresent_hw_shifted << 4;
1884 phaseinfo_hw_shifted = phaseinfo_alias[widx];
1885 phaseinfo_hw_shifted = phaseinfo_hw_shifted << 1;
1886 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1887 const uintptr_t cur_idx = ((geno_word & 15) | (phasepresent_hw_shifted & 48)) ^ (phaseinfo_hw_shifted & 6);
1888 _mm_storeu_si128(result_iter, table_alias[cur_idx]);
1889 ++result_iter;
1890 geno_word >>= 4;
1891 phasepresent_hw_shifted >>= 2;
1892 phaseinfo_hw_shifted >>= 2;
1893 }
1894 }
1895 }
1896 }
1897 #else
PhaseLookup8b(const uintptr_t * genoarr,const uintptr_t * phasepresent,const uintptr_t * phaseinfo,const void * table56x8bx2,uint32_t sample_ct,void * __restrict result)1898 void PhaseLookup8b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const void* table56x8bx2, uint32_t sample_ct, void* __restrict result) {
1899 const uint64_t* table_alias = S_CAST(const uint64_t*, table56x8bx2);
1900 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
1901 const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
1902 const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
1903 uint64_t* result_iter = S_CAST(uint64_t*, result);
1904 uint32_t loop_len = kBitsPerWordD4;
1905 uintptr_t geno_word = 0;
1906 uintptr_t phasepresent_hw_shifted = 0;
1907 uintptr_t phaseinfo_hw_shifted = 0;
1908 for (uint32_t widx = 0; ; ++widx) {
1909 if (widx >= sample_ctl2_m1) {
1910 if (widx > sample_ctl2_m1) {
1911 if (sample_ct % 2) {
1912 uintptr_t cur_idx = (geno_word & 3);
1913 if (phasepresent_hw_shifted & 16) {
1914 cur_idx ^= 16 | (phaseinfo_hw_shifted & 2);
1915 }
1916 memcpy(result_iter, &(table_alias[cur_idx * 2]), 8);
1917 }
1918 return;
1919 }
1920 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
1921 }
1922 geno_word = genoarr[widx];
1923 phasepresent_hw_shifted = phasepresent_alias[widx];
1924 if (!phasepresent_hw_shifted) {
1925 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1926 memcpy(result_iter, &(table_alias[(geno_word & 15) * 2]), 16);
1927 result_iter = &(result_iter[2]);
1928 geno_word >>= 4;
1929 }
1930 } else {
1931 phasepresent_hw_shifted = phasepresent_hw_shifted << 4;
1932 phaseinfo_hw_shifted = phaseinfo_alias[widx];
1933 phaseinfo_hw_shifted = phaseinfo_hw_shifted << 1;
1934 for (uint32_t uii = 0; uii != loop_len; ++uii) {
1935 const uintptr_t cur_idx = ((geno_word & 15) | (phasepresent_hw_shifted & 48)) ^ (phaseinfo_hw_shifted & 6);
1936 memcpy(result_iter, &(table_alias[cur_idx * 2]), 16);
1937 geno_word >>= 4;
1938 phasepresent_hw_shifted >>= 2;
1939 phaseinfo_hw_shifted >>= 2;
1940 }
1941 }
1942 }
1943 }
1944 #endif
1945
InitPhaseLookup8b(void * table56x8bx2)1946 void InitPhaseLookup8b(void* table56x8bx2) {
1947 uint64_t* table_iter = S_CAST(uint64_t*, table56x8bx2);
1948 uint64_t vals[4];
1949 vals[0] = table_iter[0];
1950 table_iter[1] = vals[0];
1951 vals[1] = table_iter[2];
1952 table_iter[3] = vals[0];
1953 vals[2] = table_iter[4];
1954 table_iter[5] = vals[0];
1955 vals[3] = table_iter[6];
1956 table_iter[7] = vals[0];
1957 table_iter = &(table_iter[8]);
1958 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1959 const uint64_t cur_high = vals[high_idx];
1960 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1961 *table_iter++ = vals[low_idx];
1962 *table_iter++ = cur_high;
1963 }
1964 }
1965 // [16][0]..[31][1]: bit 4 is set
1966 // low bits must be 01 or 11
1967 const uint64_t val_phaseinfo0 = table_iter[2];
1968 table_iter[3] = vals[0];
1969 const uint64_t val_phaseinfo1 = table_iter[6];
1970 table_iter[7] = vals[0];
1971 table_iter = &(table_iter[8]);
1972 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
1973 const uint64_t cur_high = vals[high_idx];
1974 table_iter[2] = val_phaseinfo0;
1975 table_iter[3] = cur_high;
1976 table_iter[6] = val_phaseinfo1;
1977 table_iter[7] = cur_high;
1978 table_iter = &(table_iter[8]);
1979 }
1980 // [32][0]..[39][1]: bit 5 set, bit 4 unset
1981 // high bits must be 00 or 01
1982 for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
1983 const uint64_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
1984 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
1985 *table_iter++ = vals[low_idx];
1986 *table_iter++ = cur_high;
1987 }
1988 }
1989 table_iter = &(table_iter[16]);
1990 // [48][0]..[55][1]: bits 4 and 5 set
1991 for (uint32_t high_idx = 0; high_idx != 2; ++high_idx) {
1992 const uint64_t cur_high = high_idx? val_phaseinfo0 : val_phaseinfo1;
1993 table_iter[2] = val_phaseinfo0;
1994 table_iter[3] = cur_high;
1995 table_iter[6] = val_phaseinfo1;
1996 table_iter[7] = cur_high;
1997 table_iter = &(table_iter[8]);
1998 }
1999 }
2000
2001 // bits 0..3: two genotypes
2002 // bits 4..5: two (phasepresent | sex_male) bits
2003 // bits 1,3: unpacked phaseinfo xor
PhaseXNohhLookup4b(const uintptr_t * genoarr,const uintptr_t * phasepresent,const uintptr_t * phaseinfo,const uintptr_t * sex_male,const void * table64x4bx2,uint32_t sample_ct,void * result)2004 void PhaseXNohhLookup4b(const uintptr_t* genoarr, const uintptr_t* phasepresent, const uintptr_t* phaseinfo, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result) {
2005 const uint64_t* table_alias = S_CAST(const uint64_t*, table64x4bx2);
2006 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2007 const Halfword* phasepresent_alias = R_CAST(const Halfword*, phasepresent);
2008 const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
2009 const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2010 uint64_t* result_iter = S_CAST(uint64_t*, result);
2011 uint32_t loop_len = kBitsPerWordD4;
2012 uintptr_t geno_word_xored = 0;
2013 uintptr_t male_or_phasepresent_hw_shifted = 0;
2014 for (uint32_t widx = 0; ; ++widx) {
2015 if (widx >= sample_ctl2_m1) {
2016 if (widx > sample_ctl2_m1) {
2017 if (sample_ct % 2) {
2018 uintptr_t cur_idx = (geno_word_xored & 3) | (male_or_phasepresent_hw_shifted & 16);
2019 memcpy(result_iter, &(table_alias[cur_idx]), 4);
2020 }
2021 return;
2022 }
2023 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2024 }
2025 geno_word_xored = genoarr[widx];
2026 male_or_phasepresent_hw_shifted = sex_male_alias[widx];
2027 const uintptr_t phasepresent_hw = phasepresent_alias[widx];
2028 male_or_phasepresent_hw_shifted |= phasepresent_hw;
2029 male_or_phasepresent_hw_shifted <<= 4;
2030 if (!phasepresent_hw) {
2031 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2032 *result_iter++ = table_alias[(geno_word_xored & 15) | (male_or_phasepresent_hw_shifted & 48)];
2033 geno_word_xored >>= 4;
2034 male_or_phasepresent_hw_shifted >>= 2;
2035 }
2036 } else {
2037 geno_word_xored ^= UnpackHalfwordToWordShift1(phaseinfo_alias[widx]);
2038 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2039 const uintptr_t cur_idx = (geno_word_xored & 15) | (male_or_phasepresent_hw_shifted & 48);
2040 *result_iter++ = table_alias[cur_idx];
2041 geno_word_xored >>= 4;
2042 male_or_phasepresent_hw_shifted >>= 2;
2043 }
2044 }
2045 }
2046 }
2047
InitPhaseXNohhLookup4b(void * table64x4bx2)2048 void InitPhaseXNohhLookup4b(void* table64x4bx2) {
2049 uint32_t* table_iter = S_CAST(uint32_t*, table64x4bx2);
2050 uint32_t vals[4];
2051 vals[0] = table_iter[0];
2052 table_iter[1] = vals[0];
2053 vals[1] = table_iter[2];
2054 table_iter[3] = vals[0];
2055 vals[2] = table_iter[4];
2056 table_iter[5] = vals[0];
2057 vals[3] = table_iter[6];
2058 table_iter[7] = vals[0];
2059 table_iter = &(table_iter[8]);
2060 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2061 const uint32_t cur_high = vals[high_idx];
2062 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2063 *table_iter++ = vals[low_idx];
2064 *table_iter++ = cur_high;
2065 }
2066 }
2067 // [16][0]..[31][1]: bit 4 is set
2068 uint32_t male_or_phasepresent_vals[4];
2069 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2070 male_or_phasepresent_vals[low_idx] = *table_iter++;
2071 *table_iter++ = vals[0];
2072 }
2073 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2074 const uint32_t cur_high = vals[high_idx];
2075 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2076 *table_iter++ = male_or_phasepresent_vals[low_idx];
2077 *table_iter++ = cur_high;
2078 }
2079 }
2080 // [32][0]..[47][1]: bit 5 set, bit 4 unset
2081 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2082 const uint32_t cur_high = male_or_phasepresent_vals[high_idx];
2083 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2084 *table_iter++ = vals[low_idx];
2085 *table_iter++ = cur_high;
2086 }
2087 }
2088 // [48][0]..[63][1]: bits 4 and 5 set
2089 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2090 const uint32_t cur_high = male_or_phasepresent_vals[high_idx];
2091 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2092 *table_iter++ = male_or_phasepresent_vals[low_idx];
2093 *table_iter++ = cur_high;
2094 }
2095 }
2096 }
2097
GenoarrSexLookup4b(const uintptr_t * genoarr,const uintptr_t * sex_male,const void * table64x4bx2,uint32_t sample_ct,void * result)2098 void GenoarrSexLookup4b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x4bx2, uint32_t sample_ct, void* result) {
2099 const uint64_t* table_alias = S_CAST(const uint64_t*, table64x4bx2);
2100 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2101 const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2102 uint64_t* result_iter = S_CAST(uint64_t*, result);
2103 uint32_t loop_len = kBitsPerWordD4;
2104 uintptr_t geno_word = 0;
2105 uintptr_t male_hw_shifted = 0;
2106 for (uint32_t widx = 0; ; ++widx) {
2107 if (widx >= sample_ctl2_m1) {
2108 if (widx > sample_ctl2_m1) {
2109 if (sample_ct % 2) {
2110 uintptr_t cur_idx = (geno_word & 3) | (male_hw_shifted & 16);
2111 memcpy(result_iter, &(table_alias[cur_idx]), 4);
2112 }
2113 return;
2114 }
2115 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2116 }
2117 geno_word = genoarr[widx];
2118 male_hw_shifted = sex_male_alias[widx];
2119 male_hw_shifted <<= 4;
2120 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2121 *result_iter++ = table_alias[(geno_word & 15) | (male_hw_shifted & 48)];
2122 geno_word >>= 4;
2123 male_hw_shifted >>= 2;
2124 }
2125 }
2126 }
2127
InitPhaseXNohhLookup8b(void * table64x8bx2)2128 void InitPhaseXNohhLookup8b(void* table64x8bx2) {
2129 uint64_t* table_iter = S_CAST(uint64_t*, table64x8bx2);
2130 uint64_t vals[4];
2131 vals[0] = table_iter[0];
2132 table_iter[1] = vals[0];
2133 vals[1] = table_iter[2];
2134 table_iter[3] = vals[0];
2135 vals[2] = table_iter[4];
2136 table_iter[5] = vals[0];
2137 vals[3] = table_iter[6];
2138 table_iter[7] = vals[0];
2139 table_iter = &(table_iter[8]);
2140 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2141 const uint64_t cur_high = vals[high_idx];
2142 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2143 *table_iter++ = vals[low_idx];
2144 *table_iter++ = cur_high;
2145 }
2146 }
2147 // [16][0]..[31][1]: bit 4 is set
2148 uint64_t male_or_phasepresent_vals[4];
2149 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2150 male_or_phasepresent_vals[low_idx] = *table_iter++;
2151 *table_iter++ = vals[0];
2152 }
2153 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2154 const uint64_t cur_high = vals[high_idx];
2155 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2156 *table_iter++ = male_or_phasepresent_vals[low_idx];
2157 *table_iter++ = cur_high;
2158 }
2159 }
2160 // [32][0]..[47][1]: bit 5 set, bit 4 unset
2161 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2162 const uint64_t cur_high = male_or_phasepresent_vals[high_idx];
2163 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2164 *table_iter++ = vals[low_idx];
2165 *table_iter++ = cur_high;
2166 }
2167 }
2168 // [48][0]..[63][1]: bits 4 and 5 set
2169 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2170 const uint64_t cur_high = male_or_phasepresent_vals[high_idx];
2171 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2172 *table_iter++ = male_or_phasepresent_vals[low_idx];
2173 *table_iter++ = cur_high;
2174 }
2175 }
2176 }
2177
2178 #ifdef __LP64__
GenoarrSexLookup8b(const uintptr_t * genoarr,const uintptr_t * sex_male,const void * table64x8bx2,uint32_t sample_ct,void * result)2179 void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result) {
2180 const __m128i* table_alias = S_CAST(const __m128i*, table64x8bx2);
2181 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2182 const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2183 __m128i* result_iter = S_CAST(__m128i*, result);
2184 uint32_t loop_len = kBitsPerWordD4;
2185 uintptr_t geno_word = 0;
2186 uintptr_t male_hw_shifted = 0;
2187 for (uint32_t widx = 0; ; ++widx) {
2188 if (widx >= sample_ctl2_m1) {
2189 if (widx > sample_ctl2_m1) {
2190 if (sample_ct % 2) {
2191 uintptr_t cur_idx = (geno_word & 3) | (male_hw_shifted & 16);
2192 memcpy(result_iter, &(table_alias[cur_idx]), 8);
2193 }
2194 return;
2195 }
2196 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2197 }
2198 geno_word = genoarr[widx];
2199 male_hw_shifted = sex_male_alias[widx];
2200 male_hw_shifted <<= 4;
2201 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2202 _mm_storeu_si128(result_iter, table_alias[(geno_word & 15) | (male_hw_shifted & 48)]);
2203 ++result_iter;
2204 geno_word >>= 4;
2205 male_hw_shifted >>= 2;
2206 }
2207 }
2208 }
2209 #else
GenoarrSexLookup8b(const uintptr_t * genoarr,const uintptr_t * sex_male,const void * table64x8bx2,uint32_t sample_ct,void * result)2210 void GenoarrSexLookup8b(const uintptr_t* genoarr, const uintptr_t* sex_male, const void* table64x8bx2, uint32_t sample_ct, void* result) {
2211 const uint64_t* table_alias = S_CAST(const uint64_t*, table64x8bx2);
2212 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2213 const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2214 uint64_t* result_iter = S_CAST(uint64_t*, result);
2215 uint32_t loop_len = kBitsPerWordD4;
2216 uintptr_t geno_word = 0;
2217 uintptr_t male_hw_shifted = 0;
2218 for (uint32_t widx = 0; ; ++widx) {
2219 if (widx >= sample_ctl2_m1) {
2220 if (widx > sample_ctl2_m1) {
2221 if (sample_ct % 2) {
2222 uintptr_t cur_idx = (geno_word & 3) | (male_hw_shifted & 16);
2223 memcpy(result_iter, &(table_alias[cur_idx * 2]), 8);
2224 }
2225 return;
2226 }
2227 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2228 }
2229 geno_word = genoarr[widx];
2230 male_hw_shifted = sex_male_alias[widx];
2231 male_hw_shifted <<= 4;
2232 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2233 memcpy(result_iter, &(table_alias[((geno_word & 15) | (male_hw_shifted & 48)) * 2]), 16);
2234 result_iter = &(result_iter[2]);
2235 geno_word >>= 4;
2236 male_hw_shifted >>= 2;
2237 }
2238 }
2239 }
2240 #endif
2241
VcfPhaseLookup4b(const uintptr_t * genoarr,const uintptr_t * cur_phased,const uintptr_t * phaseinfo,const void * table246x4bx2,uint32_t sample_ct,void * __restrict result)2242 void VcfPhaseLookup4b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x4bx2, uint32_t sample_ct, void* __restrict result) {
2243 const uint64_t* table_alias = S_CAST(const uint64_t*, table246x4bx2);
2244 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2245 const Halfword* cur_phased_alias = R_CAST(const Halfword*, cur_phased);
2246 const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
2247 uint64_t* result_iter = S_CAST(uint64_t*, result);
2248 uint32_t loop_len = kBitsPerWordD4;
2249 uintptr_t geno_word = 0;
2250 uintptr_t cur_phased_hw_shifted = 0;
2251 uintptr_t phaseinfo_hw_shifted = 0;
2252 for (uint32_t widx = 0; ; ++widx) {
2253 if (widx >= sample_ctl2_m1) {
2254 if (widx > sample_ctl2_m1) {
2255 if (sample_ct % 2) {
2256 uintptr_t cur_idx = (geno_word & 3) | (cur_phased_hw_shifted & 16) | (phaseinfo_hw_shifted & 64);
2257 memcpy(result_iter, &(table_alias[cur_idx]), 4);
2258 }
2259 return;
2260 }
2261 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2262 }
2263 geno_word = genoarr[widx];
2264 cur_phased_hw_shifted = cur_phased_alias[widx];
2265 if (!cur_phased_hw_shifted) {
2266 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2267 *result_iter++ = table_alias[geno_word & 15];
2268 geno_word >>= 4;
2269 }
2270 } else {
2271 cur_phased_hw_shifted = cur_phased_hw_shifted << 4;
2272 phaseinfo_hw_shifted = phaseinfo_alias[widx];
2273
2274 // note that this must be on a separate line (or we have to static_cast)
2275 phaseinfo_hw_shifted = phaseinfo_hw_shifted << 6;
2276
2277 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2278 const uintptr_t cur_idx = (geno_word & 15) | (cur_phased_hw_shifted & 48) | (phaseinfo_hw_shifted & 192);
2279 *result_iter++ = table_alias[cur_idx];
2280 geno_word >>= 4;
2281 cur_phased_hw_shifted >>= 2;
2282 phaseinfo_hw_shifted >>= 2;
2283 }
2284 }
2285 }
2286 }
2287
InitVcfPhaseLookup4b(void * table246x4bx2)2288 void InitVcfPhaseLookup4b(void* table246x4bx2) {
2289 uint32_t* table_iter = S_CAST(uint32_t*, table246x4bx2);
2290 uint32_t unphased_vals[4];
2291 unphased_vals[0] = table_iter[0];
2292 table_iter[1] = unphased_vals[0];
2293 unphased_vals[1] = table_iter[2];
2294 table_iter[3] = unphased_vals[0];
2295 unphased_vals[2] = table_iter[4];
2296 table_iter[5] = unphased_vals[0];
2297 unphased_vals[3] = table_iter[6];
2298 table_iter[7] = unphased_vals[0];
2299 table_iter = &(table_iter[8]);
2300 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2301 const uint32_t cur_high = unphased_vals[high_idx];
2302 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2303 *table_iter++ = unphased_vals[low_idx];
2304 *table_iter++ = cur_high;
2305 }
2306 }
2307 // [16][0]..[31][1]: first entry is phased and unflipped, second is unphased
2308 uint32_t phased_unflipped_vals[4];
2309 phased_unflipped_vals[0] = table_iter[0];
2310 table_iter[1] = unphased_vals[0];
2311 phased_unflipped_vals[1] = table_iter[2];
2312 table_iter[3] = unphased_vals[0];
2313 phased_unflipped_vals[2] = table_iter[4];
2314 table_iter[5] = unphased_vals[0];
2315 phased_unflipped_vals[3] = table_iter[6];
2316 table_iter[7] = unphased_vals[0];
2317 table_iter = &(table_iter[8]);
2318 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2319 const uint32_t cur_high = unphased_vals[high_idx];
2320 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2321 *table_iter++ = phased_unflipped_vals[low_idx];
2322 *table_iter++ = cur_high;
2323 }
2324 }
2325 // [32][0]..[63][1]: second entry is phased and unflipped
2326 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2327 const uint32_t cur_high = phased_unflipped_vals[high_idx];
2328 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2329 *table_iter++ = unphased_vals[low_idx];
2330 *table_iter++ = cur_high;
2331 }
2332 }
2333 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2334 const uint32_t cur_high = phased_unflipped_vals[high_idx];
2335 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2336 *table_iter++ = phased_unflipped_vals[low_idx];
2337 *table_iter++ = cur_high;
2338 }
2339 }
2340 // [64][0]..[79][1] should be impossible
2341 table_iter = &(table_iter[32]);
2342 // [80][0]..[95][1]: first entry is phased and flipped, second is unphased
2343 // genotype must be 01
2344 const uint32_t phased_flipped_01 = table_iter[2];
2345 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2346 table_iter[2] = phased_flipped_01;
2347 table_iter[3] = unphased_vals[high_idx];
2348 table_iter = &(table_iter[8]);
2349 }
2350 // [96][0]..[111][1] should be impossible
2351 table_iter = &(table_iter[32]);
2352 // [112][0]..[127][1]: first entry phased-flipped, second phased-unflipped
2353 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2354 table_iter[2] = phased_flipped_01;
2355 table_iter[3] = phased_unflipped_vals[high_idx];
2356 table_iter = &(table_iter[8]);
2357 }
2358 // [128][0]..[163][1] should be impossible
2359 table_iter = &(table_iter[72]);
2360 // [164][0]..[167][1]: second entry phased-flipped, first entry unphased
2361 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2362 *table_iter++ = unphased_vals[low_idx];
2363 *table_iter++ = phased_flipped_01;
2364 }
2365 // [168][0]..[179][1] should be impossible
2366 table_iter = &(table_iter[24]);
2367 // [180][0]..[183][1]: second entry phased-flipped, first phased-unflipped
2368 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2369 *table_iter++ = phased_unflipped_vals[low_idx];
2370 *table_iter++ = phased_flipped_01;
2371 }
2372 // [184][0]..[244][1] should be impossible
2373 // [245][0]..[245][1]: both phased-flipped
2374 table_iter[122] = phased_flipped_01;
2375 table_iter[123] = phased_flipped_01;
2376 }
2377
VcfPhaseLookup2b(const uintptr_t * genoarr,const uintptr_t * cur_phased,const uintptr_t * phaseinfo,const void * table246x2bx2,uint32_t sample_ct,void * __restrict result)2378 void VcfPhaseLookup2b(const uintptr_t* genoarr, const uintptr_t* cur_phased, const uintptr_t* phaseinfo, const void* table246x2bx2, uint32_t sample_ct, void* __restrict result) {
2379 const uint32_t* table_alias = S_CAST(const uint32_t*, table246x2bx2);
2380 const uint32_t sample_ctl2_m1 = (sample_ct - 1) / kBitsPerWordD2;
2381 const Halfword* cur_phased_alias = R_CAST(const Halfword*, cur_phased);
2382 const Halfword* phaseinfo_alias = R_CAST(const Halfword*, phaseinfo);
2383 uint32_t* result_iter = S_CAST(uint32_t*, result);
2384 uint32_t loop_len = kBitsPerWordD4;
2385 uintptr_t geno_word = 0;
2386 uintptr_t cur_phased_hw_shifted = 0;
2387 uintptr_t phaseinfo_hw_shifted = 0;
2388 for (uint32_t widx = 0; ; ++widx) {
2389 if (widx >= sample_ctl2_m1) {
2390 if (widx > sample_ctl2_m1) {
2391 if (sample_ct % 2) {
2392 uintptr_t cur_idx = (geno_word & 3) | (cur_phased_hw_shifted & 16) | (phaseinfo_hw_shifted & 64);
2393 memcpy(result_iter, &(table_alias[cur_idx]), 2);
2394 }
2395 return;
2396 }
2397 loop_len = ModNz(sample_ct, kBitsPerWordD2) / 2;
2398 }
2399 geno_word = genoarr[widx];
2400 cur_phased_hw_shifted = cur_phased_alias[widx];
2401 if (!cur_phased_hw_shifted) {
2402 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2403 *result_iter++ = table_alias[geno_word & 15];
2404 geno_word >>= 4;
2405 }
2406 } else {
2407 cur_phased_hw_shifted = cur_phased_hw_shifted << 4;
2408 phaseinfo_hw_shifted = phaseinfo_alias[widx];
2409
2410 // note that this must be on a separate line (or we have to static_cast)
2411 phaseinfo_hw_shifted = phaseinfo_hw_shifted << 6;
2412
2413 for (uint32_t uii = 0; uii != loop_len; ++uii) {
2414 const uintptr_t cur_idx = (geno_word & 15) | (cur_phased_hw_shifted & 48) | (phaseinfo_hw_shifted & 192);
2415 *result_iter++ = table_alias[cur_idx];
2416 geno_word >>= 4;
2417 cur_phased_hw_shifted >>= 2;
2418 phaseinfo_hw_shifted >>= 2;
2419 }
2420 }
2421 }
2422 }
2423
InitVcfPhaseLookup2b(void * table246x2bx2)2424 void InitVcfPhaseLookup2b(void* table246x2bx2) {
2425 uint16_t* table_iter = S_CAST(uint16_t*, table246x2bx2);
2426 uint16_t unphased_vals[4];
2427 unphased_vals[0] = table_iter[0];
2428 table_iter[1] = unphased_vals[0];
2429 unphased_vals[1] = table_iter[2];
2430 table_iter[3] = unphased_vals[0];
2431 unphased_vals[2] = table_iter[4];
2432 table_iter[5] = unphased_vals[0];
2433 unphased_vals[3] = table_iter[6];
2434 table_iter[7] = unphased_vals[0];
2435 table_iter = &(table_iter[8]);
2436 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2437 const uint32_t cur_high = unphased_vals[high_idx];
2438 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2439 *table_iter++ = unphased_vals[low_idx];
2440 *table_iter++ = cur_high;
2441 }
2442 }
2443 // [16][0]..[31][1]: first entry is phased and unflipped, second is unphased
2444 uint16_t phased_unflipped_vals[4];
2445 phased_unflipped_vals[0] = table_iter[0];
2446 table_iter[1] = unphased_vals[0];
2447 phased_unflipped_vals[1] = table_iter[2];
2448 table_iter[3] = unphased_vals[0];
2449 phased_unflipped_vals[2] = table_iter[4];
2450 table_iter[5] = unphased_vals[0];
2451 phased_unflipped_vals[3] = table_iter[6];
2452 table_iter[7] = unphased_vals[0];
2453 table_iter = &(table_iter[8]);
2454 for (uint32_t high_idx = 1; high_idx != 4; ++high_idx) {
2455 const uint32_t cur_high = unphased_vals[high_idx];
2456 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2457 *table_iter++ = phased_unflipped_vals[low_idx];
2458 *table_iter++ = cur_high;
2459 }
2460 }
2461 // [32][0]..[63][1]: second entry is phased and unflipped
2462 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2463 const uint32_t cur_high = phased_unflipped_vals[high_idx];
2464 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2465 *table_iter++ = unphased_vals[low_idx];
2466 *table_iter++ = cur_high;
2467 }
2468 }
2469 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2470 const uint32_t cur_high = phased_unflipped_vals[high_idx];
2471 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2472 *table_iter++ = phased_unflipped_vals[low_idx];
2473 *table_iter++ = cur_high;
2474 }
2475 }
2476 // [64][0]..[79][1] should be impossible
2477 table_iter = &(table_iter[32]);
2478 // [80][0]..[95][1]: first entry is phased and flipped, second is unphased
2479 // genotype must be 01
2480 const uint32_t phased_flipped_01 = table_iter[2];
2481 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2482 table_iter[2] = phased_flipped_01;
2483 table_iter[3] = unphased_vals[high_idx];
2484 table_iter = &(table_iter[8]);
2485 }
2486 // [96][0]..[111][1] should be impossible
2487 table_iter = &(table_iter[32]);
2488 // [112][0]..[127][1]: first entry phased-flipped, second phased-unflipped
2489 for (uint32_t high_idx = 0; high_idx != 4; ++high_idx) {
2490 table_iter[2] = phased_flipped_01;
2491 table_iter[3] = phased_unflipped_vals[high_idx];
2492 table_iter = &(table_iter[8]);
2493 }
2494 // [128][0]..[163][1] should be impossible
2495 table_iter = &(table_iter[72]);
2496 // [164][0]..[167][1]: second entry phased-flipped, first entry unphased
2497 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2498 *table_iter++ = unphased_vals[low_idx];
2499 *table_iter++ = phased_flipped_01;
2500 }
2501 // [168][0]..[179][1] should be impossible
2502 table_iter = &(table_iter[24]);
2503 // [180][0]..[183][1]: second entry phased-flipped, first phased-unflipped
2504 for (uint32_t low_idx = 0; low_idx != 4; ++low_idx) {
2505 *table_iter++ = phased_unflipped_vals[low_idx];
2506 *table_iter++ = phased_flipped_01;
2507 }
2508 // [184][0]..[244][1] should be impossible
2509 // [245][0]..[245][1]: both phased-flipped
2510 table_iter[122] = phased_flipped_01;
2511 table_iter[123] = phased_flipped_01;
2512 }
2513
2514
ClearGenoarrMissing1bit8Unsafe(const uintptr_t * __restrict genoarr,uint32_t * subset_sizep,uintptr_t * __restrict subset,void * __restrict sparse_vals)2515 void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals) {
2516 const uint32_t orig_subset_size = *subset_sizep;
2517 Halfword* subset_alias = R_CAST(Halfword*, subset);
2518 uint32_t read_idx = 0;
2519 // deliberate overflow
2520 for (uint32_t read_widx = UINT32_MAX; ; ) {
2521 uint32_t subset_bits;
2522 do {
2523 subset_bits = subset_alias[++read_widx];
2524 } while (!subset_bits);
2525 uintptr_t detect_11 = genoarr[read_widx];
2526 detect_11 = detect_11 & (detect_11 >> 1) & kMask5555;
2527 if (detect_11) {
2528 uint32_t detect_11_hw = PackWordToHalfword(detect_11);
2529 const uint32_t joint_u32 = subset_bits & detect_11_hw;
2530 if (joint_u32) {
2531 uintptr_t lowbit = joint_u32 & (-joint_u32);
2532 uint32_t write_idx = read_idx + PopcountWord(subset_bits & (lowbit - 1));
2533 read_idx = write_idx + 1;
2534 uint32_t subset_bits_write = subset_bits ^ lowbit;
2535 unsigned char* sparse_vals_uc = S_CAST(unsigned char*, sparse_vals);
2536 subset_bits &= -(2 * lowbit);
2537 for (; read_idx != orig_subset_size; ++read_idx) {
2538 #ifdef USE_AVX2
2539 if (!subset_bits) {
2540 subset_alias[read_widx] = subset_bits_write;
2541 do {
2542 subset_bits = subset_alias[++read_widx];
2543 } while (!subset_bits);
2544 subset_bits_write = subset_bits;
2545 detect_11 = genoarr[read_widx];
2546 detect_11 = detect_11 & (detect_11 >> 1);
2547 detect_11_hw = PackWordToHalfwordMask5555(detect_11);
2548 }
2549 lowbit = subset_bits & (-subset_bits);
2550 subset_bits ^= lowbit;
2551 if (lowbit & detect_11_hw) {
2552 subset_bits_write ^= lowbit;
2553 continue;
2554 }
2555 #else
2556 if (!subset_bits) {
2557 subset_alias[read_widx] = subset_bits_write;
2558 do {
2559 subset_bits = subset_alias[++read_widx];
2560 } while (!subset_bits);
2561 subset_bits_write = subset_bits;
2562 detect_11 = genoarr[read_widx];
2563 detect_11 = detect_11 & (detect_11 >> 1);
2564 }
2565 lowbit = subset_bits & (-subset_bits);
2566 subset_bits ^= lowbit;
2567 if ((lowbit * lowbit) & detect_11) {
2568 subset_bits_write ^= lowbit;
2569 continue;
2570 }
2571 #endif
2572 sparse_vals_uc[write_idx++] = sparse_vals_uc[read_idx];
2573 }
2574 subset_alias[read_widx] = subset_bits_write;
2575 *subset_sizep = write_idx;
2576 return;
2577 }
2578 }
2579 read_idx += PopcountWord(subset_bits);
2580 if (read_idx == orig_subset_size) {
2581 return;
2582 }
2583 }
2584 }
2585
ClearGenoarrMissing1bit16Unsafe(const uintptr_t * __restrict genoarr,uint32_t * subset_sizep,uintptr_t * __restrict subset,void * __restrict sparse_vals)2586 void ClearGenoarrMissing1bit16Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals) {
2587 const uint32_t orig_subset_size = *subset_sizep;
2588 Halfword* subset_alias = R_CAST(Halfword*, subset);
2589 uint32_t read_idx = 0;
2590 // deliberate overflow
2591 for (uint32_t read_widx = UINT32_MAX; ; ) {
2592 uint32_t subset_bits;
2593 do {
2594 subset_bits = subset_alias[++read_widx];
2595 } while (!subset_bits);
2596 uintptr_t detect_11 = genoarr[read_widx];
2597 detect_11 = detect_11 & (detect_11 >> 1) & kMask5555;
2598 if (detect_11) {
2599 uint32_t detect_11_hw = PackWordToHalfword(detect_11);
2600 const uint32_t joint_u32 = subset_bits & detect_11_hw;
2601 if (joint_u32) {
2602 uintptr_t lowbit = joint_u32 & (-joint_u32);
2603 uint32_t write_idx = read_idx + PopcountWord(subset_bits & (lowbit - 1));
2604 read_idx = write_idx + 1;
2605 uint32_t subset_bits_write = subset_bits ^ lowbit;
2606 uint16_t* sparse_vals_u16 = S_CAST(uint16_t*, sparse_vals);
2607 subset_bits &= -(2 * lowbit);
2608 for (; read_idx != orig_subset_size; ++read_idx) {
2609 #ifdef USE_AVX2
2610 if (!subset_bits) {
2611 subset_alias[read_widx] = subset_bits_write;
2612 do {
2613 subset_bits = subset_alias[++read_widx];
2614 } while (!subset_bits);
2615 subset_bits_write = subset_bits;
2616 detect_11 = genoarr[read_widx];
2617 detect_11 = detect_11 & (detect_11 >> 1);
2618 detect_11_hw = PackWordToHalfwordMask5555(detect_11);
2619 }
2620 lowbit = subset_bits & (-subset_bits);
2621 subset_bits ^= lowbit;
2622 if (lowbit & detect_11_hw) {
2623 subset_bits_write ^= lowbit;
2624 continue;
2625 }
2626 #else
2627 if (!subset_bits) {
2628 subset_alias[read_widx] = subset_bits_write;
2629 do {
2630 subset_bits = subset_alias[++read_widx];
2631 } while (!subset_bits);
2632 subset_bits_write = subset_bits;
2633 detect_11 = genoarr[read_widx];
2634 detect_11 = detect_11 & (detect_11 >> 1);
2635 }
2636 lowbit = subset_bits & (-subset_bits);
2637 subset_bits ^= lowbit;
2638 if ((lowbit * lowbit) & detect_11) {
2639 subset_bits_write ^= lowbit;
2640 continue;
2641 }
2642 #endif
2643 sparse_vals_u16[write_idx++] = sparse_vals_u16[read_idx];
2644 }
2645 subset_alias[read_widx] = subset_bits_write;
2646 *subset_sizep = write_idx;
2647 return;
2648 }
2649 }
2650 read_idx += PopcountWord(subset_bits);
2651 if (read_idx == orig_subset_size) {
2652 return;
2653 }
2654 }
2655 }
2656
MultiallelicDiploidMinimac3R2(const uint64_t * __restrict sums,const uint64_t * __restrict hap_ssqs_x2,uint32_t nm_sample_ct,uint32_t allele_ct,uint32_t extra_phased_het_ct)2657 double MultiallelicDiploidMinimac3R2(const uint64_t* __restrict sums, const uint64_t* __restrict hap_ssqs_x2, uint32_t nm_sample_ct, uint32_t allele_ct, uint32_t extra_phased_het_ct) {
2658 // sums[k] == sum_i [left_dosage_{ik} + right_dosage_{ik}]
2659 // hap_ssqs_x2[k] ==
2660 // 2 * sum_i [(left_dosage_{ik})^2 + (right_dosage_{ik})^2]
2661 // This may be odd, since it's computed as
2662 // (left + right)^2 + (left - right)^2
2663 // and the parities of the two integers can be different.
2664 // For phased hardcalls, it is fine for the hap_ssqs_x2[k] values to
2665 // correspond to unphased hardcalls iff extra_phased_het_ct is the number of
2666 // phased-hets that weren't accounted for in hap_ssqs_x2[]; this makes it
2667 // straightforward for GetMultiallelicCountsAndDosage16s to stick to the
2668 // custom internal multiallelic-count functions.
2669 if (!nm_sample_ct) {
2670 return (0.0 / 0.0);
2671 }
2672 // Allelic phased-dosages are on a (k-1)-dimensional simplex; embed this in
2673 // R^k as the (1, 0, ..., 0), (0, 1, ..., 0), ..., (0, 0, ..., 1) polytope.
2674 // Define
2675 // m_k := (1/2n) * sum_i [left_dosage_{ik} + right_dosage_{ik}]
2676 // Minimac3-r2 is defined as empirical phased-dosage variance divided by
2677 // expected-under-allele-frequencies variance.
2678 // Expected sum-of-squared-Euclidean-distance with perfect imputation is
2679 // 2n("1"^2 - sum_k ((m_k)^2))
2680 // and observed sum-of-squared-distance is
2681 // sum_k (sum_i [(left_dosage_{ik})^2 + (right_dosage_{ik})^2] -
2682 // 2n((m_k)^2))
2683
2684 // ssq_sum_x2 can be as large as 2^31 * nm_sample_ct; meansq_sum can cancel
2685 // as little as (1 / allele_ct) of it
2686 if (nm_sample_ct < 92682) {
2687 uint64_t ssq_sum_x2 = extra_phased_het_ct * 0x20000000LLU;
2688 uint64_t meansq_sum = 0;
2689 for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
2690 const uint64_t cur_allele_dosage = sums[allele_idx];
2691 ssq_sum_x2 += hap_ssqs_x2[allele_idx];
2692 // cur_allele_dosage == 2n * m_k
2693 // -> meansq_sum becomes 2n * sum_k [2n((m_k)^2)]
2694 meansq_sum += cur_allele_dosage * cur_allele_dosage;
2695 }
2696 const uint64_t observed_variance_times_2n = ssq_sum_x2 * nm_sample_ct - meansq_sum;
2697 // "1"^2 -> 16384^2 in our units. So 2n * 2n * "1"^2 is equal to
2698 // n * n * 16384^2 * 4.
2699 const uint64_t expected_variance_times_2n = nm_sample_ct * 0x40000000LLU * nm_sample_ct - meansq_sum;
2700 // mach_r2 == 1 test cases:
2701 // - AA, AB, BB: 1, 4, 4
2702 // sums[0] = 6 * 2^14
2703 // sums[1] = 12 * 2^14
2704 // ssqs[0] = 8 * 2^28
2705 // ssqs[1] = 20 * 2^28
2706 // ssq_sum = (8 + 20) * 2^28
2707 // meansq_sum = (6 * 6 + 12 * 12) * 2^28
2708 // observed_variance = 28 * 9 * 2^28 - 180 * 2^28
2709 // expected_variance = (9 * 9 * 4 * 2^28 - 180 * 2^28) / 2
2710 // - AA, AB, BB, AC, BC, CC: 1, 4, 4, 6, 12, 9
2711 // sums[0] = 12 * 2^14
2712 // sums[1] = 24 * 2^14
2713 // sums[2] = 36 * 2^14
2714 // ssqs[0] = 14 * 2^28
2715 // ssqs[1] = 32 * 2^28
2716 // ssqs[2] = 54 * 2^28
2717 // ssq_sum = (14 + 32 + 54) * 2^28
2718 // meansq_sum = (12 * 12 + 24 * 24 + 36 * 36) * 2^28
2719 // observed_variance = 100 * 36 * 2^28 - 56 * 36 * 2^28
2720 // expected_variance = (36 * 36 * 4 * 2^28 - 56 * 36 * 2^28) / 2
2721 return S_CAST(double, observed_variance_times_2n) / S_CAST(double, expected_variance_times_2n);
2722 }
2723 // Need to avoid catastrophic cancellation here.
2724 const uint64_t nm_sample_ct_x32768 = nm_sample_ct * 0x8000LLU;
2725 double expected_variance_times_2n = 0.0;
2726 double observed_variance_times_2n_hi = 0.0;
2727 int64_t observed_variance_times_2n_lo = 0;
2728 for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
2729 const uint64_t cur_allele_dosage = sums[allele_idx];
2730 const uint64_t cur_ssq_x2 = hap_ssqs_x2[allele_idx];
2731 const double cur_allele_dosaged = u63tod(cur_allele_dosage);
2732 // Restructure expected_variance as sum/product of
2733 // guaranteed-nonnegative numbers.
2734 //
2735 // cur_allele_dosage == 2n * m_k
2736 // cur_allele_dosage[k] * ("1" * 2n - cur_allele_dosage[k])
2737 // = "1" * 4n^2 * m_k - 4n^2 * m_k^2
2738 //
2739 // sum_k (cur_allele_dosage[k] * ("1" * 2n - cur_allele_dosage[k]))
2740 // = "1" * 4n^2 * sum_k m_k - 4n^2 * sum_k m_k^2
2741 // = "1" * 4n^2 * "1" - 4n^2 * sum_k m_k^2
2742 // = 4n^2 * ("1"^2 - sum_k m_k^2)
2743 // This is 2n times the expected variance.
2744 expected_variance_times_2n += cur_allele_dosaged * u63tod(nm_sample_ct_x32768 - cur_allele_dosage);
2745 // Emulate 128-bit integers to make observed_variance computation work.
2746 // "left" corresponds to the ssq_sum_x2 * nm_sample_ct term, "right"
2747 // corresponds to meansq_sum.
2748 const uint64_t cur_ssq_x2_hi = cur_ssq_x2 >> 32;
2749 uint64_t left_lo = (cur_ssq_x2 & 0xffffffffLLU) * nm_sample_ct;
2750 const uint64_t left_hi = (left_lo >> 32) + cur_ssq_x2_hi * nm_sample_ct;
2751 left_lo &= 0xffffffffU;
2752 const uint64_t cur_allele_dosage_lo = cur_allele_dosage & 0xffffffffLLU;
2753 const uint64_t cur_allele_dosage_hi = cur_allele_dosage >> 32;
2754 uint64_t right_lo = cur_allele_dosage_lo * cur_allele_dosage_lo;
2755 const uint64_t right_hi = (right_lo >> 32) + (cur_allele_dosage_lo + cur_allele_dosage) * cur_allele_dosage_hi;
2756 right_lo &= 0xffffffffU;
2757 observed_variance_times_2n_hi += u63tod(left_hi - right_hi);
2758 observed_variance_times_2n_lo += S_CAST(int64_t, left_lo) - S_CAST(int64_t, right_lo);
2759 }
2760 const double observed_variance_times_2n = (observed_variance_times_2n_hi * 4294967296.0) + (u31tod(extra_phased_het_ct) * 536870912.0) + observed_variance_times_2n_lo;
2761 return observed_variance_times_2n / expected_variance_times_2n;
2762 }
2763
PgrDifflistToGenovecUnsafe(const uintptr_t * __restrict raregeno,const uint32_t * difflist_sample_ids,uintptr_t difflist_common_geno,uint32_t sample_ct,uint32_t difflist_len,uintptr_t * __restrict genovec)2764 void PgrDifflistToGenovecUnsafe(const uintptr_t* __restrict raregeno, const uint32_t* difflist_sample_ids, uintptr_t difflist_common_geno, uint32_t sample_ct, uint32_t difflist_len, uintptr_t* __restrict genovec) {
2765 // Ok for trailing bits of raregeno to be nonzero. Does not zero out
2766 // trailing bits of genovec.
2767 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
2768 // could just memset up to word boundary; this should be a bit more
2769 // vector-instruction-friendly, though
2770 vecset(genovec, difflist_common_geno * kMask5555, vec_ct);
2771 const uintptr_t* raregeno_incr = raregeno;
2772 uint32_t difflist_idx = 0;
2773 uint32_t difflist_idx_stop = kBitsPerWordD2;
2774 if (!difflist_common_geno) {
2775 // faster inner loop since there's no existing value to mask out
2776 // todo: check if this should just be deleted since the code bloat causes
2777 // too many more cache misses
2778 for (; ; difflist_idx_stop += kBitsPerWordD2) {
2779 if (difflist_idx_stop > difflist_len) {
2780 if (difflist_idx == difflist_len) {
2781 return;
2782 }
2783 difflist_idx_stop = difflist_len;
2784 }
2785 uintptr_t raregeno_word = *raregeno_incr++;
2786 for (; difflist_idx != difflist_idx_stop; ++difflist_idx) {
2787 const uint32_t cur_sample_idx = difflist_sample_ids[difflist_idx];
2788 genovec[cur_sample_idx / kBitsPerWordD2] |= (raregeno_word & 3) << (2 * (cur_sample_idx % kBitsPerWordD2));
2789 raregeno_word >>= 2;
2790 }
2791 }
2792 }
2793 for (; ; difflist_idx_stop += kBitsPerWordD2) {
2794 if (difflist_idx_stop > difflist_len) {
2795 if (difflist_idx == difflist_len) {
2796 return;
2797 }
2798 difflist_idx_stop = difflist_len;
2799 }
2800 uintptr_t raregeno_word = *raregeno_incr++;
2801 for (; difflist_idx != difflist_idx_stop; ++difflist_idx) {
2802 const uint32_t cur_sample_idx = difflist_sample_ids[difflist_idx];
2803 AssignNyparrEntry(cur_sample_idx, raregeno_word & 3, genovec);
2804 raregeno_word >>= 2;
2805 }
2806 }
2807 }
2808
2809 const uint16_t kHcToAlleleCodes[1024] = QUAD_TABLE256(0, 0x100, 0x101, 0xffff);
2810
2811 static_assert(sizeof(AlleleCode) == 1, "PglMultiallelicSparseToDenseMiss() needs to be updated.");
PglMultiallelicSparseToDenseMiss(const PgenVariant * pgvp,uint32_t sample_ct,AlleleCode * __restrict wide_codes)2812 void PglMultiallelicSparseToDenseMiss(const PgenVariant* pgvp, uint32_t sample_ct, AlleleCode* __restrict wide_codes) {
2813 GenoarrLookup256x2bx4(pgvp->genovec, kHcToAlleleCodes, sample_ct, wide_codes);
2814 const uint32_t patch_01_ct = pgvp->patch_01_ct;
2815 if (patch_01_ct) {
2816 const uintptr_t* patch_01_set = pgvp->patch_01_set;
2817 uintptr_t sample_idx_base = 0;
2818 uintptr_t cur_bits = patch_01_set[0];
2819 const AlleleCode* patch_01_vals = pgvp->patch_01_vals;
2820 AlleleCode* wide_codes1 = &(wide_codes[1]);
2821 for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
2822 const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
2823 wide_codes1[2 * sample_idx] = patch_01_vals[uii];
2824 }
2825 }
2826 const uint32_t patch_10_ct = pgvp->patch_10_ct;
2827 if (patch_10_ct) {
2828 const uintptr_t* patch_10_set = pgvp->patch_10_set;
2829 uintptr_t sample_idx_base = 0;
2830 uintptr_t cur_bits = patch_10_set[0];
2831 const DoubleAlleleCode* patch_10_vals_alias = R_CAST(const DoubleAlleleCode*, pgvp->patch_10_vals);
2832 DoubleAlleleCode* wide_codes_alias = R_CAST(DoubleAlleleCode*, wide_codes);
2833 for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
2834 const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
2835 wide_codes_alias[sample_idx] = patch_10_vals_alias[uii];
2836 }
2837 }
2838 }
2839
2840 #ifdef __cplusplus
2841 } // namespace plink2
2842 #endif
2843