1 #ifndef __PLINK2_BITS_H__
2 #define __PLINK2_BITS_H__
3 
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation; either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
19 
20 
21 // Bitarray support.  (Inline single-word operations are in plink2_base.h.)
22 
23 #include "plink2_base.h"
24 
25 #ifdef __cplusplus
26 namespace plink2 {
27 #endif
28 
29 #if defined(__LP64__) && !defined(USE_AVX2)
30 // may also want a version which doesn't always apply kMask5555
31 void Pack32bTo16bMask(const void* words, uintptr_t ct_32b, void* dest);
32 
PackWordsToHalfwordsMask(const uintptr_t * words,uintptr_t word_ct,Halfword * dest)33 HEADER_INLINE void PackWordsToHalfwordsMask(const uintptr_t* words, uintptr_t word_ct, Halfword* dest) {
34   uintptr_t widx = 0;
35   if (word_ct >= (32 / kBytesPerWord)) {
36     const uintptr_t ct_32b = word_ct / (32 / kBytesPerWord);
37     Pack32bTo16bMask(words, ct_32b, dest);
38     widx = ct_32b * (32 / kBytesPerWord);
39   }
40   for (; widx != word_ct; ++widx) {
41     dest[widx] = PackWordToHalfwordMask5555(words[widx]);
42   }
43 }
44 #else
45 HEADER_INLINE void PackWordsToHalfwordsMask(const uintptr_t* words, uintptr_t word_ct, Halfword* dest) {
46   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
47     dest[widx] = PackWordToHalfwordMask5555(words[widx]);
48   }
49 }
50 #endif
51 
52 // ok for ct == 0
53 void SetAllBits(uintptr_t ct, uintptr_t* bitarr);
54 
55 void BitvecAnd(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
56 
57 void BitvecInvmask(const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
58 
59 void BitvecOr(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
60 
61 void BitvecInvert(uintptr_t word_ct, uintptr_t* main_bitvec);
62 
63 // Functions with "adv" in the name generally take an index or char-pointer as
64 // an argument and return its new value, while "mov" functions take a
65 // pointer-to-index or pointer-to-char-pointer and move it.
66 
67 // These return the current index if the corresponding bit satisfies the
68 // condition.
69 uintptr_t AdvTo1Bit(const uintptr_t* bitarr, uintptr_t loc);
70 
71 uintptr_t AdvTo0Bit(const uintptr_t* bitarr, uintptr_t loc);
72 
73 // uintptr_t NextNonmissingUnsafe(const uintptr_t* genoarr, uintptr_t loc);
74 
75 uint32_t AdvBoundedTo1Bit(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil);
76 
77 uintptr_t AdvBoundedTo0Bit(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil);
78 
79 uint32_t FindLast1BitBefore(const uintptr_t* bitarr, uint32_t loc);
80 
81 // possible todo: check if movemask-based solution is better in AVX2 case
AllWordsAreZero(const uintptr_t * word_arr,uintptr_t word_ct)82 HEADER_INLINE uint32_t AllWordsAreZero(const uintptr_t* word_arr, uintptr_t word_ct) {
83   while (word_ct--) {
84     if (*word_arr++) {
85       return 0;
86     }
87   }
88   return 1;
89 }
90 
AllBitsAreOne(const uintptr_t * bitarr,uintptr_t bit_ct)91 HEADER_INLINE uint32_t AllBitsAreOne(const uintptr_t* bitarr, uintptr_t bit_ct) {
92   const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
93   for (uintptr_t widx = 0; widx != fullword_ct; ++widx) {
94     if (~(bitarr[widx])) {
95       return 0;
96     }
97   }
98   const uint32_t trailing_bit_ct = bit_ct % kBitsPerWord;
99   return (!trailing_bit_ct) || ((~(bitarr[fullword_ct])) << (kBitsPerWord - trailing_bit_ct));
100 }
101 
102 uint32_t AllBytesAreX(const unsigned char* bytes, unsigned char match, uintptr_t byte_ct);
103 
104 // Updated PopcountWords() code is based on
105 // https://github.com/kimwalisch/libpopcnt .  libpopcnt license text follows.
106 
107 /*
108  * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
109  * population count) in an array as quickly as possible using
110  * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
111  *
112  * Copyright (c) 2016 - 2017, Kim Walisch
113  * Copyright (c) 2016 - 2017, Wojciech Mula
114  *
115  * All rights reserved.
116  *
117  * Redistribution and use in source and binary forms, with or without
118  * modification, are permitted provided that the following conditions are met:
119  *
120  * 1. Redistributions of source code must retain the above copyright notice, this
121  *    list of conditions and the following disclaimer.
122  * 2. Redistributions in binary form must reproduce the above copyright notice,
123  *    this list of conditions and the following disclaimer in the documentation
124  *    and/or other materials provided with the distribution.
125  *
126  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
127  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
128  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
129  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
130  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
131  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
132  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
133  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
134  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
135  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
136  */
137 
138 #ifdef USE_AVX2
139 // 'Csa' = carry, save, add
140 // If bb, cc, and *lp are bitvectors, this returns the carry bitvector and sets
141 // *lp to contain the low-order bits of the sums.  I.e. for each position:
142 //   if none of bb, cc, and *lp are set, *lp bit is zero and carry bit is zero
143 //   if exactly 1 is set, *lp bit becomes one and carry bit is zero
144 //   if exactly 2 are set, *lp bit becomes zero and carry bit is one
145 //   if all 3 are set, *lp bit becomes one and carry bit is one
Csa256(VecW bb,VecW cc,VecW * lp)146 HEADER_INLINE VecW Csa256(VecW bb, VecW cc, VecW* lp) {
147   const VecW aa = *lp;
148   const VecW uu = aa ^ bb;
149   *lp = uu ^ cc;
150   return (aa & bb) | (uu & cc);
151 }
152 
CsaOne256(VecW bb,VecW * lp)153 HEADER_INLINE VecW CsaOne256(VecW bb, VecW* lp) {
154   const VecW aa = *lp;
155   *lp = aa ^ bb;
156   return aa & bb;
157 }
158 
PopcountVecAvx2(VecW vv)159 HEADER_INLINE VecW PopcountVecAvx2(VecW vv) {
160   const VecW lookup1 = vecw_setr8(4, 5, 5, 6, 5, 6, 6, 7,
161                                   5, 6, 6, 7, 6, 7, 7, 8);
162   const VecW lookup2 = vecw_setr8(4, 3, 3, 2, 3, 2, 2, 1,
163                                   3, 2, 2, 1, 2, 1, 1, 0);
164 
165   const VecW m4 = VCONST_W(kMask0F0F);
166   const VecW lo = vv & m4;
167   const VecW hi = vecw_srli(vv, 4) & m4;
168   const VecW popcnt1 = vecw_shuffle8(lookup1, lo);
169   const VecW popcnt2 = vecw_shuffle8(lookup2, hi);
170   return vecw_sad(popcnt1, popcnt2);
171 }
172 
HsumW(VecW vv)173 HEADER_INLINE uintptr_t HsumW(VecW vv) {
174   UniVec vu;
175   vu.vw = vv;
176   return vu.w[0] + vu.w[1] + vu.w[2] + vu.w[3];
177   // _mm256_extract_epi64() only worth it if we don't need to extract all the
178   // values.
179   // (also, I wouldn't be surprised if the compiler recognized the pattern
180   // above)
181 }
182 
183 // This no longer has any restrictions on vec_ct, though it isn't worth the
184 // overhead for vec_ct < 16.
185 uintptr_t PopcountVecsAvx2(const VecW* bit_vvec, uintptr_t vec_ct);
186 
PopcountWords(const uintptr_t * bitvec,uintptr_t word_ct)187 HEADER_INLINE uintptr_t PopcountWords(const uintptr_t* bitvec, uintptr_t word_ct) {
188   // Efficiently popcounts bitvec[0..(word_ct - 1)].  In the 64-bit case,
189   // bitvec[] must be 16-byte aligned.
190   // The PopcountWordsNzbase() wrapper takes care of starting from a later
191   // index.
192   uintptr_t tot = 0;
193   if (word_ct >= 76) {
194     assert(VecIsAligned(bitvec));
195     const uintptr_t remainder = word_ct % kWordsPerVec;
196     const uintptr_t main_block_word_ct = word_ct - remainder;
197     tot = PopcountVecsAvx2(R_CAST(const VecW*, bitvec), main_block_word_ct / kWordsPerVec);
198     bitvec = &(bitvec[main_block_word_ct]);
199     word_ct = remainder;
200   }
201   // note that recent clang versions automatically expand this to a
202   // full-service routine; takes ~50% longer than PopcountVecsAvx2 on >1kb
203   // arrays, but way better than the naive loop
204   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
205     tot += PopcountWord(bitvec[widx]);
206   }
207   return tot;
208 }
209 #else  // !USE_AVX2
210 #  ifdef __LP64__
HsumW(VecW vv)211 HEADER_INLINE uintptr_t HsumW(VecW vv) {
212   UniVec vu;
213   vu.vw = vv;
214   return vu.w[0] + vu.w[1];
215 }
216 #  else
HsumW(VecW vv)217 HEADER_INLINE uintptr_t HsumW(VecW vv) {
218   return vv;
219 }
220 #  endif
221 
222 // assumes vec_ct is a multiple of 3
223 uintptr_t PopcountVecsNoAvx2(const VecW* bit_vvec, uintptr_t vec_ct);
224 
PopcountWords(const uintptr_t * bitvec,uintptr_t word_ct)225 HEADER_INLINE uintptr_t PopcountWords(const uintptr_t* bitvec, uintptr_t word_ct) {
226   uintptr_t tot = 0;
227 #ifndef USE_SSE42
228   if (word_ct >= (3 * kWordsPerVec)) {
229     // This has an asymptotic ~10% advantage in the SSE4.2 case, but word_ct
230     // needs to be in the hundreds before the initial comparison even starts to
231     // pay for itself.
232     assert(VecIsAligned(bitvec));
233     const uintptr_t remainder = word_ct % (3 * kWordsPerVec);
234     const uintptr_t main_block_word_ct = word_ct - remainder;
235     tot = PopcountVecsNoAvx2(R_CAST(const VecW*, bitvec), main_block_word_ct / kWordsPerVec);
236     word_ct = remainder;
237     bitvec = &(bitvec[main_block_word_ct]);
238   }
239 #endif
240   for (uintptr_t trailing_word_idx = 0; trailing_word_idx != word_ct; ++trailing_word_idx) {
241     tot += PopcountWord(bitvec[trailing_word_idx]);
242   }
243   return tot;
244 }
245 #endif  // !USE_AVX2
246 
247 uintptr_t PopcountWordsIntersect(const uintptr_t* __restrict bitvec1_iter, const uintptr_t* __restrict bitvec2_iter, uintptr_t word_ct);
248 
249 // requires positive word_ct
250 // stay agnostic a bit longer re: word_ct := DIV_UP(entry_ct, kBitsPerWord)
251 // vs. word_ct := 1 + (entry_ct / kBitsPerWord)
252 // (this is a source of bugs, though; interface should probably be changed to
253 // use entry_ct once multiallelic/dosage implementation is done)
254 void FillCumulativePopcounts(const uintptr_t* subset_mask, uint32_t word_ct, uint32_t* cumulative_popcounts);
255 
256 // If idx_list is a list of valid unfiltered indexes, this converts them
257 // in-place to corresponding filtered indexes.
258 void UidxsToIdxs(const uintptr_t* subset_mask, const uint32_t* subset_cumulative_popcounts, const uintptr_t idx_list_len, uint32_t* idx_list);
259 
260 // These functions do not overread, but may write extra bytes up to the word
261 // boundary.
262 void Expand1bitTo8(const void* __restrict bytearr, uint32_t input_bit_ct, uint32_t incr, uintptr_t* __restrict dst);
263 
264 void Expand1bitTo16(const void* __restrict bytearr, uint32_t input_bit_ct, uint32_t incr, uintptr_t* __restrict dst);
265 
266 
267 // might rename this to IsSet01 (guaranteeing 0/1 return value), and change
268 // IsSet() to bitarr[idx / kBitsPerWord] & (k1LU << (idx % kBitsPerWord)) since
269 // I'd expect that to play better with out-of-order execution.  but need to
270 // benchmark first.
IsSet(const uintptr_t * bitarr,uintptr_t idx)271 HEADER_INLINE uintptr_t IsSet(const uintptr_t* bitarr, uintptr_t idx) {
272   return (bitarr[idx / kBitsPerWord] >> (idx % kBitsPerWord)) & 1;
273 }
274 
SetBit(uintptr_t idx,uintptr_t * bitarr)275 HEADER_INLINE void SetBit(uintptr_t idx, uintptr_t* bitarr) {
276   bitarr[idx / kBitsPerWord] |= k1LU << (idx % kBitsPerWord);
277 }
278 
ClearBit(uintptr_t idx,uintptr_t * bitarr)279 HEADER_INLINE void ClearBit(uintptr_t idx, uintptr_t* bitarr) {
280   bitarr[idx / kBitsPerWord] &= ~(k1LU << (idx % kBitsPerWord));
281 }
282 
AssignBit(uintptr_t idx,uintptr_t newbit,uintptr_t * bitarr)283 HEADER_INLINE void AssignBit(uintptr_t idx, uintptr_t newbit, uintptr_t* bitarr) {
284   const uintptr_t inv_mask = k1LU << (idx % kBitsPerWord);
285   uintptr_t* cur_word_ptr = &(bitarr[idx / kBitsPerWord]);
286   *cur_word_ptr = ((*cur_word_ptr) & (~inv_mask)) | (inv_mask * newbit);
287 }
288 
289 /*
290 HEADER_INLINE uintptr_t BitInnerIter1(uintptr_t uidx_base, uintptr_t* cur_bitsp, uintptr_t* cur_uidx_stopp) {
291   const uintptr_t cur_bits = *cur_bitsp;
292   const uint32_t uidx_start_lowbits = ctzw(*cur_bitsp);
293   // Key idea is to iterate over sub-blocks of set bits in a single word, in
294   // essentially the same manner as non-AVX2 CopyBitarrSubset() was doing.
295   // This particular expression 'finds' the end of the current sub-block.
296   const uintptr_t cur_bits_lfill_p1 = (cur_bits | (cur_bits - 1)) + 1;
297   *cur_bitsp = cur_bits & cur_bits_lfill_p1;
298   uint32_t uidx_stop_lowbits = kBitsPerWord;
299   if (cur_bits_lfill_p1) {
300     uidx_stop_lowbits = ctzw(cur_bits_lfill_p1);
301   }
302   *cur_uidx_stopp = uidx_base + uidx_stop_lowbits;
303   return uidx_base + uidx_start_lowbits;
304 }
305 */
306 
BitIter1(const uintptr_t * __restrict bitarr,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_bitsp)307 HEADER_INLINE uintptr_t BitIter1(const uintptr_t* __restrict bitarr, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_bitsp) {
308   uintptr_t cur_bits = *cur_bitsp;
309   if (!cur_bits) {
310     uintptr_t widx = (*uidx_basep) / kBitsPerWord;
311     do {
312       cur_bits = bitarr[++widx];
313     } while (!cur_bits);
314     *uidx_basep = widx * kBitsPerWord;
315   }
316   *cur_bitsp = cur_bits & (cur_bits - 1);
317   return (*uidx_basep) + ctzw(cur_bits);
318 }
319 
320 // Returns lowbit index instead of the full index.
BitIter1x(const uintptr_t * __restrict bitarr,uintptr_t * __restrict widxp,uintptr_t * __restrict cur_bitsp)321 HEADER_INLINE uint32_t BitIter1x(const uintptr_t* __restrict bitarr, uintptr_t* __restrict widxp, uintptr_t* __restrict cur_bitsp) {
322   uintptr_t cur_bits = *cur_bitsp;
323   while (!cur_bits) {
324     cur_bits = bitarr[++(*widxp)];
325   }
326   *cur_bitsp = cur_bits & (cur_bits - 1);
327   return ctzw(cur_bits);
328 }
329 
330 // Returns isolated lowbit.
BitIter1y(const uintptr_t * __restrict bitarr,uintptr_t * __restrict widxp,uintptr_t * __restrict cur_bitsp)331 HEADER_INLINE uintptr_t BitIter1y(const uintptr_t* __restrict bitarr, uintptr_t* __restrict widxp, uintptr_t* __restrict cur_bitsp) {
332   uintptr_t cur_bits = *cur_bitsp;
333   while (!cur_bits) {
334     cur_bits = bitarr[++(*widxp)];
335   }
336   const uintptr_t shifted_bit = cur_bits & (-cur_bits);
337   *cur_bitsp = cur_bits ^ shifted_bit;
338   return shifted_bit;
339 }
340 
BitIter1Start(const uintptr_t * __restrict bitarr,uintptr_t restart_uidx,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_bitsp)341 HEADER_INLINE void BitIter1Start(const uintptr_t* __restrict bitarr, uintptr_t restart_uidx, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_bitsp) {
342   const uintptr_t widx = restart_uidx / kBitsPerWord;
343   *cur_bitsp = bitarr[widx] & ((~k0LU) << (restart_uidx % kBitsPerWord));
344   *uidx_basep = widx * kBitsPerWord;
345 }
346 
BitIter1NoAdv(const uintptr_t * __restrict bitarr,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_bitsp)347 HEADER_INLINE uintptr_t BitIter1NoAdv(const uintptr_t* __restrict bitarr, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_bitsp) {
348   uintptr_t cur_bits = *cur_bitsp;
349   if (!cur_bits) {
350     uintptr_t widx = (*uidx_basep) / kBitsPerWord;
351     do {
352       cur_bits = bitarr[++widx];
353     } while (!cur_bits);
354     *uidx_basep = widx * kBitsPerWord;
355     *cur_bitsp = cur_bits;
356   }
357   return (*uidx_basep) + ctzw(cur_bits);
358 }
359 
BitIter0(const uintptr_t * __restrict bitarr,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_inv_bitsp)360 HEADER_INLINE uintptr_t BitIter0(const uintptr_t* __restrict bitarr, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_inv_bitsp) {
361   uintptr_t cur_inv_bits = *cur_inv_bitsp;
362   if (!cur_inv_bits) {
363     uintptr_t widx = (*uidx_basep) / kBitsPerWord;
364     do {
365       cur_inv_bits = ~bitarr[++widx];
366     } while (!cur_inv_bits);
367     *uidx_basep = widx * kBitsPerWord;
368   }
369   *cur_inv_bitsp = cur_inv_bits & (cur_inv_bits - 1);
370   return (*uidx_basep) + ctzw(cur_inv_bits);
371 }
372 
BitIter0Start(const uintptr_t * __restrict bitarr,uintptr_t restart_uidx,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_inv_bitsp)373 HEADER_INLINE void BitIter0Start(const uintptr_t* __restrict bitarr, uintptr_t restart_uidx, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_inv_bitsp) {
374   const uintptr_t widx = restart_uidx / kBitsPerWord;
375   *cur_inv_bitsp = (~bitarr[widx]) & ((~k0LU) << (restart_uidx % kBitsPerWord));
376   *uidx_basep = widx * kBitsPerWord;
377 }
378 
BitIter0NoAdv(const uintptr_t * __restrict bitarr,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_inv_bitsp)379 HEADER_INLINE uintptr_t BitIter0NoAdv(const uintptr_t* __restrict bitarr, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_inv_bitsp) {
380   uintptr_t cur_inv_bits = *cur_inv_bitsp;
381   if (!cur_inv_bits) {
382     uintptr_t widx = (*uidx_basep) / kBitsPerWord;
383     do {
384       cur_inv_bits = ~bitarr[++widx];
385     } while (!cur_inv_bits);
386     *uidx_basep = widx * kBitsPerWord;
387     *cur_inv_bitsp = cur_inv_bits;
388   }
389   return (*uidx_basep) + ctzw(cur_inv_bits);
390 }
391 
392 // todo: test this against extracting a nonmissing bitarr first
393 /*
394 HEADER_INLINE void NextNonmissingUnsafeCk32(const uintptr_t* __restrict genoarr, uint32_t* __restrict loc_ptr) {
395   if (GetNyparrEntry(genoarr, *loc_ptr) == 3) {
396     *loc_ptr = NextNonmissingUnsafe(genoarr, *loc_ptr);
397   }
398 }
399 */
400 
401 // Equivalent to popcount_bit_idx(subset_mask, 0, raw_idx).
RawToSubsettedPos(const uintptr_t * subset_mask,const uint32_t * subset_cumulative_popcounts,uint32_t raw_idx)402 HEADER_INLINE uint32_t RawToSubsettedPos(const uintptr_t* subset_mask, const uint32_t* subset_cumulative_popcounts, uint32_t raw_idx) {
403   // this should be much better than keeping a uidx_to_idx array!
404   // (update: there are more compact indexes, but postpone for now, this is
405   // is nice and simple and gets us most of what we need.)
406   const uint32_t raw_widx = raw_idx / kBitsPerWord;
407   return subset_cumulative_popcounts[raw_widx] + PopcountWord(bzhi(subset_mask[raw_widx], raw_idx % kBitsPerWord));
408 }
409 
ZeroTrailingBits(uintptr_t bit_ct,uintptr_t * bitarr)410 HEADER_INLINE void ZeroTrailingBits(uintptr_t bit_ct, uintptr_t* bitarr) {
411   const uint32_t trail_ct = bit_ct % kBitsPerWord;
412   if (trail_ct) {
413     bitarr[bit_ct / kBitsPerWord] = bzhi(bitarr[bit_ct / kBitsPerWord], trail_ct);
414   }
415 }
416 
417 #ifdef __LP64__
ZeroTrailingWords(uint32_t word_ct,uintptr_t * bitvec)418 HEADER_INLINE void ZeroTrailingWords(uint32_t word_ct, uintptr_t* bitvec) {
419   const uint32_t remainder = word_ct % kWordsPerVec;
420   if (remainder) {
421     ZeroWArr(kWordsPerVec - remainder, &(bitvec[word_ct]));
422   }
423 }
424 #else
ZeroTrailingWords(__maybe_unused uint32_t word_ct,__maybe_unused uintptr_t * bitvec)425 HEADER_INLINE void ZeroTrailingWords(__maybe_unused uint32_t word_ct, __maybe_unused uintptr_t* bitvec) {
426 }
427 #endif
428 
429 // output_bit_idx_end is practically always subset_size
430 void CopyBitarrSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t output_bit_idx_end, uintptr_t* __restrict output_bitarr);
431 
432 // expand_size + read_start_bit must be positive.
433 void ExpandBytearr(const void* __restrict compact_bitarr, const uintptr_t* __restrict expand_mask, uint32_t word_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target);
434 
435 // equivalent to calling ExpandBytearr() followed by CopyBitarrSubset()
436 void ExpandThenSubsetBytearr(const void* __restrict compact_bitarr, const uintptr_t* __restrict expand_mask, const uintptr_t* __restrict subset_mask, uint32_t expand_size, uint32_t subset_size, uint32_t read_start_bit, uintptr_t* __restrict target);
437 
438 // mid_popcount must be positive
439 void ExpandBytearrNested(const void* __restrict compact_bitarr, const uintptr_t* __restrict mid_bitarr, const uintptr_t* __restrict top_expand_mask, uint32_t word_ct, uint32_t mid_popcount, uint32_t mid_start_bit, uintptr_t* __restrict mid_target, uintptr_t* __restrict compact_target);
440 
441 // mid_popcount must be positive
442 // if mid_start_bit == 1, mid_popcount should not include that bit
443 void ExpandThenSubsetBytearrNested(const void* __restrict compact_bitarr, const uintptr_t* __restrict mid_bitarr, const uintptr_t* __restrict top_expand_mask, const uintptr_t* __restrict subset_mask, uint32_t subset_size, uint32_t mid_popcount, uint32_t mid_start_bit, uintptr_t* __restrict mid_target, uintptr_t* __restrict compact_target);
444 
445 // these don't read past the end of bitarr
446 uintptr_t PopcountBytes(const void* bitarr, uintptr_t byte_ct);
447 uintptr_t PopcountBytesMasked(const void* bitarr, const uintptr_t* mask_arr, uintptr_t byte_ct);
448 
449 
450 // TransposeNypblock(), which is more plink-specific, is in pgenlib_misc
451 CONSTI32(kPglBitTransposeBatch, kBitsPerCacheline);
452 CONSTI32(kPglBitTransposeWords, kWordsPerCacheline);
453 // * Up to 512x512; vecaligned_buf must have size 64k
454 // * write_iter must be allocated up to at least
455 //   RoundUpPow2(write_batch_size, 2) rows
456 // * We use pointers with different types to read from and write to buf0/buf1,
457 //   so defining the base type as unsigned char* is theoretically necessary to
458 //   avoid breaking strict-aliasing rules, while the restrict qualifiers should
459 //   tell the compiler it doesn't need to be paranoid about writes to one of
460 //   the buffers screwing with reads from the other.
461 #ifdef __LP64__
462 CONSTI32(kPglBitTransposeBufbytes, (kPglBitTransposeBatch * kPglBitTransposeBatch) / (CHAR_BIT / 2));
463 void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_row_ct, uint32_t write_row_ct, uintptr_t* write_iter, VecW* __restrict buf0, VecW* __restrict buf1);
464 
TransposeBitblock(const uintptr_t * read_iter,uintptr_t read_ul_stride,uintptr_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * write_iter,VecW * vecaligned_buf)465 HEADER_INLINE void TransposeBitblock(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* vecaligned_buf) {
466   TransposeBitblock64(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, vecaligned_buf, &(vecaligned_buf[kPglBitTransposeBufbytes / (2 * kBytesPerVec)]));
467 }
468 
469 #else  // !__LP64__
470 CONSTI32(kPglBitTransposeBufbytes, (kPglBitTransposeBatch * kPglBitTransposeBatch) / (CHAR_BIT / 2));
471 void TransposeBitblock32(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* __restrict buf0, VecW* __restrict buf1);
472 
473 // If this ever needs to be called on an input byte array, read_iter could be
474 // changed to const void*; in that case, read_ul_stride should be changed to a
475 // byte count.
TransposeBitblock(const uintptr_t * read_iter,uintptr_t read_ul_stride,uintptr_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * write_iter,VecW * vecaligned_buf)476 HEADER_INLINE void TransposeBitblock(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* vecaligned_buf) {
477   TransposeBitblock32(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, vecaligned_buf, &(vecaligned_buf[kPglBitTransposeBufbytes / (2 * kBytesPerVec)]));
478 }
479 #endif
480 
481 CONSTI32(kPglBitTransposeBufwords, kPglBitTransposeBufbytes / kBytesPerWord);
482 CONSTI32(kPglBitTransposeBufvecs, kPglBitTransposeBufbytes / kBytesPerVec);
483 
484 CONSTI32(kPglNybbleTransposeBatch, kNybblesPerCacheline);
485 CONSTI32(kPglNybbleTransposeWords, kWordsPerCacheline);
486 
487 CONSTI32(kPglNybbleTransposeBufbytes, (kPglNybbleTransposeBatch * kPglNybbleTransposeBatch) / 2);
488 
489 // up to 128x128; vecaligned_buf must have size 8k
490 // now ok for write_iter to not be padded when write_batch_size odd
491 void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, VecW* vecaligned_buf);
492 
493 #ifdef __LP64__
494 #  ifdef USE_AVX2
495 extern const unsigned char kLeadMask[2 * kBytesPerVec] __attribute__ ((aligned (64)));
496 #  else
497 extern const unsigned char kLeadMask[2 * kBytesPerVec] __attribute__ ((aligned (32)));
498 #  endif
499 #endif
500 
501 uintptr_t BytesumArr(const void* bytearr, uintptr_t byte_ct);
502 
503 uintptr_t CountByte(const void* bytearr, unsigned char ucc, uintptr_t byte_ct);
504 
505 uintptr_t CountU16(const void* u16arr, uint16_t usii, uintptr_t u16_ct);
506 
507 
508 // Applies sample_include to {src_subset, src_vals}.
509 uint32_t Copy1bit8Subset(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uintptr_t* __restrict sample_include, uint32_t src_subset_size, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals);
510 
511 uint32_t Copy1bit16Subset(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uintptr_t* __restrict sample_include, uint32_t src_subset_size, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals);
512 
513 #ifdef __cplusplus
514 }  // namespace plink2
515 #endif
516 
517 #endif  // __PLINK2_BASE_H__
518