1 #ifndef __PLINK2_BITS_H__
2 #define __PLINK2_BITS_H__
3
4 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
5 // Christopher Chang.
6 //
7 // This library is free software: you can redistribute it and/or modify it
8 // under the terms of the GNU Lesser General Public License as published by the
9 // Free Software Foundation; either version 3 of the License, or (at your
10 // option) any later version.
11 //
12 // This library is distributed in the hope that it will be useful, but WITHOUT
13 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
15 // for more details.
16 //
17 // You should have received a copy of the GNU Lesser General Public License
18 // along with this library. If not, see <http://www.gnu.org/licenses/>.
19
20
21 // Bitarray support. (Inline single-word operations are in plink2_base.h.)
22
23 #include "plink2_base.h"
24
25 #ifdef __cplusplus
26 namespace plink2 {
27 #endif
28
29 #if defined(__LP64__) && !defined(USE_AVX2)
30 // may also want a version which doesn't always apply kMask5555
31 void Pack32bTo16bMask(const void* words, uintptr_t ct_32b, void* dest);
32
PackWordsToHalfwordsMask(const uintptr_t * words,uintptr_t word_ct,Halfword * dest)33 HEADER_INLINE void PackWordsToHalfwordsMask(const uintptr_t* words, uintptr_t word_ct, Halfword* dest) {
34 uintptr_t widx = 0;
35 if (word_ct >= (32 / kBytesPerWord)) {
36 const uintptr_t ct_32b = word_ct / (32 / kBytesPerWord);
37 Pack32bTo16bMask(words, ct_32b, dest);
38 widx = ct_32b * (32 / kBytesPerWord);
39 }
40 for (; widx != word_ct; ++widx) {
41 dest[widx] = PackWordToHalfwordMask5555(words[widx]);
42 }
43 }
44 #else
45 HEADER_INLINE void PackWordsToHalfwordsMask(const uintptr_t* words, uintptr_t word_ct, Halfword* dest) {
46 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
47 dest[widx] = PackWordToHalfwordMask5555(words[widx]);
48 }
49 }
50 #endif
51
52 // ok for ct == 0
53 void SetAllBits(uintptr_t ct, uintptr_t* bitarr);
54
55 void BitvecAnd(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
56
57 void BitvecInvmask(const uintptr_t* __restrict exclude_bitvec, uintptr_t word_ct, uintptr_t* __restrict main_bitvec);
58
59 void BitvecOr(const uintptr_t* __restrict arg_bitvec, uintptr_t word_ct, uintptr_t* main_bitvec);
60
61 void BitvecInvert(uintptr_t word_ct, uintptr_t* main_bitvec);
62
63 // Functions with "adv" in the name generally take an index or char-pointer as
64 // an argument and return its new value, while "mov" functions take a
65 // pointer-to-index or pointer-to-char-pointer and move it.
66
67 // These return the current index if the corresponding bit satisfies the
68 // condition.
69 uintptr_t AdvTo1Bit(const uintptr_t* bitarr, uintptr_t loc);
70
71 uintptr_t AdvTo0Bit(const uintptr_t* bitarr, uintptr_t loc);
72
73 // uintptr_t NextNonmissingUnsafe(const uintptr_t* genoarr, uintptr_t loc);
74
75 uint32_t AdvBoundedTo1Bit(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil);
76
77 uintptr_t AdvBoundedTo0Bit(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil);
78
79 uint32_t FindLast1BitBefore(const uintptr_t* bitarr, uint32_t loc);
80
81 // possible todo: check if movemask-based solution is better in AVX2 case
AllWordsAreZero(const uintptr_t * word_arr,uintptr_t word_ct)82 HEADER_INLINE uint32_t AllWordsAreZero(const uintptr_t* word_arr, uintptr_t word_ct) {
83 while (word_ct--) {
84 if (*word_arr++) {
85 return 0;
86 }
87 }
88 return 1;
89 }
90
AllBitsAreOne(const uintptr_t * bitarr,uintptr_t bit_ct)91 HEADER_INLINE uint32_t AllBitsAreOne(const uintptr_t* bitarr, uintptr_t bit_ct) {
92 const uintptr_t fullword_ct = bit_ct / kBitsPerWord;
93 for (uintptr_t widx = 0; widx != fullword_ct; ++widx) {
94 if (~(bitarr[widx])) {
95 return 0;
96 }
97 }
98 const uint32_t trailing_bit_ct = bit_ct % kBitsPerWord;
99 return (!trailing_bit_ct) || ((~(bitarr[fullword_ct])) << (kBitsPerWord - trailing_bit_ct));
100 }
101
102 uint32_t AllBytesAreX(const unsigned char* bytes, unsigned char match, uintptr_t byte_ct);
103
104 // Updated PopcountWords() code is based on
105 // https://github.com/kimwalisch/libpopcnt . libpopcnt license text follows.
106
107 /*
108 * libpopcnt.h - C/C++ library for counting the number of 1 bits (bit
109 * population count) in an array as quickly as possible using
110 * specialized CPU instructions i.e. POPCNT, AVX2, AVX512, NEON.
111 *
112 * Copyright (c) 2016 - 2017, Kim Walisch
113 * Copyright (c) 2016 - 2017, Wojciech Mula
114 *
115 * All rights reserved.
116 *
117 * Redistribution and use in source and binary forms, with or without
118 * modification, are permitted provided that the following conditions are met:
119 *
120 * 1. Redistributions of source code must retain the above copyright notice, this
121 * list of conditions and the following disclaimer.
122 * 2. Redistributions in binary form must reproduce the above copyright notice,
123 * this list of conditions and the following disclaimer in the documentation
124 * and/or other materials provided with the distribution.
125 *
126 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
127 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
128 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
129 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
130 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
131 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
132 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
133 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
134 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
135 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
136 */
137
138 #ifdef USE_AVX2
139 // 'Csa' = carry, save, add
140 // If bb, cc, and *lp are bitvectors, this returns the carry bitvector and sets
141 // *lp to contain the low-order bits of the sums. I.e. for each position:
142 // if none of bb, cc, and *lp are set, *lp bit is zero and carry bit is zero
143 // if exactly 1 is set, *lp bit becomes one and carry bit is zero
144 // if exactly 2 are set, *lp bit becomes zero and carry bit is one
145 // if all 3 are set, *lp bit becomes one and carry bit is one
Csa256(VecW bb,VecW cc,VecW * lp)146 HEADER_INLINE VecW Csa256(VecW bb, VecW cc, VecW* lp) {
147 const VecW aa = *lp;
148 const VecW uu = aa ^ bb;
149 *lp = uu ^ cc;
150 return (aa & bb) | (uu & cc);
151 }
152
CsaOne256(VecW bb,VecW * lp)153 HEADER_INLINE VecW CsaOne256(VecW bb, VecW* lp) {
154 const VecW aa = *lp;
155 *lp = aa ^ bb;
156 return aa & bb;
157 }
158
PopcountVecAvx2(VecW vv)159 HEADER_INLINE VecW PopcountVecAvx2(VecW vv) {
160 const VecW lookup1 = vecw_setr8(4, 5, 5, 6, 5, 6, 6, 7,
161 5, 6, 6, 7, 6, 7, 7, 8);
162 const VecW lookup2 = vecw_setr8(4, 3, 3, 2, 3, 2, 2, 1,
163 3, 2, 2, 1, 2, 1, 1, 0);
164
165 const VecW m4 = VCONST_W(kMask0F0F);
166 const VecW lo = vv & m4;
167 const VecW hi = vecw_srli(vv, 4) & m4;
168 const VecW popcnt1 = vecw_shuffle8(lookup1, lo);
169 const VecW popcnt2 = vecw_shuffle8(lookup2, hi);
170 return vecw_sad(popcnt1, popcnt2);
171 }
172
HsumW(VecW vv)173 HEADER_INLINE uintptr_t HsumW(VecW vv) {
174 UniVec vu;
175 vu.vw = vv;
176 return vu.w[0] + vu.w[1] + vu.w[2] + vu.w[3];
177 // _mm256_extract_epi64() only worth it if we don't need to extract all the
178 // values.
179 // (also, I wouldn't be surprised if the compiler recognized the pattern
180 // above)
181 }
182
183 // This no longer has any restrictions on vec_ct, though it isn't worth the
184 // overhead for vec_ct < 16.
185 uintptr_t PopcountVecsAvx2(const VecW* bit_vvec, uintptr_t vec_ct);
186
PopcountWords(const uintptr_t * bitvec,uintptr_t word_ct)187 HEADER_INLINE uintptr_t PopcountWords(const uintptr_t* bitvec, uintptr_t word_ct) {
188 // Efficiently popcounts bitvec[0..(word_ct - 1)]. In the 64-bit case,
189 // bitvec[] must be 16-byte aligned.
190 // The PopcountWordsNzbase() wrapper takes care of starting from a later
191 // index.
192 uintptr_t tot = 0;
193 if (word_ct >= 76) {
194 assert(VecIsAligned(bitvec));
195 const uintptr_t remainder = word_ct % kWordsPerVec;
196 const uintptr_t main_block_word_ct = word_ct - remainder;
197 tot = PopcountVecsAvx2(R_CAST(const VecW*, bitvec), main_block_word_ct / kWordsPerVec);
198 bitvec = &(bitvec[main_block_word_ct]);
199 word_ct = remainder;
200 }
201 // note that recent clang versions automatically expand this to a
202 // full-service routine; takes ~50% longer than PopcountVecsAvx2 on >1kb
203 // arrays, but way better than the naive loop
204 for (uintptr_t widx = 0; widx != word_ct; ++widx) {
205 tot += PopcountWord(bitvec[widx]);
206 }
207 return tot;
208 }
209 #else // !USE_AVX2
210 # ifdef __LP64__
HsumW(VecW vv)211 HEADER_INLINE uintptr_t HsumW(VecW vv) {
212 UniVec vu;
213 vu.vw = vv;
214 return vu.w[0] + vu.w[1];
215 }
216 # else
HsumW(VecW vv)217 HEADER_INLINE uintptr_t HsumW(VecW vv) {
218 return vv;
219 }
220 # endif
221
222 // assumes vec_ct is a multiple of 3
223 uintptr_t PopcountVecsNoAvx2(const VecW* bit_vvec, uintptr_t vec_ct);
224
PopcountWords(const uintptr_t * bitvec,uintptr_t word_ct)225 HEADER_INLINE uintptr_t PopcountWords(const uintptr_t* bitvec, uintptr_t word_ct) {
226 uintptr_t tot = 0;
227 #ifndef USE_SSE42
228 if (word_ct >= (3 * kWordsPerVec)) {
229 // This has an asymptotic ~10% advantage in the SSE4.2 case, but word_ct
230 // needs to be in the hundreds before the initial comparison even starts to
231 // pay for itself.
232 assert(VecIsAligned(bitvec));
233 const uintptr_t remainder = word_ct % (3 * kWordsPerVec);
234 const uintptr_t main_block_word_ct = word_ct - remainder;
235 tot = PopcountVecsNoAvx2(R_CAST(const VecW*, bitvec), main_block_word_ct / kWordsPerVec);
236 word_ct = remainder;
237 bitvec = &(bitvec[main_block_word_ct]);
238 }
239 #endif
240 for (uintptr_t trailing_word_idx = 0; trailing_word_idx != word_ct; ++trailing_word_idx) {
241 tot += PopcountWord(bitvec[trailing_word_idx]);
242 }
243 return tot;
244 }
245 #endif // !USE_AVX2
246
247 uintptr_t PopcountWordsIntersect(const uintptr_t* __restrict bitvec1_iter, const uintptr_t* __restrict bitvec2_iter, uintptr_t word_ct);
248
249 // requires positive word_ct
250 // stay agnostic a bit longer re: word_ct := DIV_UP(entry_ct, kBitsPerWord)
251 // vs. word_ct := 1 + (entry_ct / kBitsPerWord)
252 // (this is a source of bugs, though; interface should probably be changed to
253 // use entry_ct once multiallelic/dosage implementation is done)
254 void FillCumulativePopcounts(const uintptr_t* subset_mask, uint32_t word_ct, uint32_t* cumulative_popcounts);
255
256 // If idx_list is a list of valid unfiltered indexes, this converts them
257 // in-place to corresponding filtered indexes.
258 void UidxsToIdxs(const uintptr_t* subset_mask, const uint32_t* subset_cumulative_popcounts, const uintptr_t idx_list_len, uint32_t* idx_list);
259
260 // These functions do not overread, but may write extra bytes up to the word
261 // boundary.
262 void Expand1bitTo8(const void* __restrict bytearr, uint32_t input_bit_ct, uint32_t incr, uintptr_t* __restrict dst);
263
264 void Expand1bitTo16(const void* __restrict bytearr, uint32_t input_bit_ct, uint32_t incr, uintptr_t* __restrict dst);
265
266
267 // might rename this to IsSet01 (guaranteeing 0/1 return value), and change
268 // IsSet() to bitarr[idx / kBitsPerWord] & (k1LU << (idx % kBitsPerWord)) since
269 // I'd expect that to play better with out-of-order execution. but need to
270 // benchmark first.
IsSet(const uintptr_t * bitarr,uintptr_t idx)271 HEADER_INLINE uintptr_t IsSet(const uintptr_t* bitarr, uintptr_t idx) {
272 return (bitarr[idx / kBitsPerWord] >> (idx % kBitsPerWord)) & 1;
273 }
274
SetBit(uintptr_t idx,uintptr_t * bitarr)275 HEADER_INLINE void SetBit(uintptr_t idx, uintptr_t* bitarr) {
276 bitarr[idx / kBitsPerWord] |= k1LU << (idx % kBitsPerWord);
277 }
278
ClearBit(uintptr_t idx,uintptr_t * bitarr)279 HEADER_INLINE void ClearBit(uintptr_t idx, uintptr_t* bitarr) {
280 bitarr[idx / kBitsPerWord] &= ~(k1LU << (idx % kBitsPerWord));
281 }
282
AssignBit(uintptr_t idx,uintptr_t newbit,uintptr_t * bitarr)283 HEADER_INLINE void AssignBit(uintptr_t idx, uintptr_t newbit, uintptr_t* bitarr) {
284 const uintptr_t inv_mask = k1LU << (idx % kBitsPerWord);
285 uintptr_t* cur_word_ptr = &(bitarr[idx / kBitsPerWord]);
286 *cur_word_ptr = ((*cur_word_ptr) & (~inv_mask)) | (inv_mask * newbit);
287 }
288
289 /*
290 HEADER_INLINE uintptr_t BitInnerIter1(uintptr_t uidx_base, uintptr_t* cur_bitsp, uintptr_t* cur_uidx_stopp) {
291 const uintptr_t cur_bits = *cur_bitsp;
292 const uint32_t uidx_start_lowbits = ctzw(*cur_bitsp);
293 // Key idea is to iterate over sub-blocks of set bits in a single word, in
294 // essentially the same manner as non-AVX2 CopyBitarrSubset() was doing.
295 // This particular expression 'finds' the end of the current sub-block.
296 const uintptr_t cur_bits_lfill_p1 = (cur_bits | (cur_bits - 1)) + 1;
297 *cur_bitsp = cur_bits & cur_bits_lfill_p1;
298 uint32_t uidx_stop_lowbits = kBitsPerWord;
299 if (cur_bits_lfill_p1) {
300 uidx_stop_lowbits = ctzw(cur_bits_lfill_p1);
301 }
302 *cur_uidx_stopp = uidx_base + uidx_stop_lowbits;
303 return uidx_base + uidx_start_lowbits;
304 }
305 */
306
BitIter1(const uintptr_t * __restrict bitarr,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_bitsp)307 HEADER_INLINE uintptr_t BitIter1(const uintptr_t* __restrict bitarr, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_bitsp) {
308 uintptr_t cur_bits = *cur_bitsp;
309 if (!cur_bits) {
310 uintptr_t widx = (*uidx_basep) / kBitsPerWord;
311 do {
312 cur_bits = bitarr[++widx];
313 } while (!cur_bits);
314 *uidx_basep = widx * kBitsPerWord;
315 }
316 *cur_bitsp = cur_bits & (cur_bits - 1);
317 return (*uidx_basep) + ctzw(cur_bits);
318 }
319
320 // Returns lowbit index instead of the full index.
BitIter1x(const uintptr_t * __restrict bitarr,uintptr_t * __restrict widxp,uintptr_t * __restrict cur_bitsp)321 HEADER_INLINE uint32_t BitIter1x(const uintptr_t* __restrict bitarr, uintptr_t* __restrict widxp, uintptr_t* __restrict cur_bitsp) {
322 uintptr_t cur_bits = *cur_bitsp;
323 while (!cur_bits) {
324 cur_bits = bitarr[++(*widxp)];
325 }
326 *cur_bitsp = cur_bits & (cur_bits - 1);
327 return ctzw(cur_bits);
328 }
329
330 // Returns isolated lowbit.
BitIter1y(const uintptr_t * __restrict bitarr,uintptr_t * __restrict widxp,uintptr_t * __restrict cur_bitsp)331 HEADER_INLINE uintptr_t BitIter1y(const uintptr_t* __restrict bitarr, uintptr_t* __restrict widxp, uintptr_t* __restrict cur_bitsp) {
332 uintptr_t cur_bits = *cur_bitsp;
333 while (!cur_bits) {
334 cur_bits = bitarr[++(*widxp)];
335 }
336 const uintptr_t shifted_bit = cur_bits & (-cur_bits);
337 *cur_bitsp = cur_bits ^ shifted_bit;
338 return shifted_bit;
339 }
340
BitIter1Start(const uintptr_t * __restrict bitarr,uintptr_t restart_uidx,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_bitsp)341 HEADER_INLINE void BitIter1Start(const uintptr_t* __restrict bitarr, uintptr_t restart_uidx, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_bitsp) {
342 const uintptr_t widx = restart_uidx / kBitsPerWord;
343 *cur_bitsp = bitarr[widx] & ((~k0LU) << (restart_uidx % kBitsPerWord));
344 *uidx_basep = widx * kBitsPerWord;
345 }
346
BitIter1NoAdv(const uintptr_t * __restrict bitarr,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_bitsp)347 HEADER_INLINE uintptr_t BitIter1NoAdv(const uintptr_t* __restrict bitarr, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_bitsp) {
348 uintptr_t cur_bits = *cur_bitsp;
349 if (!cur_bits) {
350 uintptr_t widx = (*uidx_basep) / kBitsPerWord;
351 do {
352 cur_bits = bitarr[++widx];
353 } while (!cur_bits);
354 *uidx_basep = widx * kBitsPerWord;
355 *cur_bitsp = cur_bits;
356 }
357 return (*uidx_basep) + ctzw(cur_bits);
358 }
359
BitIter0(const uintptr_t * __restrict bitarr,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_inv_bitsp)360 HEADER_INLINE uintptr_t BitIter0(const uintptr_t* __restrict bitarr, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_inv_bitsp) {
361 uintptr_t cur_inv_bits = *cur_inv_bitsp;
362 if (!cur_inv_bits) {
363 uintptr_t widx = (*uidx_basep) / kBitsPerWord;
364 do {
365 cur_inv_bits = ~bitarr[++widx];
366 } while (!cur_inv_bits);
367 *uidx_basep = widx * kBitsPerWord;
368 }
369 *cur_inv_bitsp = cur_inv_bits & (cur_inv_bits - 1);
370 return (*uidx_basep) + ctzw(cur_inv_bits);
371 }
372
BitIter0Start(const uintptr_t * __restrict bitarr,uintptr_t restart_uidx,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_inv_bitsp)373 HEADER_INLINE void BitIter0Start(const uintptr_t* __restrict bitarr, uintptr_t restart_uidx, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_inv_bitsp) {
374 const uintptr_t widx = restart_uidx / kBitsPerWord;
375 *cur_inv_bitsp = (~bitarr[widx]) & ((~k0LU) << (restart_uidx % kBitsPerWord));
376 *uidx_basep = widx * kBitsPerWord;
377 }
378
BitIter0NoAdv(const uintptr_t * __restrict bitarr,uintptr_t * __restrict uidx_basep,uintptr_t * __restrict cur_inv_bitsp)379 HEADER_INLINE uintptr_t BitIter0NoAdv(const uintptr_t* __restrict bitarr, uintptr_t* __restrict uidx_basep, uintptr_t* __restrict cur_inv_bitsp) {
380 uintptr_t cur_inv_bits = *cur_inv_bitsp;
381 if (!cur_inv_bits) {
382 uintptr_t widx = (*uidx_basep) / kBitsPerWord;
383 do {
384 cur_inv_bits = ~bitarr[++widx];
385 } while (!cur_inv_bits);
386 *uidx_basep = widx * kBitsPerWord;
387 *cur_inv_bitsp = cur_inv_bits;
388 }
389 return (*uidx_basep) + ctzw(cur_inv_bits);
390 }
391
392 // todo: test this against extracting a nonmissing bitarr first
393 /*
394 HEADER_INLINE void NextNonmissingUnsafeCk32(const uintptr_t* __restrict genoarr, uint32_t* __restrict loc_ptr) {
395 if (GetNyparrEntry(genoarr, *loc_ptr) == 3) {
396 *loc_ptr = NextNonmissingUnsafe(genoarr, *loc_ptr);
397 }
398 }
399 */
400
401 // Equivalent to popcount_bit_idx(subset_mask, 0, raw_idx).
RawToSubsettedPos(const uintptr_t * subset_mask,const uint32_t * subset_cumulative_popcounts,uint32_t raw_idx)402 HEADER_INLINE uint32_t RawToSubsettedPos(const uintptr_t* subset_mask, const uint32_t* subset_cumulative_popcounts, uint32_t raw_idx) {
403 // this should be much better than keeping a uidx_to_idx array!
404 // (update: there are more compact indexes, but postpone for now, this is
405 // is nice and simple and gets us most of what we need.)
406 const uint32_t raw_widx = raw_idx / kBitsPerWord;
407 return subset_cumulative_popcounts[raw_widx] + PopcountWord(bzhi(subset_mask[raw_widx], raw_idx % kBitsPerWord));
408 }
409
ZeroTrailingBits(uintptr_t bit_ct,uintptr_t * bitarr)410 HEADER_INLINE void ZeroTrailingBits(uintptr_t bit_ct, uintptr_t* bitarr) {
411 const uint32_t trail_ct = bit_ct % kBitsPerWord;
412 if (trail_ct) {
413 bitarr[bit_ct / kBitsPerWord] = bzhi(bitarr[bit_ct / kBitsPerWord], trail_ct);
414 }
415 }
416
417 #ifdef __LP64__
ZeroTrailingWords(uint32_t word_ct,uintptr_t * bitvec)418 HEADER_INLINE void ZeroTrailingWords(uint32_t word_ct, uintptr_t* bitvec) {
419 const uint32_t remainder = word_ct % kWordsPerVec;
420 if (remainder) {
421 ZeroWArr(kWordsPerVec - remainder, &(bitvec[word_ct]));
422 }
423 }
424 #else
ZeroTrailingWords(__maybe_unused uint32_t word_ct,__maybe_unused uintptr_t * bitvec)425 HEADER_INLINE void ZeroTrailingWords(__maybe_unused uint32_t word_ct, __maybe_unused uintptr_t* bitvec) {
426 }
427 #endif
428
429 // output_bit_idx_end is practically always subset_size
430 void CopyBitarrSubset(const uintptr_t* __restrict raw_bitarr, const uintptr_t* __restrict subset_mask, uint32_t output_bit_idx_end, uintptr_t* __restrict output_bitarr);
431
432 // expand_size + read_start_bit must be positive.
433 void ExpandBytearr(const void* __restrict compact_bitarr, const uintptr_t* __restrict expand_mask, uint32_t word_ct, uint32_t expand_size, uint32_t read_start_bit, uintptr_t* __restrict target);
434
435 // equivalent to calling ExpandBytearr() followed by CopyBitarrSubset()
436 void ExpandThenSubsetBytearr(const void* __restrict compact_bitarr, const uintptr_t* __restrict expand_mask, const uintptr_t* __restrict subset_mask, uint32_t expand_size, uint32_t subset_size, uint32_t read_start_bit, uintptr_t* __restrict target);
437
438 // mid_popcount must be positive
439 void ExpandBytearrNested(const void* __restrict compact_bitarr, const uintptr_t* __restrict mid_bitarr, const uintptr_t* __restrict top_expand_mask, uint32_t word_ct, uint32_t mid_popcount, uint32_t mid_start_bit, uintptr_t* __restrict mid_target, uintptr_t* __restrict compact_target);
440
441 // mid_popcount must be positive
442 // if mid_start_bit == 1, mid_popcount should not include that bit
443 void ExpandThenSubsetBytearrNested(const void* __restrict compact_bitarr, const uintptr_t* __restrict mid_bitarr, const uintptr_t* __restrict top_expand_mask, const uintptr_t* __restrict subset_mask, uint32_t subset_size, uint32_t mid_popcount, uint32_t mid_start_bit, uintptr_t* __restrict mid_target, uintptr_t* __restrict compact_target);
444
445 // these don't read past the end of bitarr
446 uintptr_t PopcountBytes(const void* bitarr, uintptr_t byte_ct);
447 uintptr_t PopcountBytesMasked(const void* bitarr, const uintptr_t* mask_arr, uintptr_t byte_ct);
448
449
450 // TransposeNypblock(), which is more plink-specific, is in pgenlib_misc
451 CONSTI32(kPglBitTransposeBatch, kBitsPerCacheline);
452 CONSTI32(kPglBitTransposeWords, kWordsPerCacheline);
453 // * Up to 512x512; vecaligned_buf must have size 64k
454 // * write_iter must be allocated up to at least
455 // RoundUpPow2(write_batch_size, 2) rows
456 // * We use pointers with different types to read from and write to buf0/buf1,
457 // so defining the base type as unsigned char* is theoretically necessary to
458 // avoid breaking strict-aliasing rules, while the restrict qualifiers should
459 // tell the compiler it doesn't need to be paranoid about writes to one of
460 // the buffers screwing with reads from the other.
461 #ifdef __LP64__
462 CONSTI32(kPglBitTransposeBufbytes, (kPglBitTransposeBatch * kPglBitTransposeBatch) / (CHAR_BIT / 2));
463 void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_row_ct, uint32_t write_row_ct, uintptr_t* write_iter, VecW* __restrict buf0, VecW* __restrict buf1);
464
TransposeBitblock(const uintptr_t * read_iter,uintptr_t read_ul_stride,uintptr_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * write_iter,VecW * vecaligned_buf)465 HEADER_INLINE void TransposeBitblock(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* vecaligned_buf) {
466 TransposeBitblock64(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, vecaligned_buf, &(vecaligned_buf[kPglBitTransposeBufbytes / (2 * kBytesPerVec)]));
467 }
468
469 #else // !__LP64__
470 CONSTI32(kPglBitTransposeBufbytes, (kPglBitTransposeBatch * kPglBitTransposeBatch) / (CHAR_BIT / 2));
471 void TransposeBitblock32(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* __restrict buf0, VecW* __restrict buf1);
472
473 // If this ever needs to be called on an input byte array, read_iter could be
474 // changed to const void*; in that case, read_ul_stride should be changed to a
475 // byte count.
TransposeBitblock(const uintptr_t * read_iter,uintptr_t read_ul_stride,uintptr_t write_ul_stride,uint32_t read_batch_size,uint32_t write_batch_size,uintptr_t * write_iter,VecW * vecaligned_buf)476 HEADER_INLINE void TransposeBitblock(const uintptr_t* read_iter, uintptr_t read_ul_stride, uintptr_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* write_iter, VecW* vecaligned_buf) {
477 TransposeBitblock32(read_iter, read_ul_stride, write_ul_stride, read_batch_size, write_batch_size, write_iter, vecaligned_buf, &(vecaligned_buf[kPglBitTransposeBufbytes / (2 * kBytesPerVec)]));
478 }
479 #endif
480
481 CONSTI32(kPglBitTransposeBufwords, kPglBitTransposeBufbytes / kBytesPerWord);
482 CONSTI32(kPglBitTransposeBufvecs, kPglBitTransposeBufbytes / kBytesPerVec);
483
484 CONSTI32(kPglNybbleTransposeBatch, kNybblesPerCacheline);
485 CONSTI32(kPglNybbleTransposeWords, kWordsPerCacheline);
486
487 CONSTI32(kPglNybbleTransposeBufbytes, (kPglNybbleTransposeBatch * kPglNybbleTransposeBatch) / 2);
488
489 // up to 128x128; vecaligned_buf must have size 8k
490 // now ok for write_iter to not be padded when write_batch_size odd
491 void TransposeNybbleblock(const uintptr_t* read_iter, uint32_t read_ul_stride, uint32_t write_ul_stride, uint32_t read_batch_size, uint32_t write_batch_size, uintptr_t* __restrict write_iter, VecW* vecaligned_buf);
492
493 #ifdef __LP64__
494 # ifdef USE_AVX2
495 extern const unsigned char kLeadMask[2 * kBytesPerVec] __attribute__ ((aligned (64)));
496 # else
497 extern const unsigned char kLeadMask[2 * kBytesPerVec] __attribute__ ((aligned (32)));
498 # endif
499 #endif
500
501 uintptr_t BytesumArr(const void* bytearr, uintptr_t byte_ct);
502
503 uintptr_t CountByte(const void* bytearr, unsigned char ucc, uintptr_t byte_ct);
504
505 uintptr_t CountU16(const void* u16arr, uint16_t usii, uintptr_t u16_ct);
506
507
508 // Applies sample_include to {src_subset, src_vals}.
509 uint32_t Copy1bit8Subset(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uintptr_t* __restrict sample_include, uint32_t src_subset_size, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals);
510
511 uint32_t Copy1bit16Subset(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uintptr_t* __restrict sample_include, uint32_t src_subset_size, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals);
512
513 #ifdef __cplusplus
514 } // namespace plink2
515 #endif
516
517 #endif // __PLINK2_BASE_H__
518