1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "pgenlib_read.h"
19 
20 #include <errno.h>
21 
22 #ifndef NO_MMAP
23 #  include <sys/types.h>  // fstat()
24 #  include <sys/stat.h>  // open(), fstat()
25 #  include <sys/mman.h>  // mmap()
26 #  include <fcntl.h>  // open()
27 #  include <unistd.h>  // fstat()
28 #endif
29 
30 #ifdef __cplusplus
31 namespace plink2 {
32 #endif
33 
GetPgrp(PgenReader * pgr_ptr)34 static inline PgenReaderMain* GetPgrp(PgenReader* pgr_ptr) {
35   return &GET_PRIVATE(*pgr_ptr, m);
36 }
37 
GetSicp(PgrSampleSubsetIndex pssi)38 static inline const uint32_t* GetSicp(PgrSampleSubsetIndex pssi) {
39   return GET_PRIVATE(pssi, cumulative_popcounts);
40 }
41 
42 #ifdef __arm__
43 #  error "Unaligned accesses in SmallGenoarrCount3FreqIncr()."
44 #endif
SmallGenoarrCount3FreqIncr(const uintptr_t * genoarr_iter,uint32_t byte_ct,uint32_t * even_ctp,uint32_t * odd_ctp,uint32_t * bothset_ctp)45 void SmallGenoarrCount3FreqIncr(const uintptr_t* genoarr_iter, uint32_t byte_ct, uint32_t* even_ctp, uint32_t* odd_ctp, uint32_t* bothset_ctp) {
46   for (uint32_t bytes_left = byte_ct; ; ) {
47     uintptr_t cur_geno_word;
48     if (bytes_left < kBytesPerWord) {
49       if (!bytes_left) {
50         return;
51       }
52       cur_geno_word = ProperSubwordLoad(genoarr_iter, bytes_left);
53       bytes_left = 0;
54     } else {
55       cur_geno_word = *genoarr_iter++;
56       bytes_left -= kBytesPerWord;
57     }
58     const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
59     *even_ctp += Popcount01Word(cur_geno_word & kMask5555);
60     *odd_ctp += Popcount01Word(cur_geno_word_high);
61     *bothset_ctp += Popcount01Word(cur_geno_word & cur_geno_word_high);
62   }
63 }
64 
65 void GenoarrbCountFreqs(const unsigned char* genoarrb, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
66   // does not read past the end of genoarrb
67   uint32_t lead_byte_ct = (-R_CAST(uintptr_t, genoarrb)) % kBytesPerVec;
68   uint32_t even_ct = 0;
69   uint32_t odd_ct = 0;
70   uint32_t bothset_ct = 0;
71   const uintptr_t* genoarrb_iter;
72   uint32_t trail_ct;
73   if (sample_ct > lead_byte_ct * 4 + (6 * kNypsPerVec)) {
74     const uint32_t remaining_sample_ct = sample_ct - 4 * lead_byte_ct;
75     // strictly speaking, this relies on undefined behavior: see e.g.
76     // http://pzemtsov.github.io/2016/11/06/bug-story-alignment-on-x86.html
77     // Probably want to search out all instances of __arm__ and make the code
78     // standard-compliant, if that can be done without a speed penalty.  Though
79     // it makes sense to wait until more is known about Apple's MacBook
80     // processor plans...
81     SmallGenoarrCount3FreqIncr(R_CAST(const uintptr_t*, genoarrb), lead_byte_ct, &even_ct, &odd_ct, &bothset_ct);
82     genoarrb_iter = R_CAST(const uintptr_t*, &(genoarrb[lead_byte_ct]));
83     const uint32_t remaining_full_vec_ct = remaining_sample_ct / kNypsPerVec;
84     uint32_t even_ct_incr;
85     uint32_t odd_ct_incr;
86     uint32_t bothset_ct_incr;
87     const uint32_t vec_ct = remaining_full_vec_ct - (remaining_full_vec_ct % 6);
88     Count3FreqVec6(R_CAST(const VecW*, genoarrb_iter), vec_ct, &even_ct_incr, &odd_ct_incr, &bothset_ct_incr);
89     even_ct += even_ct_incr;
90     odd_ct += odd_ct_incr;
91     bothset_ct += bothset_ct_incr;
92     genoarrb_iter = &(genoarrb_iter[kWordsPerVec * vec_ct]);
93     trail_ct = remaining_sample_ct - (vec_ct * kNypsPerVec);
94   } else {
95     genoarrb_iter = R_CAST(const uintptr_t*, genoarrb);
96     trail_ct = sample_ct;
97   }
98   const uint32_t trail_byte_ct = NypCtToByteCt(trail_ct);
99   SmallGenoarrCount3FreqIncr(genoarrb_iter, trail_byte_ct, &even_ct, &odd_ct, &bothset_ct);
100   genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
101   genocounts[1] = even_ct - bothset_ct;
102   genocounts[2] = odd_ct - bothset_ct;
103   genocounts[3] = bothset_ct;
104 }
105 
106 #ifdef __arm__
107 #  error "Unaligned accesses in GenoarrbCountSubsetFreqs()."
108 #endif
109 void GenoarrbCountSubsetFreqs(const unsigned char* genoarrb, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
110   // does not read past the end of genoarrb
111   const uint32_t raw_sample_ctv2 = NypCtToVecCt(raw_sample_ct);
112   uint32_t even_ct;
113   uint32_t odd_ct;
114   uint32_t bothset_ct;
115   uint32_t vec_idx = raw_sample_ctv2 - (raw_sample_ctv2 % 6);
116   CountSubset3FreqVec6(R_CAST(const VecW*, genoarrb), R_CAST(const VecW*, sample_include_interleaved_vec), vec_idx, &even_ct, &odd_ct, &bothset_ct);
117   const uintptr_t* genoarrb_iter = &(R_CAST(const uintptr_t*, genoarrb)[kWordsPerVec * vec_idx]);
118 #ifdef __LP64__
119   const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[vec_idx * (kWordsPerVec / 2)]);
120 #else
121   // bugfix (19 Jul 2018): (kWordsPerVec / 2) doesn't work in 32-bit case
122   const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[(vec_idx * kWordsPerVec) / 2]);
123 #endif
124 #ifdef USE_AVX2
125   const uint32_t halfvec_idx_trail = (raw_sample_ct + 3) / (kBitsPerVec / 4);
126   uintptr_t mask_base1 = 0;
127   uintptr_t mask_base2 = 0;
128   uintptr_t mask_base3 = 0;
129   uintptr_t mask_base4 = 0;
130   for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
131     uintptr_t mask_word1;
132     uintptr_t mask_word2;
133     uintptr_t mask_word3;
134     uintptr_t mask_word4;
135     if (!(vec_idx % 2)) {
136       mask_base1 = *interleaved_mask_iter++;
137       mask_base2 = *interleaved_mask_iter++;
138       mask_base3 = *interleaved_mask_iter++;
139       mask_base4 = *interleaved_mask_iter++;
140       mask_word1 = mask_base1 & kMask5555;
141       mask_word2 = mask_base2 & kMask5555;
142       mask_word3 = mask_base3 & kMask5555;
143       mask_word4 = mask_base4 & kMask5555;
144     } else {
145       mask_word1 = (mask_base1 >> 1) & kMask5555;
146       mask_word2 = (mask_base2 >> 1) & kMask5555;
147       mask_word3 = (mask_base3 >> 1) & kMask5555;
148       mask_word4 = (mask_base4 >> 1) & kMask5555;
149     }
150     uint32_t vechalf_idx = 0;
151     while (1) {
152       uintptr_t cur_geno_word1;
153       uintptr_t cur_geno_word2;
154       if (2 * vec_idx + vechalf_idx < halfvec_idx_trail) {
155         cur_geno_word1 = *genoarrb_iter++;
156         cur_geno_word2 = *genoarrb_iter++;
157       } else {
158         const uint32_t remaining_byte_ct = NypCtToByteCt(raw_sample_ct) % kBytesPerVec;
159         // todo: check if this harms usual-case loop efficiency
160         vechalf_idx = 1;
161         if (remaining_byte_ct < kBytesPerWord) {
162           cur_geno_word1 = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct);
163           cur_geno_word2 = 0;
164         } else {
165           cur_geno_word1 = *genoarrb_iter++;
166           cur_geno_word2 = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct - kBytesPerWord);
167         }
168       }
169       const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
170       const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
171       even_ct += PopcountWord(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
172       odd_ct += PopcountWord((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
173       bothset_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
174       if (vechalf_idx) {
175         break;
176       }
177       ++vechalf_idx;
178       mask_word1 = mask_word3;
179       mask_word2 = mask_word4;
180     }
181   }
182 #else  // not USE_AVX2
183   const uint32_t vec_idx_trail = (raw_sample_ct + 3) / kNypsPerVec;
184 #  ifdef __LP64__
185   uintptr_t mask_base1 = 0;
186   uintptr_t mask_base2 = 0;
187   for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
188     uintptr_t mask_word1;
189     uintptr_t mask_word2;
190     if (!(vec_idx % 2)) {
191       mask_base1 = *interleaved_mask_iter++;
192       mask_base2 = *interleaved_mask_iter++;
193       mask_word1 = mask_base1 & kMask5555;
194       mask_word2 = mask_base2 & kMask5555;
195     } else {
196       mask_word1 = (mask_base1 >> 1) & kMask5555;
197       mask_word2 = (mask_base2 >> 1) & kMask5555;
198     }
199     uintptr_t cur_geno_word1;
200     uintptr_t cur_geno_word2;
201     if (vec_idx < vec_idx_trail) {
202       cur_geno_word1 = *genoarrb_iter++;
203       cur_geno_word2 = *genoarrb_iter++;
204     } else {
205       const uint32_t remaining_byte_ct = NypCtToByteCt(raw_sample_ct) % kBytesPerVec;
206       if (remaining_byte_ct < kBytesPerWord) {
207         cur_geno_word1 = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct);
208         cur_geno_word2 = 0;
209       } else {
210         cur_geno_word1 = *genoarrb_iter++;
211         cur_geno_word2 = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct - kBytesPerWord);
212       }
213     }
214     const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
215     const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
216 #    ifdef USE_SSE42
217     even_ct += PopcountWord(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
218     odd_ct += PopcountWord((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
219     bothset_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
220 #    else
221     even_ct += NypsumWord((cur_geno_word1 & mask_word1) + (cur_geno_word2 & mask_word2));
222     odd_ct += NypsumWord(cur_geno_word1_high_masked + cur_geno_word2_high_masked);
223     bothset_ct += NypsumWord((cur_geno_word1 & cur_geno_word1_high_masked) + (cur_geno_word2 & cur_geno_word2_high_masked));
224 #    endif
225   }
226 #  else  // not __LP64__
227   uintptr_t mask_base = 0;
228   for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
229     uintptr_t mask_word;
230     if (!(vec_idx % 2)) {
231       mask_base = *interleaved_mask_iter++;
232       mask_word = mask_base & kMask5555;
233     } else {
234       mask_word = (mask_base >> 1) & kMask5555;
235     }
236     uintptr_t cur_geno_word;
237     if (vec_idx < vec_idx_trail) {
238       cur_geno_word = *genoarrb_iter++;
239     } else {
240       const uint32_t remaining_byte_ct = NypCtToByteCt(raw_sample_ct) % kBytesPerVec;
241       cur_geno_word = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct);
242     }
243     const uintptr_t cur_geno_word_high_masked = mask_word & (cur_geno_word >> 1);
244     even_ct += Popcount01Word(cur_geno_word & mask_word);
245     odd_ct += Popcount01Word(cur_geno_word_high_masked);
246     bothset_ct += Popcount01Word(cur_geno_word & cur_geno_word_high_masked);
247   }
248 #  endif  // not __LP64__
249 #endif  // not USE_AVX2
250   genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
251   genocounts[1] = even_ct - bothset_ct;
252   genocounts[2] = odd_ct - bothset_ct;
253   genocounts[3] = bothset_ct;
254 }
255 
256 void GenoarrCountFreqs(const uintptr_t* genoarr, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
257   // this masks out trailing genoarr bits
258   const uint32_t sample_ct_remainder = sample_ct % kBitsPerWordD2;
259   GenoarrCountFreqsUnsafe(genoarr, sample_ct - sample_ct_remainder, genocounts);
260   if (sample_ct_remainder) {
261     uintptr_t cur_geno_word = bzhi(genoarr[sample_ct / kBitsPerWordD2], 2 * sample_ct_remainder);
262     const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
263     const uint32_t even_ct = Popcount01Word(cur_geno_word & kMask5555);
264     const uint32_t odd_ct = Popcount01Word(cur_geno_word_high);
265     const uint32_t bothset_ct = Popcount01Word(cur_geno_word & cur_geno_word_high);
266     genocounts[0] += sample_ct_remainder + bothset_ct - even_ct - odd_ct;
267     genocounts[1] += even_ct - bothset_ct;
268     genocounts[2] += odd_ct - bothset_ct;
269     genocounts[3] += bothset_ct;
270   }
271 }
272 
GenovecNonmissingToZeroUnsafe(uint32_t sample_ct,uintptr_t * genovec)273 void GenovecNonmissingToZeroUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
274   // sets 1 and 2 to zero; leaves 3s untouched.
275   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
276   assert(VecIsAligned(genovec));
277   const VecW m1 = VCONST_W(kMask5555);
278   VecW* vptr = R_CAST(VecW*, genovec);
279   for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
280     VecW cur_vec = vptr[vidx];
281     const VecW cur_vec_rshifted = vecw_srli(cur_vec, 1);
282     cur_vec = cur_vec & m1;
283     cur_vec = cur_vec & cur_vec_rshifted;
284     vptr[vidx] = cur_vec | vecw_slli(cur_vec, 1);
285   }
286 }
287 
GenovecNonzeroToMissingUnsafe(uint32_t sample_ct,uintptr_t * genovec)288 void GenovecNonzeroToMissingUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
289   // converts 1s and 2s to 3s, leaves zeroes untouched.
290   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
291   assert(VecIsAligned(genovec));
292   const VecW m1 = VCONST_W(kMask5555);
293   VecW* vptr = R_CAST(VecW*, genovec);
294   for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
295     VecW cur_vec = vptr[vidx];
296     const VecW cur_vec_rshifted = vecw_srli(cur_vec, 1);
297     cur_vec = cur_vec | cur_vec_rshifted;
298     cur_vec = cur_vec & m1;
299     vptr[vidx] = cur_vec | vecw_slli(cur_vec, 1);
300   }
301 }
302 
GenovecNontwoToMissingUnsafe(uint32_t sample_ct,uintptr_t * genovec)303 void GenovecNontwoToMissingUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
304   // 0 -> 3, 1 -> 3.
305   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
306   assert(VecIsAligned(genovec));
307   const VecW not_m1 = VCONST_W(kMaskAAAA);
308   VecW* vptr = R_CAST(VecW*, genovec);
309   for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
310     const VecW cur_vec = vptr[vidx];
311     const VecW cur_vec_hi = vecw_and_notfirst(cur_vec, not_m1);
312     const VecW cur_or = cur_vec_hi | vecw_srli(cur_vec_hi, 1);
313     vptr[vidx] = cur_vec | cur_or;
314   }
315 }
316 
GenovecNonzeroToMissingThenInvertUnsafe(uint32_t sample_ct,uintptr_t * genovec)317 void GenovecNonzeroToMissingThenInvertUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
318   // 0 -> 2, 1 -> 3, 2 -> 3
319   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
320   assert(VecIsAligned(genovec));
321   const VecW not_m1 = VCONST_W(kMaskAAAA);
322   VecW* vptr = R_CAST(VecW*, genovec);
323   for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
324     const VecW cur_vec = vptr[vidx];
325     vptr[vidx] = cur_vec | vecw_srli(cur_vec, 1) | not_m1;
326   }
327 }
328 
GenovecInvertThenNonzeroToMissingUnsafe(uint32_t sample_ct,uintptr_t * genovec)329 void GenovecInvertThenNonzeroToMissingUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
330   // 0 -> 3, 1 -> 3, 2 -> 0
331   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
332   assert(VecIsAligned(genovec));
333   const VecW m1 = VCONST_W(kMask5555);
334   VecW* vptr = R_CAST(VecW*, genovec);
335   for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
336     const VecW cur_vec = vptr[vidx];
337     const VecW cur_vec_rshifted = vecw_srli(cur_vec, 1);
338     const VecW not2 = vecw_and_notfirst(vecw_and_notfirst(cur_vec, cur_vec_rshifted), m1);
339     vptr[vidx] = not2 | vecw_slli(not2, 1);
340   }
341 }
342 
BiallelicDiploidMinimac3R2(uint64_t alt1_dosage,uint64_t hap_alt1_ssq_x2,uint32_t nm_sample_ct)343 double BiallelicDiploidMinimac3R2(uint64_t alt1_dosage, uint64_t hap_alt1_ssq_x2, uint32_t nm_sample_ct) {
344   if (!nm_sample_ct) {
345     return (0.0 / 0.0);
346   }
347 
348   const uint64_t nm_sample_ct_x32768 = nm_sample_ct * 0x8000LLU;
349   if (nm_sample_ct < 131072) {
350     const uint64_t alt1_dosage_sq = alt1_dosage * alt1_dosage;
351     const uint64_t observed_variance_times_2n = hap_alt1_ssq_x2 * nm_sample_ct - alt1_dosage * alt1_dosage;
352     const uint64_t expected_variance_times_2n = nm_sample_ct_x32768 * alt1_dosage - alt1_dosage_sq;
353     return S_CAST(double, observed_variance_times_2n) / S_CAST(double, expected_variance_times_2n);
354   }
355   // Need to avoid catastrophic cancellation here.
356   const double alt1_dosaged = u63tod(alt1_dosage);
357   const double expected_variance_times_2n = alt1_dosaged * u63tod(nm_sample_ct_x32768 - alt1_dosage);
358   const uint64_t hap_alt1_ssq_x2_hi = hap_alt1_ssq_x2 >> 32;
359   uint64_t left_lo = (hap_alt1_ssq_x2 & 0xffffffffLLU) * nm_sample_ct;
360   const uint64_t left_hi = (left_lo >> 32) + hap_alt1_ssq_x2_hi * nm_sample_ct;
361   left_lo &= 0xffffffffU;
362   const uint64_t alt1_dosage_lo = alt1_dosage & 0xffffffffLLU;
363   const uint64_t alt1_dosage_hi = alt1_dosage >> 32;
364   uint64_t right_lo = alt1_dosage_lo * alt1_dosage_lo;
365   const uint64_t right_hi = (right_lo >> 32) + (alt1_dosage_lo + alt1_dosage) * alt1_dosage_hi;
366   right_lo &= 0xffffffffU;
367   const double observed_variance_times_2n_hi = u63tod(left_hi - right_hi);
368   const int64_t observed_variance_times_2n_lo = S_CAST(int64_t, left_lo) - S_CAST(int64_t, right_lo);
369   const double observed_variance_times_2n = (observed_variance_times_2n_hi * 4294967296.0) + observed_variance_times_2n_lo;
370   return observed_variance_times_2n / expected_variance_times_2n;
371 }
372 
PreinitPgfi(PgenFileInfo * pgfip)373 void PreinitPgfi(PgenFileInfo* pgfip) {
374   pgfip->shared_ff = nullptr;
375   pgfip->block_base = nullptr;
376   // we want this for proper handling of e.g. sites-only VCFs
377   pgfip->nonref_flags = nullptr;
378 }
379 
CountPgfiAllocCachelinesRequired(uint32_t raw_variant_ct)380 uint32_t CountPgfiAllocCachelinesRequired(uint32_t raw_variant_ct) {
381   // assumes variable-width variant records, otherwise pgfi.vrtypes and
382   // pgfi.vr_fpos can just be nullptr.
383 
384   // vrtypes: 1 byte per entry, (raw_variant_ct + 1) entries
385   uint32_t cachelines_required = 1 + (raw_variant_ct / kCacheline);
386 
387   // var_fpos: 8 bytes per entry, (raw_variant_ct + 1) entries
388   cachelines_required += 1 + (raw_variant_ct / kInt64PerCacheline);
389   return cachelines_required;
390 }
391 
CountPgrAllocCachelinesRequired(uint32_t raw_sample_ct,PgenGlobalFlags gflags,uint32_t max_allele_ct,uint32_t fread_buf_byte_ct)392 uint32_t CountPgrAllocCachelinesRequired(uint32_t raw_sample_ct, PgenGlobalFlags gflags, uint32_t max_allele_ct, uint32_t fread_buf_byte_ct) {
393   // ldbase_raw_genovec: always needed, 2 bits per entry, up to raw_sample_ct
394   // entries
395   const uint32_t genovec_cacheline_req = NypCtToCachelineCt(raw_sample_ct);
396   const uint32_t bitvec_cacheline_req = BitCtToCachelineCt(raw_sample_ct);
397   uint32_t cachelines_required = genovec_cacheline_req;
398   // fread_buf.  fread_buf_byte_ct should be zero if mmap() is being used.
399   // DivUp() won't overflow since fread_buf_byte_ct requirement can't exceed
400   // kPglMaxBytesPerVariant, which is sufficiently far from 2^32.
401   cachelines_required += DivUp(fread_buf_byte_ct, kCacheline);
402 
403   const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1;
404   const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor);
405   if ((gflags & kfPgenGlobalDifflistOrLdPresent) || (max_allele_ct > 2)) {
406     // workspace_difflist_sample_ids
407     // bugfix: must add 1 since several routines add a terminator element
408     cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline);
409   }
410   if (gflags & kfPgenGlobalDifflistOrLdPresent) {
411     // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
412     // workspace_raregeno_vec
413     cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base);
414 
415     // workspace_raregeno_tmp_loadbuf
416     cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base);
417 
418     if (ld_compression_present) {
419       // ldbase_genovec
420       cachelines_required += genovec_cacheline_req;
421 
422       // ldbase_raregeno
423       cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base);
424 
425       // ldbase_difflist_sample_ids
426       cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline);
427     }
428   }
429   const PgenGlobalFlags gflags_hphase_dosage = gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent);
430   if ((max_allele_ct > 2) || gflags_hphase_dosage) {
431     cachelines_required += genovec_cacheline_req;  // workspace_vec
432     if (max_allele_ct > 2) {
433       // workspace_aux1x_present
434       cachelines_required += bitvec_cacheline_req;
435       // workspace_imp_r2
436       cachelines_required += Int64CtToCachelineCt(2 * max_allele_ct);
437     }
438     if (gflags & kfPgenGlobalHardcallPhasePresent) {
439       // workspace_all_hets, workspace_subset
440       cachelines_required += bitvec_cacheline_req * 2;
441     }
442     if (gflags & kfPgenGlobalDosagePresent) {
443       // aux track #3: usually bitarray tracking which samples have dosage info
444       // (may be stored on disk as a dosage list)
445       cachelines_required += bitvec_cacheline_req;
446       if (gflags & kfPgenGlobalDosagePhasePresent) {
447         // aux track #7: bitarray tracking which dosage entries are phased
448         cachelines_required += bitvec_cacheline_req;
449 
450         // phased aux tracks #4,8: 2 bytes per sample
451         // There may be overflow risk here in the future.
452         // (commented out since caller always provides this buffer for now)
453         // cachelines_required += DivUp(2 * k1LU * raw_sample_ct, kCacheline);
454       }
455       // unphased aux track #4: 2 bytes per sample
456       // cachelines_required += DivUp(2 * k1LU * raw_sample_ct, kCacheline);
457 
458       // may need deltalist64 workspace in multiallelic dosage case
459     }
460   }
461   return cachelines_required;
462 }
463 
464 static_assert(kPglMaxAltAlleleCt == 254, "Need to update PgfiInitPhase1().");
PgfiInitPhase1(const char * fname,uint32_t raw_variant_ct,uint32_t raw_sample_ct,uint32_t use_mmap,PgenHeaderCtrl * header_ctrl_ptr,PgenFileInfo * pgfip,uintptr_t * pgfi_alloc_cacheline_ct_ptr,char * errstr_buf)465 PglErr PgfiInitPhase1(const char* fname, uint32_t raw_variant_ct, uint32_t raw_sample_ct, uint32_t use_mmap, PgenHeaderCtrl* header_ctrl_ptr, PgenFileInfo* pgfip, uintptr_t* pgfi_alloc_cacheline_ct_ptr, char* errstr_buf) {
466   pgfip->var_fpos = nullptr;
467   pgfip->vrtypes = nullptr;
468   pgfip->allele_idx_offsets = nullptr;
469   pgfip->nonref_flags = nullptr;
470 
471   // Caller is currently expected to reset max_allele_ct if allele_idx_offsets
472   // is preloaded... need to fix this interface.
473   pgfip->max_allele_ct = 2;
474   // pgfip->max_dosage_allele_ct = 0;
475 
476   pgfip->block_base = nullptr;
477   // this should force overflow when value is uninitialized.
478   pgfip->block_offset = 1LLU << 63;
479 
480   uint64_t fsize;
481   const unsigned char* fread_ptr;
482   FILE* shared_ff = nullptr;
483   unsigned char small_readbuf[3];
484 #ifdef NO_MMAP
485   if (unlikely(use_mmap)) {
486     pgfip->shared_ff = nullptr;  // this must be initialized before block_base
487     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase1() use_mmap parameter is nonzero, but pgenlib was not compiled with mmap support.\n");
488     return kPglRetImproperFunctionCall;
489   }
490 #else
491   if (use_mmap) {
492     pgfip->shared_ff = nullptr;  // this must be initialized before block_base
493     int32_t file_handle = open(fname, O_RDONLY);
494     if (unlikely(file_handle < 0)) {
495       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Failed to open %s : %s.\n", fname, strerror(errno));
496       return kPglRetOpenFail;
497     }
498     struct stat statbuf;
499     if (unlikely(fstat(file_handle, &statbuf) < 0)) {
500       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Failed to open %s : %s.\n", fname, strerror(errno));
501       return kPglRetOpenFail;
502     }
503     fsize = statbuf.st_size;
504     pgfip->block_offset = 0;
505     pgfip->file_size = fsize;
506     pgfip->block_base = S_CAST(const unsigned char*, mmap(0, pgfip->file_size, PROT_READ, MAP_SHARED, file_handle, 0));
507     if (unlikely(R_CAST(uintptr_t, pgfip->block_base) == (~k0LU))) {
508       pgfip->block_base = nullptr;
509       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s read failure: %s.\n", fname, strerror(errno));
510       return kPglRetReadFail;
511     }
512     // this provided less than a ~5% boost on OS X; mmap still took >80% longer
513     // than fread on an 85GB file there
514     // try MAP_POPULATE on Linux?
515     // madvise((unsigned char*)(pgfip->block_base), fsize, MADV_SEQUENTIAL);
516     close(file_handle);
517     // update (7 Jan 2018): drop support for zero-sample and zero-variant
518     // files, not worth the development cost
519     if (unlikely(fsize < 4)) {
520       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s is too small to be a valid .pgen file.\n", fname);
521       return kPglRetMalformedInput;
522     }
523     fread_ptr = pgfip->block_base;
524   }
525 #endif
526   else {
527     shared_ff = fopen(fname, FOPEN_RB);
528     pgfip->shared_ff = shared_ff;
529     if (unlikely(!shared_ff)) {
530       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Failed to open %s : %s.\n", fname, strerror(errno));
531       return kPglRetOpenFail;
532     }
533     if (unlikely(fseeko(shared_ff, 0, SEEK_END))) {
534       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s read failure: %s.\n", fname, strerror(errno));
535       return kPglRetReadFail;
536     }
537     fsize = ftello(shared_ff);
538     if (unlikely(fsize < 4)) {
539       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s is too small to be a valid .pgen file.\n", fname);
540       return kPglRetMalformedInput;
541     }
542     rewind(shared_ff);
543     if (unlikely(!fread_unlocked(small_readbuf, 3, 1, shared_ff))) {
544       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s read failure: %s.\n", fname, strerror(errno));
545       return kPglRetReadFail;
546     }
547     fread_ptr = small_readbuf;
548   }
549   // deliberate underflow
550   if (unlikely(((raw_variant_ct - 1) > 0x7ffffffc) && (raw_variant_ct != UINT32_MAX))) {
551     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid raw_variant_ct function parameter.\n");
552     return kPglRetImproperFunctionCall;
553   }
554   if (unlikely(((raw_sample_ct - 1) > 0x7ffffffd) && (raw_sample_ct != UINT32_MAX))) {
555     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid raw_sample_ct function parameter.\n");
556     return kPglRetImproperFunctionCall;
557   }
558   if (unlikely(!memequal_k(fread_ptr, "l\x1b", 2))) {
559     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s is not a .pgen file (first two bytes don't match the magic number).\n", fname);
560     return kPglRetMalformedInput;
561   }
562   const uint32_t file_type_code = fread_ptr[2];
563   *header_ctrl_ptr = 0;
564   if (file_type_code < 2) {
565     // plink 1 binary
566     if (unlikely(!file_type_code)) {
567       // sample-major.  validate file size here so we don't have to recheck it
568       if ((raw_sample_ct != UINT32_MAX) && (raw_variant_ct != UINT32_MAX)) {
569         const uint64_t fsize_expected = 3 + S_CAST(uint64_t, raw_sample_ct) * NypCtToByteCt(raw_variant_ct);
570         if (fsize != fsize_expected) {
571           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Unexpected PLINK 1 sample-major .bed file size (%" PRIu64 " bytes expected).\n", fsize_expected);
572           return kPglRetMalformedInput;
573         }
574       }
575       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: pgenlib does not directly support sample-major PLINK 1 .bed files.\n(However, PLINK 2 automatically transposes and compresses them for you.)\n");
576       return kPglRetSampleMajorBed;
577     }
578     if (unlikely(raw_sample_ct == UINT32_MAX)) {
579       // either .fam must be loaded first, or user must provide sample count
580       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase1() must be called with an accurate raw_sample_ct value, since %s is a PLINK 1 .bed file.\n", fname);
581       return kPglRetImproperFunctionCall;
582     }
583     const uint32_t const_vrec_width = NypCtToByteCt(raw_sample_ct);
584     if (raw_variant_ct == UINT32_MAX) {
585       // allow raw_variant_ct to be inferred
586       uint64_t quotient = (fsize - 3) / const_vrec_width;
587       if (unlikely((quotient > 0x7fffffffU) || (quotient * const_vrec_width + 3 != fsize))) {
588         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Unexpected PLINK 1 .bed file size (since raw_sample_ct was %u, [file size - 3] should be divisible by %u and the quotient should be smaller than 2^31).\n", raw_sample_ct, const_vrec_width);
589         return kPglRetMalformedInput;
590       }
591       raw_variant_ct = quotient;
592     } else {
593       if (unlikely(S_CAST(uint64_t, raw_variant_ct) * const_vrec_width + 3 != fsize)) {
594         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Unexpected PLINK 1 .bed file size (expected %" PRIu64 " bytes).\n", S_CAST(uint64_t, raw_variant_ct) * const_vrec_width + 3);
595         return kPglRetMalformedInput;
596       }
597     }
598     pgfip->raw_variant_ct = raw_variant_ct;
599     pgfip->raw_sample_ct = raw_sample_ct;
600     pgfip->const_fpos_offset = 3;
601 
602     pgfip->const_vrtype = kPglVrtypePlink1;
603     pgfip->const_vrec_width = const_vrec_width;
604     pgfip->gflags = kfPgenGlobalAllNonref;
605     *pgfi_alloc_cacheline_ct_ptr = 0;
606     return kPglRetSuccess;
607   }
608 
609   if (unlikely(fsize < 12)) {
610     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s is too small to be a valid .pgen file.\n", fname);
611     return kPglRetMalformedInput;
612   }
613 #ifndef NO_MMAP
614   if (use_mmap) {
615     memcpy(&(pgfip->raw_variant_ct), &(fread_ptr[3]), sizeof(int32_t));
616     memcpy(&(pgfip->raw_sample_ct), &(fread_ptr[7]), sizeof(int32_t));
617     memcpy(header_ctrl_ptr, &(fread_ptr[11]), 1);
618   } else {
619 #endif
620     if (unlikely(
621             (!fread_unlocked(&(pgfip->raw_variant_ct), sizeof(int32_t), 1, shared_ff)) ||
622             (!fread_unlocked(&(pgfip->raw_sample_ct), sizeof(int32_t), 1, shared_ff)) ||
623             (!fread_unlocked(header_ctrl_ptr, 1, 1, shared_ff)))) {
624       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s read failure: %s.\n", fname, strerror(errno));
625       return kPglRetReadFail;
626     }
627 #ifndef NO_MMAP
628   }
629 #endif
630   PgenHeaderCtrl header_ctrl = *header_ctrl_ptr;
631   if (raw_variant_ct == UINT32_MAX) {
632     raw_variant_ct = pgfip->raw_variant_ct;
633     // deliberate underflow
634     if (unlikely((raw_variant_ct - 1) > 0x7ffffffc)) {
635       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid variant count in .pgen file.\n");
636       return kPglRetMalformedInput;
637     }
638   } else if (unlikely(raw_variant_ct != pgfip->raw_variant_ct)) {
639     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase1() was called with raw_variant_ct == %u, but %s contains %u variant%s.\n", raw_variant_ct, fname, pgfip->raw_variant_ct, (pgfip->raw_variant_ct == 1)? "" : "s");
640     return kPglRetInconsistentInput;
641   }
642   if (raw_sample_ct == UINT32_MAX) {
643     raw_sample_ct = pgfip->raw_sample_ct;
644     // deliberate underflow
645     if (unlikely((raw_sample_ct - 1) > 0x7ffffffd)) {
646       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid sample count in .pgen file.\n");
647       return kPglRetMalformedInput;
648     }
649   } else if (unlikely(raw_sample_ct != pgfip->raw_sample_ct)) {
650     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase1() was called with raw_sample_ct == %u, but %s contains %u sample%s.\n", raw_sample_ct, fname, pgfip->raw_sample_ct, (pgfip->raw_sample_ct == 1)? "" : "s");
651     return kPglRetInconsistentInput;
652   }
653   pgfip->gflags = kfPgenGlobal0;
654   pgfip->const_fpos_offset = 12;
655 
656   // explicit storage of "is this reference allele untrusted?"
657   // need caller to allocate this
658   uint32_t nonref_flags_storage = header_ctrl >> 6;
659   if (nonref_flags_storage == 3) {
660     pgfip->const_fpos_offset += DivUp(raw_variant_ct, CHAR_BIT);
661   } else if (nonref_flags_storage == 2) {
662     pgfip->gflags |= kfPgenGlobalAllNonref;
663   }
664 
665   if (file_type_code < 16) {
666     // plink 2 binary, single constant-width vrtype
667     if (unlikely(file_type_code > 4)) {
668       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Third byte of %s does not correspond to a storage mode supported by this version of pgenlib.\n", fname);
669       return kPglRetNotYetSupported;
670     }
671     if (unlikely(header_ctrl & 63)) {
672       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Third byte of %s corresponds to a fixed-width storage mode, but twelfth byte is only consistent with a variable-width mode.\n", fname);
673       return kPglRetMalformedInput;
674     }
675     uint32_t vrtype = 0;
676     uintptr_t const_vrec_width = NypCtToByteCt(raw_sample_ct);
677     if (file_type_code == 3) {
678       vrtype = 0x40;
679       const_vrec_width += raw_sample_ct * 2;
680       pgfip->gflags |= kfPgenGlobalDosagePresent;
681     } else if (file_type_code == 4) {
682       vrtype = 0xc0;
683       const_vrec_width += raw_sample_ct * 4;
684       pgfip->gflags |= kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent;
685     }
686     if (unlikely(S_CAST(uint64_t, raw_variant_ct) * const_vrec_width + pgfip->const_fpos_offset != fsize)) {
687       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Unexpected .pgen file size (expected %" PRIu64 " bytes).\n", S_CAST(uint64_t, raw_variant_ct) * const_vrec_width + pgfip->const_fpos_offset);
688       return kPglRetMalformedInput;
689     }
690     pgfip->const_vrtype = vrtype;
691     pgfip->const_vrec_width = const_vrec_width;
692     *pgfi_alloc_cacheline_ct_ptr = 0;
693     return kPglRetSuccess;
694   }
695   if (unlikely(file_type_code >= 0x11)) {
696     // todo: 0x11 phase sets (maybe not before 2021, though)
697     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Third byte of %s does not correspond to a storage mode supported by this version of pgenlib.\n", fname);
698     return kPglRetNotYetSupported;
699   }
700   // plink 2 binary, general-purpose
701   pgfip->const_vrtype = UINT32_MAX;
702   pgfip->const_vrec_width = 0;
703   const uintptr_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
704   if (unlikely(alt_allele_ct_byte_ct > 1)) {
705     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: This version of pgenlib does not support >254 alternate alleles for a single variant.\n");
706     return kPglRetNotYetSupported;
707   }
708 
709   // 8 extra bytes per vblock, to support fast random access
710   const uintptr_t vblock_ct = DivUp(raw_variant_ct, kPglVblockSize);
711 
712   uint64_t vrtype_and_vrec_len_bit_cost;
713   if (header_ctrl & 8) {
714     // Special header_ctrl modes:
715     //   8: 1 bit per fused vrtype-length.  Unset = vrtype 5, set = vrtype 0.
716     //   9: 2 bits, multiallelic.  0 = vrtype 5, 1 = vrtype 0, 2-3 = vrtype
717     //      8 with that many more bytes than vrtype 0.  Note that this is
718     //      limited to 16 ALT alleles.
719     //   10: 2 bits, phased.  0 = vrtype 5, 1 = vrtype 0, 2-3 = vrtype 16
720     //       with that many minus 1 bytes beyond vrtype 0.  While this is also
721     //       aimed at the single-sample use case, it technically supports up to
722     //       15 always-phased or 7 partially-phased samples.
723     //   11: 4 bits, multiallelic + phased.  0 = vrtype 5, 1 = vrtype 0,
724     //       2-7 = vrtype 8 with that many bytes beyond vrtype 0, 9 = vrtype 16
725     //       phase info requiring just 1 byte, 10-15 = vrtype 24 with (x-7)
726     //       extra bytes required between multiallelic and phased tracks.
727     //   12: 2 bits, dosage, must be single-sample.  0 = vrtype 5,
728     //       1 = vrtype 0, 2 = vrtype 0x45 with 2 bytes, 3 = vrtype 0x40 with 3
729     //       total bytes.
730     //   13: reserved for single-sample multiallelic + dosage.
731     //   14: 4 bits, phased + dosage, must be single-sample.  0 and 1 as usual,
732     //       3 = vrtype 16 with 1 phaseinfo byte, 4 = vrtype 0x45 with 2 bytes,
733     //       5 = vrtype 0x40 with 3 total bytes, 12 = vrtype 0xc5 with 4 total
734     //       bytes, 13 = vrtype 0xc0 with 5 total bytes, 15 = vrtype 0xe0 with
735     //       6 total bytes
736     //   15: reserved for single-sample multiallelic + phased dosage.
737     const uint32_t header_ctrl_low3 = header_ctrl & 7;
738     // this can be a table lookup once 13/15 are implemented
739     if (!header_ctrl_low3) {
740       vrtype_and_vrec_len_bit_cost = 1;
741     } else if ((header_ctrl_low3 == 3) || (header_ctrl_low3 == 6)) {
742       vrtype_and_vrec_len_bit_cost = 4;
743     } else if (likely(header_ctrl_low3 <= 4)) {
744       vrtype_and_vrec_len_bit_cost = 2;
745     } else {
746       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Twelfth byte of %s does not correspond to a format supported by this version of pgenlib.\n", fname);
747       return kPglRetNotYetSupported;
748     }
749   } else {
750     // set this to *4* if true, 0 if false
751     const uint32_t phase_or_dosage_present_x4 = header_ctrl & 4;
752     // vrtype entries = 4 bits if no phase/dosage, 8 otherwise
753     // var_fpos entries = 8 + (8 * (header_ctrl & 3)) bits
754     vrtype_and_vrec_len_bit_cost = 12 + phase_or_dosage_present_x4 + 8 * (header_ctrl & 3);
755   }
756   pgfip->const_fpos_offset += (raw_sample_ct * vrtype_and_vrec_len_bit_cost + 7) / 8 + (raw_sample_ct * alt_allele_ct_byte_ct) + (8 * vblock_ct);
757   *pgfi_alloc_cacheline_ct_ptr = CountPgfiAllocCachelinesRequired(raw_variant_ct);
758   return kPglRetSuccess;
759 }
760 
761 static_assert(kPglMaxAltAlleleCt == 254, "Need to update PgfiInitPhase2().");
PgfiInitPhase2(PgenHeaderCtrl header_ctrl,uint32_t allele_cts_already_loaded,uint32_t nonref_flags_already_loaded,uint32_t use_blockload,uint32_t vblock_idx_start,uint32_t vidx_end,uint32_t * max_vrec_width_ptr,PgenFileInfo * pgfip,unsigned char * pgfi_alloc,uintptr_t * pgr_alloc_cacheline_ct_ptr,char * errstr_buf)762 PglErr PgfiInitPhase2(PgenHeaderCtrl header_ctrl, uint32_t allele_cts_already_loaded, uint32_t nonref_flags_already_loaded, uint32_t use_blockload, uint32_t vblock_idx_start, uint32_t vidx_end, uint32_t* max_vrec_width_ptr, PgenFileInfo* pgfip, unsigned char* pgfi_alloc, uintptr_t* pgr_alloc_cacheline_ct_ptr, char* errstr_buf) {
763   // *max_vrec_width_ptr technically only needs to be set in single-variant
764   // fread() mode, but its computation is not currently optimized out in the
765   // other two modes.
766 
767   // possible todo: add option to skip validation when allele_cts/nonref_flags
768   // are already loaded.  but let's play it safe for now.
769   const uint32_t raw_variant_ct = pgfip->raw_variant_ct;
770   const uint32_t const_vrec_width = pgfip->const_vrec_width;
771   *pgr_alloc_cacheline_ct_ptr = 0;
772 
773   // Note that this is a rather hefty stack allocation.
774   unsigned char loadbuf[kPglVblockSize * 4];
775 
776   uintptr_t* allele_idx_offsets_iter = pgfip->allele_idx_offsets;
777   uintptr_t prev_allele_idx_offset = 0;
778   if (allele_idx_offsets_iter) {
779     if (!allele_cts_already_loaded) {
780       *allele_idx_offsets_iter = 0;
781     } else {
782       prev_allele_idx_offset = *allele_idx_offsets_iter;
783     }
784     ++allele_idx_offsets_iter;
785   }
786   if (!raw_variant_ct) {
787     return kPglRetSuccess;
788   }
789   const uint32_t nonref_flags_stored = ((header_ctrl >> 6) == 3);
790   unsigned char* nonref_flags_iter = R_CAST(unsigned char*, pgfip->nonref_flags);
791   const unsigned char* fread_ptr = nullptr;  // maybe-uninitialized warning
792   FILE* shared_ff = pgfip->shared_ff;
793   if (const_vrec_width) {
794     // no allele counts to verify if fixed-width
795     // always need ldbase_raw_genovec
796     *pgr_alloc_cacheline_ct_ptr = NypCtToCachelineCt(pgfip->raw_sample_ct);
797     *max_vrec_width_ptr = const_vrec_width;
798 #ifdef NO_MMAP
799     assert(shared_ff);
800 #else
801     if (!shared_ff) {
802       if (unlikely(use_blockload)) {
803         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase2() cannot be called with use_blockload set when PgfiInitPhase1() had use_mmap set.\n");
804         return kPglRetImproperFunctionCall;
805       }
806       if ((!(header_ctrl & 192)) || (pgfip->const_vrtype == kPglVrtypePlink1)) {
807         return kPglRetSuccess;
808       }
809       fread_ptr = &(pgfip->block_base[12]);
810       const uint32_t nonref_flags_byte_ct = DivUp(raw_variant_ct, CHAR_BIT);
811       if (!nonref_flags_already_loaded) {
812         if (nonref_flags_stored) {
813           memcpy(nonref_flags_iter, fread_ptr, nonref_flags_byte_ct);
814         }
815         return kPglRetSuccess;
816       }
817       if (nonref_flags_stored) {
818         if (unlikely(!memequal(nonref_flags_iter, fread_ptr, nonref_flags_byte_ct))) {
819           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
820           return kPglRetInconsistentInput;
821         }
822         return kPglRetSuccess;
823       }
824       if (header_ctrl & 64) {
825         // all ref
826         if (unlikely(!AllWordsAreZero(pgfip->nonref_flags, BitCtToWordCt(raw_variant_ct)))) {
827           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
828           return kPglRetInconsistentInput;
829         }
830         return kPglRetSuccess;
831       }
832       // all nonref
833       if (unlikely(!AllBitsAreOne(pgfip->nonref_flags, raw_variant_ct))) {
834         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
835         return kPglRetInconsistentInput;
836       }
837       return kPglRetSuccess;
838     }
839 #endif
840     if (!use_blockload) {
841       // using fread() single-variant-at-a-time, need pgr.fread_buf
842       *pgr_alloc_cacheline_ct_ptr += DivUp(const_vrec_width, kCacheline);
843     }
844     if ((!(header_ctrl & 192)) || (pgfip->const_vrtype == kPglVrtypePlink1)) {
845       return kPglRetSuccess;
846     }
847     if ((header_ctrl >> 6) == 1) {
848       // all ref
849       if (nonref_flags_already_loaded) {
850         if (unlikely(!AllWordsAreZero(pgfip->nonref_flags, BitCtToWordCt(raw_variant_ct)))) {
851           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
852           return kPglRetInconsistentInput;
853         }
854       }
855       return kPglRetSuccess;
856     }
857     if ((header_ctrl >> 6) == 2) {
858       // all nonref
859       if (nonref_flags_already_loaded) {
860         if (unlikely(!AllBitsAreOne(pgfip->nonref_flags, raw_variant_ct))) {
861           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
862           return kPglRetInconsistentInput;
863         }
864       }
865       return kPglRetSuccess;
866     }
867     // _last more useful than _end iff we just refer to the number of elements
868     // in the block and have no use for a _stop pointer
869     unsigned char* nonref_flags_last = &(nonref_flags_iter[((raw_variant_ct - 1) / (kPglVblockSize * 32)) * (kPglVblockSize * 4)]);
870     uint32_t cur_byte_ct = kPglVblockSize * 4;
871     for (; ; nonref_flags_iter = &(nonref_flags_iter[cur_byte_ct])) {
872       if (nonref_flags_iter >= nonref_flags_last) {
873         if (nonref_flags_iter > nonref_flags_last) {
874           return kPglRetSuccess;
875         }
876         cur_byte_ct = 1 + ((raw_variant_ct - 1) % (kPglVblockSize * 32)) / CHAR_BIT;
877       }
878       unsigned char* loadptr = nonref_flags_already_loaded? loadbuf : nonref_flags_iter;
879       if (unlikely(!fread_unlocked(loadptr, cur_byte_ct, 1, shared_ff))) {
880         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
881         return kPglRetReadFail;
882       }
883       if (nonref_flags_already_loaded) {
884         if (unlikely(!memequal(nonref_flags_iter, loadbuf, cur_byte_ct))) {
885           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
886           return kPglRetInconsistentInput;
887         }
888       }
889     }
890   }
891 
892   const uint32_t raw_sample_ct = pgfip->raw_sample_ct;
893   unsigned char* vrtypes_iter = pgfi_alloc;
894   pgfip->vrtypes = vrtypes_iter;
895   uint64_t* var_fpos_iter = R_CAST(uint64_t*, &(vrtypes_iter[RoundUpPow2(raw_variant_ct + 1, kCacheline)]));
896   pgfip->var_fpos = var_fpos_iter;
897   uint32_t vblock_ct_m1 = (raw_variant_ct - 1) / kPglVblockSize;
898   uint32_t max_vrec_width = 0;
899   uint64_t cur_fpos;
900 #ifdef NO_MMAP
901   assert(shared_ff);
902 #else
903   if (!shared_ff) {
904     if (unlikely(use_blockload)) {
905       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase2() cannot be called with use_blockload set when PgfiInitPhase1() had use_mmap set.\n");
906       return kPglRetImproperFunctionCall;
907     }
908     fread_ptr = &(pgfip->block_base[12 + 8 * vblock_idx_start]);
909     memcpy(&cur_fpos, fread_ptr, sizeof(int64_t));
910     fread_ptr = &(fread_ptr[(vblock_ct_m1 + 1 - vblock_idx_start) * sizeof(int64_t)]);
911   } else {
912 #endif
913     if (vblock_idx_start) {
914       if (unlikely(fseeko(shared_ff, vblock_idx_start * sizeof(int64_t), SEEK_CUR))) {
915         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
916         return kPglRetReadFail;
917       }
918     }
919     if (unlikely(!fread_unlocked(&cur_fpos, sizeof(int64_t), 1, shared_ff))) {
920       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
921       return kPglRetReadFail;
922     }
923     // May also need to load the rest of these values in the future, if we want
924     // to support dynamic insertion into a memory-mapped file.  But skip them
925     // for now.
926     if (unlikely(fseeko(shared_ff, (vblock_ct_m1 - vblock_idx_start) * sizeof(int64_t), SEEK_CUR))) {
927       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
928       return kPglRetReadFail;
929     }
930 #ifndef NO_MMAP
931   }
932 #endif
933   const uint32_t vrtype_and_fpos_storage = header_ctrl & 15;
934   const uint32_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
935   if (alt_allele_ct_byte_ct) {
936     assert(alt_allele_ct_byte_ct == 1);
937     if (unlikely(!allele_idx_offsets_iter)) {
938       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: pgfip->allele_idx_offsets must be allocated before PgfiInitPhase2() is called.\n");
939       return kPglRetImproperFunctionCall;
940     }
941   }
942   uint32_t vblock_idx = vblock_idx_start;
943   vblock_ct_m1 = (vidx_end - 1) / kPglVblockSize;
944   if (vblock_idx) {
945     uintptr_t header_vblock_byte_ct = kPglVblockSize * alt_allele_ct_byte_ct;
946     if (nonref_flags_stored) {
947       header_vblock_byte_ct += kPglVblockSize / CHAR_BIT;
948     }
949     if (vrtype_and_fpos_storage & 8) {
950       header_vblock_byte_ct += kPglVblockSize >> (10 - vrtype_and_fpos_storage);
951     } else {
952       if (!(vrtype_and_fpos_storage & 4)) {
953         header_vblock_byte_ct += kPglVblockSize / 2;
954       } else {
955         header_vblock_byte_ct += kPglVblockSize;
956       }
957       header_vblock_byte_ct += kPglVblockSize * (1 + (vrtype_and_fpos_storage & 3));
958     }
959 #ifndef NO_MMAP
960     if (!shared_ff) {
961       fread_ptr = &(fread_ptr[header_vblock_byte_ct * S_CAST(uint64_t, vblock_idx)]);
962     } else {
963 #endif
964       if (unlikely(fseeko(shared_ff, header_vblock_byte_ct * S_CAST(uint64_t, vblock_idx), SEEK_CUR))) {
965         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
966         return kPglRetReadFail;
967       }
968 #ifndef NO_MMAP
969     }
970 #endif
971   }
972   uint32_t cur_vblock_variant_ct = kPglVblockSize;
973   uint32_t max_allele_ct = pgfip->max_allele_ct;
974   for (; ; ++vblock_idx) {
975     if (vblock_idx >= vblock_ct_m1) {
976       if (vblock_idx > vblock_ct_m1) {
977         // finish up
978 #ifndef NO_MMAP
979         // now > instead of != to allow additional information to be stored
980         // between header and first variant record
981         if (!shared_ff) {
982           if (unlikely(S_CAST(uintptr_t, fread_ptr - pgfip->block_base) > pgfip->var_fpos[0])) {
983             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid .pgen header.\n");
984             return kPglRetMalformedInput;
985           }
986         } else {
987 #endif
988           if (unlikely(S_CAST(uint64_t, ftello(shared_ff)) > pgfip->var_fpos[0])) {
989             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid .pgen header.\n");
990             return kPglRetMalformedInput;
991           }
992 #ifndef NO_MMAP
993         }
994 #endif
995         pgfip->var_fpos[vidx_end] = cur_fpos;
996         pgfip->max_allele_ct = max_allele_ct;
997         // if difflist/LD might be present, scan for them in a way that's
998         // likely to terminate quickly
999         PgenGlobalFlags new_gflags = kfPgenGlobal0;
1000         if (vrtype_and_fpos_storage != 8) {
1001           const uint32_t trailing_byte_ct = vidx_end & (kBytesPerVec - 1);
1002           if (trailing_byte_ct) {
1003             memset(&(pgfip->vrtypes[vidx_end]), 0, kBytesPerVec - trailing_byte_ct);
1004           }
1005           const VecW* vrtypes_alias_start = R_CAST(VecW*, pgfip->vrtypes);
1006           const VecW* vrtypes_alias_end = &(vrtypes_alias_start[DivUp(vidx_end, kBytesPerVec)]);
1007           if (vblock_idx_start) {
1008             vrtypes_alias_start = &(vrtypes_alias_start[vblock_idx_start * (kPglVblockSize / kBytesPerVec)]);
1009           }
1010           const VecW* vrtypes_alias_iter = vrtypes_alias_start;
1011           if (vrtype_and_fpos_storage < 8) {
1012             for (; vrtypes_alias_iter != vrtypes_alias_end; ++vrtypes_alias_iter) {
1013               const VecW cur_vvec = *vrtypes_alias_iter;
1014 #ifdef __LP64__
1015               const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5);
1016               const VecW cur_vvec_bit1 = vecw_slli(cur_vvec, 6);
1017               // check if any vrtype has bit 1 set and bit 2 clear
1018               if (vecw_movemask(vecw_and_notfirst(cur_vvec_bit2, cur_vvec_bit1))) {
1019                 new_gflags |= kfPgenGlobalLdCompressionPresent | kfPgenGlobalDifflistOrLdPresent;
1020                 break;
1021               }
1022               const VecW cur_vvec_bit0 = vecw_slli(cur_vvec, 7);
1023               if (vecw_movemask(cur_vvec_bit0 | cur_vvec_bit2)) {
1024                 // this catches onebit
1025                 new_gflags |= kfPgenGlobalDifflistOrLdPresent;
1026               }
1027 #else
1028               const uintptr_t cur_vvec_shifted = cur_vvec >> 1;
1029               // check if any vrtype has bit 1 set and bit 2 clear
1030               if (vecw_and_notfirst(cur_vvec_shifted, cur_vvec) & (2 * kMask0101)) {
1031                 new_gflags |= kfPgenGlobalLdCompressionPresent | kfPgenGlobalDifflistOrLdPresent;
1032                 break;
1033               }
1034               if (cur_vvec & (5 * kMask0101)) {
1035                 // this catches onebit
1036                 new_gflags |= kfPgenGlobalDifflistOrLdPresent;
1037               }
1038 #endif
1039             }
1040           }
1041           if (vrtype_and_fpos_storage >= 4) {
1042             // Likely for one of {hphase, dosage} to be present without the
1043             // other; make this scan faster in that case, at the cost of
1044             // failing to early-exit when both are present.
1045             // This is also suboptimal for the vrtype_and_fpos_storage > 8
1046             // special encodings.
1047             VecW or_vvec = vecw_setzero();
1048             for (vrtypes_alias_iter = vrtypes_alias_start; vrtypes_alias_iter != vrtypes_alias_end; ++vrtypes_alias_iter) {
1049               or_vvec |= *vrtypes_alias_iter;
1050             }
1051 #ifdef __LP64__
1052             const VecW or_vvec_bit3 = vecw_slli(or_vvec, 4);
1053             if (vecw_movemask(or_vvec_bit3)) {
1054               // note that, if no phase or dosage data is present, we don't
1055               // look for multiallelic hardcalls.
1056               new_gflags |= kfPgenGlobalMultiallelicHardcallFound;
1057             }
1058             const VecW or_vvec_bit4 = vecw_slli(or_vvec, 3);
1059             if (vecw_movemask(or_vvec_bit4)) {
1060               new_gflags |= kfPgenGlobalHardcallPhasePresent;
1061             }
1062             const VecW or_vvec_bit5 = vecw_slli(or_vvec, 2);
1063             const VecW or_vvec_bit6 = vecw_slli(or_vvec, 1);
1064             if (vecw_movemask(or_vvec_bit5 | or_vvec_bit6)) {
1065               new_gflags |= kfPgenGlobalDosagePresent;
1066               if (vecw_movemask(or_vvec)) {
1067                 new_gflags |= kfPgenGlobalDosagePhasePresent;
1068               }
1069             }
1070 #else
1071             if (or_vvec & (8 * kMask0101)) {
1072               new_gflags |= kfPgenGlobalMultiallelicHardcallFound;
1073             }
1074             if (or_vvec & (0x10 * kMask0101)) {
1075               new_gflags |= kfPgenGlobalHardcallPhasePresent;
1076             }
1077             if (or_vvec & (0x60 * kMask0101)) {
1078               new_gflags |= kfPgenGlobalDosagePresent;
1079               if (or_vvec & (0x80 * kMask0101)) {
1080                 new_gflags |= kfPgenGlobalDosagePhasePresent;
1081               }
1082             }
1083 #endif
1084           }
1085           if (vrtype_and_fpos_storage > 8) {
1086             if (vrtype_and_fpos_storage == 12) {
1087               max_vrec_width = 3;
1088             } else if (vrtype_and_fpos_storage == 14) {
1089               max_vrec_width = 6;
1090             } else {
1091               max_vrec_width = NypCtToByteCt(raw_sample_ct);
1092               if (vrtype_and_fpos_storage == 9) {
1093                 max_vrec_width += 3;
1094               } else if (vrtype_and_fpos_storage == 10) {
1095                 max_vrec_width += 2;
1096               } else {
1097                 // 11
1098                 max_vrec_width += 8;
1099               }
1100               // 13 and 15 not specified yet
1101             }
1102           } else if (!(vrtype_and_fpos_storage & 3)) {
1103             // 1 byte per vrec_len entry, don't bother to determine true
1104             // maximum
1105             max_vrec_width = 255;
1106           }
1107           pgfip->gflags |= new_gflags;
1108         } else {
1109           // vrtype_and_fpos_storage == 8.
1110           max_vrec_width = NypCtToByteCt(raw_sample_ct);
1111         }
1112         *pgr_alloc_cacheline_ct_ptr = CountPgrAllocCachelinesRequired(raw_sample_ct, new_gflags, max_allele_ct, (shared_ff && (!use_blockload))? max_vrec_width : 0);
1113         *max_vrec_width_ptr = max_vrec_width;
1114         return kPglRetSuccess;
1115       }
1116       cur_vblock_variant_ct = ModNz(vidx_end, kPglVblockSize);
1117     }
1118     // 1. handle vrtypes and var_fpos.
1119     if (vrtype_and_fpos_storage >= 8) {
1120       // Special encodings.
1121       uint32_t log2_entry_bit_width = 1;
1122       unsigned char vrtype_table[16];
1123       uint32_t vrec_len_table[16];
1124       vrtype_table[0] = 5;
1125       vrtype_table[1] = 0;
1126       vrec_len_table[0] = 0;
1127       const uint32_t raw_sample_ct4 = NypCtToByteCt(raw_sample_ct);
1128       vrec_len_table[1] = raw_sample_ct4;
1129       if (vrtype_and_fpos_storage == 8) {
1130         log2_entry_bit_width = 0;
1131       } else if (vrtype_and_fpos_storage == 9) {
1132         vrtype_table[2] = 8;
1133         vrtype_table[3] = 8;
1134         vrec_len_table[2] = raw_sample_ct4 + 2;
1135         vrec_len_table[3] = raw_sample_ct4 + 3;
1136       } else if (vrtype_and_fpos_storage == 10) {
1137         vrtype_table[2] = 16;
1138         vrtype_table[3] = 16;
1139         vrec_len_table[2] = raw_sample_ct4 + 1;
1140         vrec_len_table[3] = raw_sample_ct4 + 2;
1141       } else if (vrtype_and_fpos_storage == 11) {
1142         log2_entry_bit_width = 2;
1143         vrtype_table[2] = 8;
1144         vrtype_table[3] = 8;
1145         vrtype_table[4] = 8;
1146         vrtype_table[5] = 8;
1147         vrtype_table[6] = 8;
1148         vrtype_table[7] = 8;
1149         // 8 invalid
1150         vrtype_table[9] = 16;
1151         vrtype_table[10] = 24;
1152         vrtype_table[11] = 24;
1153         vrtype_table[12] = 24;
1154         vrtype_table[13] = 24;
1155         vrtype_table[14] = 24;
1156         vrtype_table[15] = 24;
1157         vrec_len_table[9] = raw_sample_ct4 + 1;
1158         for (uint32_t uii = 2; uii < 8; ++uii) {
1159           vrec_len_table[uii] = raw_sample_ct4 + uii;
1160           vrec_len_table[uii + 8] = raw_sample_ct4 + 1 + uii;
1161         }
1162       } else if (vrtype_and_fpos_storage == 12) {
1163         assert(raw_sample_ct == 1);
1164         vrtype_table[2] = 0x45;
1165         vrtype_table[3] = 0x40;
1166         vrec_len_table[2] = 2;
1167         vrec_len_table[3] = 3;
1168       } else {
1169         // 14 is only remaining possibility for now
1170         assert(raw_sample_ct == 1);
1171         log2_entry_bit_width = 2;
1172         vrtype_table[3] = 0x10;
1173         vrtype_table[4] = 0x45;
1174         vrtype_table[5] = 0x40;
1175         vrtype_table[12] = 0xc5;
1176         vrtype_table[13] = 0xc0;
1177         vrtype_table[15] = 0xe0;
1178         vrec_len_table[3] = 2;
1179         vrec_len_table[4] = 2;
1180         vrec_len_table[5] = 3;
1181         vrec_len_table[12] = 4;
1182         vrec_len_table[13] = 5;
1183         vrec_len_table[15] = 6;
1184       }
1185       const uint32_t entry_bit_width = 1 << log2_entry_bit_width;
1186       const uint32_t entry_mask = (1 << entry_bit_width) - 1;
1187       const uint32_t cur_byte_ct = 1 + ((cur_vblock_variant_ct - 1) >> (3 - log2_entry_bit_width));
1188       const uintptr_t* loadbuf_iter;
1189 #ifdef __arm__
1190 #  error "Unaligned accesses in PgfiInitPhase2()."
1191 #endif
1192 #ifndef NO_MMAP
1193       if (!shared_ff) {
1194         loadbuf_iter = R_CAST(const uintptr_t*, fread_ptr);
1195         fread_ptr = &(fread_ptr[cur_byte_ct]);
1196       } else {
1197 #endif
1198         if (unlikely(!fread_unlocked(loadbuf, cur_byte_ct, 1, shared_ff))) {
1199           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1200           return kPglRetReadFail;
1201         }
1202         loadbuf_iter = R_CAST(const uintptr_t*, loadbuf);
1203 #ifndef NO_MMAP
1204       }
1205 #endif
1206       const uint32_t log2_entries_per_word = kBitsPerWordLog2 - log2_entry_bit_width;
1207       const uint32_t block_len = 1 << log2_entries_per_word;
1208       uint32_t cur_vblock_idx = 0;
1209       uint32_t cur_vblock_idx_stop = block_len;
1210       for (; ; cur_vblock_idx_stop += block_len) {
1211         if (cur_vblock_idx_stop > cur_vblock_variant_ct) {
1212           if (cur_vblock_idx == cur_vblock_variant_ct) {
1213             break;
1214           }
1215           cur_vblock_idx_stop = cur_vblock_variant_ct;
1216         }
1217         uintptr_t input_word = *loadbuf_iter++;
1218         for (; cur_vblock_idx != cur_vblock_idx_stop; ++cur_vblock_idx) {
1219           const uint32_t input_word_masked = input_word & entry_mask;
1220           *vrtypes_iter++ = vrtype_table[input_word_masked];
1221           *var_fpos_iter++ = cur_fpos;
1222           cur_fpos += vrec_len_table[input_word_masked];
1223           input_word >>= entry_bit_width;
1224         }
1225       }
1226     } else {
1227       if (vrtype_and_fpos_storage < 4) {
1228         // no phase or dosage present, 4-bit vrtypes
1229         const uint32_t cur_byte_ct = DivUp(cur_vblock_variant_ct, 2);
1230 #ifndef NO_MMAP
1231         if (shared_ff) {
1232 #endif
1233           if (unlikely(!fread_unlocked(loadbuf, cur_byte_ct, 1, shared_ff))) {
1234             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1235             return kPglRetReadFail;
1236           }
1237           fread_ptr = loadbuf;
1238 #ifndef NO_MMAP
1239         }
1240 #endif
1241         const uint32_t word_write_ct = DivUp(cur_vblock_variant_ct, kBytesPerWord);
1242         uintptr_t* vrtypes_alias_fullword = R_CAST(uintptr_t*, vrtypes_iter);
1243         const Halfword* loadbuf_alias_halfword = R_CAST(const Halfword*, fread_ptr);
1244         for (uint32_t widx = 0; widx != word_write_ct; ++widx) {
1245           uintptr_t ww = loadbuf_alias_halfword[widx];
1246 #ifdef USE_AVX2
1247           // speed advantage is small on my Mac since compiler auto-vectorizes
1248           // the code below?
1249           vrtypes_alias_fullword[widx] = _pdep_u64(ww, kMask0F0F);
1250 #else
1251 #  ifdef __LP64__
1252           ww = (ww | (ww << 16)) & kMask0000FFFF;
1253 #  endif
1254           ww = (ww | (ww << 8)) & kMask00FF;
1255           vrtypes_alias_fullword[widx] = (ww | (ww << 4)) & kMask0F0F;
1256 #endif  // !USE_AVX2
1257         }
1258         const uint32_t last_word_byte_ct = cur_vblock_variant_ct % kBytesPerWord;
1259         vrtypes_iter = &(vrtypes_iter[cur_vblock_variant_ct]);
1260         if (last_word_byte_ct) {
1261           ProperSubwordStore(0, kBytesPerWord - last_word_byte_ct, vrtypes_iter);
1262         } else {
1263           // must guarantee a trailing zero for is_ldbase check to work
1264           vrtypes_iter[0] = 0;
1265         }
1266 #ifndef NO_MMAP
1267         if (!shared_ff) {
1268           fread_ptr = &(fread_ptr[cur_byte_ct]);
1269         }
1270 #endif
1271       } else {
1272         // phase and dosage
1273 #ifndef NO_MMAP
1274         if (shared_ff) {
1275 #endif
1276           if (unlikely(!fread_unlocked(vrtypes_iter, cur_vblock_variant_ct, 1, shared_ff))) {
1277             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1278             return kPglRetReadFail;
1279           }
1280 #ifndef NO_MMAP
1281         } else {
1282           memcpy(vrtypes_iter, fread_ptr, cur_vblock_variant_ct);
1283         }
1284 #endif
1285         const uint32_t last_word_byte_ct = cur_vblock_variant_ct % kBytesPerWord;
1286         vrtypes_iter = &(vrtypes_iter[cur_vblock_variant_ct]);
1287         if (last_word_byte_ct) {
1288           ProperSubwordStore(0, kBytesPerWord - last_word_byte_ct, vrtypes_iter);
1289         } else {
1290           // must guarantee a trailing zero for is_ldbase check to work
1291           vrtypes_iter[0] = 0;
1292         }
1293 #ifndef NO_MMAP
1294         if (!shared_ff) {
1295           fread_ptr = &(fread_ptr[cur_vblock_variant_ct]);
1296         }
1297 #endif
1298       }
1299       const uint32_t bytes_per_entry = 1 + (vrtype_and_fpos_storage & 3);
1300       const uint32_t cur_byte_ct = cur_vblock_variant_ct * bytes_per_entry;
1301 #ifndef NO_MMAP
1302       if (shared_ff) {
1303 #endif
1304         if (unlikely(!fread_unlocked(loadbuf, cur_byte_ct, 1, shared_ff))) {
1305           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1306           return kPglRetReadFail;
1307         }
1308         fread_ptr = loadbuf;
1309 #ifndef NO_MMAP
1310       }
1311 #endif
1312       if (bytes_per_entry == 1) {
1313         for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1314           var_fpos_iter[cur_vblock_vidx] = cur_fpos;
1315           uint32_t cur_vrec_len = fread_ptr[cur_vblock_vidx];
1316           cur_fpos += cur_vrec_len;
1317           // no need for correct max_vrec_width
1318         }
1319       } else if (bytes_per_entry == 2) {
1320         for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1321           var_fpos_iter[cur_vblock_vidx] = cur_fpos;
1322           uint16_t cur_vrec_len;
1323           memcpy_k(&cur_vrec_len, &(fread_ptr[cur_vblock_vidx * 2]), 2);
1324           cur_fpos += cur_vrec_len;
1325           if (cur_vrec_len > max_vrec_width) {
1326             // todo: check whether we're better off just assuming 2^16 - 1
1327             max_vrec_width = cur_vrec_len;
1328           }
1329         }
1330       } else if (bytes_per_entry == 3) {
1331         for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1332           var_fpos_iter[cur_vblock_vidx] = cur_fpos;
1333           uint32_t cur_vrec_len;
1334           // safe to read a byte past the end, since that's either in loadbuf
1335           // or, in mmap case, we can't be at the end of a valid file
1336           memcpy(&cur_vrec_len, &(fread_ptr[cur_vblock_vidx * 3]), sizeof(int32_t));
1337           cur_vrec_len &= 0xffffff;
1338           cur_fpos += cur_vrec_len;
1339           if (cur_vrec_len > max_vrec_width) {
1340             max_vrec_width = cur_vrec_len;
1341           }
1342         }
1343       } else {
1344         for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1345           var_fpos_iter[cur_vblock_vidx] = cur_fpos;
1346           uint32_t cur_vrec_len;
1347           memcpy(&cur_vrec_len, &(fread_ptr[cur_vblock_vidx * 4]), 4);
1348           cur_fpos += cur_vrec_len;
1349           if (cur_vrec_len > max_vrec_width) {
1350             max_vrec_width = cur_vrec_len;
1351           }
1352         }
1353 #ifdef __LP64__
1354         if (unlikely(max_vrec_width > kPglMaxBytesPerVariant)) {
1355           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid .pgen header.\n");
1356           return kPglRetMalformedInput;
1357         }
1358 #else
1359         if (unlikely(max_vrec_width > kMaxBytesPerIO)) {
1360           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Variant records too large for 32-bit pgenlib.\n");
1361           return kPglRetNomem;
1362         }
1363 #endif
1364       }
1365       var_fpos_iter = &(var_fpos_iter[cur_vblock_variant_ct]);
1366 #ifndef NO_MMAP
1367       if (!shared_ff) {
1368         fread_ptr = &(fread_ptr[cur_byte_ct]);
1369       }
1370 #endif
1371     }
1372     // 2. allele counts?
1373     if (alt_allele_ct_byte_ct) {
1374       assert(alt_allele_ct_byte_ct == 1);
1375 #ifndef NO_MMAP
1376       if (shared_ff) {
1377 #endif
1378         if (unlikely(!fread_unlocked(loadbuf, cur_vblock_variant_ct * alt_allele_ct_byte_ct, 1, shared_ff))) {
1379           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1380           return kPglRetReadFail;
1381         }
1382         fread_ptr = loadbuf;
1383 #ifndef NO_MMAP
1384       }
1385 #endif
1386       // max_allele_ct scan can probably be sped up with _mm{256}_max_epu8()?
1387       // probably can't do much for main loop (at least in sizeof(AlleleCode)
1388       // == 1 case)
1389       if (allele_cts_already_loaded) {
1390         // todo: update this for multibyte AlleleCode
1391         for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1392           const uintptr_t cur_allele_idx_offset = allele_idx_offsets_iter[cur_vblock_vidx];
1393           const uint32_t cur_allele_ct = fread_ptr[cur_vblock_vidx];
1394           if (unlikely((cur_allele_idx_offset - prev_allele_idx_offset) != cur_allele_ct)) {
1395             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded allele_idx_offsets do not match values in .pgen file.\n");
1396             return kPglRetInconsistentInput;
1397           }
1398           prev_allele_idx_offset = cur_allele_idx_offset;
1399           if (cur_allele_ct > max_allele_ct) {
1400             max_allele_ct = cur_allele_ct;
1401           }
1402         }
1403       } else {
1404         for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1405           const uint32_t cur_allele_ct = fread_ptr[cur_vblock_vidx];
1406           allele_idx_offsets_iter[cur_vblock_vidx] = prev_allele_idx_offset;
1407           prev_allele_idx_offset += cur_allele_ct;
1408           if (cur_allele_ct > max_allele_ct) {
1409             max_allele_ct = cur_allele_ct;
1410           }
1411         }
1412       }
1413       allele_idx_offsets_iter = &(allele_idx_offsets_iter[cur_vblock_variant_ct]);
1414 #ifndef NO_MMAP
1415       if (!shared_ff) {
1416         fread_ptr = &(fread_ptr[cur_vblock_variant_ct * alt_allele_ct_byte_ct]);
1417       }
1418 #endif
1419     }
1420     // 3. nonref flags?
1421     if (nonref_flags_stored) {
1422       const uint32_t cur_byte_ct = DivUp(cur_vblock_variant_ct, CHAR_BIT);
1423 #ifndef NO_MMAP
1424       if (!shared_ff) {
1425         if (nonref_flags_already_loaded) {
1426           if (unlikely(!memequal(nonref_flags_iter, fread_ptr, cur_byte_ct))) {
1427             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
1428             return kPglRetInconsistentInput;
1429           }
1430         } else {
1431           memcpy(nonref_flags_iter, fread_ptr, cur_byte_ct);
1432         }
1433         fread_ptr = &(fread_ptr[cur_byte_ct]);
1434       } else {
1435 #endif
1436         unsigned char* loadptr = nonref_flags_already_loaded? loadbuf : nonref_flags_iter;
1437         if (unlikely(!fread_unlocked(loadptr, cur_byte_ct, 1, shared_ff))) {
1438           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1439           return kPglRetReadFail;
1440         }
1441         if (nonref_flags_already_loaded) {
1442           if (unlikely(!memequal(nonref_flags_iter, loadbuf, cur_byte_ct))) {
1443             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
1444             return kPglRetInconsistentInput;
1445           }
1446         }
1447 #ifndef NO_MMAP
1448       }
1449 #endif
1450       nonref_flags_iter = &(nonref_flags_iter[cur_byte_ct]);
1451     }
1452   }
1453 }
1454 
GetLdbaseVidx(const unsigned char * vrtypes,uint32_t cur_vidx)1455 uint32_t GetLdbaseVidx(const unsigned char* vrtypes, uint32_t cur_vidx) {
1456 #ifdef __LP64__
1457   const VecW* vrtypes_valias = R_CAST(const VecW*, vrtypes);
1458   const uint32_t cur_vidx_orig_remainder = cur_vidx % kBytesPerVec;
1459   uint32_t vidx_vec_idx = cur_vidx / kBytesPerVec;
1460   Vec8thUint v8ui = 0;
1461   if (cur_vidx_orig_remainder) {
1462     const VecW cur_vvec = vrtypes_valias[vidx_vec_idx];
1463     // non-ld: ((bit 2) OR (NOT bit 1))
1464     const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5);
1465     const VecW inv_cur_vvec_bit1 = ~vecw_slli(cur_vvec, 6);
1466     v8ui = vecw_movemask(cur_vvec_bit2 | inv_cur_vvec_bit1);
1467     v8ui = bzhi(v8ui, cur_vidx_orig_remainder);
1468   }
1469   while (!v8ui) {
1470     const VecW cur_vvec = vrtypes_valias[--vidx_vec_idx];
1471     const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5);
1472     const VecW inv_cur_vvec_bit1 = ~vecw_slli(cur_vvec, 6);
1473     v8ui = vecw_movemask(cur_vvec_bit2 | inv_cur_vvec_bit1);
1474   }
1475   return (vidx_vec_idx * kBytesPerVec) + bsru32(v8ui);
1476 #else
1477   const uintptr_t* vrtypes_walias = R_CAST(const uintptr_t*, vrtypes);
1478   const uint32_t cur_vidx_orig_remainder = cur_vidx % kBytesPerWord;
1479   uint32_t vidx_word_idx = (cur_vidx - 1) / kBytesPerWord;
1480   uintptr_t cur_vrtypes_word = vrtypes_walias[vidx_word_idx];
1481   if (cur_vidx_orig_remainder) {
1482     // make sure we don't detect a byte after the current position.
1483     cur_vrtypes_word = bzhi(cur_vrtypes_word, CHAR_BIT * cur_vidx_orig_remainder);
1484     cur_vrtypes_word |= (kMask0101 * 2) << (CHAR_BIT * cur_vidx_orig_remainder);
1485   }
1486   while (1) {
1487     // ((bit 2) OR (NOT bit 1)) for each byte.  (possible experiment: see if
1488     // the same assembly is generated if this expression is rewritten to use
1489     // ands/nots.)
1490     const uintptr_t detect_non_ld_word = ((cur_vrtypes_word >> 1) | (~cur_vrtypes_word)) & (kMask0101 * 2);
1491     if (detect_non_ld_word) {
1492       // find the highest-order set bit in detect_non_ld_word; this corresponds
1493       // to the last non-LD-compressed byte (assuming little-endian).
1494       const uint32_t new_ldbase_vidx_loworder = bsrw(detect_non_ld_word) / CHAR_BIT;
1495       return (vidx_word_idx * kBytesPerWord) + new_ldbase_vidx_loworder;
1496     }
1497     // everything LD-compressed in the current block.  move back 8 bytes in the
1498     // array (or 4-bytes for 32-bit build).
1499     cur_vrtypes_word = vrtypes_walias[--vidx_word_idx];
1500   }
1501 #endif
1502 }
1503 
PgfiMultireadGetCachelineReq(const uintptr_t * variant_include,const PgenFileInfo * pgfip,uint32_t variant_ct,uint32_t block_size)1504 uint64_t PgfiMultireadGetCachelineReq(const uintptr_t* variant_include, const PgenFileInfo* pgfip, uint32_t variant_ct, uint32_t block_size) {
1505   // if block_size < kPglVblockSize, it's ideal for it to be a power of 2 (to
1506   // avoid unnecessary vblock crossing), but that's not required.
1507   const uint32_t raw_variant_ct = pgfip->raw_variant_ct;
1508   if (variant_ct == raw_variant_ct) {
1509     variant_include = nullptr;
1510   }
1511   uint32_t block_ct_m1 = 0;
1512   if (raw_variant_ct < block_size) {
1513     block_size = raw_variant_ct;
1514   } else {
1515     block_ct_m1 = (raw_variant_ct - 1) / block_size;
1516   }
1517   const uint64_t* var_fpos = pgfip->var_fpos;
1518   if ((!variant_include) && (!var_fpos)) {
1519     return DivUpU64(S_CAST(uint64_t, pgfip->const_vrec_width) * block_size, kCacheline);
1520   }
1521   uint64_t max_block_byte_ct = 0;
1522   uint32_t max_block_variant_ct = 0;
1523   for (uint32_t block_idx = 0; ; ++block_idx) {
1524     uint32_t variant_uidx_start = block_idx * block_size;
1525     uint32_t variant_uidx_end = variant_uidx_start + block_size;
1526     if (block_idx >= block_ct_m1) {
1527       if (block_idx > block_ct_m1) {
1528         break;
1529       }
1530       variant_uidx_end = raw_variant_ct;
1531     }
1532     if (variant_include) {
1533       variant_uidx_start = AdvBoundedTo1Bit(variant_include, variant_uidx_start, variant_uidx_end);
1534       if (variant_uidx_start == variant_uidx_end) {
1535         continue;
1536       }
1537       variant_uidx_end = 1 + FindLast1BitBefore(variant_include, variant_uidx_end);
1538     }
1539     if (var_fpos) {
1540       if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
1541         // need to start loading from LD-buddy
1542         variant_uidx_start = GetLdbaseVidx(pgfip->vrtypes, variant_uidx_start);
1543       }
1544       uint64_t cur_block_byte_ct = var_fpos[variant_uidx_end] - var_fpos[variant_uidx_start];
1545       if (cur_block_byte_ct > max_block_byte_ct) {
1546         max_block_byte_ct = cur_block_byte_ct;
1547       }
1548     } else {
1549       // no LD compression here
1550       const uint32_t cur_block_variant_ct = variant_uidx_end - variant_uidx_start;
1551       if (cur_block_variant_ct > max_block_variant_ct) {
1552         max_block_variant_ct = cur_block_variant_ct;
1553         if (cur_block_variant_ct == block_size) {
1554           // no larger value possible, terminate search
1555           break;
1556         }
1557       }
1558     }
1559   }
1560   if (!var_fpos) {
1561     max_block_byte_ct = max_block_variant_ct * S_CAST(uint64_t, pgfip->const_vrec_width);
1562   }
1563   return DivUpU64(max_block_byte_ct, kCacheline);
1564 }
1565 
PgfiMultiread(const uintptr_t * variant_include,uint32_t variant_uidx_start,uint32_t variant_uidx_end,uint32_t load_variant_ct,PgenFileInfo * pgfip)1566 PglErr PgfiMultiread(const uintptr_t* variant_include, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t load_variant_ct, PgenFileInfo* pgfip) {
1567   // we could permit 0, but that encourages lots of unnecessary thread wakeups
1568   assert(load_variant_ct);
1569   if (variant_include) {
1570     variant_uidx_start = AdvTo1Bit(variant_include, variant_uidx_start);
1571   }
1572   assert(variant_uidx_start < pgfip->raw_variant_ct);
1573   uint64_t block_offset;
1574   if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
1575     // need to start loading from LD-buddy
1576     // assume for now that we can't skip any variants between the LD-buddy and
1577     // the actual first variant; should remove this assumption later
1578     block_offset = pgfip->var_fpos[GetLdbaseVidx(pgfip->vrtypes, variant_uidx_start)];
1579   } else {
1580     block_offset = GetPgfiFpos(pgfip, variant_uidx_start);
1581   }
1582   pgfip->block_offset = block_offset;
1583   uint64_t next_read_start_fpos = block_offset;
1584   // break this up into multiple freads whenever this lets us skip an entire
1585   // disk block
1586   // (possible todo: make the disk block size a parameter of this function)
1587   do {
1588     const uint64_t cur_read_start_fpos = next_read_start_fpos;
1589     uint32_t cur_read_uidx_end;
1590     uint64_t cur_read_end_fpos;
1591     while (1) {
1592       cur_read_uidx_end = variant_uidx_end;
1593       if (cur_read_uidx_end - variant_uidx_start == load_variant_ct) {
1594         cur_read_end_fpos = GetPgfiFpos(pgfip, cur_read_uidx_end);
1595         load_variant_ct = 0;
1596         break;
1597       }
1598       cur_read_uidx_end = AdvTo0Bit(variant_include, variant_uidx_start);
1599       cur_read_end_fpos = GetPgfiFpos(pgfip, cur_read_uidx_end);
1600       load_variant_ct -= cur_read_uidx_end - variant_uidx_start;
1601       if (!load_variant_ct) {
1602         break;
1603       }
1604       variant_uidx_start = AdvTo1Bit(variant_include, cur_read_uidx_end);
1605       next_read_start_fpos = GetPgfiFpos(pgfip, variant_uidx_start);
1606       if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
1607         const uint32_t variant_read_uidx_start = GetLdbaseVidx(pgfip->vrtypes, variant_uidx_start);
1608         if (variant_read_uidx_start <= cur_read_uidx_end) {
1609           continue;
1610         }
1611         next_read_start_fpos = pgfip->var_fpos[variant_read_uidx_start];
1612       }
1613       // bugfix: can't use do..while, since previous "continue" needs to skip
1614       // this check
1615       if (RoundDownPow2U64(cur_read_end_fpos + kDiskBlockSize + 1LLU, kDiskBlockSize) < RoundDownPow2U64(next_read_start_fpos, kDiskBlockSize)) {
1616         // minor bugfix (7 Jul 2017): break, not continue
1617         break;
1618       }
1619     }
1620     if (unlikely(fseeko(pgfip->shared_ff, cur_read_start_fpos, SEEK_SET))) {
1621       return kPglRetReadFail;
1622     }
1623     uintptr_t len = cur_read_end_fpos - cur_read_start_fpos;
1624     if (unlikely(fread_checked(K_CAST(unsigned char*, &(pgfip->block_base[cur_read_start_fpos - block_offset])), len, pgfip->shared_ff))) {
1625       return kPglRetReadFail;
1626     }
1627   } while (load_variant_ct);
1628   return kPglRetSuccess;
1629 }
1630 
1631 
PreinitPgr(PgenReader * pgr_ptr)1632 void PreinitPgr(PgenReader* pgr_ptr) {
1633   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
1634   pgrp->ff = nullptr;
1635 }
1636 
PgrInit(const char * fname,uint32_t max_vrec_width,PgenFileInfo * pgfip,PgenReader * pgr_ptr,unsigned char * pgr_alloc)1637 PglErr PgrInit(const char* fname, uint32_t max_vrec_width, PgenFileInfo* pgfip, PgenReader* pgr_ptr, unsigned char* pgr_alloc) {
1638   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
1639   // See CountPgrAllocCachelinesRequired().
1640   // Could add a debug mode.
1641 
1642   // Mode 1 (mmap): block_base initialized, shared_ff == nullptr.  fname must
1643   //   be nullptr.
1644   // Mode 2 (block-fread): block_base initialized, shared_ff != nullptr.  fname
1645   //   must be nullptr.
1646   // Mode 3 (per-variant fread): block_base == nullptr.  fname must be
1647   //   non-null, though it isn't actually referenced during the first
1648   //   PgenReader initialization (instead shared_ff is moved).
1649   unsigned char* pgr_alloc_iter = pgr_alloc;
1650   if (pgfip->block_base != nullptr) {
1651     if (unlikely(fname != nullptr)) {
1652       return kPglRetImproperFunctionCall;
1653     }
1654     pgrp->ff = nullptr;  // make sure CleanupPgr() doesn't break
1655   } else {
1656     if (pgfip->shared_ff != nullptr) {
1657       if (unlikely(fname == nullptr)) {
1658         return kPglRetImproperFunctionCall;
1659       }
1660       // move instead of close/reopen.
1661       pgrp->ff = pgfip->shared_ff;
1662       pgfip->shared_ff = nullptr;
1663     } else {
1664       pgrp->ff = fopen(fname, FOPEN_RB);
1665       if (unlikely(!pgrp->ff)) {
1666         return kPglRetOpenFail;
1667       }
1668     }
1669     // now that arbitrary info can be stored between header and first variant
1670     // record, always seek.
1671     uint64_t seek_pos;
1672     if (pgfip->var_fpos) {
1673       seek_pos = pgfip->var_fpos[0];
1674     } else {
1675       seek_pos = pgfip->const_fpos_offset;
1676     }
1677     if (unlikely(fseeko(pgrp->ff, seek_pos, SEEK_SET))) {
1678       return kPglRetReadFail;
1679     }
1680   }
1681   pgrp->fi = *pgfip;  // struct copy
1682   if (fname) {
1683     // Mode 3 per-reader load buffer
1684     pgrp->fread_buf = pgr_alloc_iter;
1685     pgr_alloc_iter = &(pgr_alloc_iter[RoundUpPow2(max_vrec_width, kCacheline)]);
1686   }
1687   pgrp->fp_vidx = 0;
1688   pgrp->ldbase_vidx = UINT32_MAX;
1689   pgrp->ldbase_stypes = kfPgrLdcache0;
1690   pgrp->ldbase_genovec = nullptr;
1691   pgrp->ldbase_raregeno = nullptr;
1692   pgrp->ldbase_difflist_sample_ids = nullptr;
1693 
1694   const PgenGlobalFlags gflags = pgrp->fi.gflags;
1695   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
1696   const uint32_t genovec_bytes_req = NypCtToCachelineCt(raw_sample_ct) * kCacheline;
1697   pgrp->ldbase_raw_genovec = R_CAST(uintptr_t*, pgr_alloc_iter);
1698   pgr_alloc_iter = &(pgr_alloc_iter[genovec_bytes_req]);
1699   const uint32_t bitvec_bytes_req = BitCtToCachelineCt(raw_sample_ct) * kCacheline;
1700   const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1;
1701   const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor);
1702   const uint32_t max_allele_ct = pgrp->fi.max_allele_ct;
1703   pgrp->workspace_difflist_sample_ids = nullptr;
1704   if ((gflags & kfPgenGlobalDifflistOrLdPresent) || (max_allele_ct > 2)) {
1705     pgrp->workspace_difflist_sample_ids = R_CAST(uint32_t*, pgr_alloc_iter);
1706     pgr_alloc_iter = &(pgr_alloc_iter[(1 + (max_difflist_entry_ct_base / kInt32PerCacheline)) * (kCacheline * k1LU)]);
1707   }
1708   if (gflags & kfPgenGlobalDifflistOrLdPresent) {
1709     // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
1710 
1711     pgrp->workspace_raregeno_vec = R_CAST(uintptr_t*, pgr_alloc_iter);
1712     pgr_alloc_iter = &(pgr_alloc_iter[NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline]);
1713 
1714     pgrp->workspace_raregeno_tmp_loadbuf = R_CAST(uintptr_t*, pgr_alloc_iter);
1715     pgr_alloc_iter = &(pgr_alloc_iter[NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline]);
1716 
1717     if (ld_compression_present) {
1718       pgrp->ldbase_genovec = R_CAST(uintptr_t*, pgr_alloc_iter);
1719       pgr_alloc_iter = &(pgr_alloc_iter[genovec_bytes_req]);
1720 
1721       pgrp->ldbase_raregeno = R_CAST(uintptr_t*, pgr_alloc_iter);
1722       pgr_alloc_iter = &(pgr_alloc_iter[NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline]);
1723 
1724       pgrp->ldbase_difflist_sample_ids = R_CAST(uint32_t*, pgr_alloc_iter);
1725       pgr_alloc_iter = &(pgr_alloc_iter[(1 + (max_difflist_entry_ct_base / kInt32PerCacheline)) * (kCacheline * k1LU)]);
1726     }
1727   } else {
1728     pgrp->workspace_raregeno_vec = nullptr;
1729     pgrp->workspace_raregeno_tmp_loadbuf = nullptr;
1730   }
1731   pgrp->workspace_vec = nullptr;
1732   pgrp->workspace_aux1x_present = nullptr;
1733   pgrp->workspace_imp_r2 = nullptr;
1734   pgrp->workspace_all_hets = nullptr;
1735   pgrp->workspace_subset = nullptr;
1736   const PgenGlobalFlags gflags_hphase_dosage = gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent);
1737   if ((max_allele_ct > 2) || gflags_hphase_dosage) {
1738     pgrp->workspace_vec = R_CAST(uintptr_t*, pgr_alloc_iter);
1739     pgr_alloc_iter = &(pgr_alloc_iter[genovec_bytes_req]);
1740     if (max_allele_ct > 2) {
1741       pgrp->workspace_aux1x_present = R_CAST(uintptr_t*, pgr_alloc_iter);
1742       pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1743       pgrp->workspace_imp_r2 = R_CAST(uint64_t*, pgr_alloc_iter);
1744       pgr_alloc_iter = &(pgr_alloc_iter[Int64CtToCachelineCt(2 * max_allele_ct) * (kCacheline * k1LU)]);
1745     }
1746     if (gflags & kfPgenGlobalHardcallPhasePresent) {
1747       pgrp->workspace_all_hets = R_CAST(uintptr_t*, pgr_alloc_iter);
1748       pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1749       pgrp->workspace_subset = R_CAST(uintptr_t*, pgr_alloc_iter);
1750       pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1751     }
1752     pgrp->workspace_dosage_present = nullptr;
1753     pgrp->workspace_dphase_present = nullptr;
1754     if (gflags & kfPgenGlobalDosagePresent) {
1755       pgrp->workspace_dosage_present = R_CAST(uintptr_t*, pgr_alloc_iter);
1756       pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1757       if (gflags & kfPgenGlobalDosagePhasePresent) {
1758         pgrp->workspace_dphase_present = R_CAST(uintptr_t*, pgr_alloc_iter);
1759       }
1760       // pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1761     }
1762   }
1763   return kPglRetSuccess;
1764 }
1765 
PgrPlink1ToPlink2InplaceUnsafe(uint32_t sample_ct,uintptr_t * genovec)1766 void PgrPlink1ToPlink2InplaceUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
1767   // 00 -> 10, 01 -> 11, 10 -> 01, 11 -> 00
1768   // new low bit  = [old low] ^ [old high]
1769   // new high bit = ~[old high]
1770   // "unsafe" because trailing bits are not zeroed out.
1771   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
1772   const VecW m1 = VCONST_W(kMask5555);
1773   const VecW not_m1 = VCONST_W(kMaskAAAA);
1774   VecW* vptr = R_CAST(VecW*, genovec);
1775   for (uint32_t vidx = 0; vidx != vec_ct; vidx++) {
1776     const VecW not_cur_vec_high = vecw_and_notfirst(vptr[vidx], not_m1);
1777     vptr[vidx] = (vecw_and_notfirst(vptr[vidx], m1) ^ vecw_srli(not_cur_vec_high, 1)) | not_cur_vec_high;
1778   }
1779 }
1780 
PgrPlink2ToPlink1InplaceUnsafe(uint32_t sample_ct,uintptr_t * genovec)1781 void PgrPlink2ToPlink1InplaceUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
1782   // 00 -> 11, 01 -> 10, 10 -> 00, 11 -> 01
1783   // new low bit  = [old low] ^ (~[old high])
1784   // new high bit = ~[old high]
1785   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
1786   const VecW not_m1 = VCONST_W(kMaskAAAA);
1787   VecW* vptr = R_CAST(VecW*, genovec);
1788   for (uint32_t vidx = 0; vidx != vec_ct; vidx++) {
1789     VecW cur_vec = vptr[vidx];
1790     VecW not_cur_vec_high = vecw_and_notfirst(cur_vec, not_m1);
1791     vptr[vidx] = (vecw_and_notfirst(not_m1, cur_vec) ^ vecw_srli(not_cur_vec_high, 1)) | not_cur_vec_high;
1792   }
1793 }
1794 
ParseDifflistHeader(const unsigned char * fread_end,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uintptr_t * raregeno_buf,const unsigned char ** difflist_group_info_ptr,uint32_t * difflist_len_ptr)1795 PglErr ParseDifflistHeader(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* raregeno_buf, const unsigned char** difflist_group_info_ptr, uint32_t* difflist_len_ptr) {
1796   // Can be used for deltalists as well: pass raregeno_buf == nullptr.
1797   // Trailing bits of raregeno may not be zeroed out.
1798   // Will need a separate 64-bit version of this for multiallelic dosages.
1799   const uint32_t difflist_len = GetVint31(fread_end, fread_pp);
1800   // moved here to address maybe-uninitialized warnings
1801   *difflist_group_info_ptr = *fread_pp;
1802   *difflist_len_ptr = difflist_len;
1803   if (!difflist_len) {
1804     return kPglRetSuccess;
1805   }
1806   if (unlikely(difflist_len > raw_sample_ct / kPglMaxDifflistLenDivisor)) {
1807     // automatically catches GetVint31() failure
1808     return kPglRetMalformedInput;
1809   }
1810   const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize);
1811   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
1812   const uint32_t difflist_index_byte_ct = group_ct * (sample_id_byte_ct + 1) - 1;
1813   if (PtrAddCk(fread_end, difflist_index_byte_ct, fread_pp)) {
1814     return kPglRetMalformedInput;
1815   }
1816   if (!raregeno_buf) {
1817     // for sample ID lists without 2-bit genotype info, used for sparse dosage
1818     return kPglRetSuccess;
1819   }
1820   const uint32_t raregeno_byte_ct = NypCtToByteCt(difflist_len);
1821   const unsigned char* raregeno_start = *fread_pp;
1822   if (PtrAddCk(fread_end, raregeno_byte_ct, fread_pp)) {
1823     return kPglRetMalformedInput;
1824   }
1825   // possible todo: just return raregeno_start, and let the caller perform this
1826   // copy
1827   memcpy(raregeno_buf, raregeno_start, raregeno_byte_ct);
1828   return kPglRetSuccess;
1829 }
1830 
ParseAndSaveDifflist(const unsigned char * fread_end,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uintptr_t * __restrict raregeno,uint32_t * __restrict difflist_sample_ids,uint32_t * __restrict difflist_len_ptr)1831 PglErr ParseAndSaveDifflist(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
1832   // Appropriate when we need to iterate through the difflist multiple times.
1833   // Other functions are more efficient if we only need to process the list
1834   // once.
1835   // Trailing bits of raregeno may not be zeroed out.
1836   const unsigned char* group_info_iter;
1837   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno, &group_info_iter, difflist_len_ptr);
1838   uint32_t difflist_len = *difflist_len_ptr;
1839   // todo: check if difflist_len == 0 early exit is a net positive or negative
1840   // on a few test datasets
1841   if (reterr || (!difflist_len)) {
1842     return reterr;
1843   }
1844   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
1845   uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
1846   for (uint32_t difflist_remaining = difflist_len; ; ) {
1847     const uint32_t* difflist_sample_ids_stop;
1848     if (difflist_remaining < kPglDifflistGroupSize) {
1849       if (!difflist_remaining) {
1850         return kPglRetSuccess;
1851       }
1852       difflist_sample_ids_stop = &(difflist_sample_ids_iter[difflist_remaining]);
1853       difflist_remaining = 0;
1854     } else {
1855       difflist_sample_ids_stop = &(difflist_sample_ids_iter[kPglDifflistGroupSize]);
1856       difflist_remaining -= kPglDifflistGroupSize;
1857     }
1858     // can't use uint32_t assignment trick for now since there's a corner case
1859     // where that would read past the end of the mapped address range
1860     uintptr_t raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
1861     group_info_iter = &(group_info_iter[sample_id_byte_ct]);
1862     while (1) {
1863 #ifndef __LP64__
1864       // perform more frequent checks in 32-bit build since raw_sample_idx may
1865       // overflow
1866       // misses "small negative" malformed input, but it'll catch data
1867       // corruption with very high probability
1868       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
1869         return kPglRetMalformedInput;
1870       }
1871 #endif
1872       *difflist_sample_ids_iter++ = raw_sample_idx;
1873       if (difflist_sample_ids_iter == difflist_sample_ids_stop) {
1874         break;
1875       }
1876       raw_sample_idx += GetVint31(fread_end, fread_pp);
1877     }
1878 #ifdef __LP64__
1879     if (unlikely(raw_sample_idx >= raw_sample_ct)) {
1880       return kPglRetMalformedInput;
1881     }
1882 #endif
1883   }
1884   return kPglRetSuccess;
1885 }
1886 
ParseAndSaveDifflistProperSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uintptr_t * __restrict raregeno,uint32_t * __restrict difflist_sample_ids,uint32_t * __restrict difflist_len_ptr,uintptr_t * __restrict raregeno_workspace)1887 PglErr ParseAndSaveDifflistProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr, uintptr_t* __restrict raregeno_workspace) {
1888   // Requires a PROPER subset.  Might want to just merge this with
1889   // ParseAndSaveDifflist() and rename appropriately.
1890   // Trailing bits of raregeno are zeroed out.
1891   uint32_t raw_difflist_len;
1892   const unsigned char* group_info_iter;
1893   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &raw_difflist_len);
1894   if (reterr || (!raw_difflist_len)) {
1895     *difflist_len_ptr = 0;
1896     return reterr;
1897   }
1898   const uint32_t subgroup_idx_last = (raw_difflist_len - 1) / kBitsPerWordD2;
1899   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
1900   uintptr_t* raregeno_workspace_iter = raregeno_workspace;
1901   uintptr_t* raregeno_iter = raregeno;
1902   uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
1903 
1904   // technically doesn't need to be initialized, but I have principles
1905   uintptr_t raw_sample_idx = 0;
1906 
1907   uintptr_t raregeno_word = 0;
1908   uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
1909   uint32_t difflist_len_lowbits = 0;
1910   for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
1911     if (subgroup_idx >= subgroup_idx_last) {
1912       if (subgroup_idx > subgroup_idx_last) {
1913         if (difflist_len_lowbits) {
1914           *raregeno_iter = raregeno_word;
1915         }
1916         *difflist_len_ptr = S_CAST(uintptr_t, difflist_sample_ids_iter - difflist_sample_ids) + difflist_len_lowbits;
1917         return kPglRetSuccess;
1918       }
1919       subgroup_len_m1 &= raw_difflist_len - 1;
1920     }
1921     // We need to consume a new rare genotype word every 32 entries, and pull a
1922     // raw sample index from the difflist header every 64 entries.  So it's
1923     // best to make the inner loop have a period of 32 (call this a 'subgroup',
1924     // where 'group' refers to a set of 64 entries).
1925     if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
1926 #ifdef __LP64__
1927       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
1928         return kPglRetMalformedInput;
1929       }
1930 #endif
1931       raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
1932       group_info_iter = &(group_info_iter[sample_id_byte_ct]);
1933     } else {
1934       raw_sample_idx += GetVint31(fread_end, fread_pp);
1935     }
1936     uintptr_t raregeno_workspace_word = *raregeno_workspace_iter++;
1937     for (uint32_t raw_difflist_idx_lowbits = 0; ; ++raw_difflist_idx_lowbits) {
1938 #ifndef __LP64__
1939       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
1940         return kPglRetMalformedInput;
1941       }
1942 #endif
1943       if (IsSet(sample_include, raw_sample_idx)) {
1944         raregeno_word |= ((raregeno_workspace_word >> (2 * raw_difflist_idx_lowbits)) & 3) << (difflist_len_lowbits * 2);
1945         difflist_sample_ids_iter[difflist_len_lowbits] = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx);
1946         if (difflist_len_lowbits++ == (kBitsPerWordD2 - 1)) {
1947           *raregeno_iter++ = raregeno_word;
1948           raregeno_word = 0;
1949           difflist_len_lowbits = 0;
1950           difflist_sample_ids_iter = &(difflist_sample_ids_iter[kBitsPerWordD2]);
1951         }
1952       }
1953       if (raw_difflist_idx_lowbits == subgroup_len_m1) {
1954         break;
1955       }
1956       raw_sample_idx += GetVint31(fread_end, fread_pp);
1957     }
1958   }
1959 }
1960 
ParseLdAndMergeDifflistSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict ldbase_raregeno,const uint32_t * __restrict ldbase_difflist_sample_ids,uint32_t ldbase_difflist_len,uintptr_t ldbase_common_geno,uint32_t raw_sample_ct,uint32_t sample_ct,const unsigned char ** fread_pp,uintptr_t * __restrict merged_raregeno,uint32_t * __restrict merged_difflist_sample_ids,uint32_t * __restrict merged_difflist_len_ptr,uintptr_t * __restrict diff_from_ldbase_raregeno_iter)1961 PglErr ParseLdAndMergeDifflistSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict ldbase_raregeno, const uint32_t* __restrict ldbase_difflist_sample_ids, uint32_t ldbase_difflist_len, uintptr_t ldbase_common_geno, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict merged_raregeno, uint32_t* __restrict merged_difflist_sample_ids, uint32_t* __restrict merged_difflist_len_ptr, uintptr_t* __restrict diff_from_ldbase_raregeno_iter) {
1962   // Used when the ldbase variant was saved as a difflist, and it's useful to
1963   // process the current variant as a difflist.
1964   // * Assumes ldbase_difflist_sample_ids[ldbase_difflist_len]==sample_ct.
1965   // * Assumes sample_include == nullptr if no subsetting needed.  (Otherwise,
1966   //   it'll still work, but performance will be worse.)
1967   // Trailing bits of merged_raregeno may not be zeroed out.
1968   // Caller is responsible for inverting ldbase_common_geno and merged_raregeno
1969   // afterward if necessary.
1970   assert(ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct);
1971   uint32_t diff_from_ldbase_len;
1972   const unsigned char* group_info_iter;
1973   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, diff_from_ldbase_raregeno_iter, &group_info_iter, &diff_from_ldbase_len);
1974   if (unlikely(reterr)) {
1975     return reterr;
1976   }
1977   if (!diff_from_ldbase_len) {
1978     memcpy(merged_difflist_sample_ids, ldbase_difflist_sample_ids, ldbase_difflist_len * sizeof(int32_t));
1979     *merged_difflist_len_ptr = ldbase_difflist_len;
1980     CopyNyparr(ldbase_raregeno, ldbase_difflist_len, merged_raregeno);
1981     return kPglRetSuccess;
1982   }
1983   const uint32_t subgroup_idx_last = (diff_from_ldbase_len - 1) / kBitsPerWordD2;
1984   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
1985   uintptr_t* merged_raregeno_iter = merged_raregeno;
1986   uint32_t* merged_difflist_sample_ids_iter = merged_difflist_sample_ids;
1987   uintptr_t merged_raregeno_word = 0;
1988   uintptr_t ldbase_raregeno_word = 0;
1989   uintptr_t diff_from_ldbase_raregeno_word = 0;
1990   uint32_t ldbase_sample_idx = ldbase_difflist_sample_ids[0];
1991   uintptr_t raw_sample_idx = 0;
1992   uintptr_t cur_geno = 0;
1993   uint32_t sample_idx = 0;
1994   uint32_t ldbase_difflist_idx = 0;
1995   uint32_t done = 0;
1996   uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
1997   uint32_t merge_idx_lowbits = 0;
1998   for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
1999     uint32_t diff_from_ldbase_idx_lowbits = 0;
2000     if (subgroup_idx >= subgroup_idx_last) {
2001       if (subgroup_idx > subgroup_idx_last) {
2002         done = 1;
2003         sample_idx = sample_ct;
2004         goto ParseLdAndMergeDifflistSubset_finish;
2005       }
2006       subgroup_len_m1 &= diff_from_ldbase_len - 1;
2007     }
2008     if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
2009       raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
2010       group_info_iter = &(group_info_iter[sample_id_byte_ct]);
2011     } else {
2012       raw_sample_idx += GetVint31(fread_end, fread_pp);
2013     }
2014     diff_from_ldbase_raregeno_word = *diff_from_ldbase_raregeno_iter++;
2015     for (; ; ++diff_from_ldbase_idx_lowbits) {
2016       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2017         return kPglRetMalformedInput;
2018       }
2019       cur_geno = diff_from_ldbase_raregeno_word & 3;
2020       if ((!sample_include) || IsSet(sample_include, raw_sample_idx)) {
2021         sample_idx = sample_include? RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx) : raw_sample_idx;
2022       ParseLdAndMergeDifflistSubset_finish:
2023         while (ldbase_sample_idx < sample_idx) {
2024           // replace with blocked copy?
2025           if (!(ldbase_difflist_idx % kBitsPerWordD2)) {
2026             ldbase_raregeno_word = ldbase_raregeno[ldbase_difflist_idx / kBitsPerWordD2];
2027           }
2028           *merged_difflist_sample_ids_iter++ = ldbase_sample_idx;
2029           merged_raregeno_word |= (ldbase_raregeno_word & 3) << (2 * merge_idx_lowbits);
2030           if (merge_idx_lowbits++ == (kBitsPerWordD2 - 1)) {
2031             *merged_raregeno_iter++ = merged_raregeno_word;
2032             merged_raregeno_word = 0;
2033             merge_idx_lowbits = 0;
2034           }
2035           ++ldbase_difflist_idx;
2036           ldbase_raregeno_word >>= 2;
2037           ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx];
2038         }
2039         if (ldbase_sample_idx == sample_idx) {
2040           if (done) {
2041             if (merge_idx_lowbits) {
2042               *merged_raregeno_iter = merged_raregeno_word;
2043             }
2044             *merged_difflist_len_ptr = merged_difflist_sample_ids_iter - merged_difflist_sample_ids;
2045             return kPglRetSuccess;
2046           }
2047           if (!(ldbase_difflist_idx % kBitsPerWordD2)) {
2048             ldbase_raregeno_word = ldbase_raregeno[ldbase_difflist_idx / kBitsPerWordD2];
2049           }
2050           ++ldbase_difflist_idx;
2051           ldbase_raregeno_word >>= 2;
2052           ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx];
2053         }
2054         if (cur_geno != ldbase_common_geno) {
2055           *merged_difflist_sample_ids_iter++ = sample_idx;
2056           merged_raregeno_word |= cur_geno << (2 * merge_idx_lowbits);
2057           if (merge_idx_lowbits++ == (kBitsPerWordD2 - 1)) {
2058             *merged_raregeno_iter++ = merged_raregeno_word;
2059             merged_raregeno_word = 0;
2060             merge_idx_lowbits = 0;
2061           }
2062         }
2063       }
2064       if (diff_from_ldbase_idx_lowbits == subgroup_len_m1) {
2065         break;
2066       }
2067       raw_sample_idx += GetVint31(fread_end, fread_pp);
2068       diff_from_ldbase_raregeno_word >>= 2;
2069     }
2070   }
2071 }
2072 
2073 /*
2074 void PrunedDifflistToGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t sample_ct, uint32_t difflist_common_geno, uint32_t difflist_len, uintptr_t* __restrict genovec) {
2075   // Designed to be used after genovec subsetting.  Assumes all difflist
2076   // entries are valid.  Ok for trailing bits of raregeno to be nonzero.  Does
2077   // not zero out trailing bits of genovec.
2078   const uint32_t vec_ct = NypCtToVecCt(sample_ct);
2079   vecset(genovec, difflist_common_geno * kMask5555, vec_ct);
2080   if (!difflist_len) {
2081     return;
2082   }
2083   const uintptr_t* raregeno_incr = raregeno;
2084   const uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
2085   const uint32_t* difflist_sample_ids_end = &(difflist_sample_ids[difflist_len]);
2086   // don't think there's a point to separating out the
2087   // difflist_common_geno == 0 case here, since the RawToSubsettedPos
2088   // operation is a bit expensive
2089   while (1) {
2090     // er, get rid of this undefined behavior if we uncomment this function
2091     const uint32_t* difflist_sample_ids_stop = &(difflist_sample_ids_iter[kBitsPerWordD2]);
2092     uintptr_t raregeno_word = *raregeno_incr++;
2093     if (difflist_sample_ids_stop > difflist_sample_ids_end) {
2094       if (difflist_sample_ids_iter == difflist_sample_ids_end) {
2095         return;
2096       }
2097       difflist_sample_ids_stop = difflist_sample_ids_end;
2098     }
2099     while (1) {
2100       const uint32_t cur_sample_idx = *difflist_sample_ids_iter;
2101       const uint32_t cur_subsetted_pos = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, cur_sample_idx);
2102       AssignNyparrEntry(cur_subsetted_pos, raregeno_word & 3, genovec);
2103       if (difflist_sample_ids_iter++ == difflist_sample_ids_stop) {
2104         break;
2105       }
2106       raregeno_word >>= 2;
2107     }
2108   }
2109 }
2110 */
2111 
ParseAndApplyDifflist(const unsigned char * fread_end,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)2112 PglErr ParseAndApplyDifflist(const unsigned char* fread_end, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
2113   // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
2114   // Cannot occur after genoarr subsetting since the difflist sample indexes
2115   // will be incorrect.
2116   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2117   uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
2118   const unsigned char* group_info_iter;
2119   uint32_t difflist_len;
2120   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len);
2121   if (reterr || (!difflist_len)) {
2122     return reterr;
2123   }
2124   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
2125   const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
2126   uintptr_t raw_sample_idx = 0;
2127   for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
2128     uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
2129     if (subgroup_idx >= subgroup_idx_last) {
2130       if (subgroup_idx > subgroup_idx_last) {
2131         return kPglRetSuccess;
2132       }
2133       remaining_deltas_in_subgroup &= difflist_len - 1;
2134     }
2135     if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
2136       raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
2137       group_info_iter = &(group_info_iter[sample_id_byte_ct]);
2138     } else {
2139       raw_sample_idx += GetVint31(fread_end, fread_pp);
2140     }
2141     uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
2142     // This loop tends to be the decompression bottleneck.  Tried to modify it
2143     // to process 4 entries at a time, but that didn't end up helping.
2144     for (; ; --remaining_deltas_in_subgroup) {
2145       // always check, since otherwise AssignNyparrEntry() can scribble
2146       // over arbitrary memory
2147       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2148         return kPglRetMalformedInput;
2149       }
2150       const uintptr_t cur_geno = cur_raregeno_word & 3;
2151       AssignNyparrEntry(raw_sample_idx, cur_geno, genoarr);
2152       if (!remaining_deltas_in_subgroup) {
2153         break;
2154       }
2155       raw_sample_idx += GetVint31(fread_end, fread_pp);
2156       cur_raregeno_word >>= 2;
2157     }
2158   }
2159 }
2160 
2161 // could merge ParseAndApplyDifflist() with this?
ParseAndApplyDifflistSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)2162 PglErr ParseAndApplyDifflistSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
2163   // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
2164   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2165   if (sample_ct == raw_sample_ct) {
2166     return ParseAndApplyDifflist(fread_end, fread_pp, pgrp, genoarr);
2167   }
2168   uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
2169   const unsigned char* group_info_iter;
2170   uint32_t difflist_len;
2171   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len);
2172   if (reterr || (!difflist_len)) {
2173     return reterr;
2174   }
2175   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
2176   const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
2177   uintptr_t raw_sample_idx = 0;
2178   for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
2179     uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
2180     if (subgroup_idx >= subgroup_idx_last) {
2181       if (subgroup_idx > subgroup_idx_last) {
2182         return kPglRetSuccess;
2183       }
2184       remaining_deltas_in_subgroup &= difflist_len - 1;
2185     }
2186     if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
2187       raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
2188       group_info_iter = &(group_info_iter[sample_id_byte_ct]);
2189     } else {
2190       raw_sample_idx += GetVint31(fread_end, fread_pp);
2191     }
2192     uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
2193     // This loop tends to be the decompression bottleneck.  Tried to modify it
2194     // to process 4 entries at a time, but that didn't end up helping.
2195     for (; ; --remaining_deltas_in_subgroup) {
2196       // always check, since otherwise AssignNyparrEntry() can scribble
2197       // over arbitrary memory
2198       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2199         return kPglRetMalformedInput;
2200       }
2201       if (IsSet(sample_include, raw_sample_idx)) {
2202         const uintptr_t cur_geno = cur_raregeno_word & 3;
2203         AssignNyparrEntry(RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx), cur_geno, genoarr);
2204       }
2205       if (!remaining_deltas_in_subgroup) {
2206         break;
2207       }
2208       raw_sample_idx += GetVint31(fread_end, fread_pp);
2209       cur_raregeno_word >>= 2;
2210     }
2211   }
2212 }
2213 
2214 // vector-alignment preferred
ParseOnebitUnsafe(const unsigned char * fread_end,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)2215 PglErr ParseOnebitUnsafe(const unsigned char* fread_end, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
2216   // doesn't zero out trailing genoarr bits
2217   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2218   const uint32_t common2_and_bitarray_byte_ct = (raw_sample_ct + 15) / CHAR_BIT;
2219   const unsigned char* onebit_main_iter = *fread_pp;
2220   if (PtrAddCk(fread_end, common2_and_bitarray_byte_ct, fread_pp)) {
2221     return kPglRetMalformedInput;
2222   }
2223   const uintptr_t common2_code = *onebit_main_iter++;
2224   const uintptr_t word_base = (common2_code / 4) * kMask5555;
2225   const uintptr_t common_code_delta = common2_code & 3;
2226   uint32_t genoarr_widx = 0;
2227 #if defined(__LP64__) && !defined(USE_AVX2)
2228   // this is slower in AVX2 case
2229   const uint32_t read_hw_ct = raw_sample_ct / kBitsPerWordD2;
2230   if (read_hw_ct >= 2 * kWordsPerVec) {
2231     const uint32_t read_vec_ct = raw_sample_ct / kBitsPerVec;
2232     const VecW* onebit_main_valias = R_CAST(const VecW*, onebit_main_iter);
2233     const VecW m4 = VCONST_W(kMask0F0F);
2234 #  ifdef USE_SSE42
2235     // 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, 85 if the codes
2236     // are 0 and 1
2237     const VecW lookup = {word_base + common_code_delta * 0x1514111005040100LLU,
2238                          word_base + common_code_delta * 0x5554515045444140LLU};
2239 #  else
2240     const VecW m1 = VCONST_W(kMask5555);
2241     const VecW m2 = VCONST_W(kMask3333);
2242     const VecW vec_base = VCONST_W(word_base);
2243     const VecW vec_delta = VCONST_W(common_code_delta * kMask5555);
2244 #  endif
2245     VecW* genoarr_valias = R_CAST(VecW*, genoarr);
2246     for (uint32_t vidx = 0; vidx != read_vec_ct; ++vidx) {
2247       const VecW cur_vec = vecw_loadu(&(onebit_main_valias[vidx]));
2248       const VecW vec_even = cur_vec & m4;
2249       const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;
2250       VecW vec_lo = vecw_unpacklo8(vec_even, vec_odd);
2251       VecW vec_hi = vecw_unpackhi8(vec_even, vec_odd);
2252 #  ifdef USE_SSE42
2253       vec_lo = vecw_shuffle8(lookup, vec_lo);
2254       vec_hi = vecw_shuffle8(lookup, vec_hi);
2255 #  else
2256       // unpack bytes, then use as mask for vec_add.
2257       vec_lo = (vec_lo | vecw_slli(vec_lo, 2)) & m2;
2258       vec_hi = (vec_hi | vecw_slli(vec_hi, 2)) & m2;
2259       vec_lo = (vec_lo | vecw_slli(vec_lo, 1)) & m1;
2260       vec_hi = (vec_hi | vecw_slli(vec_hi, 1)) & m1;
2261       vec_lo = vec_lo | vecw_slli(vec_lo, 1);
2262       vec_hi = vec_hi | vecw_slli(vec_hi, 1);
2263       vec_lo = vec_base + (vec_delta & vec_lo);
2264       vec_hi = vec_base + (vec_delta & vec_hi);
2265 #  endif
2266       genoarr_valias[2 * vidx] = vec_lo;
2267       genoarr_valias[2 * vidx + 1] = vec_hi;
2268     }
2269     genoarr_widx = read_vec_ct * (2 * kWordsPerVec);
2270   }
2271 #endif
2272   const uint32_t genoarr_widx_trail = (raw_sample_ct + 7) / kBitsPerWordD2;
2273   const uint32_t genoarr_widx_end = NypCtToWordCt(raw_sample_ct);
2274 #  ifdef __arm__
2275 #    error "Unaligned accesses in ParseOnebitUnsafe()."
2276 #  endif
2277   const Halfword* onebit_main_alias = R_CAST(const Halfword*, onebit_main_iter);
2278   for (; ; ++genoarr_widx) {
2279     uintptr_t ww;
2280     if (genoarr_widx >= genoarr_widx_trail) {
2281       // might want to modify to not go here if last read is an entire halfword
2282       if (genoarr_widx == genoarr_widx_end) {
2283         break;
2284       }
2285       ww = ProperSubwordLoad(&(onebit_main_alias[genoarr_widx_trail]), 1 + (((raw_sample_ct - 1) % kBitsPerWordD2) / CHAR_BIT));
2286     } else {
2287       ww = onebit_main_alias[genoarr_widx];
2288     }
2289     // apply middle-out operation
2290     // 64-bit:
2291     //   const uintptr_t middle_out_result = (ww | (ww << 31)) & kMask5555;
2292     // 32-bit:
2293     //   *genoarr_iter++ = word_base + (ww & kMask5555) * common_code_delta;
2294     //   *genoarr_iter++ = word_base + ((ww >> 1) & kMask5555) * common_code_delta;
2295     // (scrapped since the time savings don't seem to be worth the extra
2296     // end-of-vector corner cases, apparently the extra operations here are
2297     // sufficiently cheap, or even negative-cost in AVX2 case)
2298 
2299     ww = UnpackHalfwordToWord(ww);
2300     genoarr[genoarr_widx] = word_base + ww * common_code_delta;
2301   }
2302   return ParseAndApplyDifflist(fread_end, fread_pp, pgrp, genoarr);
2303 }
2304 
2305 // vector-alignment preferred
Parse1or2bitGenoarrUnsafe(const unsigned char * fread_end,uint32_t vrtype,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)2306 PglErr Parse1or2bitGenoarrUnsafe(const unsigned char* fread_end, uint32_t vrtype, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
2307   // Side effect: may use pgrp->workspace_raregeno_tmp_loadbuf.
2308   // Does not update fp_vidx, does not rotate plink1-formatted data (since it's
2309   // better to do that post-subsetting)
2310   if (vrtype & 3) {
2311     return ParseOnebitUnsafe(fread_end, fread_pp, pgrp, genoarr);
2312   }
2313   // uncompressed storage
2314   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2315   const uint32_t genoarr_byte_ct = NypCtToByteCt(raw_sample_ct);
2316   const unsigned char* src_genodata = *fread_pp;
2317   if (PtrAddCk(fread_end, genoarr_byte_ct, fread_pp)) {
2318     return kPglRetMalformedInput;
2319   }
2320   memcpy(genoarr, src_genodata, genoarr_byte_ct);
2321   return kPglRetSuccess;
2322 }
2323 
ParseNonLdGenovecSubsetUnsafe(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vrtype,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genovec)2324 PglErr ParseNonLdGenovecSubsetUnsafe(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vrtype, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genovec) {
2325   // Side effects:
2326   //   may use pgrp->workspace_raregeno_tmp_loadbuf
2327   //   fills pgrp->ldbase_raw_genovec iff (!(vrtype & 4)) and
2328   //     subsetting_required (does not update ldbase_stypes, caller's
2329   //     responsibility to care)
2330   // See comments on Parse1or2bitGenoarrUnsafe().
2331   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2332   if (!(vrtype & 4)) {
2333     const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
2334     uintptr_t* raw_genovec = subsetting_required? pgrp->ldbase_raw_genovec : genovec;
2335     PglErr reterr = Parse1or2bitGenoarrUnsafe(fread_end, vrtype, fread_pp, pgrp, raw_genovec);
2336     if ((!subsetting_required) || reterr) {
2337       return reterr;
2338     }
2339     CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
2340     return kPglRetSuccess;
2341   }
2342   const uint32_t vrtype_low2 = vrtype & 3;
2343   if (vrtype_low2 != 1) {
2344     const uint32_t vec_ct = NypCtToVecCt(sample_ct);
2345 
2346     // This memset is frequently the limiting operation.  This suggests that we
2347     // should eventually make more use of the DifflistOrGenovec interface.
2348     vecset(genovec, vrtype_low2 * kMask5555, vec_ct);
2349     return ParseAndApplyDifflistSubset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, fread_pp, pgrp, genovec);
2350   }
2351   // all homozygous-ref special case
2352   ZeroWArr(NypCtToWordCt(sample_ct), genovec);
2353   return kPglRetSuccess;
2354 }
2355 
InitReadPtrs(uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp)2356 BoolErr InitReadPtrs(uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp) {
2357   const unsigned char* block_base = pgrp->fi.block_base;
2358   if (block_base != nullptr) {
2359     // possible todo: special handling of end of vblock
2360     const uint64_t block_offset = pgrp->fi.block_offset;
2361     *fread_pp = &(block_base[GetPgfiFpos(&(pgrp->fi), vidx) - block_offset]);
2362     *fread_endp = &(block_base[GetPgfiFpos(&(pgrp->fi), vidx + 1) - block_offset]);
2363 
2364     // still a useful hint to LdLoadNecessary()
2365     pgrp->fp_vidx = vidx + 1;
2366 
2367     return 0;
2368   }
2369   if (pgrp->fp_vidx != vidx) {
2370     if (unlikely(fseeko(pgrp->ff, GetPgfiFpos(&(pgrp->fi), vidx), SEEK_SET))) {
2371       return 1;
2372     }
2373   }
2374   const uintptr_t cur_vrec_width = GetPgfiVrecWidth(&(pgrp->fi), vidx);
2375 #ifdef __LP64__
2376   if (unlikely(fread_checked(pgrp->fread_buf, cur_vrec_width, pgrp->ff))) {
2377     return 1;
2378   }
2379 #else
2380   // cur_vrec_width < 2^31 since otherwise we error out on initialization
2381   if (unlikely(!fread_unlocked(pgrp->fread_buf, cur_vrec_width, 1, pgrp->ff))) {
2382     return 1;
2383   }
2384 #endif
2385   *fread_pp = pgrp->fread_buf;
2386   *fread_endp = &(pgrp->fread_buf[cur_vrec_width]);
2387   pgrp->fp_vidx = vidx + 1;
2388   return 0;
2389 }
2390 
LdLoadNecessary(uint32_t cur_vidx,PgenReaderMain * pgrp)2391 uint32_t LdLoadNecessary(uint32_t cur_vidx, PgenReaderMain* pgrp) {
2392   // Determines whether LD base variant needs to be loaded (in addition to the
2393   // current variant), assuming we need (possibly subsetted) hardcalls.
2394   // Important: this updates pgrp->ldbase_vidx when necessary, as a side
2395   // effect.
2396   // bugfix (22 May 2018): this only checked whether ldbase_stypes was nonzero;
2397   // there was an AllHets + cache-clear edge case where that's not good enough.
2398   // now that AllHets has been removed, though, it should be safe again.
2399   if (pgrp->ldbase_stypes && (cur_vidx == pgrp->fp_vidx)) {
2400     assert(pgrp->ldbase_stypes & (kfPgrLdcacheNyp | kfPgrLdcacheDifflist | kfPgrLdcacheRawNyp));
2401     // ldbase variant guaranteed to be up-to-date if we didn't skip the last
2402     // variant, and cache wasn't cleared
2403     return 0;
2404   }
2405   // Find the last vrtypes[] value before vrtypes[cur_vidx] with bit 1 unset or
2406   // bit 2 set.
2407   const uint32_t old_ldbase_vidx = pgrp->ldbase_vidx;
2408   const uint32_t new_ldbase_vidx = GetLdbaseVidx(pgrp->fi.vrtypes, cur_vidx);
2409   if (old_ldbase_vidx == new_ldbase_vidx) {
2410     return 0;
2411   }
2412   pgrp->ldbase_vidx = new_ldbase_vidx;
2413   return 1;
2414 }
2415 
2416 // Fills dest with subsetted ldbase contents, and ensures ldcache is filled so
2417 // no explicit reload of ldbase is needed for next variant if we're extracting
2418 // the same sample subset.  (Reload is occasionally needed if next variant is
2419 // multiallelic or phased, we only prevent that when convenient.)
LdLoadAndCopyGenovecSubset(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,uintptr_t * dest)2420 PglErr LdLoadAndCopyGenovecSubset(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uintptr_t* dest) {
2421   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2422   if (LdLoadNecessary(vidx, pgrp)) {
2423     const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
2424     const unsigned char* fread_ptr;
2425     const unsigned char* fread_end;
2426     if (unlikely(InitReadPtrs(ldbase_vidx, pgrp, &fread_ptr, &fread_end))) {
2427       return kPglRetReadFail;
2428     }
2429     const uint32_t vrtype = pgrp->fi.vrtypes[ldbase_vidx];
2430     PglErr reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, dest);
2431     pgrp->ldbase_stypes = ((sample_ct != raw_sample_ct) && (!(vrtype & 4)))? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp) : kfPgrLdcacheNyp;
2432     CopyNyparr(dest, sample_ct, pgrp->ldbase_genovec);
2433     return reterr;
2434   }
2435   if (pgrp->ldbase_stypes & kfPgrLdcacheNyp) {
2436     CopyNyparr(pgrp->ldbase_genovec, sample_ct, dest);
2437   } else {
2438     if ((pgrp->ldbase_stypes & kfPgrLdcacheRawNyp) && (sample_ct == raw_sample_ct)) {
2439       CopyNyparr(pgrp->ldbase_raw_genovec, sample_ct, dest);
2440     } else if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) {
2441       // rematerialize-from-difflist is cheap.
2442       PgrDifflistToGenovecUnsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, sample_ct, pgrp->ldbase_difflist_len, dest);
2443     } else {
2444       CopyNyparrNonemptySubset(pgrp->ldbase_raw_genovec, sample_include, pgrp->fi.raw_sample_ct, sample_ct, dest);
2445       CopyNyparr(dest, sample_ct, pgrp->ldbase_genovec);
2446       pgrp->ldbase_stypes |= kfPgrLdcacheNyp;
2447     }
2448   }
2449   return kPglRetSuccess;
2450 }
2451 
2452 // fread_pp should be non-null iff this is being called by an internal function
2453 // as part of a more complex read.
2454 // in multiallelic case:
2455 //   hom-ref = 0
2456 //   het-ref = 1
2457 //   two nonref = 2
2458 //   missing = 3
ReadGenovecSubsetUnsafe(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict genovec)2459 PglErr ReadGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec) {
2460   // Side effects:
2461   //   may use pgr.workspace_raregeno_tmp_loadbuf (any difflist)
2462   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
2463   const uint32_t maintrack_vrtype = vrtype & 7;
2464   if (VrtypeLdCompressed(maintrack_vrtype)) {
2465     // LD compression
2466     PglErr reterr = LdLoadAndCopyGenovecSubset(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, genovec);
2467     if (unlikely(reterr)) {
2468       return reterr;
2469     }
2470     const unsigned char* fread_ptr;
2471     const unsigned char* fread_end;
2472     if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
2473       return kPglRetReadFail;
2474     }
2475     reterr = ParseAndApplyDifflistSubset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, &fread_ptr, pgrp, genovec);
2476     if (unlikely(reterr)) {
2477       return reterr;
2478     }
2479     if (maintrack_vrtype == 3) {
2480       GenovecInvertUnsafe(sample_ct, genovec);
2481     }
2482     if (fread_pp) {
2483       *fread_pp = fread_ptr;
2484       *fread_endp = fread_end;
2485     }
2486     return kPglRetSuccess;
2487   }
2488   const unsigned char* fread_ptr;
2489   const unsigned char* fread_end = nullptr;  // maybe-uninitialized warning
2490   // tried inserting special-case code for the plink1 case to avoid a copy, and
2491   // it was actually slower
2492   if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
2493     return kPglRetReadFail;
2494   }
2495   // tried to add more sophisticated caching, but turns out it isn't worth it
2496   PglErr reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, maintrack_vrtype, &fread_ptr, pgrp, genovec);
2497   if (unlikely(reterr)) {
2498     return reterr;
2499   }
2500   if (vrtype == kPglVrtypePlink1) {
2501     PgrPlink1ToPlink2InplaceUnsafe(sample_ct, genovec);
2502   } else {
2503     const uint32_t is_ldbase = pgrp->fi.vrtypes && VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
2504     const uint32_t ldbase_raw_genovec_saved = (sample_ct != pgrp->fi.raw_sample_ct) && (!(maintrack_vrtype & 4));
2505     if (is_ldbase) {
2506       CopyNyparr(genovec, sample_ct, pgrp->ldbase_genovec);
2507       pgrp->ldbase_vidx = vidx;
2508       // may be better to just always set to kfPgrLdcacheNyp?  this depends
2509       // on multiallelic code
2510       pgrp->ldbase_stypes = ldbase_raw_genovec_saved? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp) : kfPgrLdcacheNyp;
2511     } else if (ldbase_raw_genovec_saved) {
2512       // bugfix (22 Sep 2018): when accessing variants out of order, need to
2513       // note that we just clobbered the cache
2514       pgrp->ldbase_stypes &= ~kfPgrLdcacheRawNyp;
2515     }
2516   }
2517   if (fread_pp) {
2518     *fread_pp = fread_ptr;
2519     *fread_endp = fread_end;
2520   }
2521   return kPglRetSuccess;
2522 }
2523 
PgrGet(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict genovec)2524 PglErr PgrGet(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec) {
2525   if (!sample_ct) {
2526     return kPglRetSuccess;
2527   }
2528   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
2529   assert(vidx < pgrp->fi.raw_variant_ct);
2530   return ReadGenovecSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
2531 }
2532 
2533 // Fills dest with ldbase contents, and ensures ldcache is filled so no
2534 // explicit reload of ldbase is needed for next variant.
LdLoadAndCopyRawGenovec(uint32_t subsetting_required,uint32_t vidx,PgenReaderMain * pgrp,uintptr_t * dest)2535 PglErr LdLoadAndCopyRawGenovec(uint32_t subsetting_required, uint32_t vidx, PgenReaderMain* pgrp, uintptr_t* dest) {
2536   const uint32_t genovec_byte_ct = NypCtToVecCt(pgrp->fi.raw_sample_ct) * kBytesPerVec;
2537   if (LdLoadNecessary(vidx, pgrp) || (subsetting_required && (!(pgrp->ldbase_stypes & kfPgrLdcacheRawNyp)))) {
2538     const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
2539     const unsigned char* fread_ptr;
2540     const unsigned char* fread_end;
2541     if (unlikely(InitReadPtrs(ldbase_vidx, pgrp, &fread_ptr, &fread_end))) {
2542       return kPglRetReadFail;
2543     }
2544     const uint32_t vrtype = pgrp->fi.vrtypes[ldbase_vidx];
2545     pgrp->ldbase_stypes = kfPgrLdcacheRawNyp;
2546     assert((vrtype & 7) != 5); // all-hom-ref can't be ldbase
2547     uintptr_t* raw_genovec = pgrp->ldbase_raw_genovec;
2548     PglErr reterr;
2549     if (!(vrtype & 4)) {
2550       reterr = Parse1or2bitGenoarrUnsafe(fread_end, vrtype, &fread_ptr, pgrp, raw_genovec);
2551     } else {
2552       const uint32_t vrtype_low2 = vrtype & 3;
2553       vecset(raw_genovec, vrtype_low2 * kMask5555, DivUp(genovec_byte_ct, kBytesPerVec));
2554       reterr = ParseAndApplyDifflist(fread_end, &fread_ptr, pgrp, raw_genovec);
2555     }
2556     memcpy(dest, raw_genovec, genovec_byte_ct);
2557     return reterr;
2558   }
2559   if (pgrp->ldbase_stypes & kfPgrLdcacheRawNyp) {
2560     memcpy(dest, pgrp->ldbase_raw_genovec, genovec_byte_ct);
2561   } else {
2562     // no subsetting, can use regular Ldcache entries
2563     const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2564     if (pgrp->ldbase_stypes & kfPgrLdcacheNyp) {
2565       memcpy(dest, pgrp->ldbase_genovec, genovec_byte_ct);
2566     } else {
2567       PgrDifflistToGenovecUnsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, raw_sample_ct, pgrp->ldbase_difflist_len, dest);
2568     }
2569   }
2570   return kPglRetSuccess;
2571 }
2572 
2573 // Does not zero out trailing bits.
2574 // Requires fread_pp and fread_endp to be non-null for now.
ReadRawGenovec(uint32_t subsetting_required,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * raw_genovec)2575 PglErr ReadRawGenovec(uint32_t subsetting_required, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* raw_genovec) {
2576   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
2577   const uint32_t maintrack_vrtype = vrtype & 7;
2578   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2579   if (VrtypeLdCompressed(maintrack_vrtype)) {
2580     // LD compression
2581     PglErr reterr = LdLoadAndCopyRawGenovec(subsetting_required, vidx, pgrp, raw_genovec);
2582     if (unlikely(reterr)) {
2583       return reterr;
2584     }
2585     if (unlikely(InitReadPtrs(vidx, pgrp, fread_pp, fread_endp))) {
2586       return kPglRetReadFail;
2587     }
2588     reterr = ParseAndApplyDifflist(*fread_endp, fread_pp, pgrp, raw_genovec);
2589     if (unlikely(reterr)) {
2590       return reterr;
2591     }
2592     if (maintrack_vrtype == 3) {
2593       GenovecInvertUnsafe(raw_sample_ct, raw_genovec);
2594     }
2595     return kPglRetSuccess;
2596   }
2597   if (unlikely(InitReadPtrs(vidx, pgrp, fread_pp, fread_endp))) {
2598     return kPglRetReadFail;
2599   }
2600   const unsigned char* fread_end = *fread_endp;
2601   PglErr reterr;
2602   if (!(vrtype & 4)) {
2603     reterr = Parse1or2bitGenoarrUnsafe(fread_end, vrtype, fread_pp, pgrp, raw_genovec);
2604   } else {
2605     const uint32_t vrtype_low2 = vrtype & 3;
2606     if (vrtype_low2 == 1) {
2607       ZeroWArr(NypCtToWordCt(raw_sample_ct), raw_genovec);
2608       // all-hom-ref can't be ldbase
2609       return kPglRetSuccess;
2610     }
2611     const uint32_t vec_ct = NypCtToVecCt(raw_sample_ct);
2612     vecset(raw_genovec, vrtype_low2 * kMask5555, vec_ct);
2613     reterr = ParseAndApplyDifflist(fread_end, fread_pp, pgrp, raw_genovec);
2614   }
2615   if (vrtype == kPglVrtypePlink1) {
2616     PgrPlink1ToPlink2InplaceUnsafe(raw_sample_ct, raw_genovec);
2617   } else {
2618     const uint32_t is_ldbase = pgrp->fi.vrtypes && VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
2619     if (is_ldbase) {
2620       CopyNyparr(raw_genovec, raw_sample_ct, pgrp->ldbase_raw_genovec);
2621       pgrp->ldbase_vidx = vidx;
2622       pgrp->ldbase_stypes = kfPgrLdcacheRawNyp;
2623     }
2624   }
2625   return reterr;
2626 }
2627 /*
2628 void CopyAndSubsetDifflist(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_raregeno, const uint32_t* __restrict raw_difflist_sample_ids, uint32_t raw_difflist_len, uintptr_t* __restrict new_raregeno, uint32_t* __restrict new_difflist_sample_ids, uint32_t* __restrict new_difflist_len_ptr) {
2629   // Trailing bits of new_raregeno are zeroed out.
2630   if (!raw_difflist_len) {
2631     *new_difflist_len_ptr = 0;
2632     return;
2633   }
2634   const uintptr_t* raw_raregeno_incr = raw_raregeno;
2635   const uint32_t* raw_difflist_sample_ids_iter = raw_difflist_sample_ids;
2636   const uint32_t* raw_difflist_sample_ids_last = &(raw_difflist_sample_ids[RoundDownPow2(raw_difflist_len - 1, kBitsPerWordD2)]);
2637   uintptr_t* new_raregeno_incr = new_raregeno;
2638   uintptr_t new_raregeno_word = 0;
2639   uint32_t new_difflist_len = 0;
2640   uint32_t block_len_m1 = kBitsPerWordD2 - 1;
2641   while (1) {
2642     if (raw_difflist_sample_ids_iter >= raw_difflist_sample_ids_last) {
2643       if (raw_difflist_sample_ids_iter > raw_difflist_sample_ids_last) {
2644         if (new_difflist_len % kBitsPerWordD2) {
2645           *new_raregeno_incr = new_raregeno_word;
2646         }
2647         *new_difflist_len_ptr = new_difflist_len;
2648         return;
2649       }
2650       block_len_m1 &= raw_difflist_len - 1;
2651     }
2652     uintptr_t raw_raregeno_word = *raw_raregeno_incr++;
2653     uint32_t raw_difflist_idx_lowbits = 0;
2654     while (1) {
2655       const uint32_t raw_sample_idx = raw_difflist_sample_ids_iter[raw_difflist_idx_lowbits];
2656       if (IsSet(sample_include, raw_sample_idx)) {
2657         new_difflist_sample_ids[new_difflist_len] = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx);
2658         new_raregeno_word |= ((raw_raregeno_word >> (2 * raw_difflist_idx_lowbits)) & 3) << (2 * (new_difflist_len % kBitsPerWordD2));
2659         ++new_difflist_len;
2660         if (!(new_difflist_len % kBitsPerWordD2)) {
2661           *new_raregeno_incr++ = new_raregeno_word;
2662           new_raregeno_word = 0;
2663         }
2664       }
2665       if (raw_difflist_idx_lowbits == block_len_m1) {
2666         break;
2667       }
2668       ++raw_difflist_idx_lowbits;
2669     }
2670     raw_difflist_sample_ids_iter = &(raw_difflist_sample_ids_iter[kBitsPerWordD2]);
2671   }
2672 }
2673 */
2674 
2675 // Populates pgrp->ldbase_genovec or
2676 // pgrp->ldbase_{raregeno,difflist_sample_ids,difflist_len}, depending on
2677 // storage type.
2678 // Currently just called by ReadDifflistOrGenovecSubsetUnsafe(), which isn't
2679 // exploited by plink2 yet.
LdLoadMinimalSubsetIfNecessary(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp)2680 PglErr LdLoadMinimalSubsetIfNecessary(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp) {
2681   if (!LdLoadNecessary(vidx, pgrp)) {
2682     return kPglRetSuccess;
2683   }
2684   const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
2685   const uint64_t cur_vidx_fpos = pgrp->fi.var_fpos[ldbase_vidx];
2686   const uint32_t ldbase_vrtype = pgrp->fi.vrtypes[ldbase_vidx];
2687   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2688   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
2689   uintptr_t* raw_genovec = subsetting_required? pgrp->ldbase_raw_genovec : pgrp->ldbase_genovec;
2690   const unsigned char* fread_ptr;
2691   const unsigned char* fread_end;
2692   const unsigned char* block_base = pgrp->fi.block_base;
2693   PglErr reterr = kPglRetSuccess;
2694   if (block_base != nullptr) {
2695     {
2696       const uint64_t block_offset = pgrp->fi.block_offset;
2697       fread_ptr = &(block_base[cur_vidx_fpos - block_offset]);
2698       fread_end = &(block_base[pgrp->fi.var_fpos[ldbase_vidx + 1] - block_offset]);
2699     }
2700     if (!(ldbase_vrtype & 4)) {
2701       reterr = Parse1or2bitGenoarrUnsafe(fread_end, ldbase_vrtype, &fread_ptr, pgrp, raw_genovec);
2702     LdLoadMinimalSubsetIfNecessary_genovec_finish:
2703       pgrp->ldbase_stypes = subsetting_required? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp) : kfPgrLdcacheNyp;
2704       if ((!subsetting_required) || reterr) {
2705         return reterr;
2706       }
2707       CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, pgrp->ldbase_genovec);
2708       return kPglRetSuccess;
2709     }
2710     pgrp->fp_vidx = ldbase_vidx + 1;
2711   } else {
2712     if (unlikely(fseeko(pgrp->ff, pgrp->fi.var_fpos[ldbase_vidx], SEEK_SET))) {
2713       return kPglRetReadFail;
2714     }
2715     const uintptr_t cur_vrec_width = pgrp->fi.var_fpos[ldbase_vidx + 1] - cur_vidx_fpos;
2716     pgrp->fp_vidx = ldbase_vidx + 1;
2717     if (!(ldbase_vrtype & 7)) {
2718       // don't actually need to fread the whole record in this case
2719       const uint32_t raw_sample_ct4 = NypCtToByteCt(raw_sample_ct);
2720       if (unlikely(!fread_unlocked(raw_genovec, raw_sample_ct4, 1, pgrp->ff))) {
2721         return kPglRetReadFail;
2722       }
2723       if (raw_sample_ct4 != cur_vrec_width) {
2724         // ensure this doesn't match
2725         pgrp->fp_vidx = 0;
2726       }
2727       goto LdLoadMinimalSubsetIfNecessary_genovec_finish;
2728     }
2729     if (unlikely(!fread_unlocked(pgrp->fread_buf, cur_vrec_width, 1, pgrp->ff))) {
2730       return kPglRetReadFail;
2731     }
2732     fread_ptr = pgrp->fread_buf;
2733     fread_end = &(pgrp->fread_buf[cur_vrec_width]);
2734     if (!(ldbase_vrtype & 4)) {
2735       reterr = ParseOnebitUnsafe(fread_end, &fread_ptr, pgrp, raw_genovec);
2736       goto LdLoadMinimalSubsetIfNecessary_genovec_finish;
2737     }
2738   }
2739   uint32_t ldbase_difflist_len;
2740   if (!subsetting_required) {
2741     reterr = ParseAndSaveDifflist(fread_end, raw_sample_ct, &fread_ptr, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, &ldbase_difflist_len);
2742   } else {
2743     reterr = ParseAndSaveDifflistProperSubset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_sample_ct, &fread_ptr, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, &ldbase_difflist_len, pgrp->workspace_raregeno_tmp_loadbuf);
2744   }
2745   if (unlikely(reterr)) {
2746     return reterr;
2747   }
2748   pgrp->ldbase_difflist_len = ldbase_difflist_len;
2749   pgrp->ldbase_difflist_sample_ids[ldbase_difflist_len] = sample_ct;
2750   pgrp->ldbase_stypes = kfPgrLdcacheDifflist;
2751   return kPglRetSuccess;
2752 }
2753 
ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t max_simple_difflist_len,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict genovec,uint32_t * difflist_common_geno_ptr,uintptr_t * __restrict main_raregeno,uint32_t * __restrict difflist_sample_ids,uint32_t * __restrict difflist_len_ptr)2754 PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
2755   assert(vidx < pgrp->fi.raw_variant_ct);
2756   assert(sample_ct);
2757   assert(max_simple_difflist_len < sample_ct);
2758   // Side effects:
2759   //   may use pgr.workspace_raregeno_tmp_loadbuf
2760   // Trailing bits of genovec/main_raregeno may not be zeroed out.
2761   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
2762   const uint32_t maintrack_vrtype = vrtype & 7;
2763   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2764   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
2765   // const uint32_t multiallelic_hc_present = fread_pp && VrtypeMultiallelic(vrtype);
2766   if (VrtypeLdCompressed(maintrack_vrtype)) {
2767     // LD compression
2768 
2769     // note that this can currently load a difflist longer than
2770     // max_simple_difflist_len
2771     PglErr reterr = LdLoadMinimalSubsetIfNecessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
2772     if (unlikely(reterr)) {
2773       return reterr;
2774     }
2775     const unsigned char* fread_ptr;
2776     const unsigned char* fread_end;
2777     if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
2778       return kPglRetReadFail;
2779     }
2780     const uint32_t ld_invert = (maintrack_vrtype == 3);
2781     if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) {
2782       const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3;
2783       // unnecessary for this to branch on LD difflist length, since that's
2784       // limited to 3/4 of the ldbase difflist length.
2785       *difflist_common_geno_ptr = ldbase_common_geno;
2786       reterr = ParseLdAndMergeDifflistSubset(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->ldbase_difflist_len, ldbase_common_geno, raw_sample_ct, sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, pgrp->workspace_raregeno_tmp_loadbuf);
2787       if (unlikely(reterr)) {
2788         return reterr;
2789       }
2790       if (ld_invert) {
2791         *difflist_common_geno_ptr = (6 - ldbase_common_geno) & 3;
2792         GenovecInvertUnsafe(*difflist_len_ptr, main_raregeno);
2793       }
2794       return kPglRetSuccess;
2795     }
2796     if (pgrp->ldbase_stypes & kfPgrLdcacheNyp) {
2797       CopyNyparr(pgrp->ldbase_genovec, sample_ct, genovec);
2798     } else {
2799       assert(pgrp->ldbase_stypes & kfPgrLdcacheRawNyp);
2800       CopyNyparrNonemptySubset(pgrp->ldbase_raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
2801       CopyNyparr(genovec, sample_ct, pgrp->ldbase_genovec);
2802       pgrp->ldbase_stypes |= kfPgrLdcacheNyp;
2803     }
2804     *difflist_common_geno_ptr = UINT32_MAX;
2805     reterr = ParseAndApplyDifflistSubset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, &fread_ptr, pgrp, genovec);
2806     if (unlikely(reterr)) {
2807       return reterr;
2808     }
2809     if (ld_invert) {
2810       GenovecInvertUnsafe(sample_ct, genovec);
2811     }
2812     if (fread_pp) {
2813       *fread_pp = fread_ptr;
2814       *fread_endp = fread_end;
2815     }
2816     return kPglRetSuccess;
2817   }
2818   const unsigned char* fread_ptr;
2819   const unsigned char* fread_end = nullptr;  // maybe-uninitialized warning
2820   if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
2821     return kPglRetReadFail;
2822   }
2823   const uint32_t is_ldbase = pgrp->fi.vrtypes && VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
2824   const uint32_t saved_difflist_len = VrtypeDifflist(vrtype)? PeekVint31(fread_ptr, fread_end) : raw_sample_ct;
2825   pgrp->ldbase_vidx = vidx;
2826   // no limit is slightly better than /16 but substantially worse than /32 on
2827   // the large test dataset (/64 is slightly worse than /32)
2828   // no limit is best on the small test dataset
2829   if (saved_difflist_len > max_simple_difflist_len) {
2830     *difflist_common_geno_ptr = UINT32_MAX;
2831     PglErr reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, genovec);
2832     if (unlikely(reterr)) {
2833       return reterr;
2834     }
2835     const uint32_t ldbase_raw_genovec_saved = (subsetting_required && (!(vrtype & 4)));
2836     if (is_ldbase) {
2837       CopyNyparr(genovec, sample_ct, pgrp->ldbase_genovec);
2838       pgrp->ldbase_stypes = ldbase_raw_genovec_saved? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp) : kfPgrLdcacheNyp;
2839     } else if (ldbase_raw_genovec_saved) {
2840       // bugfix (22 Sep 2018)
2841       pgrp->ldbase_stypes &= ~kfPgrLdcacheRawNyp;
2842     }
2843     if (vrtype == kPglVrtypePlink1) {
2844       PgrPlink1ToPlink2InplaceUnsafe(sample_ct, genovec);
2845     }
2846     if (fread_pp) {
2847       *fread_pp = fread_ptr;
2848       *fread_endp = fread_end;
2849     }
2850     return kPglRetSuccess;
2851   }
2852   *difflist_common_geno_ptr = vrtype & 3;
2853   PglErr reterr;
2854   if (!subsetting_required) {
2855     reterr = ParseAndSaveDifflist(fread_end, raw_sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
2856   } else {
2857     reterr = ParseAndSaveDifflistProperSubset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, pgrp->workspace_raregeno_tmp_loadbuf);
2858   }
2859   if (unlikely(reterr)) {
2860     return kPglRetMalformedInput;
2861   }
2862   if (is_ldbase) {
2863     const uint32_t difflist_len = *difflist_len_ptr;
2864     pgrp->ldbase_stypes = kfPgrLdcacheDifflist;
2865     pgrp->ldbase_difflist_len = difflist_len;
2866     CopyNyparr(main_raregeno, difflist_len, pgrp->ldbase_raregeno);
2867     memcpy(pgrp->ldbase_difflist_sample_ids, difflist_sample_ids, difflist_len * sizeof(int32_t));
2868     pgrp->ldbase_difflist_sample_ids[difflist_len] = sample_ct;
2869   }
2870   if (fread_pp) {
2871     *fread_pp = fread_ptr;
2872     *fread_endp = fread_end;
2873   }
2874   return kPglRetSuccess;
2875 }
2876 
PgrGetDifflistOrGenovec(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t max_simple_difflist_len,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict genovec,uint32_t * difflist_common_geno_ptr,uintptr_t * __restrict main_raregeno,uint32_t * __restrict difflist_sample_ids,uint32_t * __restrict difflist_len_ptr)2877 PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
2878   if (!sample_ct) {
2879     *difflist_common_geno_ptr = UINT32_MAX;
2880     return kPglRetSuccess;
2881   }
2882   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
2883   assert(vidx < pgrp->fi.raw_variant_ct);
2884   return ReadDifflistOrGenovecSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, genovec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
2885 }
2886 
2887 PglErr LdSubsetAdjustGenocounts(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict ldbase_genovec, uint32_t raw_sample_ct, const unsigned char** fread_pp, STD_ARRAY_REF(uint32_t, 4) genocounts, uintptr_t* __restrict raregeno_workspace) {
2888   // * sample_include assumed to be nullptr if no subsetting required
2889   // * Assumes genocounts[] is initialized to the proper values for the LD
2890   //   reference variant (including subsetting).
2891   // * Tried a hybrid implementation which allowed the base variant to be saved
2892   //   as a difflist; turns out it's practically always better to unpack to a
2893   //   genovec first.
2894   // * There are two modes:
2895   //   1. If sample_include is nullptr, we're not selecting a sample subset.
2896   //   2. If sample_include and sample_include_cumulative_popcounts are both
2897   //      non-null, we're computing counts over a sample subset, and
2898   //      ldbase_genovec is assumed to be subsetted.
2899   //   Experimented with a third mode where ldbase_genovec was replaced with
2900   //   ldbase_raw_genovec in the subsetted case, but that didn't seem to pay
2901   //   off.
2902   // * This is the main frequency-counting bottleneck.
2903   uint32_t raw_difflist_len;
2904   const unsigned char* group_info_iter;
2905   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &raw_difflist_len);
2906   if (reterr || (!raw_difflist_len)) {
2907     return reterr;
2908   }
2909   const uint32_t subgroup_idx_last = (raw_difflist_len - 1) / kBitsPerWordD2;
2910   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
2911   uintptr_t* raregeno_workspace_iter = raregeno_workspace;
2912   uintptr_t raw_sample_idx = 0;
2913   STD_ARRAY_DECL(uint32_t, 16, delta_counts);
2914   STD_ARRAY_FILL0(delta_counts);
2915   for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
2916     uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
2917     if (subgroup_idx >= subgroup_idx_last) {
2918       if (subgroup_idx > subgroup_idx_last) {
2919         const int32_t incr0 = delta_counts[1] + delta_counts[2] + delta_counts[3] - delta_counts[4] - delta_counts[8] - delta_counts[12];
2920         const int32_t incr1 = delta_counts[4] + delta_counts[6] + delta_counts[7] - delta_counts[1] - delta_counts[9] - delta_counts[13];
2921         const int32_t incr2 = delta_counts[8] + delta_counts[9] + delta_counts[11] - delta_counts[2] - delta_counts[6] - delta_counts[14];
2922         genocounts[0] += incr0;
2923         genocounts[1] += incr1;
2924         genocounts[2] += incr2;
2925         genocounts[3] -= incr0 + incr1 + incr2;
2926         return kPglRetSuccess;
2927       }
2928       remaining_deltas_in_subgroup &= raw_difflist_len - 1;
2929     }
2930     if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
2931 #ifdef __LP64__
2932       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2933         return kPglRetMalformedInput;
2934       }
2935 #endif
2936       raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
2937       group_info_iter = &(group_info_iter[sample_id_byte_ct]);
2938     } else {
2939       raw_sample_idx += GetVint31(fread_end, fread_pp);
2940     }
2941     uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
2942     if (!sample_include) {
2943       for (; ; --remaining_deltas_in_subgroup) {
2944 #ifndef __LP64__
2945         if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2946           return kPglRetMalformedInput;
2947         }
2948 #endif
2949         const uintptr_t cur_geno = cur_raregeno_word & 3;
2950         delta_counts[cur_geno * 4 + GetNyparrEntry(ldbase_genovec, raw_sample_idx)] += 1;
2951         if (!remaining_deltas_in_subgroup) {
2952           break;
2953         }
2954         raw_sample_idx += GetVint31(fread_end, fread_pp);
2955         cur_raregeno_word >>= 2;
2956       }
2957     } else {
2958       for (; ; --remaining_deltas_in_subgroup) {
2959 #ifndef __LP64__
2960         if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2961           return kPglRetMalformedInput;
2962         }
2963 #endif
2964         if (IsSet(sample_include, raw_sample_idx)) {
2965           const uintptr_t cur_geno = cur_raregeno_word & 3;
2966           const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx);
2967           delta_counts[cur_geno * 4 + GetNyparrEntry(ldbase_genovec, sample_idx)] += 1;
2968         }
2969         if (!remaining_deltas_in_subgroup) {
2970           break;
2971         }
2972         raw_sample_idx += GetVint31(fread_end, fread_pp);
2973         cur_raregeno_word >>= 2;
2974       }
2975     }
2976   }
2977 }
2978 
SkipDeltalistIds(const unsigned char * fread_end,const unsigned char * group_info,uint32_t difflist_len,uint32_t raw_sample_ct,uint32_t has_genotypes,const unsigned char ** fread_pp)2979 PglErr SkipDeltalistIds(const unsigned char* fread_end, const unsigned char* group_info, uint32_t difflist_len, uint32_t raw_sample_ct, uint32_t has_genotypes, const unsigned char** fread_pp) {
2980   assert(difflist_len);
2981   // fread_pp is a pure output parameter here
2982   const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize);
2983   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
2984   const unsigned char* extra_byte_cts = &(group_info[group_ct * sample_id_byte_ct]);
2985   const uint32_t extra_byte_tot = BytesumArr(extra_byte_cts, group_ct - 1);
2986 
2987   // (group_ct - 1) for extra_byte_cts
2988   // (difflist_len + 3) / 4 for raregeno
2989   // (group_ct - 1) * (kPglDifflistGroupSize - 1) + extra_byte_tot for
2990   //   all but last ID block
2991   // total = (group_ct - 1) * kPglDifflistGroupSize + extra_byte_tot +
2992   //         (difflist_len + 3) / 4
2993 #ifdef __arm__
2994 #  error "Unaligned accesses in SkipDeltalistIds()."
2995 #endif
2996   const unsigned char* iddiff_start = &(extra_byte_cts[(group_ct - 1) * kPglDifflistGroupSize + extra_byte_tot]);
2997   if (has_genotypes) {
2998     iddiff_start = &(iddiff_start[NypCtToByteCt(difflist_len)]);
2999   }
3000   const uintptr_t* fread_alias = R_CAST(const uintptr_t*, iddiff_start);
3001   const uintptr_t* fread_alias_stop = R_CAST(const uintptr_t*, &(fread_end[-S_CAST(int32_t, kBytesPerWord)]));
3002   uint32_t remaining_id_ct = (difflist_len - 1) % kPglDifflistGroupSize;
3003 #ifdef __LP64__
3004   while (remaining_id_ct >= kBytesPerVec) {
3005     if (unlikely(fread_alias > fread_alias_stop)) {
3006       return kPglRetMalformedInput;
3007     }
3008     const VecW vv = vecw_loadu(R_CAST(const VecW*, fread_alias));
3009     fread_alias = &(fread_alias[kWordsPerVec]);
3010     const uint32_t highbits = vecw_movemask(vv);
3011     remaining_id_ct -= kBytesPerVec - PopcountVec8thUint(highbits);
3012   }
3013 #endif
3014   while (remaining_id_ct >= kBytesPerWord) {
3015     // scan a word at a time, count number of high bits set
3016     if (unlikely(fread_alias > fread_alias_stop)) {
3017       return kPglRetMalformedInput;
3018     }
3019 #ifdef USE_SSE42
3020     const uintptr_t ww = (*fread_alias++) & (0x80 * kMask0101);
3021     remaining_id_ct -= kBytesPerWord - PopcountWord(ww);
3022 #else
3023     const uintptr_t ww = ((*fread_alias++) >> 7) & kMask0101;
3024     remaining_id_ct -= kBytesPerWord - ((ww * kMask0101) >> (kBitsPerWord - 8));
3025 #endif
3026   }
3027   const unsigned char* fread_ptr = R_CAST(const unsigned char*, fread_alias);
3028   if (!remaining_id_ct) {
3029     *fread_pp = fread_ptr;
3030     return kPglRetSuccess;
3031   }
3032   --remaining_id_ct;
3033   while (likely(fread_ptr < fread_end)) {
3034     if ((*fread_ptr++) <= 127) {
3035       if (!remaining_id_ct) {
3036         *fread_pp = fread_ptr;
3037         return kPglRetSuccess;
3038       }
3039       --remaining_id_ct;
3040     }
3041   }
3042   return kPglRetMalformedInput;
3043 }
3044 
3045 PglErr CountparseDifflistSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t common_geno, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, STD_ARRAY_REF(uint32_t, 4) genocounts, uintptr_t* __restrict raregeno_workspace) {
3046   const unsigned char* group_info_iter;
3047   uint32_t difflist_len;
3048   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &difflist_len);
3049   STD_ARRAY_REF_FILL0(4, genocounts);
3050   if (reterr || (!difflist_len)) {
3051     genocounts[common_geno] = sample_ct;
3052     return reterr;
3053   }
3054   if (raw_sample_ct == sample_ct) {
3055     ZeroTrailingNyps(difflist_len, raregeno_workspace);
3056     GenoarrCountFreqsUnsafe(raregeno_workspace, difflist_len, genocounts);
3057     genocounts[common_geno] = sample_ct - difflist_len;
3058     // bugfix (26 Mar 2019): forgot to advance fread_pp
3059     return SkipDeltalistIds(fread_end, group_info_iter, difflist_len, raw_sample_ct, 1, fread_pp);
3060   }
3061   const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
3062   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
3063   uintptr_t* raregeno_workspace_iter = raregeno_workspace;
3064   uintptr_t raw_sample_idx = 0;
3065   for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
3066     uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
3067     if (subgroup_idx >= subgroup_idx_last) {
3068       if (subgroup_idx > subgroup_idx_last) {
3069         genocounts[common_geno] = sample_ct - genocounts[0] - genocounts[1] - genocounts[2] - genocounts[3];
3070         return kPglRetSuccess;
3071       }
3072       remaining_deltas_in_subgroup &= difflist_len - 1;
3073     }
3074     if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
3075 #ifdef __LP64__
3076       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3077         return kPglRetMalformedInput;
3078       }
3079 #endif
3080       raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
3081       group_info_iter = &(group_info_iter[sample_id_byte_ct]);
3082     } else {
3083       raw_sample_idx += GetVint31(fread_end, fread_pp);
3084     }
3085     uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
3086     for (; ; --remaining_deltas_in_subgroup) {
3087 #ifndef __LP64__
3088       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3089         return kPglRetMalformedInput;
3090       }
3091 #endif
3092       if (IsSet(sample_include, raw_sample_idx)) {
3093         const uintptr_t cur_geno = cur_raregeno_word & 3;
3094         genocounts[cur_geno] += 1;
3095       }
3096       if (!remaining_deltas_in_subgroup) {
3097         break;
3098       }
3099       raw_sample_idx += GetVint31(fread_end, fread_pp);
3100       cur_raregeno_word >>= 2;
3101     }
3102   }
3103 }
3104 
3105 // 1-bit, unsubsetted: count 1-bit array, then count raregeno
3106 // 1-bit, subsetted: count [1-bit array AND sample_include], iterate through
3107 //   difflist
3108 PglErr CountparseOnebitSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, STD_ARRAY_REF(uint32_t, 4) genocounts, uintptr_t* __restrict raregeno_workspace) {
3109   const uint32_t initial_bitarray_byte_ct = DivUp(raw_sample_ct, CHAR_BIT);
3110   const unsigned char* onebit_main_iter = *fread_pp;
3111   if (PtrAddCk(fread_end, initial_bitarray_byte_ct + 1, fread_pp)) {
3112     return kPglRetMalformedInput;
3113   }
3114   const uint32_t common2_code = *onebit_main_iter++;
3115   const uint32_t geno_code_low = common2_code / 4;
3116   const uint32_t geno_code_high = (common2_code & 3) + geno_code_low;
3117 #ifdef __arm__
3118 #  error "Unaligned accesses in CountparseOnebitSubset()."
3119 #endif
3120   uint32_t high_geno_ct;
3121   if (raw_sample_ct == sample_ct) {
3122     high_geno_ct = PopcountBytes(onebit_main_iter, initial_bitarray_byte_ct);
3123   } else {
3124     high_geno_ct = PopcountBytesMasked(onebit_main_iter, sample_include, initial_bitarray_byte_ct);
3125   }
3126   const unsigned char* group_info_iter;
3127   uint32_t difflist_len;
3128   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &difflist_len);
3129   STD_ARRAY_REF_FILL0(4, genocounts);
3130   if (reterr || (!difflist_len)) {
3131     genocounts[geno_code_low] = sample_ct - high_geno_ct;
3132     genocounts[geno_code_high] = high_geno_ct;
3133     return reterr;
3134   }
3135   if (raw_sample_ct == sample_ct) {
3136     ZeroTrailingNyps(difflist_len, raregeno_workspace);
3137     GenoarrCountFreqsUnsafe(raregeno_workspace, difflist_len, genocounts);
3138     genocounts[geno_code_low] = sample_ct - difflist_len - high_geno_ct;
3139     genocounts[geno_code_high] = high_geno_ct;
3140     // bugfix (26 Mar 2019): forgot to advance fread_pp
3141     return SkipDeltalistIds(fread_end, group_info_iter, difflist_len, raw_sample_ct, 1, fread_pp);
3142   }
3143   const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
3144   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
3145   const uintptr_t* onebitarr = R_CAST(const uintptr_t*, onebit_main_iter);
3146   uintptr_t* raregeno_workspace_iter = raregeno_workspace;
3147   uintptr_t raw_sample_idx = 0;
3148   for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
3149     uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
3150     if (subgroup_idx >= subgroup_idx_last) {
3151       if (subgroup_idx > subgroup_idx_last) {
3152         // avoid read-after-write dependency?
3153         genocounts[geno_code_low] = sample_ct - high_geno_ct - genocounts[0] - genocounts[1] - genocounts[2] - genocounts[3];
3154         genocounts[geno_code_high] = high_geno_ct;
3155         return kPglRetSuccess;
3156       }
3157       remaining_deltas_in_subgroup &= difflist_len - 1;
3158     }
3159     if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
3160 #ifdef __LP64__
3161       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3162         return kPglRetMalformedInput;
3163       }
3164 #endif
3165       raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
3166       group_info_iter = &(group_info_iter[sample_id_byte_ct]);
3167     } else {
3168       raw_sample_idx += GetVint31(fread_end, fread_pp);
3169     }
3170     uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
3171     for (; ; --remaining_deltas_in_subgroup) {
3172 #ifndef __LP64__
3173       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3174         return kPglRetMalformedInput;
3175       }
3176 #endif
3177       if (IsSet(sample_include, raw_sample_idx)) {
3178         const uintptr_t cur_geno = cur_raregeno_word & 3;
3179         genocounts[cur_geno] += 1;
3180         high_geno_ct -= IsSet(onebitarr, raw_sample_idx);
3181       }
3182       if (!remaining_deltas_in_subgroup) {
3183         break;
3184       }
3185       raw_sample_idx += GetVint31(fread_end, fread_pp);
3186       cur_raregeno_word >>= 2;
3187     }
3188   }
3189 }
3190 
3191 // loads ldbase variant if necessary, guarantees pgrp->ldbase_genovec is filled
3192 // on return
3193 // only called by GetBasicGenotypeCounts(), usually LdLoadAndCopy... is better
LdLoadGenovecSubsetIfNecessary(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp)3194 PglErr LdLoadGenovecSubsetIfNecessary(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp) {
3195   if (LdLoadNecessary(vidx, pgrp)) {
3196     const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
3197     const unsigned char* fread_ptr;
3198     const unsigned char* fread_end;
3199     if (unlikely(InitReadPtrs(ldbase_vidx, pgrp, &fread_ptr, &fread_end))) {
3200       return kPglRetReadFail;
3201     }
3202     const uint32_t vrtype = pgrp->fi.vrtypes[ldbase_vidx];
3203     // bugfix (6 Mar 2019): ldbase_raw_genovec is only filled in (!difflist) &&
3204     //   subsetting_required case; (!difflist) isn't enough
3205     pgrp->ldbase_stypes = ((vrtype & 4) || (sample_ct == pgrp->fi.raw_sample_ct))? kfPgrLdcacheNyp : (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp);
3206     return ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, pgrp->ldbase_genovec);
3207   }
3208   if (!(pgrp->ldbase_stypes & kfPgrLdcacheNyp)) {
3209     if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) {
3210       PgrDifflistToGenovecUnsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, sample_ct, pgrp->ldbase_difflist_len, pgrp->ldbase_genovec);
3211     } else {
3212       assert(pgrp->ldbase_stypes & kfPgrLdcacheRawNyp);
3213       CopyNyparrNonemptySubset(pgrp->ldbase_raw_genovec, sample_include, pgrp->fi.raw_sample_ct, sample_ct, pgrp->ldbase_genovec);
3214     }
3215     pgrp->ldbase_stypes |= kfPgrLdcacheNyp;
3216   }
3217   return kPglRetSuccess;
3218 }
3219 
3220 PglErr GetBasicGenotypeCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uint32_t* unphased_het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts) {
3221   // genocounts[0] := ref/ref, genocounts[1] := ref/altx,
3222   // genocounts[2] := altx/alty, genocounts[3] := missing
3223   // If unphased_het_ctp is non-null, this assumes multiallelic hardcalls are
3224   // not present, phased hardcalls are present, we aren't subsetting, and
3225   // unphased_het_ct is initialized to zero.
3226   assert(vidx < pgrp->fi.raw_variant_ct);
3227   assert(sample_ct);
3228   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
3229   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
3230   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
3231   const unsigned char* fread_ptr;
3232   const unsigned char* fread_end = nullptr;  // maybe-uninitialized warning
3233   PglErr reterr;
3234   if (VrtypeLdCompressed(vrtype)) {
3235     // LD compression
3236     reterr = LdLoadGenovecSubsetIfNecessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
3237     if (unlikely(reterr)) {
3238       return reterr;
3239     }
3240     if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
3241       return kPglRetReadFail;
3242     }
3243     if (!(pgrp->ldbase_stypes & kfPgrLdcacheBasicGenocounts)) {
3244       ZeroTrailingNyps(sample_ct, pgrp->ldbase_genovec);
3245       GenoarrCountFreqsUnsafe(pgrp->ldbase_genovec, sample_ct, pgrp->ldbase_basic_genocounts);
3246       pgrp->ldbase_stypes |= kfPgrLdcacheBasicGenocounts;
3247     }
3248     STD_ARRAY_COPY(pgrp->ldbase_basic_genocounts, 4, genocounts);
3249     reterr = LdSubsetAdjustGenocounts(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_genovec, raw_sample_ct, &fread_ptr, genocounts, pgrp->workspace_raregeno_tmp_loadbuf);
3250     if (vrtype & 1) {
3251       // inverted
3252       const uint32_t tmpval = genocounts[0];
3253       genocounts[0] = genocounts[2];
3254       genocounts[2] = tmpval;
3255     }
3256   } else {
3257     if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
3258       return kPglRetReadFail;
3259     }
3260     const uint32_t is_ldbase = pgrp->fi.vrtypes && VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
3261     if (is_ldbase) {
3262       // difflists are very efficient to count directly when not subsetting
3263       // (since we can entirely ignore the sample IDs), but it's often better
3264       // to unpack them first when subsetting.
3265 
3266       // ...er, the statement above is a lie, unpack-first almost always seems
3267       // to be better.
3268       pgrp->ldbase_vidx = vidx;
3269       // this may be slowed down by the LD caching change.
3270       reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, pgrp->ldbase_genovec);
3271       ZeroTrailingNyps(sample_ct, pgrp->ldbase_genovec);
3272       GenoarrCountFreqsUnsafe(pgrp->ldbase_genovec, sample_ct, genocounts);
3273       STD_ARRAY_COPY(genocounts, 4, pgrp->ldbase_basic_genocounts);
3274       pgrp->ldbase_stypes = (subsetting_required && (!(vrtype & 4)))? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp | kfPgrLdcacheBasicGenocounts) : (kfPgrLdcacheNyp | kfPgrLdcacheBasicGenocounts);
3275     } else if (vrtype & 4) {
3276       const uint32_t vrtype_low2 = vrtype & 3;
3277       if (vrtype_low2 != 1) {
3278         reterr = CountparseDifflistSubset(fread_end, sample_include, vrtype & 3, raw_sample_ct, sample_ct, &fread_ptr, genocounts, pgrp->workspace_raregeno_tmp_loadbuf);
3279       } else {
3280         genocounts[0] = sample_ct;
3281         genocounts[1] = 0;
3282         genocounts[2] = 0;
3283         genocounts[3] = 0;
3284         reterr = kPglRetSuccess;
3285       }
3286     } else if (vrtype & 1) {
3287       reterr = CountparseOnebitSubset(fread_end, sample_include, raw_sample_ct, sample_ct, &fread_ptr, genocounts, pgrp->workspace_raregeno_tmp_loadbuf);
3288     } else {
3289       const uint32_t genovec_byte_ct = NypCtToByteCt(raw_sample_ct);
3290       const unsigned char* genoarrb = fread_ptr;
3291       if (PtrAddCk(fread_end, genovec_byte_ct, &fread_ptr)) {
3292         return kPglRetMalformedInput;
3293       }
3294       const uint32_t genoarrb_is_unaligned = R_CAST(uintptr_t, genoarrb) % kBytesPerVec;
3295       if (!subsetting_required) {
3296         if (genoarrb_is_unaligned) {
3297           GenoarrbCountFreqs(genoarrb, raw_sample_ct, genocounts);
3298         } else {
3299           GenoarrCountFreqs(R_CAST(const uintptr_t*, genoarrb), raw_sample_ct, genocounts);
3300         }
3301       } else {
3302         GenoarrbCountSubsetFreqs(genoarrb, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
3303       }
3304       if (vrtype == kPglVrtypePlink1) {
3305         // [3] -> [0]
3306         // [2] -> [1]
3307         // [1] -> [3]
3308         // [0] -> [2]
3309         const uint32_t save2 = genocounts[0];
3310         const uint32_t save3 = genocounts[1];
3311         genocounts[0] = genocounts[3];
3312         genocounts[1] = genocounts[2];
3313         genocounts[2] = save2;
3314         genocounts[3] = save3;
3315       }
3316       reterr = kPglRetSuccess;
3317     }
3318   }
3319   if ((!unphased_het_ctp) || reterr) {
3320     return reterr;
3321   }
3322   assert((!subsetting_required) && ((vrtype & 0x18) == 0x10));
3323   const uint32_t het_ct = genocounts[1];
3324   const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
3325   if (PtrCheck(fread_end, fread_ptr, aux2_first_part_byte_ct)) {
3326     return kPglRetMalformedInput;
3327   }
3328   const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
3329   if (explicit_phasepresent) {
3330     // otherwise initial value if 0 is correct
3331     *unphased_het_ctp = het_ct + 1 - PopcountBytes(fread_ptr, aux2_first_part_byte_ct);
3332   }
3333   return kPglRetSuccess;
3334 }
3335 
3336 PglErr PgrGetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts) {
3337   if (!sample_ct) {
3338     STD_ARRAY_REF_FILL0(4, genocounts);
3339     return kPglRetSuccess;
3340   }
3341   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
3342   assert(vidx < pgrp->fi.raw_variant_ct);
3343   return GetBasicGenotypeCounts(sample_include, sample_include_interleaved_vec, GetSicp(pssi), sample_ct, vidx, pgrp, nullptr, genocounts);
3344 }
3345 
3346 // Ok for nyp_vvec to be unaligned.
CountNypVec6(const VecW * nyp_vvec,uintptr_t nyp_word,uint32_t vec_ct)3347 uint32_t CountNypVec6(const VecW* nyp_vvec, uintptr_t nyp_word, uint32_t vec_ct) {
3348   assert(!(vec_ct % 6));
3349   const VecW m0 = vecw_setzero();
3350   const VecW m1 = VCONST_W(kMask5555);
3351   const VecW m2 = VCONST_W(kMask3333);
3352   const VecW m4 = VCONST_W(kMask0F0F);
3353   const VecW xor_vvec = vecw_set1(nyp_word);
3354   const VecW* nyp_vvec_iter = nyp_vvec;
3355   VecW prev_sad_result = vecw_setzero();
3356   VecW acc = vecw_setzero();
3357   uintptr_t cur_incr = 60;
3358   for (; ; vec_ct -= cur_incr) {
3359     if (vec_ct < 60) {
3360       if (!vec_ct) {
3361         acc = acc + prev_sad_result;
3362         return HsumW(acc);
3363       }
3364       cur_incr = vec_ct;
3365     }
3366     VecW inner_acc = vecw_setzero();
3367     const VecW* nyp_vvec_stop = &(nyp_vvec_iter[cur_incr]);
3368     do {
3369       VecW loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3370       VecW loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3371       VecW count1 = vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, m1);
3372       VecW count2 = vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, m1);
3373 
3374       loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3375       loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3376       count1 = count1 + vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, m1);
3377       count2 = count2 + vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, m1);
3378 
3379       loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3380       loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3381       count1 = count1 + vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, m1);
3382       count2 = count2 + vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, m1);
3383 
3384       count1 = (count1 & m2) + (vecw_srli(count1, 2) & m2);
3385       count1 = count1 + (count2 & m2) + (vecw_srli(count2, 2) & m2);
3386       inner_acc = inner_acc + (count1 & m4) + (vecw_srli(count1, 4) & m4);
3387     } while (nyp_vvec_iter < nyp_vvec_stop);
3388     acc = acc + prev_sad_result;
3389     prev_sad_result = vecw_bytesum(inner_acc, m0);
3390   }
3391 }
3392 
3393 // Ok for nyparr to be unaligned.  Ok if unsafe to read trailing bytes of
3394 // nyparr.
CountNyp(const void * nyparr,uintptr_t nyp_word,uint32_t nyp_ct)3395 uint32_t CountNyp(const void* nyparr, uintptr_t nyp_word, uint32_t nyp_ct) {
3396   const uint32_t fullword_ct = nyp_ct / kBitsPerWordD2;
3397   uint32_t word_idx = fullword_ct - (fullword_ct % (6 * kWordsPerVec));
3398   uint32_t tot = CountNypVec6(S_CAST(const VecW*, nyparr), nyp_word, word_idx / kWordsPerVec);
3399   const uintptr_t* nypvec = S_CAST(const uintptr_t*, nyparr);
3400   for (; word_idx != fullword_ct; ++word_idx) {
3401     const uintptr_t cur_word = nypvec[word_idx] ^ nyp_word;
3402     tot += Popcount01Word(Word00(cur_word));
3403   }
3404   const uint32_t trailing_nyp_ct = nyp_ct % kBitsPerWordD2;
3405   if (trailing_nyp_ct) {
3406     const uint32_t trailing_byte_ct = DivUp(trailing_nyp_ct, (CHAR_BIT / 2));
3407     uintptr_t cur_word = SubwordLoad(&(nypvec[fullword_ct]), trailing_byte_ct) ^ nyp_word;
3408     cur_word = bzhi(Word00(cur_word), trailing_nyp_ct * 2);
3409     tot += Popcount01Word(cur_word);
3410   }
3411   return tot;
3412 }
3413 
3414 /*
3415 uint32_t CountNypSubsetVec6(const VecW* __restrict nyp_vvec, const VecW* __restrict interleaved_mask_vvec, uintptr_t nyp_word, uint32_t vec_ct) {
3416   assert(!(vec_ct % 6));
3417   const VecW m0 = vecw_setzero();
3418   const VecW m1 = VCONST_W(kMask5555);
3419   const VecW m2 = VCONST_W(kMask3333);
3420   const VecW m4 = VCONST_W(kMask0F0F);
3421   const VecW xor_vvec = vecw_set1(nyp_word);
3422   const VecW* nyp_vvec_iter = nyp_vvec;
3423   const VecW* interleaved_mask_vvec_iter = interleaved_mask_vvec;
3424   VecW prev_sad_result = vecw_setzero();
3425   VecW acc = vecw_setzero();
3426   uintptr_t cur_incr = 60;
3427   while (1) {
3428     if (vec_ct < 60) {
3429       if (!vec_ct) {
3430         acc = acc + prev_sad_result;
3431         return HsumW(acc);
3432       }
3433       cur_incr = vec_ct;
3434     }
3435     VecW inner_acc = vecw_setzero();
3436     const VecW* nyp_vvec_stop = &(nyp_vvec_iter[cur_incr]);
3437     vec_ct -= cur_incr;
3438     do {
3439       VecW mask1 = *interleaved_mask_vvec_iter++;
3440       VecW loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3441       VecW mask2 = vecw_srli(mask1, 1) & m1;
3442       VecW loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3443       mask1 = mask1 & m1;
3444       VecW count1 = vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, mask1);
3445       VecW count2 = vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, mask2);
3446 
3447       mask1 = *interleaved_mask_vvec_iter++;
3448       loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3449       mask2 = vecw_srli(mask1, 1) & m1;
3450       loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3451       mask1 = mask1 & m1;
3452       count1 = count1 + vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, mask1);
3453       count2 = count2 + vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, mask2);
3454 
3455       mask1 = *interleaved_mask_vvec_iter++;
3456       loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3457       mask2 = vecw_srli(mask2, 1) & m1;
3458       loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3459       mask1 = mask1 & m1;
3460       count1 = count1 + vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, mask1);
3461       count2 = count2 + vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, mask2);
3462 
3463       count1 = (count1 & m2) + (vecw_srli(count1, 2) & m2);
3464       count1 = count1 + (count2 & m2) + (vecw_srli(count2, 2) & m2);
3465       inner_acc = inner_acc + (count1 & m4) + (vecw_srli(count1, 4) & m4);
3466     } while (nyp_vvec_iter < nyp_vvec_stop);
3467     acc = acc + prev_sad_result;
3468     prev_sad_result = vecw_bytesum(inner_acc, m0);
3469   }
3470 }
3471 
3472 uint32_t CountNypSubset(const uintptr_t* __restrict nypvec, const uintptr_t* __restrict interleaved_vec, uintptr_t nyp_word, uint32_t raw_nyp_ct) {
3473   // simplified GenoarrCountSubsetFreqs()
3474   const uint32_t raw_nyp_ctv2 = NypCtToVecCt(raw_nyp_ct);
3475 #ifdef __LP64__
3476   uint32_t vec_idx = raw_nyp_ctv2 - (raw_nyp_ctv2 % 6);
3477   uint32_t tot = CountNypSubsetVec6(R_CAST(const VecW*, nypvec), R_CAST(const VecW*, interleaved_vec), nyp_word, vec_idx);
3478   const uintptr_t* nypvec_iter = &(nypvec[kWordsPerVec * vec_idx]);
3479   const uintptr_t* interleaved_mask_iter = &(interleaved_vec[(kWordsPerVec / 2) * vec_idx]);
3480 #  ifdef USE_AVX2
3481   uintptr_t mask_base1 = 0;
3482   uintptr_t mask_base2 = 0;
3483   uintptr_t mask_base3 = 0;
3484   uintptr_t mask_base4 = 0;
3485   for (; vec_idx != raw_nyp_ctv2; ++vec_idx) {
3486     uintptr_t mask_word1;
3487     uintptr_t mask_word2;
3488     uintptr_t mask_word3;
3489     uintptr_t mask_word4;
3490     if (!(vec_idx % 2)) {
3491       mask_base1 = *interleaved_mask_iter++;
3492       mask_base2 = *interleaved_mask_iter++;
3493       mask_base3 = *interleaved_mask_iter++;
3494       mask_base4 = *interleaved_mask_iter++;
3495       mask_word1 = mask_base1 & kMask5555;
3496       mask_word2 = mask_base2 & kMask5555;
3497       mask_word3 = mask_base3 & kMask5555;
3498       mask_word4 = mask_base4 & kMask5555;
3499     } else {
3500       mask_word1 = (mask_base1 >> 1) & kMask5555;
3501       mask_word2 = (mask_base2 >> 1) & kMask5555;
3502       mask_word3 = (mask_base3 >> 1) & kMask5555;
3503       mask_word4 = (mask_base4 >> 1) & kMask5555;
3504     }
3505     uint32_t uii = 0;
3506     while (1) {
3507       const uintptr_t cur_geno_word1 = (*nypvec_iter++) ^ nyp_word;
3508       const uintptr_t cur_geno_word2 = (*nypvec_iter++) ^ nyp_word;
3509       const uintptr_t masked1 = mask_word1 & (~(cur_geno_word1 | (cur_geno_word1 >> 1)));
3510       const uintptr_t masked2 = mask_word2 & (~(cur_geno_word2 | (cur_geno_word2 >> 1)));
3511       tot += PopcountWord((masked1 << 1) | masked2);
3512       if (uii) {
3513         break;
3514       }
3515       ++uii;
3516       mask_word1 = mask_word3;
3517       mask_word2 = mask_word4;
3518     }
3519   }
3520 #  else  // not USE_AVX2
3521   uintptr_t mask_base1 = 0;
3522   uintptr_t mask_base2 = 0;
3523   for (; vec_idx != raw_nyp_ctv2; ++vec_idx) {
3524     uintptr_t mask_word1;
3525     uintptr_t mask_word2;
3526     if (!(vec_idx % 2)) {
3527       mask_base1 = *interleaved_mask_iter++;
3528       mask_base2 = *interleaved_mask_iter++;
3529       mask_word1 = mask_base1 & kMask5555;
3530       mask_word2 = mask_base2 & kMask5555;
3531     } else {
3532       mask_word1 = (mask_base1 >> 1) & kMask5555;
3533       mask_word2 = (mask_base2 >> 1) & kMask5555;
3534     }
3535     const uintptr_t cur_geno_word1 = (*nypvec_iter++) ^ nyp_word;
3536     const uintptr_t cur_geno_word2 = (*nypvec_iter++) ^ nyp_word;
3537     const uintptr_t masked1 = mask_word1 & (~(cur_geno_word1 | (cur_geno_word1 >> 1)));
3538     const uintptr_t masked2 = mask_word2 & (~(cur_geno_word2 | (cur_geno_word2 >> 1)));
3539 #    ifdef USE_SSE42
3540     tot += PopcountWord((masked1 << 1) | masked2);
3541 #    else
3542     tot += NypsumWord(masked1 + masked2);
3543 #    endif
3544   }
3545 #  endif  // not USE_AVX2
3546 #else  // not __LP64__
3547   uint32_t word_idx = raw_nyp_ctv2 - (raw_nyp_ctv2 % 6);
3548   uint32_t tot = CountNypSubsetVec6(R_CAST(const VecW*, nypvec), R_CAST(const VecW*, interleaved_vec), nyp_word, word_idx);
3549   const uintptr_t* interleaved_mask_iter = &(interleaved_vec[word_idx / 2]);
3550   uintptr_t mask_base = 0;
3551   for (; word_idx != raw_nyp_ctv2; ++word_idx) {
3552     uintptr_t mask_word;
3553     if (!(word_idx % 2)) {
3554       mask_base = *interleaved_mask_iter++;
3555       mask_word = mask_base & kMask5555;
3556     } else {
3557       mask_word = (mask_base >> 1) & kMask5555;
3558     }
3559     const uintptr_t cur_geno_word = nypvec[word_idx] ^ nyp_word;
3560     const uintptr_t masked = mask_word & (~(cur_geno_word | (cur_geno_word >> 1)));
3561     tot += Popcount01Word(masked);
3562   }
3563 #endif
3564   return tot;
3565 }
3566 */
3567 
3568 // Ok for nybble_vvec to be unaligned.
CountNybbleVec(const VecW * nybble_vvec,uintptr_t nybble_word,uint32_t vec_ct)3569 uint32_t CountNybbleVec(const VecW* nybble_vvec, uintptr_t nybble_word, uint32_t vec_ct) {
3570   const VecW m0 = vecw_setzero();
3571   const VecW alld15 = VCONST_W(kMask1111);
3572   const VecW m4 = VCONST_W(kMask0F0F);
3573   const VecW xor_vvec = vecw_set1(nybble_word);
3574   const VecW* nybble_vvec_iter = nybble_vvec;
3575   VecW prev_sad_result = vecw_setzero();
3576   VecW acc = vecw_setzero();
3577   uintptr_t cur_incr = 15;
3578   for (; ; vec_ct -= cur_incr) {
3579     if (vec_ct < 15) {
3580       if (!vec_ct) {
3581         acc = acc + prev_sad_result;
3582         return HsumW(acc);
3583       }
3584       cur_incr = vec_ct;
3585     }
3586     VecW inner_acc = vecw_setzero();
3587     const VecW* nybble_vvec_stop = &(nybble_vvec_iter[cur_incr]);
3588     do {
3589       VecW loader = vecw_loadu(nybble_vvec_iter++) ^ xor_vvec;
3590       // DetectAllZeroNybbles() followed by right-shift-3 is the same number of
3591       // operations, can see if that's any faster in practice
3592       loader = vecw_srli(loader, 1) | loader;
3593       loader = vecw_srli(loader, 2) | loader;
3594       inner_acc = inner_acc + vecw_and_notfirst(loader, alld15);
3595     } while (nybble_vvec_iter < nybble_vvec_stop);
3596     inner_acc = (inner_acc & m4) + (vecw_srli(inner_acc, 4) & m4);
3597     acc = acc + prev_sad_result;
3598     prev_sad_result = vecw_bytesum(inner_acc, m0);
3599   }
3600 }
3601 
CountNybble(const void * nybblearr,uintptr_t nybble_word,uintptr_t nybble_ct)3602 uint32_t CountNybble(const void* nybblearr, uintptr_t nybble_word, uintptr_t nybble_ct) {
3603   const uint32_t fullword_ct = nybble_ct / kBitsPerWordD4;
3604   uint32_t tot = CountNybbleVec(S_CAST(const VecW*, nybblearr), nybble_word, fullword_ct / kWordsPerVec);
3605   const uintptr_t* nybblevec = S_CAST(const uintptr_t*, nybblearr);
3606 #ifdef __LP64__
3607   for (uint32_t word_idx = RoundDownPow2(fullword_ct, kWordsPerVec); word_idx != fullword_ct; ++word_idx) {
3608     uintptr_t cur_word = nybblevec[word_idx] ^ nybble_word;
3609     cur_word = cur_word | (cur_word >> 1);
3610     cur_word = cur_word | (cur_word >> 2);
3611     tot += Popcount0001Word((~cur_word) & kMask1111);
3612   }
3613 #endif
3614   const uint32_t trailing_nybble_ct = nybble_ct % kBitsPerWordD4;
3615   if (trailing_nybble_ct) {
3616     const uint32_t trailing_byte_ct = DivUp(trailing_nybble_ct, (CHAR_BIT / 4));
3617     uintptr_t cur_word = SubwordLoad(&(nybblevec[fullword_ct]), trailing_byte_ct) ^ nybble_word;
3618     cur_word = cur_word | (cur_word >> 1);
3619     cur_word = cur_word | (cur_word >> 2);
3620     cur_word = bzhi((~cur_word) & kMask1111, trailing_nybble_ct * 4);
3621 #if defined(USE_SSE42) || !defined(__LP64__)
3622     tot += Popcount0001Word(cur_word);
3623 #else
3624     // minor optimization, can't overflow
3625     tot += (cur_word * kMask1111) >> 60;
3626 #endif
3627   }
3628   return tot;
3629 }
3630 
3631 // similar to ParseAndSaveDifflist()
ParseAndSaveDeltalist(const unsigned char * fread_end,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uint32_t * __restrict deltalist,uint32_t * __restrict deltalist_len_ptr)3632 PglErr ParseAndSaveDeltalist(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uint32_t* __restrict deltalist, uint32_t* __restrict deltalist_len_ptr) {
3633   const unsigned char* group_info_iter;
3634   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, deltalist_len_ptr);
3635   const uint32_t deltalist_len = *deltalist_len_ptr;
3636   if (reterr || (!deltalist_len)) {
3637     return reterr;
3638   }
3639   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
3640   const uint32_t group_idx_last = (deltalist_len - 1) / kPglDifflistGroupSize;
3641   uint32_t* deltalist_iter = deltalist;
3642   uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
3643   for (uint32_t group_idx = 0; ; ++group_idx) {
3644     if (group_idx >= group_idx_last) {
3645       if (group_idx > group_idx_last) {
3646         return kPglRetSuccess;
3647       }
3648       group_len_m1 &= deltalist_len - 1;
3649     }
3650     uintptr_t raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
3651     group_info_iter = &(group_info_iter[sample_id_byte_ct]);
3652     for (uint32_t raw_deltalist_idx_lowbits = 0; ; ++raw_deltalist_idx_lowbits) {
3653       // always check, otherwise we may scribble over arbitrary memory
3654       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3655         return kPglRetMalformedInput;
3656       }
3657       deltalist_iter[raw_deltalist_idx_lowbits] = raw_sample_idx;
3658       if (raw_deltalist_idx_lowbits == group_len_m1) {
3659         break;
3660       }
3661       raw_sample_idx += GetVint31(fread_end, fread_pp);
3662     }
3663     deltalist_iter = &(deltalist_iter[group_len_m1 + 1]);
3664   }
3665 }
3666 
CountDeltalistIntersect(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uint32_t * __restrict intersect_ctp,uint32_t * __restrict raw_deltalist_len_ptr)3667 PglErr CountDeltalistIntersect(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, const unsigned char** fread_pp, uint32_t* __restrict intersect_ctp, uint32_t* __restrict raw_deltalist_len_ptr) {
3668   // Requires a PROPER subset.
3669   const unsigned char* group_info_iter;
3670   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, raw_deltalist_len_ptr);
3671   const uint32_t raw_deltalist_len = *raw_deltalist_len_ptr;
3672   if (reterr || (!raw_deltalist_len)) {
3673     *intersect_ctp = 0;
3674     return reterr;
3675   }
3676   const uint32_t group_idx_last = (raw_deltalist_len - 1) / kPglDifflistGroupSize;
3677   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
3678   uintptr_t intersect_ct = 0;
3679 
3680   // technically doesn't need to be initialized, but I have principles
3681   uintptr_t raw_sample_idx = 0;
3682 
3683   uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
3684   for (uint32_t group_idx = 0; ; ++group_idx) {
3685     if (group_idx >= group_idx_last) {
3686       if (group_idx > group_idx_last) {
3687         *intersect_ctp = intersect_ct;
3688         return kPglRetSuccess;
3689       }
3690       group_len_m1 &= raw_deltalist_len - 1;
3691     }
3692     // We need to pull a raw sample index from the deltalist header every 64
3693     // entries.
3694 #ifdef __LP64__
3695     if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3696       return kPglRetMalformedInput;
3697     }
3698 #endif
3699     raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
3700     group_info_iter = &(group_info_iter[sample_id_byte_ct]);
3701     for (uint32_t raw_deltalist_idx_lowbits = 0; ; ++raw_deltalist_idx_lowbits) {
3702 #ifndef __LP64__
3703       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3704         return kPglRetMalformedInput;
3705       }
3706 #endif
3707       intersect_ct += IsSet(sample_include, raw_sample_idx);
3708       if (raw_deltalist_idx_lowbits == group_len_m1) {
3709         break;
3710       }
3711       raw_sample_idx += GetVint31(fread_end, fread_pp);
3712     }
3713   }
3714 }
3715 
CountAux1aDense(const void * patch_01_fvals,uint32_t allele_ct,uint32_t allele_idx,uint32_t raw_01_ct,uint32_t rare01_ct)3716 uint32_t CountAux1aDense(const void* patch_01_fvals, uint32_t allele_ct, uint32_t allele_idx, uint32_t raw_01_ct, uint32_t rare01_ct) {
3717   // The 'f' in patch_01_fset/patch_01_fvals is to distinguish the in-file
3718   // representation from the returned AlleleCode*-based representation.
3719   if (allele_idx == 1) {
3720     // safe to ignore allele codes
3721     return raw_01_ct - rare01_ct;
3722   }
3723   if (allele_ct < 5) {
3724     if (allele_ct == 3) {
3725       return rare01_ct;
3726     }
3727     // need to count matches
3728     const uint32_t allele_code_byte_ct = DivUp(rare01_ct, 8);
3729     const uint32_t alt3_ct = PopcountBytes(patch_01_fvals, allele_code_byte_ct);
3730     if (allele_idx == 3) {
3731       return alt3_ct;
3732     }
3733     return rare01_ct - alt3_ct;
3734   }
3735   if (allele_ct < 19) {
3736     if (allele_ct < 7) {
3737       return CountNyp(patch_01_fvals, (allele_idx - 2) * kMask5555, rare01_ct);
3738     }
3739     return CountNybble(patch_01_fvals, (allele_idx - 2) * kMask1111, rare01_ct);
3740   }
3741   return CountByte(patch_01_fvals, allele_idx - 2, rare01_ct);
3742 }
3743 
GetAux1aWidth(uint32_t allele_ct)3744 uint32_t GetAux1aWidth(uint32_t allele_ct) {
3745   if (allele_ct < 7) {
3746     if (allele_ct < 5) {
3747       return allele_ct - 3;
3748     }
3749     return 2;
3750   }
3751   if (allele_ct < 19) {
3752     return 4;
3753   }
3754   return 8;
3755 }
3756 
3757 // Returns allele_code_width.  Other return values are inaccurate for allele_ct
3758 // == 3, since it's assumed that they're unused in that case.
GetAux1aConsts(uint32_t allele_ct,uintptr_t * detect_mask_hi_ptr,uintptr_t * detect_mask_lo_ptr,uint32_t * allele_code_logwidth_ptr)3759 uint32_t GetAux1aConsts(uint32_t allele_ct, uintptr_t* detect_mask_hi_ptr, uintptr_t* detect_mask_lo_ptr, uint32_t* allele_code_logwidth_ptr) {
3760   if (allele_ct < 7) {
3761     if (allele_ct < 5) {
3762       *detect_mask_hi_ptr = ~k0LU;
3763       *detect_mask_lo_ptr = ~k0LU;
3764       *allele_code_logwidth_ptr = 0;
3765       return allele_ct - 3;
3766     }
3767     *detect_mask_hi_ptr = kMaskAAAA;
3768     *detect_mask_lo_ptr = kMask5555;
3769     *allele_code_logwidth_ptr = 1;
3770     return 2;
3771   }
3772   if (allele_ct < 19) {
3773     *detect_mask_hi_ptr = kMask1111 * 8;
3774     *detect_mask_lo_ptr = kMask1111;
3775     *allele_code_logwidth_ptr = 2;
3776     return 4;
3777   }
3778   *detect_mask_hi_ptr = kMask0101 * 0x80;
3779   *detect_mask_lo_ptr = kMask0101;
3780   *allele_code_logwidth_ptr = 3;
3781   return 8;
3782 }
3783 
3784 // Advances *fread_pp past aux1a, and sets *het_ctp to the number of ref-altx
3785 // hets where x == allele_idx in sample_include.  (If allele_idx == 1, *het_ctp
3786 // is raw_01_ct - [# of aux1a entries] when there's no subsetting.)
3787 // Note that raw_01_ct must be an un-subsetted count.
3788 // Ok for subsetted_01_ct to be uninitialized if not subsetting, or allele_idx
3789 // != 1.
3790 // sample_include assumed to be nullptr if no subsetting required
CountAux1a(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx,uint32_t raw_01_ct,uint32_t subsetted_01_ct,const unsigned char ** fread_pp,uint32_t * __restrict het_ctp,uint32_t * __restrict deltalist_workspace)3791 PglErr CountAux1a(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx, uint32_t raw_01_ct, uint32_t subsetted_01_ct, const unsigned char** fread_pp, uint32_t* __restrict het_ctp, uint32_t* __restrict deltalist_workspace) {
3792   if (aux1a_mode == 15) {
3793     if (allele_idx == 1) {
3794       if (sample_include) {
3795         *het_ctp = subsetted_01_ct;
3796       } else {
3797         *het_ctp = raw_01_ct;
3798       }
3799     } else {
3800       *het_ctp = 0;
3801     }
3802     return kPglRetSuccess;
3803   }
3804   const uint32_t ignore_01_fvals = (allele_idx == 1) || (allele_ct == 3);
3805   uintptr_t detect_mask_hi;
3806   uintptr_t detect_mask_lo;
3807   uint32_t allele_code_logwidth;
3808   const uint32_t allele_code_width = GetAux1aConsts(allele_ct, &detect_mask_hi, &detect_mask_lo, &allele_code_logwidth);
3809   const uintptr_t xor_word = (allele_idx - 2) * detect_mask_lo;
3810   if (!aux1a_mode) {
3811     // 01-collapsed bitarray
3812     const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
3813     const uint32_t rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
3814 #ifdef __arm__
3815 #  error "Unaligned accesses in CountAux1a()."
3816 #endif
3817     const uintptr_t* patch_01_fsetw = R_CAST(const uintptr_t*, *fread_pp);
3818     *fread_pp += fset_byte_ct;
3819     const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
3820     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
3821     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
3822       return kPglRetMalformedInput;
3823     }
3824     if (!sample_include) {
3825       *het_ctp = CountAux1aDense(patch_01_fvalsw, allele_ct, allele_idx, raw_01_ct, rare01_ct);
3826       return kPglRetSuccess;
3827     }
3828     const Halfword* sample_include_hw = R_CAST(const Halfword*, sample_include);
3829     uintptr_t sample_hwidx = 0;
3830     uintptr_t cur_raw_genoarr_hets = Word01(raw_genoarr[0]);
3831     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
3832     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
3833     uintptr_t fvals_bits = 0;
3834     uint32_t fvals_widx = 0;
3835     uint32_t subsetted_hetx_ct = 0;
3836     uint32_t loop_len = kBitsPerWord;
3837     uint32_t rare01_lowbits = kBitsPerWord;
3838     for (uint32_t fset_widx = 0; ; ++fset_widx) {
3839       uintptr_t fset_bits;
3840       if (fset_widx >= fset_word_ct_m1) {
3841         if (fset_widx > fset_word_ct_m1) {
3842           break;
3843         }
3844         fset_bits = SubwordLoad(&(patch_01_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
3845         loop_len = ModNz(raw_01_ct, kBitsPerWord);
3846       } else {
3847         fset_bits = patch_01_fsetw[fset_widx];
3848       }
3849       // format 0, sample_include non-null
3850       if (ignore_01_fvals) {
3851         for (uint32_t uii = 0; uii != loop_len; ++uii) {
3852           while (!cur_raw_genoarr_hets) {
3853             cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
3854           }
3855           if (fset_bits & 1) {
3856             // Considered replacing cur_raw_genoarr_hets with the result of
3857             // two PackWordToHalfword() operations, since that keeps all
3858             // the sample word-indexes aligned.  Couldn't justify it given
3859             // the expected sparsity of this case, though.
3860             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
3861             subsetted_hetx_ct += (sample_include_hw[sample_hwidx] >> sample_uidx_lowbits) & 1;
3862           }
3863           cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
3864           fset_bits = fset_bits >> 1;
3865         }
3866       } else {
3867         for (uint32_t uii = 0; uii != loop_len; ++uii) {
3868           while (!cur_raw_genoarr_hets) {
3869             cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
3870           }
3871           if (fset_bits & 1) {
3872             if (rare01_lowbits == kBitsPerWord) {
3873               if (fvals_widx == fvals_word_ct_m1) {
3874                 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
3875               } else {
3876                 fvals_bits = patch_01_fvalsw[fvals_widx];
3877               }
3878               fvals_bits = fvals_bits ^ xor_word;
3879               fvals_bits = (detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)))) >> (allele_code_width - 1);
3880               // unnecessary to apply bzhi here
3881               ++fvals_widx;
3882               rare01_lowbits = 0;
3883             }
3884             if (fvals_bits & (k1LU << rare01_lowbits)) {
3885               const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
3886               subsetted_hetx_ct += (sample_include_hw[sample_hwidx] >> sample_uidx_lowbits) & 1;
3887             }
3888             rare01_lowbits += allele_code_width;
3889           }
3890           cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
3891           fset_bits = fset_bits >> 1;
3892         }
3893       }
3894     }
3895     if (allele_idx == 1) {
3896       *het_ctp = subsetted_01_ct - subsetted_hetx_ct;
3897     } else {
3898       *het_ctp = subsetted_hetx_ct;
3899     }
3900     return kPglRetSuccess;
3901   }
3902   // mode 1: difflist.
3903   if (!sample_include) {
3904     const unsigned char* group_info_iter;
3905     uint32_t rare01_ct;
3906     PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare01_ct);
3907     // rare01_ct == 0 should be impossible
3908     if (unlikely(reterr)) {
3909       return reterr;
3910     }
3911     reterr = SkipDeltalistIds(fread_end, group_info_iter, rare01_ct, raw_sample_ct, 1, fread_pp);
3912     if (unlikely(reterr)) {
3913       return reterr;
3914     }
3915     const unsigned char* patch_01_fvals = *fread_pp;
3916     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
3917     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
3918       return kPglRetMalformedInput;
3919     }
3920 
3921     *het_ctp = CountAux1aDense(patch_01_fvals, allele_ct, allele_idx, raw_01_ct, rare01_ct);
3922     return kPglRetSuccess;
3923   }
3924   if (ignore_01_fvals) {
3925     // Don't need to save deltalist contents in this case.
3926     uint32_t subsetted_hetx_ct;
3927     uint32_t rare01_ct;
3928     PglErr reterr = CountDeltalistIntersect(fread_end, sample_include, raw_sample_ct, fread_pp, &subsetted_hetx_ct, &rare01_ct);
3929     if (unlikely(reterr)) {
3930       return reterr;
3931     }
3932     if (allele_idx == 1) {
3933       *het_ctp = subsetted_01_ct - subsetted_hetx_ct;
3934       const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
3935       if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
3936         return kPglRetMalformedInput;
3937       }
3938     } else {
3939       *het_ctp = subsetted_hetx_ct;
3940     }
3941     return kPglRetSuccess;
3942   }
3943   // Save deltalist elements, iterate.
3944   uint32_t rare01_ct;
3945   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare01_ct);
3946   if (unlikely(reterr)) {
3947     return reterr;
3948   }
3949   const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
3950   const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
3951   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
3952     return kPglRetMalformedInput;
3953   }
3954   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
3955   uint32_t subsetted_hetx_ct = 0;
3956   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
3957     uintptr_t fvals_bits;
3958     if (fvals_widx >= fvals_word_ct_m1) {
3959       if (fvals_widx > fvals_word_ct_m1) {
3960         break;
3961       }
3962       fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
3963     } else {
3964       fvals_bits = patch_01_fvalsw[fvals_widx];
3965     }
3966     fvals_bits = fvals_bits ^ xor_word;
3967     fvals_bits = detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)));
3968     if (fvals_widx == fvals_word_ct_m1) {
3969       fvals_bits = bzhi_max(fvals_bits, ModNz(rare01_ct << allele_code_logwidth, kBitsPerWord));
3970     }
3971     if (!fvals_bits) {
3972       continue;
3973     }
3974     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - allele_code_logwidth)]);
3975     do {
3976       const uint32_t rare01_idx_lowbits = ctzw(fvals_bits) >> allele_code_logwidth;
3977       const uint32_t sample_uidx = cur_deltalist_base[rare01_idx_lowbits];
3978       subsetted_hetx_ct += IsSet(sample_include, sample_uidx);
3979       fvals_bits &= fvals_bits - 1;
3980     } while (fvals_bits);
3981   }
3982   *het_ctp = subsetted_hetx_ct;
3983   return kPglRetSuccess;
3984 }
3985 
CountAux1bDense(const void * patch_10_fvals,uint32_t allele_ct,uint32_t allele_idx_m1,uint32_t raw_10_ct,uint32_t rare10_ct,uint32_t * __restrict het_ctp,uint32_t * __restrict hom_ctp)3986 void CountAux1bDense(const void* patch_10_fvals, uint32_t allele_ct, uint32_t allele_idx_m1, uint32_t raw_10_ct, uint32_t rare10_ct, uint32_t* __restrict het_ctp, uint32_t* __restrict hom_ctp) {
3987   uint32_t matching_hom_ct = 0;
3988   uint32_t het_incr;
3989   if (allele_ct < 6) {
3990     if (allele_ct == 3) {
3991       const uint32_t allele_code_byte_ct = DivUp(rare10_ct, 8);
3992       matching_hom_ct = PopcountBytes(patch_10_fvals, allele_code_byte_ct);
3993       het_incr = rare10_ct - matching_hom_ct;
3994     } else {
3995       // 2+2 bits
3996       het_incr = CountNyp(patch_10_fvals, allele_idx_m1 * kMask5555, rare10_ct * 2);
3997       if (allele_idx_m1) {
3998         matching_hom_ct = CountNybble(patch_10_fvals, allele_idx_m1 * kMask5555, rare10_ct);
3999       }
4000     }
4001   } else {
4002     if (allele_ct < 18) {
4003       // 4+4 bits
4004       het_incr = CountNybble(patch_10_fvals, allele_idx_m1 * kMask1111, rare10_ct * 2);
4005       if (allele_idx_m1) {
4006         matching_hom_ct = CountByte(patch_10_fvals, allele_idx_m1 * 0x11, rare10_ct);
4007       }
4008     } else {
4009       // 8+8 bits
4010       het_incr = CountByte(patch_10_fvals, allele_idx_m1 * 0x11, rare10_ct * 2);
4011       if (allele_idx_m1) {
4012         matching_hom_ct = CountU16(patch_10_fvals, allele_idx_m1 * 0x1111, rare10_ct);
4013       }
4014     }
4015   }
4016   if (!allele_idx_m1) {
4017     *hom_ctp = raw_10_ct - rare10_ct;
4018   } else {
4019     het_incr -= 2 * matching_hom_ct;
4020     *hom_ctp = matching_hom_ct;
4021   }
4022   *het_ctp += het_incr;
4023 }
4024 
4025 // Returns allele_code_logwidth.
GetAux1bConsts(uint32_t allele_ct,uintptr_t * detect_hom_mask_lo_ptr)4026 uint32_t GetAux1bConsts(uint32_t allele_ct, uintptr_t* detect_hom_mask_lo_ptr) {
4027   if (allele_ct < 6) {
4028     if (allele_ct == 3) {
4029       *detect_hom_mask_lo_ptr = ~k0LU;
4030       return 0;
4031     }
4032     *detect_hom_mask_lo_ptr = kMask1111;
4033     return 1;
4034   }
4035   if (allele_ct < 18) {
4036     *detect_hom_mask_lo_ptr = kMask0101;
4037     return 2;
4038   }
4039   *detect_hom_mask_lo_ptr = kMask0001;
4040   return 3;
4041 }
4042 
4043 // Advances *fread_pp past aux1b; increments *het_ctp by the number of
4044 // altx-alty genotypes in aux1b and sample_include with one allele ==
4045 // allele_idx; and sets *hom_ctp to the number of such hom-allele_idx genotypes
4046 // present.  (For allele_idx == 1, *hom_ctp is equal to raw_10_ct -
4047 // <# of aux1b entries> when there's no subsetting.)
4048 // Trailing bits of raw_genoarr must be cleared.
4049 // Ok for subsetted_10_ct to be uninitialized if not subsetting, or allele_idx
4050 // != 1.
4051 // sample_include assumed to be nullptr if no subsetting required
CountAux1b(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx,uint32_t raw_10_ct,uint32_t subsetted_10_ct,const unsigned char ** fread_pp,uint32_t * __restrict het_ctp,uint32_t * __restrict hom_ctp,uint32_t * __restrict deltalist_workspace)4052 PglErr CountAux1b(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx, uint32_t raw_10_ct, uint32_t subsetted_10_ct, const unsigned char** fread_pp, uint32_t* __restrict het_ctp, uint32_t* __restrict hom_ctp, uint32_t* __restrict deltalist_workspace) {
4053   if (aux1b_mode == 15) {
4054     if (allele_idx == 1) {
4055       if (sample_include) {
4056         *hom_ctp = subsetted_10_ct;
4057       } else {
4058         *hom_ctp = raw_10_ct;
4059       }
4060     } else {
4061       *hom_ctp = 0;
4062     }
4063     return kPglRetSuccess;
4064   }
4065   uintptr_t detect_hom_mask_lo;
4066   const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
4067   const uint32_t allele_code_width = 1U << allele_code_logwidth;
4068   const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
4069   const uint32_t code10_width = 1U << code10_logwidth;
4070   const uint32_t allele_idx_m1 = allele_idx - 1;
4071   uint32_t rare10_lowbits = kBitsPerWord;
4072   if (!aux1b_mode) {
4073     // 10-collapsed bitarray
4074     const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
4075     const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
4076 #ifdef __arm__
4077 #  error "Unaligned accesses in CountAux1b()."
4078 #endif
4079     const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
4080     *fread_pp += fset_byte_ct;
4081     const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4082     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, 8);
4083     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4084       return kPglRetMalformedInput;
4085     }
4086     if (!sample_include) {
4087       CountAux1bDense(patch_10_fvalsw, allele_ct, allele_idx_m1, raw_10_ct, rare10_ct, het_ctp, hom_ctp);
4088       return kPglRetSuccess;
4089     }
4090     const Halfword* sample_include_hw = R_CAST(const Halfword*, sample_include);
4091     uintptr_t sample_hwidx = 0;
4092     uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
4093     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
4094     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4095     uintptr_t fvals_bits = 0;
4096     uint32_t fvals_widx = 0;
4097     uint32_t loop_len = kBitsPerWord;
4098     if ((!allele_idx_m1) || (allele_ct == 3)) {
4099       // bugfix (29 Dec 2019)
4100       const uintptr_t detect_alt1_mask_hi = detect_hom_mask_lo << (allele_code_width - 1);
4101       uint32_t subsetted_rare10_ct = 0;
4102       uint32_t het_1x_ct = 0;
4103       for (uint32_t fset_widx = 0; ; ++fset_widx) {
4104         uintptr_t fset_bits;
4105         if (fset_widx >= fset_word_ct_m1) {
4106           if (fset_widx > fset_word_ct_m1) {
4107             break;
4108           }
4109           fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4110           loop_len = ModNz(raw_10_ct, kBitsPerWord);
4111         } else {
4112           fset_bits = patch_10_fsetw[fset_widx];
4113         }
4114         for (uint32_t uii = 0; uii != loop_len; ++uii) {
4115           while (!cur_raw_genoarr_xys) {
4116             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4117           }
4118           if (fset_bits & 1) {
4119             if (rare10_lowbits == kBitsPerWord) {
4120               if (fvals_widx == fvals_word_ct_m1) {
4121                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4122               } else {
4123                 fvals_bits = patch_10_fvalsw[fvals_widx];
4124               }
4125               // This sets each fvals_bits entry to 1 iff the patch genotype is
4126               // ALT1-ALTx, i.e. the original low bits were zero.
4127               fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4128               // unnecessary to apply bzhi here
4129               ++fvals_widx;
4130               rare10_lowbits = 0;
4131             }
4132             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
4133             if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
4134               ++subsetted_rare10_ct;
4135               het_1x_ct += (fvals_bits >> rare10_lowbits) & 1;
4136             }
4137             rare10_lowbits += code10_width;
4138           }
4139           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4140           fset_bits = fset_bits >> 1;
4141         }
4142       }
4143       if (allele_ct == 3) {
4144         if (allele_idx_m1) {
4145           *hom_ctp = subsetted_rare10_ct - het_1x_ct;
4146           *het_ctp += het_1x_ct;
4147           return kPglRetSuccess;
4148         }
4149       }
4150       *hom_ctp = subsetted_10_ct - subsetted_rare10_ct;
4151       *het_ctp += het_1x_ct;
4152       return kPglRetSuccess;
4153     }
4154     // allele_idx > 1, allele_ct > 3
4155     const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4156     const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4157     const uintptr_t xor_word = allele_idx_m1 * detect_all_mask_lo;
4158     uint32_t matching_allele_ct = 0;  // 2x hom + 1x het
4159     uint32_t matching_het_or_hom_ct = 0;
4160     for (uint32_t fset_widx = 0; ; ++fset_widx) {
4161       uintptr_t fset_bits;
4162       if (fset_widx >= fset_word_ct_m1) {
4163         if (fset_widx > fset_word_ct_m1) {
4164           break;
4165         }
4166         fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4167         loop_len = ModNz(raw_10_ct, kBitsPerWord);
4168       } else {
4169         fset_bits = patch_10_fsetw[fset_widx];
4170       }
4171       for (uint32_t uii = 0; uii != loop_len; ++uii) {
4172         while (!cur_raw_genoarr_xys) {
4173           cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4174         }
4175         if (fset_bits & 1) {
4176           if (rare10_lowbits == kBitsPerWord) {
4177             if (fvals_widx == fvals_word_ct_m1) {
4178               fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4179             } else {
4180               fvals_bits = patch_10_fvalsw[fvals_widx];
4181             }
4182             fvals_bits ^= xor_word;
4183             fvals_bits = (detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)))) >> (allele_code_width - 1);
4184             // unnecessary to apply bzhi or detect_hom_mask_lo here
4185             fvals_bits = fvals_bits + (fvals_bits >> allele_code_width);
4186             ++fvals_widx;
4187             rare10_lowbits = 0;
4188           }
4189           const uintptr_t cur_hit_ct = (fvals_bits >> rare10_lowbits) & 3;
4190           rare10_lowbits += code10_width;
4191           if (cur_hit_ct) {
4192             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
4193             if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
4194               ++matching_het_or_hom_ct;
4195               matching_allele_ct += cur_hit_ct;
4196             }
4197           }
4198         }
4199         cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4200         fset_bits = fset_bits >> 1;
4201       }
4202     }
4203     const uint32_t matching_hom_ct = matching_allele_ct - matching_het_or_hom_ct;
4204     *hom_ctp = matching_hom_ct;
4205     *het_ctp += matching_het_or_hom_ct - matching_hom_ct;
4206     return kPglRetSuccess;
4207   }
4208   // mode 1: difflist.
4209   if (!sample_include) {
4210     const unsigned char* group_info_iter;
4211     uint32_t rare10_ct;
4212     PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare10_ct);
4213     // rare10_ct == 0 should be impossible
4214     if (unlikely(reterr)) {
4215       return reterr;
4216     }
4217     reterr = SkipDeltalistIds(fread_end, group_info_iter, rare10_ct, raw_sample_ct, 0, fread_pp);
4218     if (unlikely(reterr)) {
4219       return reterr;
4220     }
4221     const unsigned char* patch_10_fvals = *fread_pp;
4222     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
4223     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4224       return kPglRetMalformedInput;
4225     }
4226     CountAux1bDense(patch_10_fvals, allele_ct, allele_idx_m1, raw_10_ct, rare10_ct, het_ctp, hom_ctp);
4227     return kPglRetSuccess;
4228   }
4229   // Save deltalist elements, iterate.
4230   uint32_t rare10_ct;
4231   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
4232   if (unlikely(reterr)) {
4233     return reterr;
4234   }
4235   const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4236   const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
4237   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4238     return kPglRetMalformedInput;
4239   }
4240   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4241   if ((!allele_idx_m1) || (allele_ct == 3)) {
4242     const uintptr_t detect_alt1_mask_hi = detect_hom_mask_lo << (allele_code_width - 1);
4243     uint32_t subsetted_rare10_ct = 0;
4244     uint32_t het_1x_ct = 0;
4245     uint32_t loop_len = kBitsPerWord >> code10_logwidth;
4246     for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4247       uintptr_t fvals_bits;
4248       if (fvals_widx >= fvals_word_ct_m1) {
4249         if (fvals_widx > fvals_word_ct_m1) {
4250           break;
4251         }
4252         fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4253         loop_len = 1 + ((rare10_ct - 1) & ((kBitsPerWord >> code10_logwidth) - 1));
4254       } else {
4255         fvals_bits = patch_10_fvalsw[fvals_widx];
4256       }
4257       fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4258       const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
4259       for (uint32_t uii = 0; uii != loop_len; ++uii) {
4260         const uint32_t sample_uidx = cur_deltalist_base[uii];
4261         if (IsSet(sample_include, sample_uidx)) {
4262           ++subsetted_rare10_ct;
4263           het_1x_ct += (fvals_bits >> (uii << code10_logwidth)) & 1;
4264         }
4265       }
4266     }
4267     if (allele_ct == 3) {
4268       if (allele_idx_m1) {
4269         *hom_ctp = subsetted_rare10_ct - het_1x_ct;
4270         *het_ctp += het_1x_ct;
4271         return kPglRetSuccess;
4272       }
4273     }
4274     *hom_ctp = subsetted_10_ct - subsetted_rare10_ct;
4275     *het_ctp += het_1x_ct;
4276     return kPglRetSuccess;
4277   }
4278   // allele_idx > 1, allele_ct > 3
4279   const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4280   const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4281   detect_hom_mask_lo = detect_hom_mask_lo * 3;
4282   const uintptr_t xor_word = allele_idx_m1 * detect_all_mask_lo;
4283   uint32_t matching_het_or_hom_ct = 0;
4284   uint32_t matching_hom_ct = 0;
4285   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4286     uintptr_t fvals_bits;
4287     if (fvals_widx >= fvals_word_ct_m1) {
4288       if (fvals_widx > fvals_word_ct_m1) {
4289         break;
4290       }
4291       fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4292     } else {
4293       fvals_bits = patch_10_fvalsw[fvals_widx];
4294     }
4295     fvals_bits = fvals_bits ^ xor_word;
4296     fvals_bits = detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)));
4297     if (fvals_widx == fvals_word_ct_m1) {
4298       fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
4299     }
4300     if (!fvals_bits) {
4301       continue;
4302     }
4303     fvals_bits = fvals_bits >> (allele_code_width - 1);
4304     fvals_bits = (fvals_bits + (fvals_bits >> allele_code_width)) & detect_hom_mask_lo;
4305     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
4306     do {
4307       const uint32_t bit_idx = ctzw(fvals_bits);
4308       const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
4309       if (IsSet(sample_include, sample_uidx)) {
4310         ++matching_het_or_hom_ct;
4311         matching_hom_ct += bit_idx & 1;
4312       }
4313       fvals_bits &= fvals_bits - 1;
4314     } while (fvals_bits);
4315   }
4316   *hom_ctp = matching_hom_ct;
4317   *het_ctp += matching_het_or_hom_ct - matching_hom_ct;
4318   return kPglRetSuccess;
4319 }
4320 
4321 PglErr PgrGetInv1Counts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts) {
4322   // May use workspace_vec and workspace_difflist_sample_ids.
4323   if (!sample_ct) {
4324     STD_ARRAY_REF_FILL0(4, genocounts);
4325     return kPglRetSuccess;
4326   }
4327   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
4328   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
4329   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
4330   PglErr reterr;
4331   if ((!allele_idx) || (!allele_idx_offsets)) {
4332   PgrGetInv1Counts_biallelic:
4333     reterr = GetBasicGenotypeCounts(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, genocounts);
4334     if (allele_idx) {
4335       const uint32_t homref_ct = genocounts[0];
4336       genocounts[0] = genocounts[2];
4337       genocounts[2] = homref_ct;
4338     }
4339     return reterr;
4340   }
4341   const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
4342   if (allele_ct == 2) {
4343     goto PgrGetInv1Counts_biallelic;
4344   }
4345   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
4346   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
4347   uintptr_t* tmp_genovec = pgrp->workspace_vec;
4348   const unsigned char* fread_ptr;
4349   const unsigned char* fread_end;
4350   reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, tmp_genovec);
4351   if (unlikely(reterr)) {
4352     return reterr;
4353   }
4354   ZeroTrailingNyps(raw_sample_ct, tmp_genovec);
4355   const uint32_t aux1_first_byte = *fread_ptr++;
4356   const uint32_t aux1a_mode = aux1_first_byte & 15;
4357   const uint32_t aux1b_mode = aux1_first_byte >> 4;
4358   // raw_01_ct not needed when aux1a uses difflist form and subsetting is
4359   // occurring; same applies to raw_10_ct.
4360   uint32_t raw_01_ct = 0;
4361   uint32_t raw_10_ct = 0;
4362   if ((!subsetting_required) || (!aux1a_mode) || (!aux1b_mode)) {
4363     GenoarrCountFreqsUnsafe(tmp_genovec, raw_sample_ct, genocounts);
4364     raw_01_ct = genocounts[1];
4365     raw_10_ct = genocounts[2];
4366   }
4367   uint32_t subsetted_01_ct = 0;
4368   uint32_t subsetted_10_ct = 0;
4369   if (subsetting_required) {
4370     // need accurate subsetted missing count for allele_idx > 1 case
4371     GenoarrCountSubsetFreqs(tmp_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
4372     subsetted_01_ct = genocounts[1];
4373     subsetted_10_ct = genocounts[2];
4374   } else {
4375     sample_include = nullptr;
4376   }
4377   uint32_t het_ct;
4378   reterr = CountAux1a(fread_end, sample_include, tmp_genovec, aux1a_mode, raw_sample_ct, allele_ct, allele_idx, raw_01_ct, subsetted_01_ct, &fread_ptr, &het_ct, pgrp->workspace_difflist_sample_ids);
4379   if (unlikely(reterr)) {
4380     return reterr;
4381   }
4382   uint32_t hom_ct;
4383   reterr = CountAux1b(fread_end, sample_include, tmp_genovec, aux1b_mode, raw_sample_ct, allele_ct, allele_idx, raw_10_ct, subsetted_10_ct, &fread_ptr, &het_ct, &hom_ct, pgrp->workspace_difflist_sample_ids);
4384   genocounts[0] = hom_ct;
4385   genocounts[1] = het_ct;
4386   genocounts[2] = sample_ct - genocounts[3] - hom_ct - het_ct;
4387   return reterr;
4388 }
4389 
4390 // sample_include assumed to be nullptr if no subsetting required
GenoarrAux1aUpdate(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx,uintptr_t lshifted_bit,uint32_t raw_01_ct,const unsigned char ** fread_pp,uintptr_t * __restrict target_genoarr,uint32_t * __restrict deltalist_workspace)4391 PglErr GenoarrAux1aUpdate(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx, uintptr_t lshifted_bit, uint32_t raw_01_ct, const unsigned char** fread_pp, uintptr_t* __restrict target_genoarr, uint32_t* __restrict deltalist_workspace) {
4392   if (aux1a_mode == 15) {
4393     return kPglRetSuccess;
4394   }
4395   const uint32_t ignore_01_fvals = (allele_idx == 1) || (allele_ct == 3);
4396   uintptr_t detect_mask_hi;
4397   uintptr_t detect_mask_lo;
4398   uint32_t allele_code_logwidth;
4399   const uint32_t allele_code_width = GetAux1aConsts(allele_ct, &detect_mask_hi, &detect_mask_lo, &allele_code_logwidth);
4400   const uintptr_t xor_word = (allele_idx - 2) * detect_mask_lo;
4401   if (!aux1a_mode) {
4402 #ifdef __arm__
4403 #  error "Unaligned accesses in GenoarrAux1aUpdate()."
4404 #endif
4405     const uintptr_t* patch_01_fsetw = R_CAST(const uintptr_t*, *fread_pp);
4406     const uint32_t fset_byte_ct = DivUp(raw_01_ct, 8);
4407     uint32_t rare01_ct = 0;
4408     if (allele_ct > 3) {
4409       rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
4410     }
4411     *fread_pp += fset_byte_ct;
4412     const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4413     uintptr_t sample_hwidx = 0;
4414     uintptr_t cur_raw_genoarr_hets = Word01(raw_genoarr[0]);
4415     uint32_t loop_len = kBitsPerWord;
4416     const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
4417     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4418       return kPglRetMalformedInput;
4419     }
4420     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
4421     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4422     const uint32_t lshift = lshifted_bit - 1;
4423     uintptr_t fvals_bits = 0;
4424     uint32_t fvals_widx = 0;
4425     uint32_t rare01_lowbits = kBitsPerWord;
4426     for (uint32_t fset_widx = 0; ; ++fset_widx) {
4427       uintptr_t fset_bits;
4428       if (fset_widx >= fset_word_ct_m1) {
4429         if (fset_widx > fset_word_ct_m1) {
4430           return kPglRetSuccess;
4431         }
4432         fset_bits = SubwordLoad(&(patch_01_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4433         loop_len = ModNz(raw_01_ct, kBitsPerWord);
4434       } else {
4435         fset_bits = patch_01_fsetw[fset_widx];
4436       }
4437       if (!sample_include) {
4438         if (ignore_01_fvals) {
4439           for (uint32_t uii = 0; uii != loop_len; ++uii) {
4440             while (!cur_raw_genoarr_hets) {
4441               cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
4442             }
4443             if (fset_bits & 1) {
4444               // ref/altx present for x>1.  Change genovec entry from 01 to 11
4445               // (or 11 -> 01 in allele_idx == 2, allele_ct == 3 case; same xor
4446               // operation works for that)
4447               const uintptr_t lowbit = cur_raw_genoarr_hets & (-cur_raw_genoarr_hets);
4448               target_genoarr[sample_hwidx] ^= lowbit << lshift;
4449             }
4450             cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
4451             fset_bits = fset_bits >> 1;
4452           }
4453         } else {
4454           for (uint32_t uii = 0; uii != loop_len; ++uii) {
4455             while (!cur_raw_genoarr_hets) {
4456               cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
4457             }
4458             if (fset_bits & 1) {
4459               if (rare01_lowbits == kBitsPerWord) {
4460                 if (fvals_widx == fvals_word_ct_m1) {
4461                   fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4462                 } else {
4463                   fvals_bits = patch_01_fvalsw[fvals_widx];
4464                 }
4465                 fvals_bits = fvals_bits ^ xor_word;
4466                 fvals_bits = (detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)))) >> (allele_code_width - 1);
4467                 // unnecessary to apply bzhi here
4468                 ++fvals_widx;
4469                 rare01_lowbits = 0;
4470               }
4471               if (fvals_bits & (k1LU << rare01_lowbits)) {
4472                 const uintptr_t lowbit = cur_raw_genoarr_hets & (-cur_raw_genoarr_hets);
4473                 target_genoarr[sample_hwidx] ^= lowbit << lshift;
4474               }
4475               rare01_lowbits += allele_code_width;
4476             }
4477             cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
4478             fset_bits = fset_bits >> 1;
4479           }
4480         }
4481       } else {
4482         // format 0, sample_include non-null
4483         if (ignore_01_fvals) {
4484           for (uint32_t uii = 0; uii != loop_len; ++uii) {
4485             while (!cur_raw_genoarr_hets) {
4486               cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
4487             }
4488             if (fset_bits & 1) {
4489               // Considered replacing cur_raw_genoarr_hets with the result of
4490               // two PackWordToHalfword() operations, since that keeps all
4491               // the sample word-indexes aligned.  Couldn't justify it given
4492               // the expected sparsity of this case, though.
4493               const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
4494               if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
4495                 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
4496                 target_genoarr[sample_idx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_idx % kBitsPerWordD2));
4497               }
4498             }
4499             cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
4500             fset_bits = fset_bits >> 1;
4501           }
4502         } else {
4503           for (uint32_t uii = 0; uii != loop_len; ++uii) {
4504             while (!cur_raw_genoarr_hets) {
4505               cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
4506             }
4507             if (fset_bits & 1) {
4508               if (rare01_lowbits == kBitsPerWord) {
4509                 if (fvals_widx == fvals_word_ct_m1) {
4510                   fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4511                 } else {
4512                   fvals_bits = patch_01_fvalsw[fvals_widx];
4513                 }
4514                 fvals_bits = fvals_bits ^ xor_word;
4515                 fvals_bits = (detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)))) >> (allele_code_width - 1);
4516                 // unnecessary to apply bzhi here
4517                 ++fvals_widx;
4518                 rare01_lowbits = 0;
4519               }
4520               if (fvals_bits & (k1LU << rare01_lowbits)) {
4521                 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
4522                 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
4523                   const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
4524                   target_genoarr[sample_idx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_idx % kBitsPerWordD2));
4525                 }
4526               }
4527               rare01_lowbits += allele_code_width;
4528             }
4529             cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
4530             fset_bits = fset_bits >> 1;
4531           }
4532         }
4533       }
4534     }
4535   }
4536   // aux1a_mode == 1
4537   uint32_t rare01_ct;
4538   // Might hardcode the ParseAndSaveDeltalist logic later, but lets get
4539   // this working first.
4540   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare01_ct);
4541   if (unlikely(reterr)) {
4542     return reterr;
4543   }
4544   const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4545   const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
4546   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4547     return kPglRetMalformedInput;
4548   }
4549   if (ignore_01_fvals) {
4550     if (!sample_include) {
4551       for (uint32_t rare01_idx = 0; rare01_idx != rare01_ct; ++rare01_idx) {
4552         const uint32_t sample_uidx = deltalist_workspace[rare01_idx];
4553         // todo: benchmark against k1LU << (lshift + ...)
4554         target_genoarr[sample_uidx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_uidx % kBitsPerWordD2));
4555       }
4556       return kPglRetSuccess;
4557     }
4558     for (uint32_t rare01_idx = 0; rare01_idx != rare01_ct; ++rare01_idx) {
4559       const uint32_t sample_uidx = deltalist_workspace[rare01_idx];
4560       // could wrap this boilerplate
4561       const uint32_t sample_widx = sample_uidx / kBitsPerWord;
4562       const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
4563       const uintptr_t sample_include_word = sample_include[sample_widx];
4564       if (sample_include_word & lowbit) {
4565         const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
4566         target_genoarr[sample_idx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_idx % kBitsPerWordD2));
4567       }
4568     }
4569     return kPglRetSuccess;
4570   }
4571   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4572   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4573     uintptr_t fvals_bits;
4574     if (fvals_widx >= fvals_word_ct_m1) {
4575       if (fvals_widx > fvals_word_ct_m1) {
4576         return kPglRetSuccess;
4577       }
4578       fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4579     } else {
4580       fvals_bits = patch_01_fvalsw[fvals_widx];
4581     }
4582     fvals_bits = fvals_bits ^ xor_word;
4583     fvals_bits = detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)));
4584     if (fvals_widx == fvals_word_ct_m1) {
4585       fvals_bits = bzhi_max(fvals_bits, ModNz(rare01_ct << allele_code_logwidth, kBitsPerWord));
4586     }
4587     if (!fvals_bits) {
4588       continue;
4589     }
4590     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - allele_code_logwidth)]);
4591     if (!sample_include) {
4592       do {
4593         const uint32_t rare01_idx_lowbits = ctzw(fvals_bits) >> allele_code_logwidth;
4594         const uint32_t sample_uidx = cur_deltalist_base[rare01_idx_lowbits];
4595         target_genoarr[sample_uidx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_uidx % kBitsPerWordD2));
4596         fvals_bits &= fvals_bits - 1;
4597       } while (fvals_bits);
4598     } else {
4599       do {
4600         const uint32_t rare01_idx_lowbits = ctzw(fvals_bits) >> allele_code_logwidth;
4601         const uint32_t sample_uidx = cur_deltalist_base[rare01_idx_lowbits];
4602         const uint32_t sample_widx = sample_uidx / kBitsPerWord;
4603         const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
4604         const uintptr_t sample_include_word = sample_include[sample_widx];
4605         if (sample_include_word & lowbit) {
4606           const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
4607           target_genoarr[sample_idx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_idx % kBitsPerWordD2));
4608         }
4609         fvals_bits &= fvals_bits - 1;
4610       } while (fvals_bits);
4611     }
4612   }
4613 }
4614 
4615 // sample_include assumed to be nullptr if no subsetting required
GenoarrAux1bStandardUpdate(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict target_genoarr,uint32_t * __restrict deltalist_workspace)4616 PglErr GenoarrAux1bStandardUpdate(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict target_genoarr, uint32_t* __restrict deltalist_workspace) {
4617   if (aux1b_mode == 15) {
4618     return kPglRetSuccess;
4619   }
4620   const uint32_t allele_idx_m1 = allele_idx - 1;
4621   uintptr_t detect_hom_mask_lo;
4622   const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
4623   const uint32_t allele_code_width = 1U << allele_code_logwidth;
4624   const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
4625   const uint32_t code10_width = 1U << code10_logwidth;
4626   uint32_t rare10_lowbits = kBitsPerWord;
4627   if (!aux1b_mode) {
4628 #ifdef __arm__
4629 #  error "Unaligned accesses in GenoarrAux1bStandardUpdate()."
4630 #endif
4631     const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
4632     const uint32_t fset_byte_ct = DivUp(raw_10_ct, 8);
4633     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
4634     const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
4635     *fread_pp += fset_byte_ct;
4636     uintptr_t sample_hwidx = 0;
4637     uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
4638     const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4639     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, CHAR_BIT);
4640     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4641       return kPglRetMalformedInput;
4642     }
4643     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4644     uintptr_t fvals_bits = 0;
4645     uint32_t fvals_widx = 0;
4646     uint32_t loop_len = kBitsPerWord;
4647     if ((!allele_idx_m1) || (allele_ct == 3)) {
4648       // bugfix (29 Dec 2019)
4649       const uintptr_t detect_alt1_mask_hi = detect_hom_mask_lo << (allele_code_width - 1);
4650       // If allele_ct == 3:
4651       //   code10_width = 1
4652       //   0 -> 1/2, 1 -> 2/2
4653       //   if allele_idx == 1:
4654       //     we want to convert 2 -> 1 for 1/2 genotypes, and 2 -> 0 for 2/2.
4655       //   if allele_idx == 2:
4656       //     we want to convert 0 -> 1 for 1/2 genotypes, and 0 -> 2 for 2/2.
4657       // If allele_ct == 4 (allele_idx == 1 forced):
4658       //   allele_code_width = 2
4659       //   code10_width = 4
4660       //   we want to convert 2 -> 1 for 1/x genotypes, and 2 -> 0 otherwise.
4661       const uint32_t lowcode_add = 2 - allele_idx_m1;
4662       for (uint32_t fset_widx = 0; ; ++fset_widx) {
4663         uintptr_t fset_bits;
4664         if (fset_widx >= fset_word_ct_m1) {
4665           if (fset_widx > fset_word_ct_m1) {
4666             break;
4667           }
4668           fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4669           loop_len = ModNz(raw_10_ct, kBitsPerWord);
4670         } else {
4671           fset_bits = patch_10_fsetw[fset_widx];
4672         }
4673         if (!sample_include) {
4674           for (uint32_t uii = 0; uii != loop_len; ++uii) {
4675             while (!cur_raw_genoarr_xys) {
4676               cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4677             }
4678             if (fset_bits & 1) {
4679               if (rare10_lowbits == kBitsPerWord) {
4680                 if (fvals_widx == fvals_word_ct_m1) {
4681                   fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4682                 } else {
4683                   fvals_bits = patch_10_fvalsw[fvals_widx];
4684                 }
4685                 // modify to het 1/x = 1, otherwise 0, except in allele_idx ==
4686                 // 2 special case.
4687                 if (!allele_idx_m1) {
4688                   fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4689                 }
4690                 // unnecessary to apply bzhi here
4691                 ++fvals_widx;
4692                 rare10_lowbits = 0;
4693               }
4694               const uint32_t cur_lowcode0 = (fvals_bits >> rare10_lowbits) & 1;
4695               rare10_lowbits += code10_width;
4696               const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
4697               target_genoarr[sample_hwidx] ^= lowbit * (lowcode_add + cur_lowcode0);
4698             }
4699             cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4700             fset_bits = fset_bits >> 1;
4701           }
4702         } else {
4703           // sample_include non-null
4704           for (uint32_t uii = 0; uii != loop_len; ++uii) {
4705             while (!cur_raw_genoarr_xys) {
4706               cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4707             }
4708             if (fset_bits & 1) {
4709               if (rare10_lowbits == kBitsPerWord) {
4710                 if (fvals_widx == fvals_word_ct_m1) {
4711                   fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4712                 } else {
4713                   fvals_bits = patch_10_fvalsw[fvals_widx];
4714                 }
4715                 // modify to het 1/x = 1, otherwise 0, except in allele_idx ==
4716                 // 2 special case
4717                 if (!allele_idx_m1) {
4718                   fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4719                 }
4720                 // unnecessary to apply bzhi here
4721                 ++fvals_widx;
4722                 rare10_lowbits = 0;
4723               }
4724               const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
4725               if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
4726                 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
4727                 const uintptr_t cur_lowcode0 = (fvals_bits >> rare10_lowbits) & 1;
4728                 const uintptr_t shifted_xor_mult = (lowcode_add + cur_lowcode0) << (2 * (sample_idx % kBitsPerWordD2));
4729                 target_genoarr[sample_idx / kBitsPerWordD2] ^= shifted_xor_mult;
4730               }
4731               rare10_lowbits += code10_width;
4732             }
4733             cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4734             fset_bits = fset_bits >> 1;
4735           }
4736         }
4737       }
4738       return kPglRetSuccess;
4739     }
4740     // allele_idx > 1, allele_ct > 3
4741     const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4742     const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4743     const uintptr_t xor_word = allele_idx_m1 * detect_all_mask_lo;
4744     for (uint32_t fset_widx = 0; ; ++fset_widx) {
4745       uintptr_t fset_bits;
4746       if (fset_widx >= fset_word_ct_m1) {
4747         if (fset_widx > fset_word_ct_m1) {
4748           break;
4749         }
4750         fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4751         loop_len = ModNz(raw_10_ct, kBitsPerWord);
4752       } else {
4753         fset_bits = patch_10_fsetw[fset_widx];
4754       }
4755       if (!sample_include) {
4756         for (uint32_t uii = 0; uii != loop_len; ++uii) {
4757           while (!cur_raw_genoarr_xys) {
4758             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4759           }
4760           if (fset_bits & 1) {
4761             if (rare10_lowbits == kBitsPerWord) {
4762               if (fvals_widx == fvals_word_ct_m1) {
4763                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4764               } else {
4765                 fvals_bits = patch_10_fvalsw[fvals_widx];
4766               }
4767               // modify to hom = 2, het = 1, neither = 0
4768               fvals_bits = fvals_bits ^ xor_word;
4769               fvals_bits = (detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)))) >> (allele_code_width - 1);
4770               // unnecessary to apply bzhi or detect_hom_mask_lo here
4771               fvals_bits = fvals_bits + (fvals_bits >> allele_code_width);
4772               ++fvals_widx;
4773               rare10_lowbits = 0;
4774             }
4775             const uintptr_t cur_hit_ct = (fvals_bits >> rare10_lowbits) & 3;
4776             rare10_lowbits += code10_width;
4777             if (cur_hit_ct) {
4778               const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
4779               target_genoarr[sample_hwidx] ^= lowbit * cur_hit_ct;
4780             }
4781           }
4782           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4783           fset_bits = fset_bits >> 1;
4784         }
4785       } else {
4786         for (uint32_t uii = 0; uii != loop_len; ++uii) {
4787           while (!cur_raw_genoarr_xys) {
4788             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4789           }
4790           if (fset_bits & 1) {
4791             if (rare10_lowbits == kBitsPerWord) {
4792               if (fvals_widx == fvals_word_ct_m1) {
4793                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4794               } else {
4795                 fvals_bits = patch_10_fvalsw[fvals_widx];
4796               }
4797               // modify to hom = 2, het = 1, neither = 0
4798               fvals_bits = fvals_bits ^ xor_word;
4799               fvals_bits = (detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)))) >> (allele_code_width - 1);
4800               if (fvals_widx == fvals_word_ct_m1) {
4801                 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct * code10_width, kBitsPerWord));
4802               }
4803               fvals_bits = fvals_bits + (fvals_bits >> allele_code_width);
4804               ++fvals_widx;
4805               rare10_lowbits = 0;
4806             }
4807             const uintptr_t cur_hit_ct = (fvals_bits >> rare10_lowbits) & 3;
4808             rare10_lowbits += code10_width;
4809             if (cur_hit_ct) {
4810               const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
4811               if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
4812                 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
4813                 target_genoarr[sample_idx / kBitsPerWordD2] ^= cur_hit_ct << (2 * (sample_idx % kBitsPerWordD2));
4814               }
4815             }
4816           }
4817           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4818           fset_bits = fset_bits >> 1;
4819         }
4820       }
4821     }
4822     return kPglRetSuccess;
4823   }
4824   // aux1b_mode == 1
4825   uint32_t rare10_ct;
4826   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
4827   if (unlikely(reterr)) {
4828     return reterr;
4829   }
4830   const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4831   const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
4832   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4833     return kPglRetMalformedInput;
4834   }
4835   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4836   if ((!allele_idx_m1) || (allele_ct == 3)) {
4837     // bugfix (29 Dec 2019)
4838     const uintptr_t detect_alt1_mask_hi = detect_hom_mask_lo << (allele_code_width - 1);
4839     const uintptr_t lowcode_add = 2 - allele_idx_m1;
4840     uint32_t loop_len = kBitsPerWord >> code10_logwidth;
4841     for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4842       uintptr_t fvals_bits;
4843       if (fvals_widx >= fvals_word_ct_m1) {
4844         if (fvals_widx > fvals_word_ct_m1) {
4845           break;
4846         }
4847         fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4848         loop_len = 1 + ((rare10_ct - 1) & ((kBitsPerWord >> code10_logwidth) - 1));
4849       } else {
4850         fvals_bits = patch_10_fvalsw[fvals_widx];
4851       }
4852       if (!allele_idx_m1) {
4853         fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4854       }
4855       const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
4856       if (!sample_include) {
4857         for (uint32_t uii = 0; uii != loop_len; ++uii) {
4858           const uint32_t sample_uidx = cur_deltalist_base[uii];
4859           const uintptr_t cur_lowcode0 = fvals_bits & 1;
4860           const uintptr_t shifted_xor_mult = (lowcode_add + cur_lowcode0) << (2 * (sample_uidx % kBitsPerWordD2));
4861           target_genoarr[sample_uidx / kBitsPerWordD2] ^= shifted_xor_mult;
4862           fvals_bits = fvals_bits >> code10_width;
4863         }
4864       } else {
4865         for (uint32_t uii = 0; uii != loop_len; ++uii) {
4866           const uint32_t sample_uidx = cur_deltalist_base[uii];
4867           const uint32_t sample_widx = sample_uidx / kBitsPerWord;
4868           const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
4869           const uintptr_t sample_include_word = sample_include[sample_widx];
4870           if (sample_include_word & lowbit) {
4871             const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
4872             const uintptr_t cur_lowcode0 = fvals_bits & 1;
4873             const uintptr_t shifted_xor_mult = (lowcode_add + cur_lowcode0) << (2 * (sample_idx % kBitsPerWordD2));
4874             target_genoarr[sample_idx / kBitsPerWordD2] ^= shifted_xor_mult;
4875           }
4876           fvals_bits = fvals_bits >> code10_width;
4877         }
4878       }
4879     }
4880     return kPglRetSuccess;
4881   }
4882   // allele_idx > 1, allele_ct > 3
4883   const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4884   const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4885   detect_hom_mask_lo = detect_hom_mask_lo * 3;
4886   const uintptr_t xor_word = allele_idx_m1 * detect_all_mask_lo;
4887   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4888     uintptr_t fvals_bits;
4889     if (fvals_widx >= fvals_word_ct_m1) {
4890       if (fvals_widx > fvals_word_ct_m1) {
4891         break;
4892       }
4893       fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4894     } else {
4895       fvals_bits = patch_10_fvalsw[fvals_widx];
4896     }
4897     fvals_bits = fvals_bits ^ xor_word;
4898     fvals_bits = detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)));
4899     if (fvals_widx == fvals_word_ct_m1) {
4900       fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
4901     }
4902     if (!fvals_bits) {
4903       continue;
4904     }
4905     fvals_bits = fvals_bits >> (allele_code_width - 1);
4906     fvals_bits = (fvals_bits + (fvals_bits >> allele_code_width)) & detect_hom_mask_lo;
4907     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
4908     if (!sample_include) {
4909       do {
4910         const uint32_t bit_idx = ctzw(fvals_bits);
4911         const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
4912         target_genoarr[sample_uidx / kBitsPerWordD2] ^= k1LU << ((bit_idx % 2) + 2 * (sample_uidx % kBitsPerWordD2));
4913         fvals_bits &= fvals_bits - 1;
4914       } while (fvals_bits);
4915     } else {
4916       do {
4917         const uint32_t bit_idx = ctzw(fvals_bits);
4918         const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
4919         const uint32_t sample_widx = sample_uidx / kBitsPerWord;
4920         const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
4921         const uintptr_t sample_include_word = sample_include[sample_widx];
4922         if (sample_include_word & lowbit) {
4923           const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
4924           target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << ((bit_idx % 2) + 2 * (sample_idx % kBitsPerWordD2));
4925         }
4926         fvals_bits &= fvals_bits - 1;
4927       } while (fvals_bits);
4928     }
4929   }
4930   return kPglRetSuccess;
4931 }
4932 
4933 // if aux1b_het_present is true, aux1b_hets becomes a 1-bit-per-sample bitarray
4934 // with the positions of altx/alty hets in aux1b.
GetAux1bHets(const unsigned char * fread_end,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict aux1b_hets,uint32_t * __restrict aux1b_het_presentp,uint32_t * __restrict deltalist_workspace)4935 PglErr GetAux1bHets(const unsigned char* fread_end, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict aux1b_hets, uint32_t* __restrict aux1b_het_presentp, uint32_t* __restrict deltalist_workspace) {
4936   if (aux1b_mode == 15) {
4937     *aux1b_het_presentp = 0;
4938     return kPglRetSuccess;
4939   }
4940   uintptr_t detect_hom_mask_lo;
4941   const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
4942   const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
4943   const uint32_t code10_width = 1U << code10_logwidth;
4944   const uint32_t allele_code_width = 1U << allele_code_logwidth;
4945   const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4946   const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4947   Halfword* aux1b_hets_alias = R_CAST(Halfword*, aux1b_hets);
4948   uint32_t rare10_lowbits = kBitsPerWord;
4949   uint32_t aux1b_het_present = 0;
4950   if (!aux1b_mode) {
4951 #ifdef __arm__
4952 #  error "Unaligned accesses in GetAux1bHets()."
4953 #endif
4954     const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
4955     const uint32_t fset_byte_ct = DivUp(raw_10_ct, 8);
4956     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
4957     const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
4958     *fread_pp += fset_byte_ct;
4959     uintptr_t sample_hwidx = 0;
4960     uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
4961     const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4962     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, CHAR_BIT);
4963     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4964       return kPglRetMalformedInput;
4965     }
4966     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4967     uintptr_t fvals_bits = 0;
4968     uint32_t fvals_widx = 0;
4969     uint32_t loop_len = kBitsPerWord;
4970     for (uint32_t fset_widx = 0; ; ++fset_widx) {
4971       uintptr_t fset_bits;
4972       if (fset_widx >= fset_word_ct_m1) {
4973         if (fset_widx > fset_word_ct_m1) {
4974           break;
4975         }
4976         fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4977         loop_len = ModNz(raw_10_ct, kBitsPerWord);
4978       } else {
4979         fset_bits = patch_10_fsetw[fset_widx];
4980       }
4981       for (uint32_t uii = 0; uii != loop_len; ++uii) {
4982         while (!cur_raw_genoarr_xys) {
4983           cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4984         }
4985         if (fset_bits & 1) {
4986           if (rare10_lowbits == kBitsPerWord) {
4987             if (fvals_widx == fvals_word_ct_m1) {
4988               fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4989             } else {
4990               fvals_bits = patch_10_fvalsw[fvals_widx];
4991             }
4992             // allele_ct == 3: just invert raw fvals_bits
4993             // allele_ct > 3: shift by allele_code_width, xor with self so that
4994             // 0 == hom, detect nonzero by inverting the usual check
4995             if (allele_ct == 3) {
4996               fvals_bits = ~fvals_bits;
4997             } else {
4998               fvals_bits = fvals_bits ^ (fvals_bits << allele_code_width);
4999               // conveniently, removing a ~ here is equivalent to inverting the
5000               // relevant bits of the final result
5001               fvals_bits = detect_hom_mask_lo & ((fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)) >> (code10_width - 1));
5002             }
5003             // bzhi only relevant for detecting if there are any hets at all
5004             if (!aux1b_het_present) {
5005               if (fvals_widx == fvals_word_ct_m1) {
5006                 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct * code10_width, kBitsPerWord));
5007               }
5008               if (fvals_bits) {
5009                 // lazy-initialize
5010                 aux1b_het_present = 1;
5011                 ZeroHwArr(2 * BitCtToWordCt(raw_sample_ct), aux1b_hets_alias);
5012               }
5013             }
5014             ++fvals_widx;
5015             rare10_lowbits = 0;
5016           }
5017           if (fvals_bits & (k1LU << rare10_lowbits)) {
5018             const uint32_t bit_idx = ctzw(cur_raw_genoarr_xys) / 2;
5019             aux1b_hets_alias[sample_hwidx] |= 1U << bit_idx;
5020           }
5021           rare10_lowbits += code10_width;
5022         }
5023         cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5024         fset_bits = fset_bits >> 1;
5025       }
5026     }
5027     *aux1b_het_presentp = aux1b_het_present;
5028     return kPglRetSuccess;
5029   }
5030   // aux1b_mode == 1
5031   uint32_t rare10_ct;
5032   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
5033   if (unlikely(reterr)) {
5034     return reterr;
5035   }
5036   const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
5037   const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
5038   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
5039     return kPglRetMalformedInput;
5040   }
5041   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
5042   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
5043     uintptr_t fvals_bits;
5044     if (fvals_widx >= fvals_word_ct_m1) {
5045       if (fvals_widx > fvals_word_ct_m1) {
5046         break;
5047       }
5048       fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5049     } else {
5050       fvals_bits = patch_10_fvalsw[fvals_widx];
5051     }
5052     if (allele_ct == 3) {
5053       fvals_bits = ~fvals_bits;
5054     } else {
5055       fvals_bits = fvals_bits ^ (fvals_bits << allele_code_width);
5056       fvals_bits = detect_hom_mask_lo & ((fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)) >> (code10_width - 1));
5057     }
5058     if (fvals_widx == fvals_word_ct_m1) {
5059       fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
5060     }
5061     if (!fvals_bits) {
5062       continue;
5063     }
5064     if (!aux1b_het_present) {
5065       aux1b_het_present = 1;
5066       ZeroHwArr(2 * BitCtToWordCt(raw_sample_ct), aux1b_hets_alias);
5067     }
5068     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
5069     do {
5070       const uint32_t bit_idx = ctzw(fvals_bits);
5071       const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5072       aux1b_hets_alias[sample_uidx / kBitsPerWordD2] |= 1U << (sample_uidx % kBitsPerWordD2);
5073       fvals_bits &= fvals_bits - 1;
5074     } while (fvals_bits);
5075   }
5076   *aux1b_het_presentp = aux1b_het_present;
5077   return kPglRetSuccess;
5078 }
5079 
Get1Multiallelic(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict all_hets,uintptr_t * __restrict allele_countvec,uintptr_t ** subsetted_10hetp)5080 PglErr Get1Multiallelic(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict all_hets, uintptr_t* __restrict allele_countvec, uintptr_t** subsetted_10hetp) {
5081   // sample_ct > 0; either allele_idx > 1 or ((allele_idx == 1) &&
5082   // multiallelic_hc_present)
5083   // subsetted_10het assumed to be initialized to nullptr, if present at all
5084   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
5085   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
5086   uintptr_t* raw_genovec = pgrp->workspace_vec;
5087   const unsigned char* fread_ptr;
5088   const unsigned char* fread_end;
5089   PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
5090   if (unlikely(reterr)) {
5091     return reterr;
5092   }
5093 
5094   const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
5095   const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
5096   CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, allele_countvec);
5097   ZeroTrailingNyps(raw_sample_ct, raw_genovec);
5098   if (fread_pp) {
5099     *fread_endp = fread_end;
5100     if (all_hets) {
5101       PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
5102     }
5103   }
5104   if (allele_idx != 1) {
5105     GenovecNonmissingToZeroUnsafe(sample_ct, allele_countvec);
5106     if (!multiallelic_hc_present) {
5107       if (fread_pp) {
5108         *fread_pp = fread_ptr;
5109       }
5110       return kPglRetSuccess;
5111     }
5112   }
5113   const uint32_t aux1_first_byte = *fread_ptr++;
5114   const uint32_t aux1a_mode = aux1_first_byte & 15;
5115   const uint32_t aux1b_mode = aux1_first_byte >> 4;
5116   // only need to initialize these in dense modes
5117   uint32_t raw_01_ct = 0;
5118   uint32_t raw_10_ct = 0;
5119   if ((!aux1a_mode) || (!aux1b_mode)) {
5120     GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
5121   }
5122 
5123   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
5124   const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
5125   if (!subsetting_required) {
5126     sample_include = nullptr;
5127   }
5128   // allele_idx == 1 case:
5129   //   allele_countvec currently contains ALT counts; we want to reduce them to
5130   //   ALT1 counts.  This can be done with the following steps:
5131   //   1. For every element of patch_01_fset, reduce the value from 1 to 0.  We
5132   //      don't actually need to look at patch_01_fvals.
5133   //   2. For every element of patch_10_fset, reduce the value from 2 depending
5134   //      on the low bit(s) of the patch_01_fvals entry (reduce to 0 unless low
5135   //      bit(s) are all zero).
5136   // allele_idx > 1 case:
5137   //   1. For every element of patch_01_fset, set a 1 for each matching value
5138   //      of patch_01_fvals.
5139   //   2. For every element of patch_10_fset, set a 1 for each het-matching
5140   //      value of patch_10_fvals, and a 2 for each hom-match.
5141   uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
5142   // Two cases:
5143   // - If allele_idx == 1, convert all aux1a entries from 01 to 00.
5144   // - Otherwise, for each matching aux1a entry, convert from 00 to 01.
5145   reterr = GenoarrAux1aUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, allele_idx, 1, raw_01_ct, &fread_ptr, allele_countvec, deltalist_workspace);
5146   if (unlikely(reterr)) {
5147     return reterr;
5148   }
5149   const unsigned char* aux1b_start = fread_ptr;
5150   reterr = GenoarrAux1bStandardUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, allele_idx, raw_10_ct, &fread_ptr, allele_countvec, deltalist_workspace);
5151   if ((!fread_pp) || reterr) {
5152     return reterr;
5153   }
5154   *fread_pp = fread_ptr;
5155   if (all_hets) {
5156     // can merge this with GenovecAux1bStandardUpdate if this is ever a
5157     // significant bottleneck
5158     uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
5159     uint32_t aux1b_het_present;
5160     reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &aux1b_start, aux1b_hets, &aux1b_het_present, deltalist_workspace);
5161     if (unlikely(reterr)) {
5162       return reterr;
5163     }
5164     if (aux1b_het_present) {
5165       BitvecOr(aux1b_hets, BitCtToWordCt(raw_sample_ct), all_hets);
5166       if (!sample_include) {
5167         *subsetted_10hetp = aux1b_hets;
5168       } else {
5169         // Don't need raw_genovec any more.
5170         CopyBitarrSubset(aux1b_hets, sample_include, sample_ct, raw_genovec);
5171         *subsetted_10hetp = raw_genovec;
5172       }
5173     }
5174   }
5175   return kPglRetSuccess;
5176 }
5177 
IMPLPgrGet1(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,uintptr_t * __restrict allele_countvec)5178 PglErr IMPLPgrGet1(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_countvec) {
5179   if (!sample_ct) {
5180     return kPglRetSuccess;
5181   }
5182   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
5183   const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
5184   if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
5185     PglErr reterr = ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, allele_countvec);
5186     if (unlikely(reterr)) {
5187       return reterr;
5188     }
5189     if (!allele_idx) {
5190       GenovecInvertUnsafe(sample_ct, allele_countvec);
5191     }
5192     return kPglRetSuccess;
5193   }
5194   return Get1Multiallelic(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, nullptr, nullptr, nullptr, allele_countvec, nullptr);
5195 }
5196 
IMPLPgrGetInv1(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,uintptr_t * __restrict allele_invcountvec)5197 PglErr IMPLPgrGetInv1(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec) {
5198   if (!sample_ct) {
5199     return kPglRetSuccess;
5200   }
5201   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
5202   const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
5203   if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
5204     PglErr reterr = ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, allele_invcountvec);
5205     if (unlikely(reterr)) {
5206       return reterr;
5207     }
5208     if (allele_idx) {
5209       GenovecInvertUnsafe(sample_ct, allele_invcountvec);
5210     }
5211     return kPglRetSuccess;
5212   }
5213   PglErr reterr = Get1Multiallelic(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, nullptr, nullptr, nullptr, allele_invcountvec, nullptr);
5214   GenovecInvertUnsafe(sample_ct, allele_invcountvec);
5215   return reterr;
5216 }
5217 
5218 // Assumes allele_idx0 < allele_idx1, and allele_idx0 < 2.  Rotates hardcalls
5219 // such that, if no multiallelic hardcalls are present, 0 = 0/0, 1 = 0/1,
5220 // 2 = 1/1, and 3 = anything else.
Rotate2(uint32_t allele_idx0,uint32_t allele_idx1,uint32_t sample_ct,uintptr_t * genovec)5221 void Rotate2(uint32_t allele_idx0, uint32_t allele_idx1, uint32_t sample_ct, uintptr_t* genovec) {
5222   if (!allele_idx0) {
5223     if (allele_idx1 > 1) {
5224       GenovecNonzeroToMissingUnsafe(sample_ct, genovec);
5225     }
5226   } else {
5227     GenovecInvertThenNonzeroToMissingUnsafe(sample_ct, genovec);
5228   }
5229 }
5230 
SkipAux1a(const unsigned char * fread_end,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_01_ct,const unsigned char ** fread_pp)5231 PglErr SkipAux1a(const unsigned char* fread_end, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_01_ct, const unsigned char** fread_pp) {
5232   if (aux1a_mode == 15) {
5233     return kPglRetSuccess;
5234   }
5235   uint32_t rare01_ct;
5236   if (!aux1a_mode) {
5237     const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
5238     rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
5239     *fread_pp += fset_byte_ct;
5240   } else {
5241     const unsigned char* group_info_iter;
5242     PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare01_ct);
5243     if (unlikely(reterr)) {
5244       return reterr;
5245     }
5246     reterr = SkipDeltalistIds(fread_end, group_info_iter, rare01_ct, raw_sample_ct, 0, fread_pp);
5247     if (unlikely(reterr)) {
5248       return reterr;
5249     }
5250   }
5251   const uint32_t fvals_byte_ct = GetAux1aAlleleEntryByteCt(allele_ct, rare01_ct);
5252   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
5253     return kPglRetMalformedInput;
5254   }
5255   return kPglRetSuccess;
5256 }
5257 
5258 // sample_include assumed to be nullptr if no subsetting required
GenoarrAux1bUpdate2(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx0,uint32_t allele_idx1,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict target_genoarr,uint32_t * __restrict deltalist_workspace)5259 PglErr GenoarrAux1bUpdate2(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx0, uint32_t allele_idx1, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict target_genoarr, uint32_t* __restrict deltalist_workspace) {
5260   // Possible aux1b updates:
5261   // - allele_idx0 == 0:
5262   //     allele_idx1 == 1: all altx/alty including a rarealt from 10 to 11
5263   //     allele_idx1 > 1: set one rarealtx/rarealtx from 11 to 10
5264   //
5265   // - allele_idx0 == 1: change all alt1/rarealtx from 00 to 01,
5266   //   rarealtx/rarealtx from 00 to 10, and all other aux1b entries to missing.
5267   //   This can use the same driver as Get1Multiallelic.
5268   //
5269   // - allele_idx0 > 1: change all rarealtx/rarealtx from missing to 00,
5270   //   rarealtx/rarealty to 01, and rarealty/rarealty to 10.
5271   if (aux1b_mode == 15) {
5272     return kPglRetSuccess;
5273   }
5274   if (allele_idx0 == 1) {
5275     return GenoarrAux1bStandardUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genoarr, aux1b_mode, raw_sample_ct, allele_ct, allele_idx1, raw_10_ct, fread_pp, target_genoarr, deltalist_workspace);
5276   }
5277   uintptr_t detect_hom_mask_lo;
5278   const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
5279   const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
5280   const uint32_t code10_width = 1U << code10_logwidth;
5281   const uintptr_t detect_hom_mask_hi = detect_hom_mask_lo << (code10_width - 1);
5282   uintptr_t xor_word2 = allele_idx1 - 1;
5283   // fortunately, this sequence of operations happens to work for allele_ct ==
5284   // 3
5285   xor_word2 = xor_word2 | (xor_word2 << (code10_width / 2));
5286   xor_word2 = xor_word2 * detect_hom_mask_lo;
5287   uint32_t rare10_lowbits = kBitsPerWord;
5288   if (!aux1b_mode) {
5289 #ifdef __arm__
5290 #  error "Unaligned accesses in GenoarrAux1bUpdate2()."
5291 #endif
5292     const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
5293     const uint32_t fset_byte_ct = DivUp(raw_10_ct, 8);
5294     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
5295     const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
5296     *fread_pp += fset_byte_ct;
5297     uintptr_t sample_hwidx = 0;
5298     uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
5299     const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
5300     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, CHAR_BIT);
5301     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
5302       return kPglRetMalformedInput;
5303     }
5304     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
5305     uintptr_t fvals_bits = 0;
5306     uint32_t fvals_widx = 0;
5307     uint32_t loop_len = kBitsPerWord;
5308     if (!allele_idx0) {
5309       for (uint32_t fset_widx = 0; ; ++fset_widx) {
5310         uintptr_t fset_bits;
5311         if (fset_widx >= fset_word_ct_m1) {
5312           if (fset_widx > fset_word_ct_m1) {
5313             return kPglRetSuccess;
5314           }
5315           fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
5316           loop_len = ModNz(raw_10_ct, kBitsPerWord);
5317         } else {
5318           fset_bits = patch_10_fsetw[fset_widx];
5319         }
5320         if (!sample_include) {
5321           if (allele_idx1 == 1) {
5322             // All aux1b 10 -> 11.  Ignore aux1b_fvals.
5323             for (uint32_t uii = 0; uii != loop_len; ++uii) {
5324               while (!cur_raw_genoarr_xys) {
5325                 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5326               }
5327               if (fset_bits & 1) {
5328                 const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
5329                 target_genoarr[sample_hwidx] ^= lowbit;
5330               }
5331               cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5332               fset_bits = fset_bits >> 1;
5333             }
5334           } else {
5335             // hom-altx 11 -> 10.
5336             for (uint32_t uii = 0; uii != loop_len; ++uii) {
5337               while (!cur_raw_genoarr_xys) {
5338                 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5339               }
5340               if (fset_bits & 1) {
5341                 if (rare10_lowbits == kBitsPerWord) {
5342                   if (fvals_widx == fvals_word_ct_m1) {
5343                     fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5344                   } else {
5345                     fvals_bits = patch_10_fvalsw[fvals_widx];
5346                   }
5347                   fvals_bits = fvals_bits ^ xor_word2;
5348                   fvals_bits = (detect_hom_mask_hi & (~(fvals_bits | ((fvals_bits | detect_hom_mask_hi) - detect_hom_mask_lo)))) >> (code10_width - 1);
5349                   // unnecessary to apply bzhi here
5350                   ++fvals_widx;
5351                   rare10_lowbits = 0;
5352                 }
5353                 if (fvals_bits & (k1LU << rare10_lowbits)) {
5354                   const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
5355                   target_genoarr[sample_hwidx] ^= lowbit;
5356                 }
5357                 rare10_lowbits += code10_width;
5358               }
5359               cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5360               fset_bits = fset_bits >> 1;
5361             }
5362           }
5363         } else {
5364           // sample_include non-null
5365           if (allele_idx1 == 1) {
5366             for (uint32_t uii = 0; uii != loop_len; ++uii) {
5367               while (!cur_raw_genoarr_xys) {
5368                 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5369               }
5370               if (fset_bits & 1) {
5371                 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
5372                 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
5373                   const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
5374                   target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << (2 * (sample_idx % kBitsPerWordD2));
5375                 }
5376                 rare10_lowbits += code10_width;
5377               }
5378               cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5379               fset_bits = fset_bits >> 1;
5380             }
5381           } else {
5382             for (uint32_t uii = 0; uii != loop_len; ++uii) {
5383               while (!cur_raw_genoarr_xys) {
5384                 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5385               }
5386               if (fset_bits & 1) {
5387                 if (rare10_lowbits == kBitsPerWord) {
5388                   if (fvals_widx == fvals_word_ct_m1) {
5389                     fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5390                   } else {
5391                     fvals_bits = patch_10_fvalsw[fvals_widx];
5392                   }
5393                   fvals_bits = fvals_bits ^ xor_word2;
5394                   fvals_bits = (detect_hom_mask_hi & (~(fvals_bits | ((fvals_bits | detect_hom_mask_hi) - detect_hom_mask_lo)))) >> (code10_width - 1);
5395                   // unnecessary to apply bzhi here
5396                   ++fvals_widx;
5397                   rare10_lowbits = 0;
5398                 }
5399                 if (fvals_bits & (k1LU << rare10_lowbits)) {
5400                   const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
5401                   if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
5402                     const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
5403                     target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << (2 * (sample_idx % kBitsPerWordD2));
5404                   }
5405                 }
5406                 rare10_lowbits += code10_width;
5407               }
5408               cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5409               fset_bits = fset_bits >> 1;
5410             }
5411           }
5412         }
5413       }
5414     }
5415     // 2 <= allele_idx0 < allele_idx1 (so allele_ct > 3 guaranteed)
5416     uintptr_t xor_word1 = allele_idx1 - 1;
5417     uintptr_t xor_word0 = allele_idx0 - 1;
5418     xor_word1 = xor_word0 | (xor_word1 << (code10_width / 2));
5419     xor_word0 = xor_word0 | (xor_word0 << (code10_width / 2));
5420     xor_word1 *= detect_hom_mask_lo;
5421     xor_word0 *= detect_hom_mask_lo;
5422     for (uint32_t fset_widx = 0; ; ++fset_widx) {
5423       uintptr_t fset_bits;
5424       if (fset_widx >= fset_word_ct_m1) {
5425         if (fset_widx > fset_word_ct_m1) {
5426           return kPglRetSuccess;
5427         }
5428         fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
5429         loop_len = ModNz(raw_10_ct, kBitsPerWord);
5430       } else {
5431         fset_bits = patch_10_fsetw[fset_widx];
5432       }
5433       if (!sample_include) {
5434         for (uint32_t uii = 0; uii != loop_len; ++uii) {
5435           while (!cur_raw_genoarr_xys) {
5436             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5437           }
5438           if (fset_bits & 1) {
5439             if (rare10_lowbits == kBitsPerWord) {
5440               if (fvals_widx == fvals_word_ct_m1) {
5441                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5442               } else {
5443                 fvals_bits = patch_10_fvalsw[fvals_widx];
5444               }
5445               uintptr_t match0 = fvals_bits ^ xor_word0;
5446               uintptr_t match1 = fvals_bits ^ xor_word1;
5447               uintptr_t match2 = fvals_bits ^ xor_word2;
5448               match0 = detect_hom_mask_hi & (~(match0 | ((match0 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5449               match1 = detect_hom_mask_hi & (~(match1 | ((match1 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5450               match2 = detect_hom_mask_hi & (~(match2 | ((match2 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5451               // Now want match0 -> 11, match1 -> 10, and match2 -> 01.
5452               fvals_bits = ((match0 | match1) >> (code10_width - 2)) | ((match0 | match2) >> (code10_width - 1));
5453               // unnecessary to apply bzhi here
5454               ++fvals_widx;
5455               rare10_lowbits = 0;
5456             }
5457             const uintptr_t xor_val = (fvals_bits >> rare10_lowbits) & 3;
5458             if (xor_val) {
5459               const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
5460               target_genoarr[sample_hwidx] ^= lowbit * xor_val;
5461             }
5462             rare10_lowbits += code10_width;
5463           }
5464           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5465           fset_bits = fset_bits >> 1;
5466         }
5467       } else {
5468         // sample_include non-null
5469         for (uint32_t uii = 0; uii != loop_len; ++uii) {
5470           while (!cur_raw_genoarr_xys) {
5471             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5472           }
5473           if (fset_bits & 1) {
5474             if (rare10_lowbits == kBitsPerWord) {
5475               if (fvals_widx == fvals_word_ct_m1) {
5476                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5477               } else {
5478                 fvals_bits = patch_10_fvalsw[fvals_widx];
5479               }
5480               uintptr_t match0 = fvals_bits ^ xor_word0;
5481               uintptr_t match1 = fvals_bits ^ xor_word1;
5482               uintptr_t match2 = fvals_bits ^ xor_word2;
5483               match0 = detect_hom_mask_hi & (~(match0 | ((match0 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5484               match1 = detect_hom_mask_hi & (~(match1 | ((match1 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5485               match2 = detect_hom_mask_hi & (~(match2 | ((match2 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5486               fvals_bits = ((match0 | match1) >> (code10_width - 2)) | ((match0 | match2) >> (code10_width - 1));
5487               // unnecessary to apply bzhi here
5488               ++fvals_widx;
5489               rare10_lowbits = 0;
5490             }
5491             const uintptr_t xor_val = (fvals_bits >> rare10_lowbits) & 3;
5492             if (xor_val) {
5493               const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
5494               if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
5495                 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
5496                 target_genoarr[sample_idx / kBitsPerWordD2] ^= xor_val << (2 * (sample_idx % kBitsPerWordD2));
5497               }
5498             }
5499             rare10_lowbits += code10_width;
5500           }
5501           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5502           fset_bits = fset_bits >> 1;
5503         }
5504       }
5505     }
5506   }
5507   // aux1b_mode == 1
5508   uint32_t rare10_ct;
5509   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
5510   if (unlikely(reterr)) {
5511     return reterr;
5512   }
5513   const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
5514   const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
5515   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
5516     return kPglRetMalformedInput;
5517   }
5518   if (allele_idx1 == 1) {
5519     if (!sample_include) {
5520       for (uint32_t rare10_idx = 0; rare10_idx != rare10_ct; ++rare10_idx) {
5521         const uint32_t sample_uidx = deltalist_workspace[rare10_idx];
5522         target_genoarr[sample_uidx / kBitsPerWordD2] ^= k1LU << (2 * (sample_uidx % kBitsPerWordD2));
5523       }
5524       return kPglRetSuccess;
5525     }
5526     for (uint32_t rare10_idx = 0; rare10_idx != rare10_ct; ++rare10_idx) {
5527       const uint32_t sample_uidx = deltalist_workspace[rare10_idx];
5528       const uint32_t sample_widx = sample_uidx / kBitsPerWord;
5529       const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
5530       const uintptr_t sample_include_word = sample_include[sample_widx];
5531       if (sample_include_word & lowbit) {
5532         const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
5533         target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << (2 * (sample_idx % kBitsPerWordD2));
5534       }
5535     }
5536     return kPglRetSuccess;
5537   }
5538   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
5539   if (!allele_idx0) {
5540     for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
5541       uintptr_t fvals_bits;
5542       if (fvals_widx >= fvals_word_ct_m1) {
5543         if (fvals_widx > fvals_word_ct_m1) {
5544           return kPglRetSuccess;
5545         }
5546         fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5547       } else {
5548         fvals_bits = patch_10_fvalsw[fvals_widx];
5549       }
5550       fvals_bits = fvals_bits ^ xor_word2;
5551       fvals_bits = detect_hom_mask_hi & (~(fvals_bits | ((fvals_bits | detect_hom_mask_hi) - detect_hom_mask_lo)));
5552       if (fvals_widx == fvals_word_ct_m1) {
5553         fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
5554       }
5555       if (!fvals_bits) {
5556         continue;
5557       }
5558       const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
5559       if (!sample_include) {
5560         do {
5561           const uint32_t bit_idx = ctzw(fvals_bits);
5562           const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5563           target_genoarr[sample_uidx / kBitsPerWordD2] ^= k1LU << (2 * (sample_uidx % kBitsPerWordD2));
5564           fvals_bits &= fvals_bits - 1;
5565         } while (fvals_bits);
5566       } else {
5567         do {
5568           const uint32_t bit_idx = ctzw(fvals_bits);
5569           const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5570           const uint32_t sample_widx = sample_uidx / kBitsPerWord;
5571           const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
5572           const uintptr_t sample_include_word = sample_include[sample_widx];
5573           if (sample_include_word & lowbit) {
5574             const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
5575             target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << (2 * (sample_idx % kBitsPerWordD2));
5576           }
5577           fvals_bits &= fvals_bits - 1;
5578         } while (fvals_bits);
5579       }
5580     }
5581   }
5582   // 2 <= allele_idx0 < allele_idx1
5583   uintptr_t xor_word1 = allele_idx1 - 1;
5584   uintptr_t xor_word0 = allele_idx0 - 1;
5585   xor_word1 = xor_word0 | (xor_word1 << (code10_width / 2));
5586   xor_word0 = xor_word0 | (xor_word0 << (code10_width / 2));
5587   xor_word1 *= detect_hom_mask_lo;
5588   xor_word0 *= detect_hom_mask_lo;
5589   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
5590     uintptr_t fvals_bits;
5591     if (fvals_widx >= fvals_word_ct_m1) {
5592       if (fvals_widx > fvals_word_ct_m1) {
5593         return kPglRetSuccess;
5594       }
5595       fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5596     } else {
5597       fvals_bits = patch_10_fvalsw[fvals_widx];
5598     }
5599     uintptr_t match0 = fvals_bits ^ xor_word0;
5600     uintptr_t match1 = fvals_bits ^ xor_word1;
5601     uintptr_t match2 = fvals_bits ^ xor_word2;
5602     match0 = detect_hom_mask_hi & (~(match0 | ((match0 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5603     match1 = detect_hom_mask_hi & (~(match1 | ((match1 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5604     match2 = detect_hom_mask_hi & (~(match2 | ((match2 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5605     // since code10_width >= 4, we can use match0 == 3 (mod 4), match1 == 2
5606     // (mod 4), match2 == 1 (mod 4) representation.
5607     fvals_bits = (match0 >> (code10_width - 4)) | (match1 >> (code10_width - 3)) | (match2 >> (code10_width - 2));
5608     if (fvals_widx == fvals_word_ct_m1) {
5609       fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
5610     }
5611     if (!fvals_bits) {
5612       continue;
5613     }
5614     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
5615     if (!sample_include) {
5616       do {
5617         const uintptr_t bit_idx = ctzw(fvals_bits);
5618         const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5619         target_genoarr[sample_uidx / kBitsPerWordD2] ^= (bit_idx & 3) << (2 * (sample_uidx % kBitsPerWordD2));
5620         fvals_bits &= fvals_bits - 1;
5621       } while (fvals_bits);
5622     } else {
5623       do {
5624         const uintptr_t bit_idx = ctzw(fvals_bits);
5625         const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5626         const uint32_t sample_widx = sample_uidx / kBitsPerWord;
5627         const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
5628         const uintptr_t sample_include_word = sample_include[sample_widx];
5629         if (sample_include_word & lowbit) {
5630           const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
5631           target_genoarr[sample_idx / kBitsPerWordD2] ^= (bit_idx & 3) << (2 * (sample_idx % kBitsPerWordD2));
5632         }
5633         fvals_bits &= fvals_bits - 1;
5634       } while (fvals_bits);
5635     }
5636   }
5637 }
5638 
IMPLPgrGet2(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx0,uint32_t allele_idx1,PgenReaderMain * pgrp,uintptr_t * __restrict genovec)5639 PglErr IMPLPgrGet2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReaderMain* pgrp, uintptr_t* __restrict genovec) {
5640   assert(allele_idx0 != allele_idx1);
5641   if (!sample_ct) {
5642     return kPglRetSuccess;
5643   }
5644   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
5645   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
5646   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
5647   const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
5648   if (!multiallelic_hc_present) {
5649     if ((allele_idx0 > 1) && (allele_idx1 > 1)) {
5650       // Trivial all-missing case.
5651       SetAllBits(2 * sample_ct, genovec);
5652       return kPglRetSuccess;
5653     }
5654     PglErr reterr = ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
5655     if (unlikely(reterr)) {
5656       return reterr;
5657     }
5658     if (allele_idx0 < allele_idx1) {
5659       Rotate2(allele_idx0, allele_idx1, sample_ct, genovec);
5660       return kPglRetSuccess;
5661     }
5662     if (allele_idx0 == 1) {
5663       GenovecInvertUnsafe(sample_ct, genovec);
5664       return kPglRetSuccess;
5665     }
5666     if (!allele_idx1) {
5667       GenovecNonzeroToMissingThenInvertUnsafe(sample_ct, genovec);
5668       return kPglRetSuccess;
5669     }
5670     GenovecNontwoToMissingUnsafe(sample_ct, genovec);
5671     return kPglRetSuccess;
5672   }
5673   uintptr_t* raw_genovec = pgrp->workspace_vec;
5674   const unsigned char* fread_ptr;
5675   const unsigned char* fread_end;
5676   PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
5677   if (unlikely(reterr)) {
5678     return reterr;
5679   }
5680   ZeroTrailingNyps(raw_sample_ct, raw_genovec);
5681 
5682   uint32_t invert = 0;
5683   if (allele_idx0 > allele_idx1) {
5684     const uint32_t swap = allele_idx0;
5685     allele_idx0 = allele_idx1;
5686     allele_idx1 = swap;
5687     invert = 1;
5688   }
5689   if (allele_idx0 > 1) {
5690     SetAllBits(2 * sample_ct, genovec);
5691   } else {
5692     CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
5693     Rotate2(allele_idx0, allele_idx1, sample_ct, genovec);
5694   }
5695   const uint32_t aux1_first_byte = *fread_ptr++;
5696   const uint32_t aux1a_mode = aux1_first_byte & 15;
5697   const uint32_t aux1b_mode = aux1_first_byte >> 4;
5698   uint32_t raw_01_ct = 0;
5699   uint32_t raw_10_ct = 0;
5700   if ((!aux1a_mode) || (!aux1b_mode)) {
5701     GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
5702   }
5703   if (!subsetting_required) {
5704     sample_include = nullptr;
5705   }
5706   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
5707   const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
5708   uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
5709   if (!allele_idx0) {
5710     // Two cases:
5711     // - If allele_idx == 1, convert all aux1a entries from 01 to 11.
5712     // - Otherwise, for each matching aux1a entry, convert from 11 to 01.
5713     reterr = GenoarrAux1aUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, allele_idx1, 2, raw_01_ct, &fread_ptr, genovec, deltalist_workspace);
5714   } else {
5715     reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr);
5716   }
5717   if (unlikely(reterr)) {
5718     return reterr;
5719   }
5720   reterr = GenoarrAux1bUpdate2(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, allele_idx0, allele_idx1, raw_10_ct, &fread_ptr, genovec, deltalist_workspace);
5721   if (unlikely(reterr)) {
5722     return reterr;
5723   }
5724   if (invert) {
5725     GenovecInvertUnsafe(sample_ct, genovec);
5726   }
5727   return kPglRetSuccess;
5728 }
5729 
PreinitPgv(PgenVariant * pgvp)5730 void PreinitPgv(PgenVariant* pgvp) {
5731   pgvp->genovec = nullptr;
5732   pgvp->patch_01_set = nullptr;
5733   pgvp->patch_01_vals = nullptr;
5734   pgvp->patch_10_set = nullptr;
5735   pgvp->patch_10_vals = nullptr;
5736   pgvp->phasepresent = nullptr;
5737   pgvp->phaseinfo = nullptr;
5738   pgvp->dosage_present = nullptr;
5739   pgvp->dosage_main = nullptr;
5740   pgvp->multidosage_present = nullptr;
5741   pgvp->multidosage_cts = nullptr;
5742   pgvp->multidosage_codes = nullptr;
5743   pgvp->multidosage_vals = nullptr;
5744   pgvp->dphase_present = nullptr;
5745   pgvp->dphase_delta = nullptr;
5746   pgvp->multidphase_present = nullptr;
5747   pgvp->multidphase_cts = nullptr;
5748   pgvp->multidphase_codes = nullptr;
5749   pgvp->multidphase_delta = nullptr;
5750 
5751   pgvp->patch_01_ct = 0;
5752   pgvp->patch_10_ct = 0;
5753   pgvp->phasepresent_ct = 0;
5754   pgvp->dosage_ct = 0;
5755   pgvp->multidosage_sample_ct = 0;
5756   pgvp->dphase_ct = 0;
5757   pgvp->multidphase_sample_ct = 0;
5758 }
5759 
5760 // similar to ParseAndSaveDifflist()
ParseAndSaveDeltalistAsBitarr(const unsigned char * fread_end,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uintptr_t * deltalist_include,uint32_t * deltalist_len_ptr)5761 PglErr ParseAndSaveDeltalistAsBitarr(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* deltalist_include, uint32_t* deltalist_len_ptr) {
5762   const unsigned char* group_info_iter;
5763   PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, deltalist_len_ptr);
5764   const uint32_t deltalist_len = *deltalist_len_ptr;
5765   if (reterr || (!deltalist_len)) {
5766     return reterr;
5767   }
5768   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
5769   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
5770   const uint32_t group_idx_last = (deltalist_len - 1) / kPglDifflistGroupSize;
5771   ZeroWArr(raw_sample_ctl, deltalist_include);
5772   uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
5773   for (uint32_t group_idx = 0; ; ++group_idx) {
5774     if (group_idx >= group_idx_last) {
5775       if (group_idx > group_idx_last) {
5776         return kPglRetSuccess;
5777       }
5778       group_len_m1 &= deltalist_len - 1;
5779     }
5780     uintptr_t raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
5781     group_info_iter = &(group_info_iter[sample_id_byte_ct]);
5782     for (uint32_t raw_deltalist_idx_lowbits = 0; ; ++raw_deltalist_idx_lowbits) {
5783       // always check, otherwise we may scribble over arbitrary memory
5784       if (unlikely(raw_sample_idx >= raw_sample_ct)) {
5785         return kPglRetMalformedInput;
5786       }
5787       SetBit(raw_sample_idx, deltalist_include);
5788       if (raw_deltalist_idx_lowbits == group_len_m1) {
5789         break;
5790       }
5791       raw_sample_idx += GetVint31(fread_end, fread_pp);
5792     }
5793   }
5794 }
5795 
5796 // These functions do not overread, but may write extra bytes up to the word
5797 // boundary.
Expand2bitTo8(const void * __restrict bytearr,uint32_t input_nyp_ct,uint32_t incr,uintptr_t * __restrict dst)5798 void Expand2bitTo8(const void* __restrict bytearr, uint32_t input_nyp_ct, uint32_t incr, uintptr_t* __restrict dst) {
5799   const unsigned char* src_iter = S_CAST(const unsigned char*, bytearr);
5800   const uint32_t input_byte_ct = DivUp(input_nyp_ct, 4);
5801 #ifdef __arm__
5802 #  error "Unaligned accesses in Expand2bitTo8()."
5803 #endif
5804 #ifdef __LP64__
5805   const uint32_t input_vec_ct = input_byte_ct / kBytesPerVec;
5806   unsigned char* dst_iter = R_CAST(unsigned char*, dst);
5807   if (input_vec_ct) {
5808     const VecW mincr = R_CAST(VecW, vecuc_set1(incr));
5809     const VecW m03 = VCONST_W(kMask0303);
5810     for (uint32_t vec_idx = 0; vec_idx != input_vec_ct; ++vec_idx) {
5811       VecW cur_vec = vecw_loadu(src_iter);
5812       src_iter = &(src_iter[kBytesPerVec]);
5813 #  ifdef USE_AVX2
5814       // (todo: benchmark against just reading 8 bytes at a time and
5815       // broadcasting.)
5816       // midswapped_vec contains {0-1-2-3, 4-5-6-7, ..., 12-13-14-15,
5817       //                          32-33-34-35, ..., 44-45-46-47,
5818       //                          16-17-18-19, ..., 28-29-30-31,
5819       //                          48-49-50-51, ..., 60-61-62-63,
5820       //                          64-65-66-67, ..., 76-77-78-79,
5821       //                          96-97-98-99, ..., 108-109-110-111,
5822       //                          80-81-82-83, ..., 92-93-94-95,
5823       //                          112-113-114-115, ..., 124-125-126-127}
5824       // 0xd8: {0, 2, 1, 3}
5825       const __m256i midswapped_vec = _mm256_shuffle_epi32(R_CAST(__m256i, cur_vec), 0xd8);
5826       // This operation is also used in FillInterleavedMaskVec().
5827       // cur_vec now contains {0-1-2-3, 4-5-6-7, 8-9-10-11, 12-13-14-15,
5828       //                       32-33-34-35, ..., 44-45-46-47,
5829       //                       64-65-66-67, ..., 76-77-78-79,
5830       //                       96-97-98-99, ..., 108-109-110-111,
5831       //                       16-17-18-19, ..., 28-29-30-31,
5832       //                       48-49-50-51, ..., 60-61-62-63,
5833       //                       80-81-82-83, ..., 92-93-94-95,
5834       //                       112-113-114-115, ..., 124-125-126-127}
5835       cur_vec = vecw_permute0xd8_if_avx2(R_CAST(VecW, midswapped_vec));
5836 #  endif
5837       // AVX2:
5838       //   vec_even contains {0-1, 4-5, 8-9, 12-13, 32-33, ..., 44-45,
5839       //                      64-65, ..., 76-77, 96-97, ..., 108-109,
5840       //                      16-17, ..., 28-29, 48-49, ..., 60-61,
5841       //                      80-81, ..., 92-93, 112-113, ..., 124-125}
5842       //   vec_odd contains {2-3, 6-7, 10-11, 14-15, 34-35, ..., 46-47,
5843       //                     66-67, ..., 78-79, 98-99, ..., 110-111,
5844       //                     18-19, ..., 30-31, 50-51, ..., 62-63,
5845       //                     82-83, ..., 94-95, 114-115, ..., 126-127}
5846       // SSE2:
5847       //   vec_even contains {0-1, 4-5, 8-9, ..., 60-61}
5848       //   vec_odd contains {2-3, 6-7, 10-11, ..., 62-63}
5849       const VecW vec_even = cur_vec;
5850       const VecW vec_odd = vecw_srli(cur_vec, 4);
5851 
5852       // AVX2:
5853       //   vec01 contains {0-1, 2-3, 4-5, ..., 14-15, 32-33, ..., 46-47,
5854       //                   16-17, ..., 30-31, 48-49, ..., 62-63}
5855       //   vec23 contains {64-65, 66-67, ..., 78-79, 96-97, ..., 110-111,
5856       //                   80-81, ..., 94-95, 112-113, ..., 126-127}
5857       // SSE2:
5858       //   vec01 contains {0-1, 2-3, 4-5, 6-7, ..., 30-31}
5859       //   vec23 contains {32-33, 34-35, 36-37, 38-39, ..., 62-63}
5860       const VecW vec01 = vecw_unpacklo8(vec_even, vec_odd);
5861       const VecW vec23 = vecw_unpackhi8(vec_even, vec_odd);
5862 
5863       // AVX2:
5864       //   vec01_even contains {0, 2, 4, ..., 14, 32, 34, ..., 46,
5865       //                        16, 18, ..., 30, 48, 50, ..., 62}
5866       //   vec01_odd contains {1, 3, 5, ..., 15, 33, 35, ..., 47,
5867       //                       17, 19, ..., 31, 49, 51, ..., 63}
5868       // SSE2:
5869       //   vec01_even contains {0, 2, 4, 6, ..., 30}
5870       //   vec01_odd contains {1, 3, 5, 7, ..., 31}
5871       const VecW vec01_even = vec01 & m03;
5872       const VecW vec01_odd = vecw_srli(vec01, 2) & m03;
5873 
5874       // AVX2:
5875       //   vecw_unpacklo8() contains {0, 1, ..., 15, 16, ..., 31}
5876       //   vecw_unpachhi8() contains {32, 33, ..., 47, 48, ..., 63}
5877       // SSE2:
5878       //   vecw_unpacklo8() contains {0, 1, ..., 15}
5879       //   vecw_unpachhi8() contains {16, 17, ..., 31}
5880       vecw_storeu(dst_iter, mincr + vecw_unpacklo8(vec01_even, vec01_odd));
5881       dst_iter = &(dst_iter[kBytesPerVec]);
5882       vecw_storeu(dst_iter, mincr + vecw_unpackhi8(vec01_even, vec01_odd));
5883       dst_iter = &(dst_iter[kBytesPerVec]);
5884       const VecW vec23_odd = vecw_srli(vec23, 2) & m03;
5885       const VecW vec23_even = vec23 & m03;
5886       vecw_storeu(dst_iter, mincr + vecw_unpacklo8(vec23_even, vec23_odd));
5887       dst_iter = &(dst_iter[kBytesPerVec]);
5888       vecw_storeu(dst_iter, mincr + vecw_unpackhi8(vec23_even, vec23_odd));
5889       dst_iter = &(dst_iter[kBytesPerVec]);
5890     }
5891   }
5892   const uint32_t remainder = input_byte_ct % kBytesPerVec;
5893   if (remainder) {
5894     const uint32_t full_qw_ct = remainder / sizeof(Quarterword);
5895     const Quarterword* src_alias = R_CAST(const Quarterword*, src_iter);
5896     const uintptr_t incr_word = kMask0101 * incr;
5897     uintptr_t* dstw = R_CAST(uintptr_t*, dst_iter);
5898     for (uint32_t uii = 0; uii != full_qw_ct; ++uii) {
5899       const uintptr_t cur_2byte = src_alias[uii];
5900       dstw[uii] = incr_word + Unpack0303(cur_2byte);
5901     }
5902     if (input_byte_ct % 2) {
5903       uintptr_t cur_byte = src_iter[remainder - 1];
5904 #  ifdef USE_AVX2
5905       cur_byte = _pdep_u64(cur_byte, kMask0303);
5906 #  else
5907       cur_byte = cur_byte | (cur_byte << 12);
5908       cur_byte = (cur_byte | (cur_byte << 6)) & kMask0303;
5909 #  endif
5910       dstw[full_qw_ct] = incr_word + cur_byte;
5911     }
5912   }
5913 #else  // !__LP64__
5914   const Quarterword* src_alias = R_CAST(const Quarterword*, src_iter);
5915   const uintptr_t incr_word = kMask0101 * incr;
5916   uintptr_t* dstw = R_CAST(uintptr_t*, dst);
5917   for (uint32_t uii = 0; uii != input_byte_ct; ++uii) {
5918     const uintptr_t cur_2byte = src_alias[uii];
5919     dstw[uii] = incr_word + Unpack0303(cur_2byte);
5920   }
5921 #endif
5922 }
5923 
Expand4bitTo8(const void * __restrict bytearr,uint32_t input_nybble_ct,uint32_t incr,uintptr_t * __restrict dst)5924 void Expand4bitTo8(const void* __restrict bytearr, uint32_t input_nybble_ct, uint32_t incr, uintptr_t* __restrict dst) {
5925   const unsigned char* src_iter = R_CAST(const unsigned char*, bytearr);
5926   const uint32_t input_byte_ct = DivUp(input_nybble_ct, 2);
5927 #ifdef __LP64__
5928   const uint32_t input_vec_ct = input_byte_ct / kBytesPerVec;
5929   unsigned char* dst_iter = R_CAST(unsigned char*, dst);
5930   if (input_vec_ct) {
5931     const VecW mincr = R_CAST(VecW, vecuc_set1(incr));
5932     const VecW m4 = VCONST_W(kMask0F0F);
5933     for (uint32_t vec_idx = 0; vec_idx != input_vec_ct; ++vec_idx) {
5934       VecW cur_vec = vecw_loadu(src_iter);
5935       src_iter = &(src_iter[kBytesPerVec]);
5936       cur_vec = vecw_permute0xd8_if_avx2(cur_vec);
5937       // AVX2:
5938       //   vec_even contains {0, 2, 4, ..., 14, 32, 34, ..., 46,
5939       //                      16, 18, ..., 30, 48, ... 62}
5940       //   vec_odd contains {1, 3, 5, ..., 15, 33, 35, ..., 47,
5941       //                     17, 19, ..., 31, 49, ..., 63}
5942       // SSE2:
5943       //   vec_even contains {0, 2, 4, ..., 30}
5944       //   vec_odd contains {1, 3, 5, ..., 31}
5945       const VecW vec_even = cur_vec & m4;
5946       const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;
5947 
5948       // AVX2:
5949       //   vec_lo contains {0, 1, ..., 31}
5950       //   vec_hi contains {32, 33, ..., 63}
5951       // SSE2:
5952       //   vec_lo contains {0, 1, 2, ..., 15}
5953       //   vec_hi contains {16, 17, 18, ..., 31}
5954       const VecW vec_lo = vecw_unpacklo8(vec_even, vec_odd);
5955       const VecW vec_hi = vecw_unpackhi8(vec_even, vec_odd);
5956       vecw_storeu(dst_iter, mincr + vec_lo);
5957       dst_iter = &(dst_iter[kBytesPerVec]);
5958       vecw_storeu(dst_iter, mincr + vec_hi);
5959       dst_iter = &(dst_iter[kBytesPerVec]);
5960     }
5961   }
5962   const uint32_t remainder = input_byte_ct % kBytesPerVec;
5963   if (remainder) {
5964     const Halfword* src_alias = R_CAST(const Halfword*, src_iter);
5965     uintptr_t incr_word = kMask0101 * incr;
5966     const uint32_t hw_ct_m1 = (remainder - 1) / sizeof(Halfword);
5967     uintptr_t* dstw = R_CAST(uintptr_t*, dst_iter);
5968     for (uint32_t hwidx = 0; ; ++hwidx) {
5969       uint32_t cur_4byte;
5970       if (hwidx >= hw_ct_m1) {
5971         if (hwidx > hw_ct_m1) {
5972           break;
5973         }
5974         cur_4byte = SubU32Load(&(src_alias[hwidx]), ModNz(remainder, 4));
5975       } else {
5976         cur_4byte = src_alias[hwidx];
5977       }
5978       dstw[hwidx] = incr_word + Unpack0F0F(cur_4byte);
5979     }
5980   }
5981 #else
5982   unsigned char* dst_iter = R_CAST(unsigned char*, dst);
5983   for (uint32_t uii = 0; uii < input_byte_ct; ++uii) {
5984     uint32_t cur_byte = src_iter[uii];
5985     *dst_iter++ = (cur_byte & 15) + incr;
5986     *dst_iter++ = (cur_byte >> 4) + incr;
5987   }
5988 #endif
5989 }
5990 
5991 static_assert(sizeof(AlleleCode) == 1, "GetAux1aCodes() must be updated.");
GetAux1aCodes(const unsigned char * fread_end,uint32_t rare01_ct,uint32_t allele_ct,const unsigned char ** fread_pp,AlleleCode * __restrict patch_01_vals)5992 PglErr GetAux1aCodes(const unsigned char* fread_end, uint32_t rare01_ct, uint32_t allele_ct, const unsigned char** fread_pp, AlleleCode* __restrict patch_01_vals) {
5993   if (allele_ct == 3) {
5994     memset(patch_01_vals, 2, rare01_ct);
5995     return kPglRetSuccess;
5996   }
5997   const unsigned char* patch_01_fvals = *fread_pp;
5998   if (allele_ct == 4) {
5999     const uint32_t patch_01_fvals_byte_ct = DivUp(rare01_ct, CHAR_BIT);
6000     if (PtrAddCk(fread_end, patch_01_fvals_byte_ct, fread_pp)) {
6001       return kPglRetMalformedInput;
6002     }
6003     Expand1bitTo8(patch_01_fvals, rare01_ct, 2, R_CAST(uintptr_t*, patch_01_vals));
6004     return kPglRetSuccess;
6005   }
6006   if (allele_ct < 7) {
6007     const uint32_t patch_01_fvals_byte_ct = DivUp(rare01_ct, 4);
6008     if (PtrAddCk(fread_end, patch_01_fvals_byte_ct, fread_pp)) {
6009       return kPglRetMalformedInput;
6010     }
6011     Expand2bitTo8(patch_01_fvals, rare01_ct, 2, R_CAST(uintptr_t*, patch_01_vals));
6012     return kPglRetSuccess;
6013   }
6014   if (allele_ct < 19) {
6015     const uint32_t patch_01_fvals_byte_ct = DivUp(rare01_ct, 2);
6016     if (PtrAddCk(fread_end, patch_01_fvals_byte_ct, fread_pp)) {
6017       return kPglRetMalformedInput;
6018     }
6019     Expand4bitTo8(patch_01_fvals, rare01_ct, 2, R_CAST(uintptr_t*, patch_01_vals));
6020     return kPglRetSuccess;
6021   }
6022   if (PtrAddCk(fread_end, rare01_ct, fread_pp)) {
6023     return kPglRetMalformedInput;
6024   }
6025   // todo: verify the compiler recognizes this
6026   for (uint32_t uii = 0; uii < rare01_ct; ++uii) {
6027     patch_01_vals[uii] = patch_01_fvals[uii] + 2;
6028   }
6029   return kPglRetSuccess;
6030 }
6031 
6032 // Assumes aux1a_mode != 15.
ExportAux1a(const unsigned char * fread_end,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_01_ct,const unsigned char ** fread_pp,uintptr_t * __restrict patch_01_set,AlleleCode * __restrict patch_01_vals,uint32_t * __restrict rare01_ctp)6033 PglErr ExportAux1a(const unsigned char* fread_end, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_01_ct, const unsigned char** fread_pp, uintptr_t* __restrict patch_01_set, AlleleCode* __restrict patch_01_vals, uint32_t* __restrict rare01_ctp) {
6034   uint32_t rare01_ct;
6035   if (!aux1a_mode) {
6036     const unsigned char* patch_01_fset = *fread_pp;
6037     const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
6038     if (PtrAddCk(fread_end, fset_byte_ct, fread_pp)) {
6039       return kPglRetMalformedInput;
6040     }
6041     rare01_ct = PopcountBytes(patch_01_fset, fset_byte_ct);
6042     ExpandBytearrFromGenoarr(patch_01_fset, raw_genoarr, kMask5555, NypCtToWordCt(raw_sample_ct), raw_01_ct, 0, patch_01_set);
6043   } else {
6044     if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, fread_pp, patch_01_set, &rare01_ct))) {
6045       return kPglRetMalformedInput;
6046     }
6047   }
6048   *rare01_ctp = rare01_ct;
6049   return GetAux1aCodes(fread_end, rare01_ct, allele_ct, fread_pp, patch_01_vals);
6050 }
6051 
ExportAux1aProperSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t allele_ct,uint32_t raw_01_ct,const unsigned char ** fread_pp,uintptr_t * __restrict dst_01_set,AlleleCode * __restrict dst_01_vals,uint32_t * __restrict rare01_ctp,uint32_t * __restrict deltalist_workspace)6052 PglErr ExportAux1aProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t allele_ct, uint32_t raw_01_ct, const unsigned char** fread_pp, uintptr_t* __restrict dst_01_set, AlleleCode* __restrict dst_01_vals, uint32_t* __restrict rare01_ctp, uint32_t* __restrict deltalist_workspace) {
6053   const uint32_t allele_code_width = GetAux1aWidth(allele_ct);
6054   const uintptr_t allele_code_mask = (1U << allele_code_width) - 1;
6055   memset(dst_01_set, 0, BitCtToWordCt(sample_ct) * sizeof(intptr_t));
6056   AlleleCode* dst_01_vals_iter = dst_01_vals;
6057   if (!aux1a_mode) {
6058 #ifdef __arm__
6059 #  error "Unaligned accesses in ExportAux1aProperSubset()."
6060 #endif
6061     // similar to GenoarrAux1aUpdate()
6062     const uintptr_t* patch_01_fsetw = R_CAST(const uintptr_t*, *fread_pp);
6063     const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
6064     const uint32_t rare01_ct = PopcountBytes(patch_01_fsetw, fset_byte_ct);
6065     if (PtrAddCk(fread_end, fset_byte_ct, fread_pp)) {
6066       return kPglRetMalformedInput;
6067     }
6068     const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
6069     uintptr_t sample_hwidx = 0;
6070     uintptr_t cur_raw_genoarr_hets = Word01(raw_genoarr[0]);
6071     uint32_t loop_len = kBitsPerWord;
6072     const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
6073     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6074       return kPglRetMalformedInput;
6075     }
6076     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
6077     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
6078     uintptr_t fvals_bits = 0;
6079     uint32_t fvals_widx = 0;
6080     uint32_t rare01_lowbits = kBitsPerWord;
6081     for (uint32_t fset_widx = 0; ; ++fset_widx) {
6082       uintptr_t fset_bits;
6083       if (fset_widx >= fset_word_ct_m1) {
6084         if (fset_widx > fset_word_ct_m1) {
6085           break;
6086         }
6087         fset_bits = SubwordLoad(&(patch_01_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
6088         loop_len = ModNz(raw_01_ct, kBitsPerWord);
6089       } else {
6090         fset_bits = patch_01_fsetw[fset_widx];
6091       }
6092       if (allele_ct == 3) {
6093         for (uint32_t uii = 0; uii != loop_len; ++uii) {
6094           while (!cur_raw_genoarr_hets) {
6095             cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
6096           }
6097           if (fset_bits & 1) {
6098             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
6099             if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
6100               const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
6101               SetBit(sample_idx, dst_01_set);
6102               *dst_01_vals_iter++ = 2;
6103             }
6104           }
6105           cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
6106           fset_bits = fset_bits >> 1;
6107         }
6108       } else {
6109         for (uint32_t uii = 0; uii != loop_len; ++uii) {
6110           while (!cur_raw_genoarr_hets) {
6111             cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
6112           }
6113           if (fset_bits & 1) {
6114             if (rare01_lowbits == kBitsPerWord) {
6115               if (fvals_widx == fvals_word_ct_m1) {
6116                 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6117               } else {
6118                 fvals_bits = patch_01_fvalsw[fvals_widx];
6119               }
6120               // unnecessary to apply bzhi here
6121               ++fvals_widx;
6122               rare01_lowbits = 0;
6123             }
6124             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
6125             if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
6126               const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
6127               SetBit(sample_idx, dst_01_set);
6128               *dst_01_vals_iter++ = 2 + ((fvals_bits >> rare01_lowbits) & allele_code_mask);
6129             }
6130             rare01_lowbits += allele_code_width;
6131           }
6132           cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
6133           fset_bits = fset_bits >> 1;
6134         }
6135       }
6136     }
6137     *rare01_ctp = dst_01_vals_iter - dst_01_vals;
6138     return kPglRetSuccess;
6139   }
6140   // aux1a_mode == 1
6141   uint32_t rare01_ct;
6142   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare01_ct);
6143   if (unlikely(reterr)) {
6144     return reterr;
6145   }
6146   const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
6147   const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
6148   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6149     return kPglRetMalformedInput;
6150   }
6151   if (allele_ct == 3) {
6152     for (uint32_t rare01_idx = 0; rare01_idx != rare01_ct; ++rare01_idx) {
6153       const uint32_t sample_uidx = deltalist_workspace[rare01_idx];
6154       // could wrap this boilerplate
6155       const uint32_t sample_widx = sample_uidx / kBitsPerWord;
6156       const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
6157       const uintptr_t sample_include_word = sample_include[sample_widx];
6158       if (sample_include_word & lowbit) {
6159         const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
6160         SetBit(sample_idx, dst_01_set);
6161         *dst_01_vals_iter++ = 2;
6162       }
6163     }
6164     *rare01_ctp = dst_01_vals_iter - dst_01_vals;
6165     return kPglRetSuccess;
6166   }
6167   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
6168   const uint32_t allele_code_logwidth = ctzu32(allele_code_width);
6169   uint32_t loop_len = kBitsPerWord >> allele_code_logwidth;
6170   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
6171     uintptr_t fvals_bits;
6172     if (fvals_widx >= fvals_word_ct_m1) {
6173       if (fvals_widx > fvals_word_ct_m1) {
6174         break;
6175       }
6176       fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6177       loop_len = 1 + ((rare01_ct - 1) & (loop_len - 1));
6178     } else {
6179       fvals_bits = patch_01_fvalsw[fvals_widx];
6180     }
6181     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - allele_code_logwidth)]);
6182     for (uint32_t uii = 0; uii != loop_len; ++uii) {
6183       const uint32_t sample_uidx = cur_deltalist_base[uii];
6184       const uint32_t sample_widx = sample_uidx / kBitsPerWord;
6185       const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
6186       const uintptr_t sample_include_word = sample_include[sample_widx];
6187       if (sample_include_word & lowbit) {
6188         const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
6189         SetBit(sample_idx, dst_01_set);
6190         *dst_01_vals_iter++ = 2 + ((fvals_bits >> (uii << allele_code_logwidth)) & allele_code_mask);
6191       }
6192     }
6193   }
6194   *rare01_ctp = dst_01_vals_iter - dst_01_vals;
6195   return kPglRetSuccess;
6196 }
6197 
6198 static_assert(sizeof(AlleleCode) == 1, "GetAux1bCodes() must be updated.");
GetAux1bCodes(const unsigned char * fread_end,uint32_t rare10_ct,uint32_t allele_ct,const unsigned char ** fread_pp,AlleleCode * __restrict patch_10_vals)6199 PglErr GetAux1bCodes(const unsigned char* fread_end, uint32_t rare10_ct, uint32_t allele_ct, const unsigned char** fread_pp, AlleleCode* __restrict patch_10_vals) {
6200   const unsigned char* patch_10_fvals = *fread_pp;
6201   if (allele_ct == 3) {
6202     // 1 bit, distinguishes between 0x0201 and 0x0202
6203     const uint32_t patch_10_fvals_byte_ct = DivUp(rare10_ct, CHAR_BIT);
6204     if (PtrAddCk(fread_end, patch_10_fvals_byte_ct, fread_pp)) {
6205       return kPglRetMalformedInput;
6206     }
6207     Expand1bitTo16(patch_10_fvals, rare10_ct, 0x0201, R_CAST(uintptr_t*, patch_10_vals));
6208     return kPglRetSuccess;
6209   }
6210   const uint32_t rare10_ct_x2 = rare10_ct * 2;
6211   if (allele_ct < 6) {
6212     // 2+2 bits, add 1
6213     const uint32_t patch_10_fvals_byte_ct = DivUp(rare10_ct, 2);
6214     if (PtrAddCk(fread_end, patch_10_fvals_byte_ct, fread_pp)) {
6215       return kPglRetMalformedInput;
6216     }
6217     Expand2bitTo8(patch_10_fvals, rare10_ct_x2, 1, R_CAST(uintptr_t*, patch_10_vals));
6218     return kPglRetSuccess;
6219   }
6220   if (allele_ct < 18) {
6221     // 4+4 bits
6222     if (PtrAddCk(fread_end, rare10_ct, fread_pp)) {
6223       return kPglRetMalformedInput;
6224     }
6225     Expand4bitTo8(patch_10_fvals, rare10_ct_x2, 1, R_CAST(uintptr_t*, patch_10_vals));
6226     return kPglRetSuccess;
6227   }
6228   if (PtrAddCk(fread_end, rare10_ct_x2, fread_pp)) {
6229     return kPglRetMalformedInput;
6230   }
6231   // todo: verify the compiler recognizes this
6232   for (uint32_t uii = 0; uii < rare10_ct_x2; ++uii) {
6233     patch_10_vals[uii] = patch_10_fvals[uii] + 1;
6234   }
6235   return kPglRetSuccess;
6236 }
6237 
6238 // Assumes aux1b_mode != 15.
ExportAux1b(const unsigned char * fread_end,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict patch_10_set,AlleleCode * __restrict patch_10_vals,uint32_t * __restrict rare10_ctp)6239 PglErr ExportAux1b(const unsigned char* fread_end, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict patch_10_set, AlleleCode* __restrict patch_10_vals, uint32_t* __restrict rare10_ctp) {
6240   uint32_t rare10_ct;
6241   if (!aux1b_mode) {
6242     const unsigned char* patch_10_fset = *fread_pp;
6243     const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
6244     if (PtrAddCk(fread_end, fset_byte_ct, fread_pp)) {
6245       return kPglRetMalformedInput;
6246     }
6247     rare10_ct = PopcountBytes(patch_10_fset, fset_byte_ct);
6248     ExpandBytearrFromGenoarr(patch_10_fset, raw_genoarr, kMaskAAAA, NypCtToWordCt(raw_sample_ct), raw_10_ct, 0, patch_10_set);
6249   } else {
6250     if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, fread_pp, patch_10_set, &rare10_ct))) {
6251       return kPglRetMalformedInput;
6252     }
6253   }
6254   *rare10_ctp = rare10_ct;
6255   return GetAux1bCodes(fread_end, rare10_ct, allele_ct, fread_pp, patch_10_vals);
6256 }
6257 
ExportAux1bProperSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict dst_10_set,AlleleCode * __restrict dst_10_vals,uint32_t * __restrict rare10_ctp,uint32_t * __restrict deltalist_workspace)6258 PglErr ExportAux1bProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict dst_10_set, AlleleCode* __restrict dst_10_vals, uint32_t* __restrict rare10_ctp, uint32_t* __restrict deltalist_workspace) {
6259   uintptr_t detect_hom_mask_lo;  // unused
6260   const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
6261   const uint32_t allele_code_width = 1U << allele_code_logwidth;
6262   const uintptr_t allele_code_mask = (1U << allele_code_width) - 1;
6263   const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
6264   const uint32_t code10_width = 1U << code10_logwidth;
6265   memset(dst_10_set, 0, BitCtToWordCt(sample_ct) * sizeof(intptr_t));
6266   AlleleCode* dst_10_vals_iter = dst_10_vals;
6267   if (!aux1b_mode) {
6268 #ifdef __arm__
6269 #  error "Unaligned accesses in ExportAux1bProperSubset()."
6270 #endif
6271     const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
6272     const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
6273     const uint32_t rare10_ct = PopcountBytes(patch_10_fsetw, fset_byte_ct);
6274     if (PtrAddCk(fread_end, fset_byte_ct, fread_pp)) {
6275       return kPglRetMalformedInput;
6276     }
6277     const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
6278     uintptr_t sample_hwidx = 0;
6279     uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
6280     uint32_t loop_len = kBitsPerWord;
6281     const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, 8);
6282     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6283       return kPglRetMalformedInput;
6284     }
6285     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
6286     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
6287     uintptr_t fvals_bits = 0;
6288     uint32_t fvals_widx = 0;
6289     uint32_t rare10_lowbits = kBitsPerWord;
6290     for (uint32_t fset_widx = 0; ; ++fset_widx) {
6291       uintptr_t fset_bits;
6292       if (fset_widx >= fset_word_ct_m1) {
6293         if (fset_widx > fset_word_ct_m1) {
6294           break;
6295         }
6296         fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
6297         loop_len = ModNz(raw_10_ct, kBitsPerWord);
6298       } else {
6299         fset_bits = patch_10_fsetw[fset_widx];
6300       }
6301       if (allele_ct == 3) {
6302         for (uint32_t uii = 0; uii != loop_len; ++uii) {
6303           while (!cur_raw_genoarr_xys) {
6304             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
6305           }
6306           if (fset_bits & 1) {
6307             if (rare10_lowbits == kBitsPerWord) {
6308               if (fvals_widx == fvals_word_ct_m1) {
6309                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6310               } else {
6311                 fvals_bits = patch_10_fvalsw[fvals_widx];
6312               }
6313               // unnecessary to apply bzhi here
6314               ++fvals_widx;
6315               rare10_lowbits = 0;
6316             }
6317             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
6318             if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
6319               const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
6320               SetBit(sample_idx, dst_10_set);
6321               *dst_10_vals_iter++ = 1 + ((fvals_bits >> rare10_lowbits) & 1);
6322               *dst_10_vals_iter++ = 2;
6323             }
6324             ++rare10_lowbits;
6325           }
6326           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
6327           fset_bits = fset_bits >> 1;
6328         }
6329       } else {
6330         for (uint32_t uii = 0; uii != loop_len; ++uii) {
6331           while (!cur_raw_genoarr_xys) {
6332             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
6333           }
6334           if (fset_bits & 1) {
6335             if (rare10_lowbits == kBitsPerWord) {
6336               if (fvals_widx == fvals_word_ct_m1) {
6337                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6338               } else {
6339                 fvals_bits = patch_10_fvalsw[fvals_widx];
6340               }
6341               // unnecessary to apply bzhi here
6342               ++fvals_widx;
6343               rare10_lowbits = 0;
6344             }
6345             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
6346             if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
6347               const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
6348               SetBit(sample_idx, dst_10_set);
6349               const uintptr_t cur_code_pair = fvals_bits >> rare10_lowbits;
6350               const uint32_t cur_code_hi = (cur_code_pair >> allele_code_width) & allele_code_mask;
6351               const uint32_t cur_code_lo = cur_code_pair & allele_code_mask;
6352               *dst_10_vals_iter++ = 1 + cur_code_lo;
6353               *dst_10_vals_iter++ = 1 + cur_code_hi;
6354             }
6355             rare10_lowbits += code10_width;
6356           }
6357           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
6358           fset_bits = fset_bits >> 1;
6359         }
6360       }
6361     }
6362     *rare10_ctp = S_CAST(uintptr_t, dst_10_vals_iter - dst_10_vals) / 2;
6363     return kPglRetSuccess;
6364   }
6365   // aux1b_mode == 1
6366   uint32_t rare10_ct;
6367   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
6368   if (unlikely(reterr)) {
6369     return reterr;
6370   }
6371   const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
6372   const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, 8);
6373   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6374     return kPglRetMalformedInput;
6375   }
6376   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
6377   uint32_t loop_len = kBitsPerWord >> code10_logwidth;
6378   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
6379     uintptr_t fvals_bits;
6380     if (fvals_widx >= fvals_word_ct_m1) {
6381       if (fvals_widx > fvals_word_ct_m1) {
6382         break;
6383       }
6384       fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6385       loop_len = 1 + ((rare10_ct - 1) & (loop_len - 1));
6386     } else {
6387       fvals_bits = patch_10_fvalsw[fvals_widx];
6388     }
6389     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
6390     if (allele_ct == 3) {
6391       for (uint32_t uii = 0; uii != loop_len; ++uii) {
6392         const uint32_t sample_uidx = cur_deltalist_base[uii];
6393         const uint32_t sample_widx = sample_uidx / kBitsPerWord;
6394         const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
6395         const uintptr_t sample_include_word = sample_include[sample_widx];
6396         if (sample_include_word & lowbit) {
6397           const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
6398           SetBit(sample_idx, dst_10_set);
6399           *dst_10_vals_iter++ = 1 + ((fvals_bits >> uii) & 1);
6400           *dst_10_vals_iter++ = 2;
6401         }
6402       }
6403     } else {
6404       for (uint32_t uii = 0; uii != loop_len; ++uii) {
6405         const uint32_t sample_uidx = cur_deltalist_base[uii];
6406         const uint32_t sample_widx = sample_uidx / kBitsPerWord;
6407         const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
6408         const uintptr_t sample_include_word = sample_include[sample_widx];
6409         if (sample_include_word & lowbit) {
6410           const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
6411           SetBit(sample_idx, dst_10_set);
6412           const uintptr_t cur_code_pair = fvals_bits >> (uii << code10_logwidth);
6413           const uint32_t cur_code_hi = (cur_code_pair >> allele_code_width) & allele_code_mask;
6414           const uint32_t cur_code_lo = cur_code_pair & allele_code_mask;
6415           *dst_10_vals_iter++ = 1 + cur_code_lo;
6416           *dst_10_vals_iter++ = 1 + cur_code_hi;
6417         }
6418       }
6419     }
6420   }
6421   *rare10_ctp = S_CAST(uintptr_t, dst_10_vals_iter - dst_10_vals) / 2;
6422   return kPglRetSuccess;
6423 }
6424 
6425 // Assumes sample_ct > 0, multiallelic-hc track is present, and patch_01_ct and
6426 // patch_10_ct are zero-initialized.
GetMultiallelicCodes(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict all_hets,PgenVariant * pgvp)6427 PglErr GetMultiallelicCodes(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict all_hets, PgenVariant* pgvp) {
6428   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
6429   uint32_t subsetting_required = (sample_ct != raw_sample_ct);
6430   uintptr_t* raw_genovec = pgrp->workspace_vec;
6431   const unsigned char* fread_ptr;
6432   const unsigned char* fread_end;
6433   PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
6434   if (unlikely(reterr)) {
6435     return reterr;
6436   }
6437   CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, pgvp->genovec);
6438   ZeroTrailingNyps(raw_sample_ct, raw_genovec);
6439   const uint32_t aux1_first_byte = *fread_ptr++;
6440   const uint32_t aux1a_mode = aux1_first_byte & 15;
6441   const uint32_t aux1b_mode = aux1_first_byte >> 4;
6442   uint32_t raw_01_ct = 0;
6443   uint32_t raw_10_ct = 0;
6444   if ((!aux1a_mode) || (!aux1b_mode)) {
6445     GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6446   }
6447   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
6448   const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
6449   uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
6450   if (aux1a_mode != 15) {
6451     if (!subsetting_required) {
6452       reterr = ExportAux1a(fread_end, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr, pgvp->patch_01_set, pgvp->patch_01_vals, &(pgvp->patch_01_ct));
6453     } else {
6454       reterr = ExportAux1aProperSubset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1a_mode, raw_sample_ct, sample_ct, allele_ct, raw_01_ct, &fread_ptr, pgvp->patch_01_set, pgvp->patch_01_vals, &(pgvp->patch_01_ct), deltalist_workspace);
6455     }
6456     if (unlikely(reterr)) {
6457       return reterr;
6458     }
6459   }
6460   const unsigned char* aux1b_start = fread_ptr;
6461   if (aux1b_mode != 15) {
6462     if (!subsetting_required) {
6463       reterr = ExportAux1b(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, pgvp->patch_10_set, pgvp->patch_10_vals, &(pgvp->patch_10_ct));
6464     } else {
6465       reterr = ExportAux1bProperSubset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1b_mode, raw_sample_ct, sample_ct, allele_ct, raw_10_ct, &fread_ptr, pgvp->patch_10_set, pgvp->patch_10_vals, &(pgvp->patch_10_ct), deltalist_workspace);
6466     }
6467     if (unlikely(reterr)) {
6468       return reterr;
6469     }
6470   }
6471   if (fread_pp) {
6472     *fread_pp = fread_ptr;
6473     *fread_endp = fread_end;
6474     if (all_hets) {
6475       PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
6476       if (aux1b_mode != 15) {
6477         // can merge this with ExportAux1b functions later
6478         uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
6479         uint32_t aux1b_het_present;
6480         reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &aux1b_start, aux1b_hets, &aux1b_het_present, deltalist_workspace);
6481         if (unlikely(reterr)) {
6482           return reterr;
6483         }
6484         if (aux1b_het_present) {
6485           BitvecOr(aux1b_hets, BitCtToWordCt(raw_sample_ct), all_hets);
6486         }
6487       }
6488     }
6489   }
6490   return kPglRetSuccess;
6491 }
6492 
PgrGetM(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)6493 PglErr PgrGetM(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
6494   pgvp->patch_01_ct = 0;
6495   pgvp->patch_10_ct = 0;
6496   if (!sample_ct) {
6497     return kPglRetSuccess;
6498   }
6499   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
6500   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
6501   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6502   const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
6503   if (!multiallelic_hc_present) {
6504     return ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, pgvp->genovec);
6505   }
6506   return GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, nullptr, pgvp);
6507 }
6508 
DetectGenoarrHetsHw(const uintptr_t * __restrict genoarr,uint32_t raw_sample_ctl2,Halfword * all_hets_hw)6509 void DetectGenoarrHetsHw(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, Halfword* all_hets_hw) {
6510   // requires trailing bits of genoarr to be zeroed out.  does not update last
6511   // all_hets[] halfword if raw_sample_ctl2 is odd.
6512   for (uint32_t widx = 0; widx != raw_sample_ctl2; ++widx) {
6513     const uintptr_t cur_word = genoarr[widx];
6514     uintptr_t ww = (~(cur_word >> 1)) & cur_word;  // low 1, high 0
6515     all_hets_hw[widx] = PackWordToHalfwordMask5555(ww);
6516   }
6517 }
6518 
PgrDetectGenoarrHetsMultiallelic(const uintptr_t * __restrict genoarr,const uintptr_t * __restrict patch_10_set,const AlleleCode * __restrict patch_10_vals,uint32_t raw_sample_ct,uintptr_t * __restrict all_hets)6519 void PgrDetectGenoarrHetsMultiallelic(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets) {
6520   const Halfword* patch_10_set_alias = R_CAST(const Halfword*, patch_10_set);
6521   const AlleleCode* patch_10_vals_iter = patch_10_vals;
6522   const uint32_t word_ct_m1 = (raw_sample_ct - 1) / kBitsPerWordD2;
6523   Halfword* all_hets_hw = R_CAST(Halfword*, all_hets);
6524   for (uint32_t widx = 0; ; ++widx) {
6525     uintptr_t cur_geno_word;
6526     if (widx >= word_ct_m1) {
6527       if (widx > word_ct_m1) {
6528         if (widx % 2) {
6529           all_hets_hw[widx] = 0;
6530         }
6531         return;
6532       }
6533       const uint32_t final_ct = ModNz(raw_sample_ct, kBitsPerWordD2);
6534       cur_geno_word = bzhi_max(genoarr[widx], 2 * final_ct);
6535     } else {
6536       cur_geno_word = genoarr[widx];
6537     }
6538     uint32_t patch_10_hw = patch_10_set_alias[widx];
6539     uint32_t cur_hets = Pack01ToHalfword(cur_geno_word);
6540     while (patch_10_hw) {
6541       const AlleleCode code1 = *patch_10_vals_iter++;
6542       const AlleleCode code2 = *patch_10_vals_iter++;
6543       const uint32_t lowbit = patch_10_hw & (-patch_10_hw);
6544       if (code1 != code2) {
6545         cur_hets |= lowbit;
6546       }
6547       patch_10_hw ^= lowbit;
6548     }
6549     all_hets_hw[widx] = cur_hets;
6550   }
6551 }
6552 
SkipAux1b(const unsigned char * fread_end,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp)6553 PglErr SkipAux1b(const unsigned char* fread_end, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp) {
6554   if (aux1b_mode == 15) {
6555     return kPglRetSuccess;
6556   }
6557   uint32_t rare10_ct;
6558   if (!aux1b_mode) {
6559     const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
6560     rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
6561     *fread_pp += fset_byte_ct;
6562   } else {
6563     const unsigned char* group_info_iter;
6564     PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare10_ct);
6565     if (unlikely(reterr)) {
6566       return reterr;
6567     }
6568     reterr = SkipDeltalistIds(fread_end, group_info_iter, rare10_ct, raw_sample_ct, 0, fread_pp);
6569     if (unlikely(reterr)) {
6570       return reterr;
6571     }
6572   }
6573   const uint32_t fvals_byte_ct = GetAux1bAlleleEntryByteCt(allele_ct, rare10_ct);
6574   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6575     return kPglRetMalformedInput;
6576   }
6577   return kPglRetSuccess;
6578 }
6579 
SkipAux1(const unsigned char * fread_end,const uintptr_t * __restrict raw_genovec,uint32_t raw_sample_ct,uint32_t allele_ct,const unsigned char ** fread_pp)6580 PglErr SkipAux1(const unsigned char* fread_end, const uintptr_t* __restrict raw_genovec, uint32_t raw_sample_ct, uint32_t allele_ct, const unsigned char** fread_pp) {
6581   const uint32_t aux1_first_byte = **fread_pp;
6582   (*fread_pp) += 1;
6583   const uint32_t aux1a_mode = aux1_first_byte & 15;
6584   const uint32_t aux1b_mode = aux1_first_byte >> 4;
6585   uint32_t raw_01_ct = 0;
6586   uint32_t raw_10_ct = 0;
6587   if ((!aux1a_mode) || (!aux1b_mode)) {
6588     GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6589   }
6590   PglErr reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, fread_pp);
6591   if (unlikely(reterr)) {
6592     return reterr;
6593   }
6594   return SkipAux1b(fread_end, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, fread_pp);
6595 }
6596 
6597 // sample_include assumed to be nullptr if no subsetting required
6598 // subsetted_10het should only be provided when you explicitly want to exclude
6599 // those phase entries
6600 // set phasepresent == phaseinfo == nullptr if you want to skip the entire
6601 // track; ok for phasepresent_ct_ptr to be nullptr too in that case
6602 // (also see SkipAux2() and GetPhasepresentAndSkipPhaseinfo() below)
ParseAux2Subset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict all_hets,const uintptr_t * __restrict subsetted_10het,uint32_t raw_sample_ct,uint32_t sample_ct,const unsigned char ** fread_pp,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr,uintptr_t * __restrict workspace_subset)6603 PglErr ParseAux2Subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict all_hets, const uintptr_t* __restrict subsetted_10het, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr, uintptr_t* __restrict workspace_subset) {
6604   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
6605   const uint32_t het_ct = PopcountWords(all_hets, raw_sample_ctl);
6606   if (unlikely(!het_ct)) {
6607     // there shouldn't be a hphase track at all in this case, het_ct is not
6608     // computed off a subset
6609     return kPglRetMalformedInput;
6610   }
6611   const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
6612   const unsigned char* aux2_start = *fread_pp;
6613   if (!(aux2_start[0] & 1)) {
6614     // phase always present
6615     if (PtrAddCk(fread_end, 1 + (het_ct / CHAR_BIT), fread_pp)) {
6616       return kPglRetMalformedInput;
6617     }
6618     if (!phaseinfo) {
6619       // for internal callers which just want to skip aux2
6620       return kPglRetSuccess;
6621     }
6622     if (!sample_include) {
6623       memcpy(phasepresent, all_hets, raw_sample_ctl * kBytesPerWord);
6624       ExpandBytearr(aux2_start, all_hets, raw_sample_ctl, het_ct, 1, phaseinfo);
6625       if (!subsetted_10het) {
6626         *phasepresent_ct_ptr = het_ct;
6627         return kPglRetSuccess;
6628       }
6629     } else {
6630       CopyBitarrSubset(all_hets, sample_include, sample_ct, phasepresent);
6631       if (AllWordsAreZero(phasepresent, sample_ctl)) {
6632         *phasepresent_ct_ptr = 0;
6633         // bugfix (7 Dec 2017): clear sample_ctl words here, not raw_sample_ctl
6634         ZeroWArr(sample_ctl, phaseinfo);
6635         return kPglRetSuccess;
6636       }
6637       ExpandThenSubsetBytearr(aux2_start, all_hets, sample_include, het_ct, sample_ct, 1, phaseinfo);
6638     }
6639     // bugfix (25 Feb 2020): forgot to mask out subsetted_10het here
6640   } else {
6641     const uint32_t het_ctdl = het_ct / kBitsPerWord;
6642 
6643     // explicit phasepresent
6644     const uintptr_t* aux2_first_part = R_CAST(const uintptr_t*, aux2_start);
6645     uintptr_t* aux2_first_part_copy = workspace_subset;
6646     aux2_first_part_copy[het_ctdl] = 0;
6647     memcpy(aux2_first_part_copy, aux2_first_part, 1 + (het_ct / CHAR_BIT));
6648     const uint32_t raw_phasepresent_ct = PopcountWords(aux2_first_part_copy, het_ctdl + 1) - 1;
6649     if (unlikely(!raw_phasepresent_ct)) {
6650       // there shouldn't be a hphase track at all in this case
6651       return kPglRetMalformedInput;
6652     }
6653     const unsigned char* aux2_second_part = &(aux2_start[1 + (het_ct / CHAR_BIT)]);
6654     *fread_pp = aux2_second_part;
6655     if (PtrAddCk(fread_end, DivUp(raw_phasepresent_ct, CHAR_BIT), fread_pp)) {
6656       return kPglRetMalformedInput;
6657     }
6658     if (!phaseinfo) {
6659       return kPglRetSuccess;
6660     }
6661     if (!sample_include) {
6662       ExpandBytearrNested(aux2_second_part, aux2_first_part_copy, all_hets, sample_ctl, raw_phasepresent_ct, 1, phasepresent, phaseinfo);
6663       if (!subsetted_10het) {
6664         *phasepresent_ct_ptr = raw_phasepresent_ct;
6665         return kPglRetSuccess;
6666       }
6667     } else {
6668       // could skip if intersection of phasepresent with sample_include is
6669       // empty, but this function call should be fast enough there anyway?
6670       ExpandThenSubsetBytearrNested(aux2_second_part, aux2_first_part_copy, all_hets, sample_include, sample_ct, raw_phasepresent_ct, 1, phasepresent, phaseinfo);
6671     }
6672   }
6673   if (subsetted_10het) {
6674     BitvecInvmask(subsetted_10het, sample_ctl, phasepresent);
6675   }
6676   *phasepresent_ct_ptr = PopcountWords(phasepresent, sample_ctl);
6677   return kPglRetSuccess;
6678 }
6679 
SkipAux2(const unsigned char * fread_end,uint32_t het_ct,const unsigned char ** fread_pp,uint32_t * __restrict phasepresent_ctp)6680 PglErr SkipAux2(const unsigned char* fread_end, uint32_t het_ct, const unsigned char** fread_pp, uint32_t* __restrict phasepresent_ctp) {
6681   const unsigned char* aux2_start = *fread_pp;
6682   const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
6683   if (PtrAddCk(fread_end, aux2_first_part_byte_ct, fread_pp)) {
6684     return kPglRetMalformedInput;
6685   }
6686   if (!(aux2_start[0] & 1)) {
6687     if (phasepresent_ctp) {
6688       *phasepresent_ctp = het_ct;
6689     }
6690     return kPglRetSuccess;
6691   }
6692   const uint32_t phasepresent_ct = PopcountBytes(aux2_start, aux2_first_part_byte_ct) - 1;
6693   if (phasepresent_ctp) {
6694     *phasepresent_ctp = phasepresent_ct;
6695   }
6696   if (PtrAddCk(fread_end, DivUp(phasepresent_ct, CHAR_BIT), fread_pp)) {
6697     return kPglRetMalformedInput;
6698   }
6699   return kPglRetSuccess;
6700 }
6701 
6702 // If fread_pp/fread_endp are non-null, this always moves fread_ptr to the end
6703 // of aux2.  Set phasepresent/phaseinfo to nullptr when you don't actually care
6704 // about the contents of aux2.
6705 // In multiallelic case, this guarantees phasepresent bits are only set at
6706 // ref/altx hets, not at altx/alty hets.  (We don't currently guarantee this
6707 // for phaseinfo, since popcounts on that array are meaningless.)  Yes, this is
6708 // mildly annoying, but the code would be messier if the ordering of
6709 // multiallelic-hardcall and hardcall-phase info were swapped.
ReadGenovecHphaseSubsetUnsafe(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict genovec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * phasepresent_ct_ptr)6710 PglErr ReadGenovecHphaseSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* phasepresent_ct_ptr) {
6711   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6712   if ((!(vrtype & 0x18)) || ((!fread_pp) && (!VrtypeHphase(vrtype)))) {
6713     *phasepresent_ct_ptr = 0;
6714     return ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, fread_pp, fread_endp, genovec);
6715   }
6716   // Either hphase track is present; or if it's absent, multiallelic track is
6717   // present and we were asked to advance fread_ptr to the end of aux2.
6718   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
6719   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
6720   uintptr_t* raw_genovec = (subsetting_required || VrtypeMultiallelicHc(vrtype))? pgrp->workspace_vec : genovec;
6721   const unsigned char* fread_ptr;
6722   const unsigned char* fread_end;
6723   PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
6724   if (unlikely(reterr)) {
6725     return reterr;
6726   }
6727   ZeroTrailingNyps(raw_sample_ct, raw_genovec);
6728   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
6729   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
6730   if (raw_genovec != genovec) {
6731     CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
6732     if (!VrtypeHphase(vrtype)) {
6733       // only possible if multiallelic track present and fread_ptr must be
6734       // advanced to end of aux2
6735       *fread_pp = fread_ptr;
6736       *fread_endp = fread_end;
6737       return SkipAux1(fread_end, raw_genovec, raw_sample_ct, allele_ct, fread_pp);
6738     }
6739   }
6740   uintptr_t* all_hets = pgrp->workspace_all_hets;
6741   PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
6742   uintptr_t* subsetted_10het = nullptr;
6743   if (VrtypeMultiallelicHc(vrtype)) {
6744     const uint32_t aux1_first_byte = *fread_ptr++;
6745     const uint32_t aux1a_mode = aux1_first_byte & 15;
6746     const uint32_t aux1b_mode = aux1_first_byte >> 4;
6747     uint32_t raw_01_ct = 0;
6748     uint32_t raw_10_ct = 0;
6749     if ((!aux1a_mode) || (!aux1b_mode)) {
6750       GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6751     }
6752     reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr);
6753     if (unlikely(reterr)) {
6754       return reterr;
6755     }
6756     // 1. fill workspace_aux1x_present with aux1b
6757     // 2. clear bit for each hom-altx call in aux1b
6758     // 3. bitvec-or to set new workspace_all_hets bits
6759     // 4. if not subsetting, set subsetted_10het := workspace_all_hets
6760     //    if subsetting, copy-subset to pgrp->workspace_vec and set to that
6761     //    if AllWordsAreZero, keep as nullptr
6762     uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
6763     uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
6764     uint32_t aux1b_het_present;
6765     reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, aux1b_hets, &aux1b_het_present, deltalist_workspace);
6766     if (unlikely(reterr)) {
6767       return reterr;
6768     }
6769     if (aux1b_het_present) {
6770       BitvecOr(aux1b_hets, BitCtToWordCt(raw_sample_ct), all_hets);
6771       if (!subsetting_required) {
6772         subsetted_10het = aux1b_hets;
6773       } else {
6774         // Don't need raw_genovec any more.
6775         CopyBitarrSubset(aux1b_hets, sample_include, sample_ct, raw_genovec);
6776         subsetted_10het = raw_genovec;
6777       }
6778     }
6779   }
6780   reterr = ParseAux2Subset(fread_end, subsetting_required? sample_include : nullptr, all_hets, subsetted_10het, raw_sample_ct, sample_ct, &fread_ptr, phasepresent, phaseinfo, phasepresent_ct_ptr, pgrp->workspace_subset);
6781   if (fread_pp) {
6782     *fread_pp = fread_ptr;
6783     *fread_endp = fread_end;
6784   }
6785   return reterr;
6786 }
6787 
PgrGetP(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict genovec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6788 PglErr PgrGetP(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6789   if (!sample_ct) {
6790     *phasepresent_ct_ptr = 0;
6791     return kPglRetSuccess;
6792   }
6793   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
6794   assert(vidx < pgrp->fi.raw_variant_ct);
6795   return ReadGenovecHphaseSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, vidx, pgrp, nullptr, nullptr, genovec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6796 }
6797 
6798 // eventually want to return fread_ptr/fread_end, but not relevant until
6799 // multiallelic dosage working
Get1MP(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,uintptr_t * __restrict allele_countvec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6800 PglErr Get1MP(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6801   // sample_ct > 0; either allele_idx > 1 or ((allele_idx == 1) &&
6802   // multiallelic_hc_present)
6803   const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
6804   if (!VrtypeHphase(vrtype)) {
6805     *phasepresent_ct_ptr = 0;
6806     return IMPLPgrGet1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_countvec);
6807   }
6808   uintptr_t* all_hets = pgrp->workspace_all_hets;
6809   uintptr_t* subsetted_10het = nullptr;
6810   const unsigned char* fread_ptr;
6811   const unsigned char* fread_end;
6812   PglErr reterr = Get1Multiallelic(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, &fread_ptr, &fread_end, all_hets, allele_countvec, &subsetted_10het);
6813   if (unlikely(reterr)) {
6814     return reterr;
6815   }
6816   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
6817   reterr = ParseAux2Subset(fread_end, (sample_ct != raw_sample_ct)? sample_include : nullptr, all_hets, subsetted_10het, raw_sample_ct, sample_ct, &fread_ptr, phasepresent, phaseinfo, phasepresent_ct_ptr, pgrp->workspace_subset);
6818   // bugfix (7 Sep 2018): Need to postprocess phasepresent when collapsing
6819   // multiple alleles.
6820   if (reterr || (!(*phasepresent_ct_ptr))) {
6821     return reterr;
6822   }
6823 
6824   // Might want to make this its own function.
6825   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
6826   Halfword* phasepresent_alias = R_CAST(Halfword*, phasepresent);
6827   for (uint32_t hwidx = 0; hwidx != sample_ctl2; ++hwidx) {
6828     phasepresent_alias[hwidx] &= Pack01ToHalfword(allele_countvec[hwidx]);
6829   }
6830   *phasepresent_ct_ptr = PopcountWords(phasepresent, BitCtToWordCt(sample_ct));
6831 
6832   return kPglRetSuccess;
6833 }
6834 
PgrGet1P(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_countvec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6835 PglErr PgrGet1P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6836   if (!sample_ct) {
6837     *phasepresent_ct_ptr = 0;
6838     return kPglRetSuccess;
6839   }
6840   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
6841   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
6842   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6843   const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
6844   if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
6845     PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, allele_countvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6846     if (allele_idx) {
6847       GenovecInvertUnsafe(sample_ct, allele_countvec);
6848       if (*phasepresent_ct_ptr) {
6849         BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
6850       }
6851     }
6852     return reterr;
6853   }
6854   return Get1MP(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_countvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6855 }
6856 
IMPLPgrGetInv1P(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,uintptr_t * __restrict allele_invcountvec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6857 PglErr IMPLPgrGetInv1P(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6858   if (!sample_ct) {
6859     *phasepresent_ct_ptr = 0;
6860     return kPglRetSuccess;
6861   }
6862   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6863   const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
6864   if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
6865     PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, allele_invcountvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6866     if (!allele_idx) {
6867       GenovecInvertUnsafe(sample_ct, allele_invcountvec);
6868       if (*phasepresent_ct_ptr) {
6869         BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
6870       }
6871     }
6872     return reterr;
6873   }
6874   PglErr reterr = Get1MP(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6875   if (unlikely(reterr)) {
6876     return reterr;
6877   }
6878   GenovecInvertUnsafe(sample_ct, allele_invcountvec);
6879   if (*phasepresent_ct_ptr) {
6880     BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
6881   }
6882   return kPglRetSuccess;
6883 }
6884 
PgrGet2P(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx0,uint32_t allele_idx1,PgenReader * pgr_ptr,uintptr_t * __restrict genovec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6885 PglErr PgrGet2P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6886   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
6887   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
6888   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6889   if (!VrtypeHphase(vrtype)) {
6890     *phasepresent_ct_ptr = 0;
6891     return IMPLPgrGet2(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx0, allele_idx1, pgrp, genovec);
6892   }
6893   if (!sample_ct) {
6894     *phasepresent_ct_ptr = 0;
6895     return kPglRetSuccess;
6896   }
6897   if (allele_idx0 + allele_idx1 == 1) {
6898     PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6899     if (allele_idx0) {
6900       GenovecInvertUnsafe(sample_ct, genovec);
6901       if (*phasepresent_ct_ptr) {
6902         BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
6903       }
6904     }
6905     return reterr;
6906   }
6907   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
6908   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
6909   uintptr_t* raw_genovec = pgrp->workspace_vec;
6910   const unsigned char* fread_ptr;
6911   const unsigned char* fread_end;
6912   PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
6913   if (unlikely(reterr)) {
6914     return reterr;
6915   }
6916   ZeroTrailingNyps(raw_sample_ct, raw_genovec);
6917 
6918   uint32_t invert = 0;
6919   if (allele_idx0 > allele_idx1) {
6920     const uint32_t swap = allele_idx0;
6921     allele_idx0 = allele_idx1;
6922     allele_idx1 = swap;
6923     invert = 1;
6924   }
6925   if (allele_idx0 > 1) {
6926     SetAllBits(2 * sample_ct, genovec);
6927   } else {
6928     CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
6929     // allele_idx1 > 1 guaranteed
6930     if (!allele_idx0) {
6931       GenovecNonzeroToMissingUnsafe(sample_ct, genovec);
6932     } else {
6933       GenovecInvertThenNonzeroToMissingUnsafe(sample_ct, genovec);
6934     }
6935   }
6936   uintptr_t* all_hets = pgrp->workspace_all_hets;
6937   PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
6938   uintptr_t* subsetted_10het = nullptr;
6939   if (!subsetting_required) {
6940     sample_include = nullptr;
6941   }
6942 
6943   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
6944   const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
6945   if (VrtypeMultiallelicHc(vrtype)) {
6946     // This combines ReadGenovecHphaseSubsetUnsafe() and Get2()'s logic.
6947     const uint32_t aux1_first_byte = *fread_ptr++;
6948     const uint32_t aux1a_mode = aux1_first_byte & 15;
6949     const uint32_t aux1b_mode = aux1_first_byte >> 4;
6950     uint32_t raw_01_ct = 0;
6951     uint32_t raw_10_ct = 0;
6952     if ((!aux1a_mode) || (!aux1b_mode)) {
6953       GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6954     }
6955     uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
6956     if (!allele_idx0) {
6957       // Two cases:
6958       // - If allele_idx == 1, convert all aux1a entries from 01 to 11.
6959       // - Otherwise, for each matching aux1a entry, convert from 11 to 01.
6960       reterr = GenoarrAux1aUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, allele_idx1, 2, raw_01_ct, &fread_ptr, genovec, deltalist_workspace);
6961     } else {
6962       reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr);
6963     }
6964     if (unlikely(reterr)) {
6965       return reterr;
6966     }
6967     const unsigned char* aux1b_start = fread_ptr;
6968     reterr = GenoarrAux1bUpdate2(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, allele_idx0, allele_idx1, raw_10_ct, &fread_ptr, genovec, deltalist_workspace);
6969     if (unlikely(reterr)) {
6970       return reterr;
6971     }
6972     // Can have a modified version of GenoarrAux1bUpdate2() which only requires
6973     // one pass, but let's keep the logic simpler for now since I don't expect
6974     // this function to be used frequently.
6975     uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
6976     uint32_t aux1b_het_present;
6977     reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &aux1b_start, aux1b_hets, &aux1b_het_present, deltalist_workspace);
6978     if (unlikely(reterr)) {
6979       return reterr;
6980     }
6981     if (aux1b_het_present) {
6982       BitvecOr(aux1b_hets, BitCtToWordCt(raw_sample_ct), all_hets);
6983       if (!subsetting_required) {
6984         subsetted_10het = aux1b_hets;
6985       } else {
6986         // Don't need raw_genovec any more.
6987         CopyBitarrSubset(aux1b_hets, sample_include, sample_ct, raw_genovec);
6988         subsetted_10het = raw_genovec;
6989       }
6990     }
6991   }
6992   reterr = ParseAux2Subset(fread_end, sample_include, all_hets, subsetted_10het, raw_sample_ct, sample_ct, &fread_ptr, phasepresent, phaseinfo, phasepresent_ct_ptr, pgrp->workspace_subset);
6993   if (unlikely(reterr)) {
6994     return reterr;
6995   }
6996   if (VrtypeMultiallelicHc(vrtype) && (*phasepresent_ct_ptr)) {
6997     const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
6998     Halfword* phasepresent_alias = R_CAST(Halfword*, phasepresent);
6999     for (uint32_t hwidx = 0; hwidx != sample_ctl2; ++hwidx) {
7000       phasepresent_alias[hwidx] &= Pack01ToHalfword(genovec[hwidx]);
7001     }
7002     *phasepresent_ct_ptr = PopcountWords(phasepresent, BitCtToWordCt(sample_ct));
7003   }
7004   if (invert) {
7005     GenovecInvertUnsafe(sample_ct, genovec);
7006     if (*phasepresent_ct_ptr) {
7007       BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
7008     }
7009   }
7010   return kPglRetSuccess;
7011 }
7012 
PgrGetMP(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)7013 PglErr PgrGetMP(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
7014   pgvp->patch_01_ct = 0;
7015   pgvp->patch_10_ct = 0;
7016   if (!sample_ct) {
7017     pgvp->phasepresent_ct = 0;
7018     return kPglRetSuccess;
7019   }
7020   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
7021   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
7022   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
7023   const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
7024   if (!multiallelic_hc_present) {
7025     return ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, pgvp->genovec, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct));
7026   }
7027   const unsigned char* fread_ptr;
7028   const unsigned char* fread_end;
7029   uintptr_t* all_hets = VrtypeHphase(vrtype)? pgrp->workspace_all_hets : nullptr;
7030   PglErr reterr = GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, all_hets? (&fread_ptr) : nullptr, all_hets? (&fread_end) : nullptr, all_hets, pgvp);
7031   if (reterr || (!all_hets)) {
7032     return reterr;
7033   }
7034   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7035   return ParseAux2Subset(fread_end, (sample_ct != raw_sample_ct)? sample_include : nullptr, all_hets, nullptr, raw_sample_ct, sample_ct, &fread_ptr, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct), pgrp->workspace_subset);
7036 }
7037 
7038 // ok for sample_include to be nullptr if not subsetting, though this is not
7039 // required
ParseDosage16(const unsigned char * fread_ptr,const unsigned char * fread_end,const uintptr_t * __restrict sample_include,uint32_t sample_ct,uint32_t vidx,uint32_t allele_ct,PgenReaderMain * pgrp,uint32_t * __restrict dosage_ct_ptr,uintptr_t * __restrict dphase_present,int16_t * dphase_delta,uint32_t * __restrict dphase_ct_ptr,uintptr_t * __restrict dosage_present,uint16_t * dosage_main)7040 PglErr ParseDosage16(const unsigned char* fread_ptr, const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t sample_ct, uint32_t vidx, uint32_t allele_ct, PgenReaderMain* pgrp, uint32_t* __restrict dosage_ct_ptr, uintptr_t* __restrict dphase_present, int16_t* dphase_delta, uint32_t* __restrict dphase_ct_ptr, uintptr_t* __restrict dosage_present, uint16_t* dosage_main) {
7041   // Side effect: may use pgrp->workspace_dosage_present and
7042   // pgrp->workspace_dphase_present
7043   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7044   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7045   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
7046   uintptr_t* raw_dosage_present = subsetting_required? pgrp->workspace_dosage_present : dosage_present;
7047   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
7048   const uint32_t is_unconditional_dosage = ((vrtype & 0x60) == 0x40);
7049   uint32_t raw_dosage_ct;
7050   if ((vrtype & 0x60) == 0x20) {
7051     // case 1: dosage list
7052     if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, &fread_ptr, raw_dosage_present, &raw_dosage_ct))) {
7053       return kPglRetMalformedInput;
7054     }
7055   } else if (is_unconditional_dosage) {
7056     // case 2: unconditional dosage.  handle separately from other two cases
7057     // since missing values may be present.
7058     SetAllBits(raw_sample_ct, raw_dosage_present);
7059     raw_dosage_ct = raw_sample_ct;
7060   } else {
7061     // case 3: dosage bitarray
7062     raw_dosage_present[raw_sample_ctl - 1] = 0;
7063     const uint32_t raw_sample_ctb = DivUp(raw_sample_ct, CHAR_BIT);
7064     memcpy(raw_dosage_present, fread_ptr, raw_sample_ctb);
7065     fread_ptr = &(fread_ptr[raw_sample_ctb]);
7066     raw_dosage_ct = PopcountWords(raw_dosage_present, raw_sample_ctl);
7067   }
7068   const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
7069   uint32_t dosage_ct;
7070   if (subsetting_required) {
7071     CopyBitarrSubset(raw_dosage_present, sample_include, sample_ct, dosage_present);
7072     dosage_ct = PopcountWords(dosage_present, sample_ctl);
7073   } else {
7074     dosage_ct = raw_dosage_ct;
7075   }
7076   if (dosage_ct_ptr) {
7077     *dosage_ct_ptr = dosage_ct;
7078   }
7079   if (!dosage_ct) {
7080     if (dphase_ct_ptr) {
7081       *dphase_ct_ptr = 0;
7082     }
7083     return kPglRetSuccess;
7084   }
7085 #ifdef __arm__
7086 #  error "Unaligned accesses in ParseDosage16()."
7087 #endif
7088   const uint16_t* dosage_main_read_iter = R_CAST(const uint16_t*, fread_ptr);
7089   uint16_t* dosage_main_write_iter = dosage_main;
7090   uint32_t raw_dphase_ct = 0;
7091   uint32_t dphase_ct = 0;
7092   uintptr_t* raw_dphase_present = nullptr;
7093   if (dphase_present && (vrtype & 0x80)) {
7094     fread_ptr = &(fread_ptr[raw_dosage_ct * 2]);
7095     if (!is_unconditional_dosage) {
7096       const uintptr_t* file_dphase_present = R_CAST(const uintptr_t*, fread_ptr);
7097       fread_ptr = &(fread_ptr[DivUp(raw_dosage_ct, CHAR_BIT)]);
7098       raw_dphase_present = subsetting_required? pgrp->workspace_dphase_present : dphase_present;
7099       ExpandBytearr(file_dphase_present, raw_dosage_present, raw_sample_ctl, raw_dosage_ct, 0, raw_dphase_present);
7100       raw_dphase_ct = PopcountWords(raw_dphase_present, raw_sample_ctl);
7101       dphase_ct = raw_dphase_ct;
7102       if (subsetting_required) {
7103         CopyBitarrSubset(raw_dphase_present, sample_include, sample_ct, dphase_present);
7104         dphase_ct = PopcountWords(dphase_present, sample_ctl);
7105       }
7106     } else {
7107       // raw_dphase_present = raw_dosage_present;
7108       dphase_ct = dosage_ct;
7109       SetAllBits(sample_ct, dphase_present);
7110     }
7111   }
7112   if (!dphase_ct) {
7113     if (allele_ct == 2) {
7114       if (!is_unconditional_dosage) {
7115         if (dosage_ct == raw_dosage_ct) {
7116           memcpy(dosage_main_write_iter, dosage_main_read_iter, dosage_ct * sizeof(int16_t));
7117         } else {
7118           // bugfix (22 May 2017): dosage_entry_idx needs to iterate up to
7119           // raw_dosage_ct, not dosage_ct
7120           uintptr_t widx = ~k0LU;
7121           uint32_t dosage_entry_idx = 0;
7122           do {
7123             uintptr_t cur_bits;
7124             do {
7125               cur_bits = raw_dosage_present[++widx];
7126             } while (!cur_bits);
7127             const uintptr_t sample_include_word = sample_include[widx];
7128             do {
7129               const uintptr_t low_bit = cur_bits & (-cur_bits);
7130               if (sample_include_word & low_bit) {
7131                 *dosage_main_write_iter++ = dosage_main_read_iter[dosage_entry_idx];
7132               }
7133               ++dosage_entry_idx;
7134               cur_bits ^= low_bit;
7135             } while (cur_bits);
7136           } while (dosage_entry_idx != raw_dosage_ct);
7137         }
7138       } else {
7139         if (!subsetting_required) {
7140           for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
7141             const uint16_t cur_dosage = *dosage_main_read_iter++;
7142             if (cur_dosage != 65535) {
7143               *dosage_main_write_iter++ = cur_dosage;
7144             } else {
7145               ClearBit(sample_idx, dosage_present);
7146             }
7147           }
7148         } else {
7149           uintptr_t widx = ~k0LU;
7150           uint32_t sample_idx = 0;
7151           do {
7152             uintptr_t cur_bits;
7153             do {
7154               cur_bits = sample_include[++widx];
7155             } while (!cur_bits);
7156             const uintptr_t sample_uidx_base = widx * kBitsPerWord;
7157             const uint16_t* dosage_main_readp = &(dosage_main_read_iter[sample_uidx_base]);
7158             do {
7159               const uint32_t sample_uidx_lowbits = ctzw(cur_bits);
7160               const uint16_t cur_dosage = dosage_main_readp[sample_uidx_lowbits];
7161               if (cur_dosage != 65535) {
7162                 *dosage_main_write_iter++ = cur_dosage;
7163               } else {
7164                 ClearBit(sample_idx, dosage_present);
7165               }
7166               ++sample_idx;
7167               cur_bits &= cur_bits - 1;
7168             } while (cur_bits);
7169           } while (sample_idx != sample_ct);
7170         }
7171         if (dosage_ct_ptr) {
7172           *dosage_ct_ptr = dosage_main_write_iter - dosage_main;
7173         }
7174       }
7175     } else {
7176       // todo: multiallelic dosage
7177       // need to support downcode to ref/nonref as well as raw load
7178       // (dosage_ct_ptr should be nullptr iff we're doing a raw load)
7179       fputs("multiallelic variants not yet supported by ParseDosage16()\n", stderr);
7180       exit(S_CAST(int32_t, kPglRetNotYetSupported));
7181       return kPglRetSuccess;
7182     }
7183     if (dphase_ct_ptr) {
7184       *dphase_ct_ptr = 0;
7185     }
7186   } else {
7187     // phased dosage
7188     if (allele_ct == 2) {
7189       if (!is_unconditional_dosage) {
7190         if (dphase_ct == raw_dphase_ct) {
7191           memcpy(dosage_main_write_iter, dosage_main_read_iter, dosage_ct * sizeof(int16_t));
7192           memcpy(dphase_delta, fread_ptr, dphase_ct * sizeof(int16_t));
7193           if (dphase_ct_ptr) {
7194             *dphase_ct_ptr = dphase_ct;
7195           }
7196         } else {
7197           uintptr_t widx = ~k0LU;
7198           uint32_t dosage_entry_idx = 0;
7199           do {
7200             uintptr_t cur_bits;
7201             do {
7202               cur_bits = raw_dosage_present[++widx];
7203             } while (!cur_bits);
7204             const uintptr_t sample_include_word = sample_include[widx];
7205             do {
7206               const uintptr_t low_bit = cur_bits & (-cur_bits);
7207               if (sample_include_word & low_bit) {
7208                 *dosage_main_write_iter++ = dosage_main_read_iter[dosage_entry_idx];
7209               }
7210               ++dosage_entry_idx;
7211               cur_bits ^= low_bit;
7212             } while (cur_bits);
7213           } while (dosage_entry_idx != raw_dosage_ct);
7214           widx = ~k0LU;
7215           uint32_t dphase_entry_idx = 0;
7216           const int16_t* dphase_delta_read_alias = R_CAST(const int16_t*, fread_ptr);
7217           int16_t* dphase_delta_write_iter = dphase_delta;
7218           do {
7219             uintptr_t cur_bits;
7220             do {
7221               cur_bits = raw_dphase_present[++widx];
7222             } while (!cur_bits);
7223             const uintptr_t sample_include_word = sample_include[widx];
7224             do {
7225               const uintptr_t low_bit = cur_bits & (-cur_bits);
7226               if (sample_include_word & low_bit) {
7227                 *dphase_delta_write_iter++ = dphase_delta_read_alias[dphase_entry_idx];
7228               }
7229               ++dphase_entry_idx;
7230               cur_bits ^= low_bit;
7231             } while (cur_bits);
7232           } while (dphase_entry_idx != raw_dphase_ct);
7233           if (dphase_ct_ptr) {
7234             *dphase_ct_ptr = dphase_delta_write_iter - dphase_delta;
7235           }
7236         }
7237       } else {
7238         const int16_t* dphase_delta_read = R_CAST(const int16_t*, fread_ptr);
7239         int16_t* dphase_delta_write_iter = dphase_delta;
7240         if (!subsetting_required) {
7241           for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
7242             const uint16_t cur_dosage = *dosage_main_read_iter++;
7243             if (cur_dosage != 65535) {
7244               *dosage_main_write_iter++ = cur_dosage;
7245               const int16_t dphase_delta_val = dphase_delta_read[sample_idx];
7246               if (dphase_delta_val) {
7247                 *dphase_delta_write_iter++ = dphase_delta_val;
7248               } else {
7249                 ClearBit(sample_idx, dphase_present);
7250               }
7251             } else {
7252               // assert(dphase_delta_read[sample_idx] == -32768);
7253               ClearBit(sample_idx, dosage_present);
7254             }
7255           }
7256         } else {
7257           uintptr_t sample_uidx_base = 0;
7258           uintptr_t sample_include_bits = sample_include[0];
7259           for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
7260             const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
7261             const uint16_t cur_dosage = dosage_main_read_iter[sample_uidx];
7262             if (cur_dosage != 65535) {
7263               *dosage_main_write_iter++ = cur_dosage;
7264               const int16_t dphase_delta_val = dphase_delta_read[sample_uidx];
7265               if (dphase_delta_val) {
7266                 *dphase_delta_write_iter++ = dphase_delta_val;
7267               } else {
7268                 ClearBit(sample_idx, dphase_present);
7269               }
7270             } else {
7271               // assert(dphase_delta_read[sample_uidx] == -32768);
7272               ClearBit(sample_idx, dosage_present);
7273             }
7274           }
7275         }
7276         dosage_ct = dosage_main_write_iter - dosage_main;
7277         if (dosage_ct != sample_ct) {
7278           BitvecAnd(dosage_present, sample_ctl, dphase_present);
7279         }
7280         if (dosage_ct_ptr) {
7281           *dosage_ct_ptr = dosage_ct;
7282         }
7283         if (dphase_ct_ptr) {
7284           *dphase_ct_ptr = dphase_delta_write_iter - dphase_delta;
7285         }
7286       }
7287     } else {
7288       // multiallelic subcase
7289       fputs("multiallelic variants not yet supported by ParseDosage16()\n", stderr);
7290       exit(S_CAST(int32_t, kPglRetNotYetSupported));
7291       return kPglRetSuccess;
7292     }
7293   }
7294   return kPglRetSuccess;
7295 }
7296 
IMPLPgrGetD(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,uintptr_t * __restrict genovec,uintptr_t * __restrict dosage_present,uint16_t * dosage_main,uint32_t * dosage_ct_ptr)7297 PglErr IMPLPgrGetD(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
7298   assert(vidx < pgrp->fi.raw_variant_ct);
7299   if (!sample_ct) {
7300     *dosage_ct_ptr = 0;
7301     return kPglRetSuccess;
7302   }
7303   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
7304   if ((!VrtypeDosage(vrtype)) || (!dosage_present)) {
7305     *dosage_ct_ptr = 0;
7306     return ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
7307   }
7308   const unsigned char* fread_ptr = nullptr;
7309   const unsigned char* fread_end = nullptr;
7310   uint32_t phasepresent_ct;
7311   PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec, nullptr, nullptr, &phasepresent_ct);
7312   if (unlikely(reterr)) {
7313     return reterr;
7314   }
7315   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
7316   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
7317   return ParseDosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, allele_ct, pgrp, dosage_ct_ptr, nullptr, nullptr, nullptr, dosage_present, dosage_main);
7318 }
7319 
PgrGet1D(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,AlleleCode allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_countvec,uintptr_t * __restrict dosage_present,uint16_t * dosage_main,uint32_t * dosage_ct_ptr)7320 PglErr PgrGet1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
7321   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
7322   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
7323   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
7324   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
7325   if ((allele_ct == 2) || (!allele_idx)) {
7326     uint32_t dosage_ct;
7327     PglErr reterr = IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, allele_countvec, dosage_present, dosage_main, &dosage_ct);
7328     if (!allele_idx) {
7329       GenovecInvertUnsafe(sample_ct, allele_countvec);
7330       if (dosage_ct) {
7331         BiallelicDosage16Invert(dosage_ct, dosage_main);
7332       }
7333     }
7334     *dosage_ct_ptr = dosage_ct;
7335     return reterr;
7336   }
7337   const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
7338   if (!VrtypeDosage(vrtype)) {
7339     *dosage_ct_ptr = 0;
7340     return IMPLPgrGet1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_countvec);
7341   }
7342   fputs("multiallelic variants not yet supported by PgrGet1D()\n", stderr);
7343   exit(S_CAST(int32_t, kPglRetNotYetSupported));
7344   return kPglRetSuccess;
7345 }
7346 
PgrGetInv1D(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,AlleleCode allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_invcountvec,uintptr_t * __restrict dosage_present,uint16_t * dosage_main,uint32_t * dosage_ct_ptr)7347 PglErr PgrGetInv1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
7348   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
7349   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
7350   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
7351   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
7352   if ((allele_ct == 2) || (!allele_idx)) {
7353     uint32_t dosage_ct;
7354     PglErr reterr = IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, allele_invcountvec, dosage_present, dosage_main, &dosage_ct);
7355     if (allele_idx) {
7356       GenovecInvertUnsafe(sample_ct, allele_invcountvec);
7357       if (dosage_ct) {
7358         BiallelicDosage16Invert(dosage_ct, dosage_main);
7359       }
7360     }
7361     *dosage_ct_ptr = dosage_ct;
7362     return reterr;
7363   }
7364   const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
7365   if (!VrtypeDosage(vrtype)) {
7366     *dosage_ct_ptr = 0;
7367     return IMPLPgrGetInv1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec);
7368   }
7369   fputs("multiallelic variants not yet supported by PgrGetInv1D()\n", stderr);
7370   exit(S_CAST(int32_t, kPglRetNotYetSupported));
7371   return kPglRetSuccess;
7372 }
7373 
GetAux1bHetIncr(const unsigned char * fread_end,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uint32_t * __restrict raw_het_ctp)7374 PglErr GetAux1bHetIncr(const unsigned char* fread_end, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uint32_t* __restrict raw_het_ctp) {
7375   if (aux1b_mode == 15) {
7376     return kPglRetSuccess;
7377   }
7378   uint32_t rare10_ct;
7379   if (!aux1b_mode) {
7380     const uint32_t fset_byte_ct = DivUp(raw_10_ct, 8);
7381     rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
7382     *fread_pp += fset_byte_ct;
7383   } else {
7384     // aux1b_mode == 1
7385     const unsigned char* group_info_iter;
7386     PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare10_ct);
7387     if (unlikely(reterr)) {
7388       return reterr;
7389     }
7390     reterr = SkipDeltalistIds(fread_end, group_info_iter, rare10_ct, raw_sample_ct, 0, fread_pp);
7391     if (unlikely(reterr)) {
7392       return reterr;
7393     }
7394   }
7395   uintptr_t detect_hom_mask_lo;
7396   const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
7397   const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
7398 #ifdef __arm__
7399 #  error "Unaligned accesses in GetAux1bHetIncr()."
7400 #endif
7401   const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
7402   const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
7403   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
7404     return kPglRetMalformedInput;
7405   }
7406   if (allele_ct == 3) {
7407     const uint32_t hom22_ct = PopcountBytes(patch_10_fvalsw, fvals_byte_ct);
7408     *raw_het_ctp += rare10_ct - hom22_ct;
7409     return kPglRetSuccess;
7410   }
7411   // possible todo: vectorized het-counter, analogous to CountAux1bDense()
7412   const uint32_t code10_width = 1U << code10_logwidth;
7413   const uint32_t allele_code_width = 1U << allele_code_logwidth;
7414   const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
7415   const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
7416   const uintptr_t detect_hom_mask_hi = detect_hom_mask_lo << (code10_width - 1);
7417   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
7418   uint32_t het_incr = 0;
7419   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
7420     uintptr_t fvals_bits;
7421     if (fvals_widx >= fvals_word_ct_m1) {
7422       if (fvals_widx > fvals_word_ct_m1) {
7423         break;
7424       }
7425       fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
7426     } else {
7427       fvals_bits = patch_10_fvalsw[fvals_widx];
7428     }
7429     // allele_ct > 3 guaranteed
7430     fvals_bits = fvals_bits ^ (fvals_bits << allele_code_width);
7431     fvals_bits = detect_hom_mask_hi & (fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo));
7432     if (fvals_widx == fvals_word_ct_m1) {
7433       fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
7434     }
7435     het_incr += PopcountWord(fvals_bits);
7436   }
7437   *raw_het_ctp += het_incr;
7438   return kPglRetSuccess;
7439 }
7440 
U16VecSum(const uint16_t * __restrict uint16_vec,uint32_t entry_ct)7441 uint64_t U16VecSum(const uint16_t* __restrict uint16_vec, uint32_t entry_ct) {
7442 #ifdef __LP64__
7443   // UniVecHsum32() could overflow once we exceed this
7444   const uint32_t max_loop_len = (131072 / kInt32PerVec) - 1;
7445 
7446   const VecW m16 = VCONST_W(kMask0000FFFF);
7447   const VecW* uint16_vvec_iter = R_CAST(const VecW*, uint16_vec);
7448   uint64_t sum = 0;
7449   for (uint32_t full_vecs_remaining = entry_ct / (kBytesPerVec / sizeof(int16_t)); ; ) {
7450     UniVec acc_even;
7451     UniVec acc_odd;
7452     acc_even.vw = vecw_setzero();
7453     acc_odd.vw = vecw_setzero();
7454     const VecW* uint16_vvec_stop;
7455     if (full_vecs_remaining < max_loop_len) {
7456       if (!full_vecs_remaining) {
7457         const uint32_t trail_ct = entry_ct % (kBytesPerVec / sizeof(int16_t));
7458         uint16_vec = R_CAST(const uint16_t*, uint16_vvec_iter);
7459         for (uint32_t uii = 0; uii != trail_ct; ++uii) {
7460           sum += uint16_vec[uii];
7461         }
7462         return sum;
7463       }
7464       uint16_vvec_stop = &(uint16_vvec_iter[full_vecs_remaining]);
7465       full_vecs_remaining = 0;
7466     } else {
7467       uint16_vvec_stop = &(uint16_vvec_iter[max_loop_len]);
7468       full_vecs_remaining -= max_loop_len;
7469     }
7470     do {
7471       const VecW cur_vec = *uint16_vvec_iter++;
7472       acc_even.vw = acc_even.vw + (cur_vec & m16);
7473       acc_odd.vw = acc_odd.vw + (vecw_srli(cur_vec, 16) & m16);
7474     } while (uint16_vvec_iter < uint16_vvec_stop);
7475     sum += UniVecHsum32(acc_even);
7476     sum += UniVecHsum32(acc_odd);
7477   }
7478 #else
7479   uint64_t sum = 0;
7480   for (uint32_t uii = 0; uii != entry_ct; ++uii) {
7481     sum += uint16_vec[uii];
7482   }
7483   return sum;
7484 #endif
7485 }
7486 
GetPhasepresentAndSkipPhaseinfo(const unsigned char * fread_end,const uintptr_t * __restrict all_hets,uint32_t raw_sample_ct,uint32_t het_ct,const unsigned char ** fread_pp,uintptr_t * __restrict phasepresent,uint32_t * __restrict phasepresent_ctp)7487 PglErr GetPhasepresentAndSkipPhaseinfo(const unsigned char* fread_end, const uintptr_t* __restrict all_hets, uint32_t raw_sample_ct, uint32_t het_ct, const unsigned char** fread_pp, uintptr_t* __restrict phasepresent, uint32_t* __restrict phasepresent_ctp) {
7488   const unsigned char* aux2_start = *fread_pp;
7489   const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
7490   if (PtrAddCk(fread_end, aux2_first_part_byte_ct, fread_pp)) {
7491     return kPglRetMalformedInput;
7492   }
7493   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7494   if (!(aux2_start[0] & 1)) {
7495     memcpy(phasepresent, all_hets, raw_sample_ctl * kBytesPerWord);
7496     *phasepresent_ctp = het_ct;
7497     return kPglRetSuccess;
7498   }
7499   const uint32_t phasepresent_ct = PopcountBytes(aux2_start, aux2_first_part_byte_ct) - 1;
7500   if (PtrAddCk(fread_end, DivUp(phasepresent_ct, CHAR_BIT), fread_pp)) {
7501     return kPglRetMalformedInput;
7502   }
7503   *phasepresent_ctp = phasepresent_ct;
7504   ExpandBytearr(aux2_start, all_hets, raw_sample_ctl, het_ct, 1, phasepresent);
7505   return kPglRetSuccess;
7506 }
7507 
GetUnphasedBiallelicHetCt(const uintptr_t * __restrict sample_include,const uintptr_t * raw_genoarr,const unsigned char * fread_ptr,const unsigned char * fread_end,uint32_t subsetted_het_ct,PgenReaderMain * pgrp,uint32_t * unphased_het_ctp)7508 PglErr GetUnphasedBiallelicHetCt(const uintptr_t* __restrict sample_include, const uintptr_t* raw_genoarr, const unsigned char* fread_ptr, const unsigned char* fread_end, uint32_t subsetted_het_ct, PgenReaderMain* pgrp, uint32_t* unphased_het_ctp) {
7509   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7510   uint32_t raw_het_ct;
7511   if (!sample_include) {
7512     raw_het_ct = subsetted_het_ct;
7513   } else {
7514     raw_het_ct = CountNyp(raw_genoarr, kMask5555, raw_sample_ct);
7515   }
7516   const uint32_t aux2_first_part_byte_ct = 1 + (raw_het_ct / CHAR_BIT);
7517   if (PtrCheck(fread_end, fread_ptr, aux2_first_part_byte_ct)) {
7518     return kPglRetMalformedInput;
7519   }
7520   const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
7521   if (!explicit_phasepresent) {
7522     // initial value of 0 is correct
7523     return kPglRetSuccess;
7524   }
7525   if (raw_het_ct == subsetted_het_ct) {
7526     *unphased_het_ctp = raw_het_ct + 1 - PopcountBytes(fread_ptr, aux2_first_part_byte_ct);
7527     return kPglRetSuccess;
7528   }
7529   // A dedicated counting function would be faster, but this case
7530   // should rarely come up.
7531   uintptr_t* all_hets = pgrp->workspace_all_hets;
7532   PgrDetectGenoarrHets(raw_genoarr, raw_sample_ct, all_hets);
7533   uintptr_t* raw_phasepresent = pgrp->workspace_subset;
7534   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7535   // todo: compare against ExpandThenSubsetBytearr followed by simple popcount
7536   ExpandBytearr(fread_ptr, all_hets, raw_sample_ctl, raw_het_ct, 1, raw_phasepresent);
7537   *unphased_het_ctp = subsetted_het_ct - PopcountWordsIntersect(raw_phasepresent, sample_include, raw_sample_ctl);
7538   return kPglRetSuccess;
7539 }
7540 
7541 PglErr GetPhasedBiallelicGenotypeSubsetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uint32_t* unphased_het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts) {
7542   // Currently much less optimized than the other count functions.  (This case
7543   // shouldn't come up much, the user has to be computing minimac-r2 on a file
7544   // with no dosages...)
7545   uintptr_t* raw_genovec = pgrp->workspace_vec;
7546   const unsigned char* fread_ptr;
7547   const unsigned char* fread_end;
7548   PglErr reterr = ReadRawGenovec(1, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
7549   if (unlikely(reterr)) {
7550     return reterr;
7551   }
7552   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7553   ZeroTrailingNyps(raw_sample_ct, raw_genovec);
7554   GenoarrCountSubsetFreqs(raw_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
7555   return GetUnphasedBiallelicHetCt(sample_include, raw_genovec, fread_ptr, fread_end, genocounts[1], pgrp, unphased_het_ctp);
7556 }
7557 
7558 // Imputation r^2 computation:
7559 // * This function assumes the biallelic diploid case.  Divide by two to get
7560 //   the biallelic haploid value, for whatever that's worth.
7561 // * chrX requires sex information, so that's handled directly in
7562 //   LoadAlleleAndGenoCountsThread()... er, actually, we just give up on that
7563 //   for now.
7564 // * See PgrGetMDCounts() support functions below for multiallelic-diploid
7565 //   notes.
7566 PglErr GetBasicGenotypeCountsAndDosage16s(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReaderMain* pgrp, double* imp_r2_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages) {
7567   // genocounts[0] := ref/ref, genocounts[1] := ref/altx,
7568   // genocounts[2] := altx/alty, genocounts[3] := missing
7569   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
7570   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7571   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
7572   uint32_t unphased_het_ct = 0;
7573   // To avoid LD cache thrashing, we try to either always keep a subsetted
7574   // cache, or never do so.  (Always, when only hardcalls are present;
7575   // otherwise never.)
7576   if ((!(pgrp->fi.gflags & kfPgenGlobalDosagePresent)) ||
7577       ((!(vrtype & 0x60)) && (!subsetting_required))) {
7578     {
7579       const uint32_t need_unphased_het_ct = is_minimac3_r2 && VrtypeHphase(vrtype);
7580       PglErr reterr;
7581       if (!(subsetting_required && need_unphased_het_ct)) {
7582         reterr = GetBasicGenotypeCounts(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, need_unphased_het_ct? (&unphased_het_ct) : nullptr, genocounts);
7583       } else {
7584         reterr = GetPhasedBiallelicGenotypeSubsetCounts(sample_include, sample_include_interleaved_vec, sample_ct, vidx, pgrp, &unphased_het_ct, genocounts);
7585       }
7586       if (unlikely(reterr)) {
7587         return reterr;
7588       }
7589     }
7590   GetBasicGenotypeCountsAndDosage16s_basic_finish:
7591     all_dosages[0] = (genocounts[0] * 2 + genocounts[1]) * 16384LLU;
7592     all_dosages[1] = (genocounts[2] * 2 + genocounts[1]) * 16384LLU;
7593     if (!imp_r2_ptr) {
7594       return kPglRetSuccess;
7595     }
7596     // yeah, it's sinful to implement imputation r2 here...
7597     const uint32_t nm_sample_ct = sample_ct - genocounts[3];
7598     const uint64_t alt1_dosage = genocounts[2] * 0x8000LLU + genocounts[1] * 0x4000LLU;
7599     uint64_t hap_alt1_ssq_x2 = genocounts[2] * 0x40000000LLU + genocounts[1] * 0x10000000LLU;
7600     if (is_minimac3_r2) {
7601       if (!VrtypeHphase(vrtype)) {
7602         unphased_het_ct = genocounts[1];
7603       }
7604       hap_alt1_ssq_x2 += (genocounts[1] - unphased_het_ct) * 0x10000000LLU;
7605     }
7606     *imp_r2_ptr = BiallelicDiploidMinimac3R2(alt1_dosage, hap_alt1_ssq_x2, nm_sample_ct);
7607     if (!is_minimac3_r2) {
7608       *imp_r2_ptr *= 2;
7609     }
7610     return kPglRetSuccess;
7611   }
7612   uintptr_t* raw_genovec = pgrp->workspace_vec;
7613   const unsigned char* fread_ptr;
7614   const unsigned char* fread_end;
7615   PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
7616   if (unlikely(reterr)) {
7617     return reterr;
7618   }
7619   ZeroTrailingNyps(raw_sample_ct, raw_genovec);
7620   if (!subsetting_required) {
7621     GenoarrCountFreqsUnsafe(raw_genovec, raw_sample_ct, genocounts);
7622   } else {
7623     GenoarrCountSubsetFreqs(raw_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
7624   }
7625   if (!(vrtype & 0x60)) {
7626     if (is_minimac3_r2 && VrtypeHphase(vrtype)) {
7627       assert(!VrtypeMultiallelicHc(vrtype));
7628       reterr = GetUnphasedBiallelicHetCt(subsetting_required? sample_include : nullptr, raw_genovec, fread_ptr, fread_end, genocounts[1], pgrp, &unphased_het_ct);
7629       if (unlikely(reterr)) {
7630         return reterr;
7631       }
7632     }
7633     goto GetBasicGenotypeCountsAndDosage16s_basic_finish;
7634   }
7635   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
7636   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
7637   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7638   uintptr_t* raw_phasepresent = pgrp->workspace_subset;
7639   uint32_t raw_phasepresent_ct = 0;
7640   if (VrtypeHphase(vrtype)) {
7641     uint32_t raw_het_ct = genocounts[1];  // inaccurate if subsetting_required
7642     if (!is_minimac3_r2) {
7643       if (VrtypeMultiallelicHc(vrtype)) {
7644         const uint32_t aux1_first_byte = *fread_ptr++;
7645         const uint32_t aux1a_mode = aux1_first_byte & 15;
7646         const uint32_t aux1b_mode = aux1_first_byte >> 4;
7647         uint32_t raw_10_ct = 0;
7648         if ((!aux1a_mode) || (!aux1b_mode) || subsetting_required) {
7649           GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_het_ct, &raw_10_ct);
7650         }
7651         reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_het_ct, &fread_ptr);
7652         if (unlikely(reterr)) {
7653           return reterr;
7654         }
7655         reterr = GetAux1bHetIncr(fread_end, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, &raw_het_ct);
7656         if (unlikely(reterr)) {
7657           return reterr;
7658         }
7659       } else if (subsetting_required) {
7660         raw_het_ct = CountNyp(raw_genovec, kMask5555, raw_sample_ct);
7661       }
7662       reterr = SkipAux2(fread_end, raw_het_ct, &fread_ptr, nullptr);
7663       if (unlikely(reterr)) {
7664         return reterr;
7665       }
7666     } else {
7667       assert(!VrtypeMultiallelicHc(vrtype));
7668       uintptr_t* all_hets = pgrp->workspace_all_hets;
7669       PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
7670       if (subsetting_required) {
7671         raw_het_ct = PopcountWords(all_hets, raw_sample_ctl);
7672       }
7673       const uint32_t first_half_byte_ct = 1 + (raw_het_ct / CHAR_BIT);
7674       const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
7675       if (explicit_phasepresent) {
7676         ExpandBytearr(fread_ptr, all_hets, raw_sample_ctl, raw_het_ct, 1, raw_phasepresent);
7677         raw_phasepresent_ct = PopcountBytes(fread_ptr, first_half_byte_ct) - 1;
7678         const uint32_t second_half_byte_ct = DivUp(raw_phasepresent_ct, CHAR_BIT);
7679         fread_ptr = &(fread_ptr[first_half_byte_ct + second_half_byte_ct]);
7680       } else {
7681         raw_phasepresent_ct = raw_het_ct;
7682         memcpy(raw_phasepresent, all_hets, raw_sample_ctl * sizeof(intptr_t));
7683         fread_ptr = &(fread_ptr[first_half_byte_ct]);
7684       }
7685     }
7686   } else if (VrtypeMultiallelicHc(vrtype)) {
7687     reterr = SkipAux1(fread_end, raw_genovec, raw_sample_ct, allele_ct, &fread_ptr);
7688     if (unlikely(reterr)) {
7689       return reterr;
7690     }
7691   }
7692   if (allele_ct != 2) {
7693     // Maybe make this an invalid function call?  If that happens, the
7694     // VrtypeMultiallelicHc() branch above can be removed.
7695     fputs("multiallelic dosages not yet supported by GetBasicGenotypeCountsAndDosage16s()\n", stderr);
7696     exit(S_CAST(int32_t, kPglRetNotYetSupported));
7697     return kPglRetSuccess;
7698   }
7699 
7700   const uint32_t is_unconditional_dosage = ((vrtype & 0x60) == 0x40);
7701   uint64_t alt1_dosage = 0;
7702   uint32_t dosage_ct = 0;
7703   STD_ARRAY_DECL(uint32_t, 4, replaced_genocounts);
7704   if ((!is_minimac3_r2) || (!(vrtype & 0x90))) {
7705     uint64_t alt1_dosage_sq_sum = 0;
7706     if (is_unconditional_dosage) {
7707       // needs to be handled separately from the other cases due to possible
7708       // presence of missing values.
7709       // note that this code will also need to be adjusted when multiallelic
7710       // support is added.
7711 #ifdef __arm__
7712 #  error "Unaligned accesses in GetBasicGenotypeCountsAndDosage16s()."
7713 #endif
7714       STD_ARRAY_FILL0(replaced_genocounts);
7715       const uint16_t* dosage_main = R_CAST(const uint16_t*, fread_ptr);
7716       if (PtrAddCk(fread_end, raw_sample_ct * sizeof(int16_t), &fread_ptr)) {
7717         return kPglRetMalformedInput;
7718       }
7719       if (subsetting_required) {
7720         uintptr_t sample_uidx_base = 0;
7721         uintptr_t sample_include_bits = sample_include[0];
7722         for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
7723           const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
7724           const uintptr_t cur_dosage_val = dosage_main[sample_uidx];
7725           if (cur_dosage_val != 65535) {
7726             alt1_dosage += cur_dosage_val;
7727 
7728             // todo: check if this is slow enough to justify removing it from
7729             // the main loop
7730             alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
7731             ++dosage_ct;
7732           }
7733         }
7734       } else {
7735         for (uint32_t sample_uidx = 0; sample_uidx != sample_ct; ++sample_uidx) {
7736           const uintptr_t cur_dosage_val = dosage_main[sample_uidx];
7737           if (cur_dosage_val != 65535) {
7738             alt1_dosage += cur_dosage_val;
7739             alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
7740             ++dosage_ct;
7741           }
7742         }
7743       }
7744       // update (20 Mar 2019): .pgen specification tightened to remove the need
7745       // to update replaced_genocounts in the main loops above.
7746       STD_ARRAY_COPY(genocounts, 4, replaced_genocounts);
7747       replaced_genocounts[3] = replaced_genocounts[3] + dosage_ct - sample_ct;
7748     } else {
7749       uintptr_t* raw_dosage_present = pgrp->workspace_dosage_present;
7750       uint32_t raw_dosage_ct;
7751       if (!(vrtype & 0x40)) {
7752         // dosage list
7753         if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, &fread_ptr, raw_dosage_present, &raw_dosage_ct))) {
7754           return kPglRetMalformedInput;
7755         }
7756       } else {
7757         // dosage bitarray
7758         raw_dosage_present[raw_sample_ctl - 1] = 0;
7759         const uint32_t raw_sample_ctb = DivUp(raw_sample_ct, CHAR_BIT);
7760         memcpy(raw_dosage_present, fread_ptr, raw_sample_ctb);
7761         fread_ptr = &(fread_ptr[raw_sample_ctb]);
7762         raw_dosage_ct = PopcountWords(raw_dosage_present, raw_sample_ctl);
7763       }
7764       const uint16_t* dosage_main_iter = R_CAST(const uint16_t*, fread_ptr);
7765       if (PtrAddCk(fread_end, raw_dosage_ct * sizeof(int16_t), &fread_ptr)) {
7766         return kPglRetMalformedInput;
7767       }
7768       if (subsetting_required) {
7769         uintptr_t sample_widx = 0;
7770         uintptr_t dosage_present_bits = raw_dosage_present[0];
7771         for (uint32_t dosage_idx = 0; dosage_idx != raw_dosage_ct; ++dosage_idx) {
7772           const uintptr_t lowbit = BitIter1y(raw_dosage_present, &sample_widx, &dosage_present_bits);
7773           if (sample_include[sample_widx] & lowbit) {
7774             const uintptr_t cur_dosage_val = dosage_main_iter[dosage_idx];
7775             alt1_dosage += cur_dosage_val;
7776             alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
7777             ++dosage_ct;
7778           }
7779         }
7780         GenoarrCountSubsetIntersectFreqs(raw_genovec, raw_dosage_present, sample_include, raw_sample_ct, replaced_genocounts);
7781       } else {
7782         if (!imp_r2_ptr) {
7783           for (uint32_t dosage_idx = 0; dosage_idx != raw_dosage_ct; ++dosage_idx) {
7784             alt1_dosage += dosage_main_iter[dosage_idx];
7785           }
7786         } else {
7787           for (uint32_t dosage_idx = 0; dosage_idx != raw_dosage_ct; ++dosage_idx) {
7788             const uintptr_t cur_dosage_val = dosage_main_iter[dosage_idx];
7789             alt1_dosage += cur_dosage_val;
7790             alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
7791           }
7792         }
7793         dosage_ct = raw_dosage_ct;
7794         GenoarrCountSubsetFreqs2(raw_genovec, raw_dosage_present, raw_sample_ct, raw_dosage_ct, replaced_genocounts);
7795       }
7796     }
7797     const uint32_t replaced_ct = replaced_genocounts[0] + replaced_genocounts[1] + replaced_genocounts[2];
7798     const uint32_t remaining_het_ct = genocounts[1] - replaced_genocounts[1];
7799     const uint32_t remaining_hom_alt_ct = genocounts[2] - replaced_genocounts[2];
7800     const uint32_t alt1_ct = 2 * remaining_hom_alt_ct + remaining_het_ct;
7801     alt1_dosage += alt1_ct * 16384LLU;
7802     all_dosages[1] = alt1_dosage;
7803     const uint32_t nondosage_nm_ct = sample_ct - genocounts[3] - replaced_ct;
7804     const uint32_t new_sample_nm_ct = dosage_ct + nondosage_nm_ct;
7805     all_dosages[0] = new_sample_nm_ct * 32768LLU - alt1_dosage;
7806     if (!imp_r2_ptr) {
7807       return kPglRetSuccess;
7808     }
7809     // possible todo: also move all-hardcall-phase-present, no-dosage
7810     // is_minimac3_r2 case under this branch, since we can just set imp_r2 to
7811     // NaN or 1.
7812     // 16384^2, 32768^2
7813     alt1_dosage_sq_sum += remaining_het_ct * 0x10000000LLU + remaining_hom_alt_ct * 0x40000000LLU;
7814     *imp_r2_ptr = BiallelicDiploidMinimac3R2(alt1_dosage, alt1_dosage_sq_sum, new_sample_nm_ct);
7815     if (!is_minimac3_r2) {
7816       *imp_r2_ptr *= 2;
7817     }
7818     return kPglRetSuccess;
7819   }
7820   // Need to deal with implicitly phased dosages.  Best to have raw_genovec,
7821   // raw_phasepresent, dosage_present, and dosage_main all available, then loop
7822   // over everything at once.
7823   // (phaseinfo is irrelevant since only absolute value of (left - right)
7824   // matters.)
7825 
7826   // We have the following 2x2x3 cases to deal with:
7827   // - Subsetted vs. un-subsetted.  Un-subsetted comes up a lot, so we have an
7828   //   optimized code path for it.
7829   // - Unconditional vs. conditional dosage.  Unconditional should not come up
7830   //   much, so we just mock up raw_dosage_present... er, actually, that
7831   //   doesn't work because dosage_main would also need to be collapsed.  Sigh.
7832   //   Ok, it's still handled separately.
7833   // - Only hardcall-phase, vs. only dosage-phase, vs. both.  At least we can
7834   //   merge the "only dosage-phase" and "both" cases.
7835   // So we end up with 8 primary code paths.
7836   // This is kind of a nightmare; it would obviously be nicer to move this
7837   // out of pgenlib_internal, and that may eventually happen.  But we don't
7838   // want users to be discouraged from running --minimac3-r2-filter when it's
7839   // appropriate just because it's a lot slower than other standard filters;
7840   // and this also serves as a testing ground for efficient phased-dosage
7841   // handling strategies.
7842   if (!VrtypeHphase(vrtype)) {
7843     ZeroWArr(raw_sample_ctl, raw_phasepresent);
7844   }
7845   uintptr_t* raw_dosage_present = nullptr;
7846   const uint16_t* dosage_main;
7847   uint32_t raw_dosage_ct = 0;
7848   if (is_unconditional_dosage) {
7849     dosage_main = R_CAST(const uint16_t*, fread_ptr);
7850     if (PtrAddCk(fread_end, raw_sample_ct * sizeof(int16_t), &fread_ptr)) {
7851       return kPglRetMalformedInput;
7852     }
7853     // raw_dosage_ct unused in this case.
7854   } else {
7855     // could move some duplicate code before the big branch
7856     raw_dosage_present = pgrp->workspace_dosage_present;
7857     if (!(vrtype & 0x40)) {
7858       // dosage list
7859       if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, &fread_ptr, raw_dosage_present, &raw_dosage_ct))) {
7860         return kPglRetMalformedInput;
7861       }
7862     } else {
7863       // dosage bitarray
7864       raw_dosage_present[raw_sample_ctl - 1] = 0;
7865       const uint32_t raw_sample_ctb = DivUp(raw_sample_ct, CHAR_BIT);
7866       memcpy(raw_dosage_present, fread_ptr, raw_sample_ctb);
7867       fread_ptr = &(fread_ptr[raw_sample_ctb]);
7868       raw_dosage_ct = PopcountWords(raw_dosage_present, raw_sample_ctl);
7869     }
7870     dosage_main = R_CAST(const uint16_t*, fread_ptr);
7871     if (PtrAddCk(fread_end, raw_dosage_ct * sizeof(int16_t), &fread_ptr)) {
7872       return kPglRetMalformedInput;
7873     }
7874   }
7875   const uint16_t* dosage_main_iter = dosage_main;
7876   uint64_t hap_ssq_x2 = 0;
7877   uint32_t phased_hc_het_ct = 0;
7878   if (!(vrtype & 0x80)) {
7879     if (is_unconditional_dosage) {
7880       if (!subsetting_required) {
7881         const uint32_t raw_sample_ctl_m1 = raw_sample_ctl - 1;
7882         uint32_t loop_len = kBitsPerWord;
7883         for (uint32_t widx = 0; ; ++widx) {
7884           if (widx >= raw_sample_ctl_m1) {
7885             if (widx > raw_sample_ctl_m1) {
7886               break;
7887             }
7888             loop_len = ModNz(raw_sample_ct, kBitsPerWord);
7889           }
7890           uintptr_t phasepresent_word = raw_phasepresent[widx];
7891           for (uint32_t uii = 0; uii != loop_len; ++uii) {
7892             const uintptr_t cur_dosage_val = *dosage_main_iter++;
7893             if (cur_dosage_val != 65535) {
7894               alt1_dosage += cur_dosage_val;
7895               hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7896               ++dosage_ct;
7897               if (phasepresent_word & 1) {
7898                 // For each dosage, when phasepresent bit is set, implicit
7899                 // dphase_delta value is 16384 - |16384 - x|.
7900                 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
7901                 hap_ssq_x2 += homdist * homdist;
7902               }
7903             }
7904             phasepresent_word = phasepresent_word >> 1;
7905           }
7906         }
7907       } else {
7908         for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
7909           uintptr_t sample_include_word = sample_include[widx];
7910           if (!sample_include_word) {
7911             continue;
7912           }
7913           const uintptr_t phasepresent_word = raw_phasepresent[widx];
7914           const uint16_t* cur_dosage_main = &(dosage_main[widx * kBitsPerWord]);
7915           do {
7916             const uint32_t sample_idx_lowbits = ctzw(sample_include_word);
7917             const uintptr_t cur_dosage_val = cur_dosage_main[sample_idx_lowbits];
7918             const uintptr_t lowbit = sample_include_word & (-sample_include_word);
7919             if (cur_dosage_val != 65535) {
7920               alt1_dosage += cur_dosage_val;
7921               hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7922               ++dosage_ct;
7923               if (lowbit & phasepresent_word) {
7924                 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
7925                 hap_ssq_x2 += homdist * homdist;
7926               }
7927             }
7928             sample_include_word ^= lowbit;
7929           } while (sample_include_word);
7930         }
7931       }
7932       STD_ARRAY_COPY(genocounts, 4, replaced_genocounts);
7933       replaced_genocounts[3] = replaced_genocounts[3] + dosage_ct - sample_ct;
7934     } else {  // !is_unconditional_dosage
7935       if (!subsetting_required) {
7936         // phased_hc_het_ct := popcount(phasepresent & (~dosage_present))
7937         phased_hc_het_ct = raw_phasepresent_ct - PopcountWordsIntersect(raw_phasepresent, raw_dosage_present, raw_sample_ctl);
7938 
7939         for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
7940           uintptr_t dosage_present_word = raw_dosage_present[widx];
7941           if (dosage_present_word) {
7942             const uintptr_t phasepresent_word = raw_phasepresent[widx];
7943             do {
7944               const uintptr_t cur_dosage_val = *dosage_main_iter++;
7945               alt1_dosage += cur_dosage_val;
7946               const uintptr_t lowbit = dosage_present_word & (-dosage_present_word);
7947               hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7948               if (lowbit & phasepresent_word) {
7949                 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
7950                 hap_ssq_x2 += homdist * homdist;
7951               }
7952               dosage_present_word ^= lowbit;
7953             } while (dosage_present_word);
7954           }
7955         }
7956         dosage_ct = raw_dosage_ct;
7957         GenoarrCountSubsetFreqs2(raw_genovec, raw_dosage_present, raw_sample_ct, raw_dosage_ct, replaced_genocounts);
7958       } else {
7959         for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
7960           const uintptr_t sample_include_word = sample_include[widx];
7961           uintptr_t dosage_present_word = raw_dosage_present[widx];
7962           if (!sample_include_word) {
7963             dosage_main_iter = &(dosage_main_iter[PopcountWord(dosage_present_word)]);
7964             continue;
7965           }
7966           const uintptr_t phasepresent_word = raw_phasepresent[widx];
7967           phased_hc_het_ct += PopcountWord(sample_include_word & phasepresent_word & (~dosage_present_word));
7968           while (dosage_present_word) {
7969             const uintptr_t lowbit = dosage_present_word & (-dosage_present_word);
7970             if (lowbit & sample_include_word) {
7971               const uintptr_t cur_dosage_val = *dosage_main_iter;
7972               alt1_dosage += cur_dosage_val;
7973               hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7974               ++dosage_ct;
7975               if (lowbit & phasepresent_word) {
7976                 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
7977                 hap_ssq_x2 += homdist * homdist;
7978               }
7979             }
7980             dosage_present_word ^= lowbit;
7981             ++dosage_main_iter;
7982           }
7983         }
7984         GenoarrCountSubsetIntersectFreqs(raw_genovec, raw_dosage_present, sample_include, raw_sample_ct, replaced_genocounts);
7985       }
7986     }
7987   } else {
7988     if (is_unconditional_dosage) {
7989       if (PtrCheck(fread_end, fread_ptr, raw_sample_ct * sizeof(int16_t))) {
7990         return kPglRetMalformedInput;
7991       }
7992       const int16_t* dphase_delta = R_CAST(const int16_t*, fread_ptr);
7993       if (!subsetting_required) {
7994         for (uint32_t sample_uidx = 0; sample_uidx != raw_sample_ct; ++sample_uidx) {
7995           const uintptr_t cur_dosage_val = dosage_main[sample_uidx];
7996           if (cur_dosage_val != 65535) {
7997             alt1_dosage += cur_dosage_val;
7998             hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7999             ++dosage_ct;
8000             // .pgen specification now requires this value to never be missing.
8001             const intptr_t dphase_delta_val = dphase_delta[sample_uidx];
8002             hap_ssq_x2 += dphase_delta_val * dphase_delta_val;
8003           }
8004         }
8005       } else {
8006         for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
8007           uintptr_t sample_include_word = sample_include[widx];
8008           if (!sample_include_word) {
8009             continue;
8010           }
8011           const uint16_t* cur_dosage_main = &(dosage_main[widx * kBitsPerWord]);
8012           const int16_t* cur_dphase_delta = &(dphase_delta[widx * kBitsPerWord]);
8013           do {
8014             const uint32_t sample_idx_lowbits = ctzw(sample_include_word);
8015             const uintptr_t cur_dosage_val = cur_dosage_main[sample_idx_lowbits];
8016             if (cur_dosage_val != 65535) {
8017               alt1_dosage += cur_dosage_val;
8018               hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
8019               ++dosage_ct;
8020               const intptr_t dphase_delta_val = cur_dphase_delta[sample_idx_lowbits];
8021               hap_ssq_x2 += dphase_delta_val * dphase_delta_val;
8022             }
8023             sample_include_word &= sample_include_word - 1;
8024           } while (sample_include_word);
8025         }
8026       }
8027       STD_ARRAY_COPY(genocounts, 4, replaced_genocounts);
8028       replaced_genocounts[3] = replaced_genocounts[3] + dosage_ct - sample_ct;
8029     } else {
8030       const uintptr_t* file_dphase_present = R_CAST(const uintptr_t*, fread_ptr);
8031       const uint32_t raw_dosage_ctb = DivUp(raw_dosage_ct, CHAR_BIT);
8032       if (PtrAddCk(fread_end, raw_dosage_ctb, &fread_ptr)) {
8033         return kPglRetMalformedInput;
8034       }
8035       const uint32_t raw_dphase_ct = PopcountBytes(file_dphase_present, raw_dosage_ctb);
8036       if (PtrCheck(fread_end, fread_ptr, raw_dphase_ct * sizeof(int16_t))) {
8037         return kPglRetMalformedInput;
8038       }
8039       uintptr_t* raw_dphase_present = pgrp->workspace_dphase_present;
8040       ExpandBytearr(file_dphase_present, raw_dosage_present, raw_sample_ctl, raw_dosage_ct, 0, raw_dphase_present);
8041       const int16_t* dphase_delta_iter = R_CAST(const int16_t*, fread_ptr);
8042       if (!subsetting_required) {
8043         phased_hc_het_ct = raw_phasepresent_ct - PopcountWordsIntersect(raw_phasepresent, raw_dosage_present, raw_sample_ctl);
8044 
8045         for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
8046           uintptr_t dosage_present_word = raw_dosage_present[widx];
8047           if (dosage_present_word) {
8048             const uintptr_t phasepresent_word = raw_phasepresent[widx];
8049             const uintptr_t dphase_present_word = raw_dphase_present[widx];
8050             do {
8051               const uintptr_t cur_dosage_val = *dosage_main_iter++;
8052               alt1_dosage += cur_dosage_val;
8053               const uintptr_t lowbit = dosage_present_word & (-dosage_present_word);
8054               hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
8055               if (lowbit & dphase_present_word) {
8056                 const intptr_t dphase_delta_val = *dphase_delta_iter++;
8057                 hap_ssq_x2 += dphase_delta_val * dphase_delta_val;
8058               } else if (lowbit & phasepresent_word) {
8059                 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
8060                 hap_ssq_x2 += homdist * homdist;
8061               }
8062               dosage_present_word ^= lowbit;
8063             } while (dosage_present_word);
8064           }
8065         }
8066         dosage_ct = raw_dosage_ct;
8067         GenoarrCountSubsetFreqs2(raw_genovec, raw_dosage_present, raw_sample_ct, raw_dosage_ct, replaced_genocounts);
8068       } else {
8069         for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
8070           const uintptr_t sample_include_word = sample_include[widx];
8071           const uintptr_t dphase_present_word = raw_dphase_present[widx];
8072           uintptr_t dosage_present_word = raw_dosage_present[widx];
8073           if (!sample_include_word) {
8074             dosage_main_iter = &(dosage_main_iter[PopcountWord(dosage_present_word)]);
8075             dphase_delta_iter = &(dphase_delta_iter[PopcountWord(dphase_present_word)]);
8076             continue;
8077           }
8078           const uintptr_t phasepresent_word = raw_phasepresent[widx];
8079           phased_hc_het_ct += PopcountWord(sample_include_word & phasepresent_word & (~dosage_present_word));
8080           while (dosage_present_word) {
8081             const uintptr_t lowbit = dosage_present_word & (-dosage_present_word);
8082             const uintptr_t dphase_here = lowbit & dphase_present_word;
8083             if (lowbit & sample_include_word) {
8084               const uintptr_t cur_dosage_val = *dosage_main_iter;
8085               alt1_dosage += cur_dosage_val;
8086               hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
8087               ++dosage_ct;
8088               if (dphase_here) {
8089                 const intptr_t dphase_delta_val = *dphase_delta_iter;
8090                 hap_ssq_x2 += dphase_delta_val * dphase_delta_val;
8091               } else if (lowbit & phasepresent_word) {
8092                 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
8093                 hap_ssq_x2 += homdist * homdist;
8094               }
8095             }
8096             dphase_delta_iter += (dphase_here != 0);
8097             dosage_present_word ^= lowbit;
8098             ++dosage_main_iter;
8099           }
8100         }
8101         GenoarrCountSubsetIntersectFreqs(raw_genovec, raw_dosage_present, sample_include, raw_sample_ct, replaced_genocounts);
8102       }
8103     }
8104   }
8105   const uint32_t replaced_ct = replaced_genocounts[0] + replaced_genocounts[1] + replaced_genocounts[2];
8106   const uint32_t remaining_het_ct = genocounts[1] - replaced_genocounts[1];
8107   const uint32_t remaining_hom_alt_ct = genocounts[2] - replaced_genocounts[2];
8108   const uint32_t alt1_ct = 2 * remaining_hom_alt_ct + remaining_het_ct;
8109   alt1_dosage += alt1_ct * 16384LLU;
8110   all_dosages[1] = alt1_dosage;
8111   const uint32_t nondosage_nm_ct = sample_ct - genocounts[3] - replaced_ct;
8112   const uint32_t new_sample_nm_ct = dosage_ct + nondosage_nm_ct;
8113   all_dosages[0] = new_sample_nm_ct * 32768LLU - alt1_dosage;
8114   hap_ssq_x2 += (remaining_het_ct + phased_hc_het_ct) * 0x10000000LLU + remaining_hom_alt_ct * 0x40000000LLU;
8115   *imp_r2_ptr = BiallelicDiploidMinimac3R2(alt1_dosage, hap_ssq_x2, new_sample_nm_ct);
8116   return kPglRetSuccess;
8117 }
8118 
8119 PglErr PgrGetDCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReader* pgr_ptr, double* imp_r2_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages) {
8120   if (!sample_ct) {
8121     STD_ARRAY_REF_FILL0(4, genocounts);
8122     all_dosages[0] = 0;
8123     all_dosages[1] = 0;
8124     if (imp_r2_ptr) {
8125       *imp_r2_ptr = 0.0 / 0.0;
8126     }
8127     return kPglRetSuccess;
8128   }
8129   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8130   assert(vidx < pgrp->fi.raw_variant_ct);
8131   return GetBasicGenotypeCountsAndDosage16s(sample_include, sample_include_interleaved_vec, GetSicp(pssi), sample_ct, vidx, is_minimac3_r2, pgrp, imp_r2_ptr, genocounts, all_dosages);
8132 }
8133 
8134 // Does not zero-initialize results[].
CountAllBytes64(const void * bytearr,uintptr_t byte_ct,uint64_t * __restrict results)8135 void CountAllBytes64(const void* bytearr, uintptr_t byte_ct, uint64_t* __restrict results) {
8136   const unsigned char* bytearr_uc = S_CAST(const unsigned char*, bytearr);
8137   for (uintptr_t ulii = 0; ulii != byte_ct; ++ulii) {
8138     results[bytearr_uc[ulii]] += 1;
8139   }
8140 }
8141 
8142 // Does not zero-initialize results[].
CountAllNybbles64(const void * nybblearr,uintptr_t nybble_ct,uint64_t * __restrict results)8143 void CountAllNybbles64(const void* nybblearr, uintptr_t nybble_ct, uint64_t* __restrict results) {
8144   // possible todo: for sufficiently large nybble_ct, use CountAllBytes and
8145   // then postprocess
8146   const uintptr_t fullbyte_ct = nybble_ct / 2;
8147   const unsigned char* nybblearr_uc = S_CAST(const unsigned char*, nybblearr);
8148   for (uintptr_t ulii = 0; ulii != fullbyte_ct; ++ulii) {
8149     const uint32_t uii = nybblearr_uc[ulii];
8150     results[uii & 15] += 1;
8151     results[uii >> 4] += 1;
8152   }
8153   if (nybble_ct % 2) {
8154     results[nybblearr_uc[fullbyte_ct] & 15] += 1;
8155   }
8156 }
8157 
CountAllAux1aDense(const void * patch_01_fvals,uint32_t allele_ct,uint32_t rare01_ct,uint64_t * __restrict one_cts)8158 void CountAllAux1aDense(const void* patch_01_fvals, uint32_t allele_ct, uint32_t rare01_ct, uint64_t* __restrict one_cts) {
8159   one_cts[1] -= rare01_ct;
8160   if (allele_ct < 5) {
8161     if (allele_ct == 3) {
8162       // all entries are 0/1 -> 0/2
8163       one_cts[2] = rare01_ct;
8164       return;
8165     }
8166     const uint32_t allele_code_byte_ct = DivUp(rare01_ct, 8);
8167     const uint32_t alt3_ct = PopcountBytes(patch_01_fvals, allele_code_byte_ct);
8168     one_cts[2] = rare01_ct - alt3_ct;
8169     one_cts[3] = alt3_ct;
8170     return;
8171   }
8172   if (allele_ct < 19) {
8173     if (allele_ct < 7) {
8174       STD_ARRAY_DECL(uint32_t, 4, rare0het_counts);
8175       GenoarrCountFreqs(R_CAST(const uintptr_t*, patch_01_fvals), rare01_ct, rare0het_counts);
8176       for (uint32_t allele_idx_p2 = 2; allele_idx_p2 != allele_ct; ++allele_idx_p2) {
8177         one_cts[allele_idx_p2] = rare0het_counts[allele_idx_p2 - 2];
8178       }
8179       return;
8180     }
8181     CountAllNybbles64(patch_01_fvals, rare01_ct, &(one_cts[2]));
8182     return;
8183   }
8184   CountAllBytes64(patch_01_fvals, rare01_ct, &(one_cts[2]));
8185 }
8186 
8187 // assumes one_cts[1] initialized to genocounts[1]
8188 // sample_include should be nullptr if we aren't subsetting
CountAllAux1a(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_01_ct,const unsigned char ** fread_pp,uint64_t * __restrict one_cts,uint32_t * __restrict deltalist_workspace)8189 PglErr CountAllAux1a(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_01_ct, const unsigned char** fread_pp, uint64_t* __restrict one_cts, uint32_t* __restrict deltalist_workspace) {
8190   if (aux1a_mode == 15) {
8191     return kPglRetSuccess;
8192   }
8193   if (!sample_include) {
8194     uint32_t rare01_ct;
8195     if (!aux1a_mode) {
8196       const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
8197       rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
8198       *fread_pp += fset_byte_ct;
8199     } else {
8200       const unsigned char* group_info_iter;
8201       PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare01_ct);
8202       if (unlikely(reterr)) {
8203         return reterr;
8204       }
8205       reterr = SkipDeltalistIds(fread_end, group_info_iter, rare01_ct, raw_sample_ct, 0, fread_pp);
8206       if (unlikely(reterr)) {
8207         return reterr;
8208       }
8209     }
8210     const unsigned char* patch_01_fvals = *fread_pp;
8211     const uint32_t fvals_byte_ct = GetAux1aAlleleEntryByteCt(allele_ct, rare01_ct);
8212     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8213       return kPglRetMalformedInput;
8214     }
8215     CountAllAux1aDense(patch_01_fvals, allele_ct, rare01_ct, one_cts);
8216     return kPglRetSuccess;
8217   }
8218   const uint32_t allele_code_width = GetAux1aWidth(allele_ct);
8219   const uintptr_t allele_code_mask = (1U << allele_code_width) - 1;
8220   uint64_t* one_cts_offset2 = &(one_cts[2]);
8221   if (!aux1a_mode) {
8222     const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
8223     const uint32_t rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
8224 #ifdef __arm__
8225 #  error "Unaligned accesses in CountAllAux1a()."
8226 #endif
8227     const uintptr_t* patch_01_fsetw = R_CAST(const uintptr_t*, *fread_pp);
8228     *fread_pp += fset_byte_ct;
8229     const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
8230     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
8231     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8232       return kPglRetMalformedInput;
8233     }
8234     const Halfword* sample_include_hw = R_CAST(const Halfword*, sample_include);
8235     uintptr_t sample_hwidx = 0;
8236     uintptr_t cur_raw_genoarr_hets = Word01(raw_genoarr[0]);
8237     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
8238     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
8239     uintptr_t fvals_bits = 0;
8240     uint32_t fvals_widx = 0;
8241     uint32_t subsetted_rare01_ct = 0;
8242     uint32_t loop_len = kBitsPerWord;
8243     uint32_t rare01_lowbits = kBitsPerWord;
8244     for (uint32_t fset_widx = 0; ; ++fset_widx) {
8245       uintptr_t fset_bits;
8246       if (fset_widx >= fset_word_ct_m1) {
8247         if (fset_widx > fset_word_ct_m1) {
8248           break;
8249         }
8250         fset_bits = SubwordLoad(&(patch_01_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
8251         loop_len = ModNz(raw_01_ct, kBitsPerWord);
8252       } else {
8253         fset_bits = patch_01_fsetw[fset_widx];
8254       }
8255       if (allele_ct == 3) {
8256         for (uint32_t uii = 0; uii != loop_len; ++uii) {
8257           while (!cur_raw_genoarr_hets) {
8258             cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
8259           }
8260           if (fset_bits & 1) {
8261             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
8262             subsetted_rare01_ct += (sample_include_hw[sample_hwidx] >> sample_uidx_lowbits) & 1;
8263           }
8264           cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
8265           fset_bits = fset_bits >> 1;
8266         }
8267       } else {
8268         for (uint32_t uii = 0; uii != loop_len; ++uii) {
8269           while (!cur_raw_genoarr_hets) {
8270             cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
8271           }
8272           if (fset_bits & 1) {
8273             if (rare01_lowbits == kBitsPerWord) {
8274               if (fvals_widx == fvals_word_ct_m1) {
8275                 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8276               } else {
8277                 fvals_bits = patch_01_fvalsw[fvals_widx];
8278               }
8279               // unnecessary to apply bzhi here
8280               ++fvals_widx;
8281               rare01_lowbits = 0;
8282             }
8283             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
8284             if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
8285               ++subsetted_rare01_ct;
8286               one_cts_offset2[(fvals_bits >> rare01_lowbits) & allele_code_mask] += 1;
8287             }
8288             rare01_lowbits += allele_code_width;
8289           }
8290           cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
8291           fset_bits = fset_bits >> 1;
8292         }
8293       }
8294     }
8295     one_cts_offset2[-1] -= subsetted_rare01_ct;
8296     if (allele_ct == 3) {
8297       one_cts_offset2[0] = subsetted_rare01_ct;
8298     }
8299     return kPglRetSuccess;
8300   }
8301   // mode 1: difflist.
8302   if (allele_ct == 3) {
8303     // Use CountDeltalistIntersect shortcut here.
8304     uint32_t subsetted_02_ct;
8305     uint32_t rare01_ct;
8306     PglErr reterr = CountDeltalistIntersect(fread_end, sample_include, raw_sample_ct, fread_pp, &subsetted_02_ct, &rare01_ct);
8307     if (unlikely(reterr)) {
8308       return reterr;
8309     }
8310     one_cts_offset2[-1] -= subsetted_02_ct;
8311     one_cts_offset2[0] = subsetted_02_ct;
8312     return kPglRetSuccess;
8313   }
8314   // Save deltalist elements, iterate.
8315   uint32_t rare01_ct;
8316   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare01_ct);
8317   if (unlikely(reterr)) {
8318     return reterr;
8319   }
8320   const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
8321   const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
8322   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8323     return kPglRetMalformedInput;
8324   }
8325   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
8326   const uint32_t allele_code_logwidth = ctzu32(allele_code_width);
8327   uint32_t subsetted_rare01_ct = 0;
8328   uint32_t loop_len = kBitsPerWord >> allele_code_logwidth;
8329   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
8330     uintptr_t fvals_bits;
8331     if (fvals_widx >= fvals_word_ct_m1) {
8332       if (fvals_widx > fvals_word_ct_m1) {
8333         break;
8334       }
8335       fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8336       loop_len = 1 + ((rare01_ct - 1) & (loop_len - 1));
8337     } else {
8338       fvals_bits = patch_01_fvalsw[fvals_widx];
8339     }
8340     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - allele_code_logwidth)]);
8341     for (uint32_t uii = 0; uii != loop_len; ++uii) {
8342       const uint32_t sample_uidx = cur_deltalist_base[uii];
8343       if (IsSet(sample_include, sample_uidx)) {
8344         ++subsetted_rare01_ct;
8345         one_cts_offset2[(fvals_bits >> (uii << allele_code_logwidth)) & allele_code_mask] += 1;
8346       }
8347     }
8348   }
8349   one_cts_offset2[-1] -= subsetted_rare01_ct;
8350   return kPglRetSuccess;
8351 }
8352 
CountAllAux1bDense(const void * __restrict patch_10_fvals,uint32_t allele_ct,uint32_t rare10_ct,uint64_t * __restrict one_cts_offset1,uint64_t * __restrict two_cts_offset1)8353 void CountAllAux1bDense(const void* __restrict patch_10_fvals, uint32_t allele_ct, uint32_t rare10_ct, uint64_t* __restrict one_cts_offset1, uint64_t* __restrict two_cts_offset1) {
8354   // probable todo: faster path if two_cts_offset1 == nullptr
8355   const uint32_t allele_ct_m1 = allele_ct - 1;
8356   two_cts_offset1[0] -= rare10_ct;
8357   if (allele_ct_m1 < 5) {
8358     if (allele_ct_m1 == 2) {
8359       const uint32_t allele_code_byte_ct = DivUp(rare10_ct, 8);
8360       const uint32_t hom22_ct = PopcountBytes(patch_10_fvals, allele_code_byte_ct);
8361       const uint32_t het12_ct = rare10_ct - hom22_ct;
8362       one_cts_offset1[0] += het12_ct;
8363       one_cts_offset1[1] += het12_ct;
8364       two_cts_offset1[1] = hom22_ct;
8365       return;
8366     }
8367     STD_ARRAY_DECL(uint32_t, 4, alt_counts);
8368     GenoarrCountFreqs(R_CAST(const uintptr_t*, patch_10_fvals), rare10_ct * 2, alt_counts);
8369     one_cts_offset1[0] += alt_counts[0];
8370     for (uint32_t allele_idx_m1 = 1; allele_idx_m1 != allele_ct_m1; ++allele_idx_m1) {
8371       const uint32_t homxx_ct = CountNybble(patch_10_fvals, allele_idx_m1 * kMask5555, rare10_ct);
8372       one_cts_offset1[allele_idx_m1] += alt_counts[allele_idx_m1] - 2 * homxx_ct;
8373       two_cts_offset1[allele_idx_m1] = homxx_ct;
8374     }
8375     return;
8376   }
8377   const unsigned char* patch_10_fvals_uc = S_CAST(const unsigned char*, patch_10_fvals);
8378   if (allele_ct_m1 < 17) {
8379     // for larger rare10_ct, this should use a byte counter
8380     for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
8381       const uint32_t cur_byte = patch_10_fvals_uc[uii];
8382       const uint32_t cur_byte_hi = cur_byte >> 4;
8383       const uint32_t cur_byte_lo = cur_byte & 15;
8384       if (cur_byte_hi == cur_byte_lo) {
8385         two_cts_offset1[cur_byte_lo] += 1;
8386       } else {
8387         one_cts_offset1[cur_byte_lo] += 1;
8388         one_cts_offset1[cur_byte_hi] += 1;
8389       }
8390     }
8391     return;
8392   }
8393   for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
8394     const uint32_t cur_byte_lo = patch_10_fvals_uc[2 * uii];
8395     const uint32_t cur_byte_hi = patch_10_fvals_uc[2 * uii + 1];
8396     if (cur_byte_hi == cur_byte_lo) {
8397       two_cts_offset1[cur_byte_lo] += 1;
8398     } else {
8399       one_cts_offset1[cur_byte_lo] += 1;
8400       one_cts_offset1[cur_byte_hi] += 1;
8401     }
8402   }
8403 }
8404 
CountAllAux1b(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uint64_t * __restrict one_cts,uint64_t * __restrict two_cts,uint32_t * __restrict deltalist_workspace)8405 PglErr CountAllAux1b(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uint64_t* __restrict one_cts, uint64_t* __restrict two_cts, uint32_t* __restrict deltalist_workspace) {
8406   if (aux1b_mode == 15) {
8407     return kPglRetSuccess;
8408   }
8409   uint64_t* one_cts_offset1 = &(one_cts[1]);
8410   uint64_t* two_cts_offset1 = &(two_cts[1]);
8411   if (!sample_include) {
8412     uint32_t rare10_ct;
8413     if (!aux1b_mode) {
8414       const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
8415       rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
8416       *fread_pp += fset_byte_ct;
8417     } else {
8418       const unsigned char* group_info_iter;
8419       PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare10_ct);
8420       if (unlikely(reterr)) {
8421         return reterr;
8422       }
8423       reterr = SkipDeltalistIds(fread_end, group_info_iter, rare10_ct, raw_sample_ct, 0, fread_pp);
8424       if (unlikely(reterr)) {
8425         return reterr;
8426       }
8427     }
8428     const unsigned char* patch_10_fvals = *fread_pp;
8429     const uint32_t fvals_byte_ct = GetAux1bAlleleEntryByteCt(allele_ct, rare10_ct);
8430     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8431       return kPglRetMalformedInput;
8432     }
8433     CountAllAux1bDense(patch_10_fvals, allele_ct, rare10_ct, one_cts_offset1, two_cts_offset1);
8434     return kPglRetSuccess;
8435   }
8436   uintptr_t detect_hom_mask_lo;  // unused
8437   const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
8438   const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
8439   const uint32_t allele_code_width = 1U << allele_code_logwidth;
8440   const uint32_t allele_code_mask = (1U << allele_code_width) - 1;
8441   const uint32_t allele_ct_m1 = allele_ct - 1;
8442   uint32_t rare10_lowbits = kBitsPerWord;
8443   // probable todo: faster paths when two_cts_offset1 == nullptr
8444   if (!aux1b_mode) {
8445     const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
8446     const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
8447 #ifdef __arm__
8448 #  error "Unaligned accesses in CountAllAux1b()."
8449 #endif
8450     const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
8451     *fread_pp += fset_byte_ct;
8452     const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
8453     const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, 8);
8454     if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8455       return kPglRetMalformedInput;
8456     }
8457     const Halfword* sample_include_hw = R_CAST(const Halfword*, sample_include);
8458     uintptr_t sample_hwidx = 0;
8459     uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
8460     const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
8461     const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
8462     const uint32_t code10_width = 1U << code10_logwidth;
8463     uintptr_t fvals_bits = 0;
8464     uint32_t fvals_widx = 0;
8465     uint32_t subsetted_rare10_ct = 0;
8466     uint32_t loop_len = kBitsPerWord;
8467     for (uint32_t fset_widx = 0; ; ++fset_widx) {
8468       uintptr_t fset_bits;
8469       if (fset_widx >= fset_word_ct_m1) {
8470         if (fset_widx > fset_word_ct_m1) {
8471           break;
8472         }
8473         fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
8474         loop_len = ModNz(raw_10_ct, kBitsPerWord);
8475       } else {
8476         fset_bits = patch_10_fsetw[fset_widx];
8477       }
8478       if (allele_ct_m1 == 2) {
8479         for (uint32_t uii = 0; uii != loop_len; ++uii) {
8480           while (!cur_raw_genoarr_xys) {
8481             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
8482           }
8483           if (fset_bits & 1) {
8484             if (rare10_lowbits == kBitsPerWord) {
8485               if (fvals_widx == fvals_word_ct_m1) {
8486                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8487               } else {
8488                 fvals_bits = patch_10_fvalsw[fvals_widx];
8489               }
8490               // unnecessary to apply bzhi here
8491               ++fvals_widx;
8492               rare10_lowbits = 0;
8493             }
8494             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
8495             if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
8496               ++subsetted_rare10_ct;
8497               two_cts_offset1[1] += (fvals_bits >> rare10_lowbits) & 1;
8498             }
8499             ++rare10_lowbits;
8500           }
8501           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
8502           fset_bits = fset_bits >> 1;
8503         }
8504       } else {
8505         for (uint32_t uii = 0; uii != loop_len; ++uii) {
8506           while (!cur_raw_genoarr_xys) {
8507             cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
8508           }
8509           if (fset_bits & 1) {
8510             if (rare10_lowbits == kBitsPerWord) {
8511               if (fvals_widx == fvals_word_ct_m1) {
8512                 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8513               } else {
8514                 fvals_bits = patch_10_fvalsw[fvals_widx];
8515               }
8516               // unnecessary to apply bzhi here
8517               ++fvals_widx;
8518               rare10_lowbits = 0;
8519             }
8520             const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
8521             if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
8522               ++subsetted_rare10_ct;
8523               const uintptr_t cur_code_pair = fvals_bits >> rare10_lowbits;
8524               const uint32_t cur_code_hi = (cur_code_pair >> allele_code_width) & allele_code_mask;
8525               const uint32_t cur_code_lo = cur_code_pair & allele_code_mask;
8526               if (cur_code_hi == cur_code_lo) {
8527                 two_cts_offset1[cur_code_lo] += 1;
8528               } else {
8529                 one_cts_offset1[cur_code_lo] += 1;
8530                 one_cts_offset1[cur_code_hi] += 1;
8531               }
8532             }
8533             rare10_lowbits += code10_width;
8534           }
8535           cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
8536           fset_bits = fset_bits >> 1;
8537         }
8538       }
8539     }
8540     two_cts_offset1[0] -= subsetted_rare10_ct;
8541     if (allele_ct == 3) {
8542       const uint32_t subsetted_het12_ct = subsetted_rare10_ct - two_cts_offset1[1];
8543       one_cts_offset1[0] += subsetted_het12_ct;
8544       one_cts_offset1[1] += subsetted_het12_ct;
8545     }
8546     return kPglRetSuccess;
8547   }
8548   // Save deltalist elements, iterate.
8549   uint32_t rare10_ct;
8550   PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
8551   if (unlikely(reterr)) {
8552     return reterr;
8553   }
8554   const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
8555   const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, 8);
8556   if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8557     return kPglRetMalformedInput;
8558   }
8559   const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
8560   uint32_t subsetted_rare10_ct = 0;
8561   uint32_t loop_len = kBitsPerWord >> code10_logwidth;
8562   for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
8563     uintptr_t fvals_bits;
8564     if (fvals_widx >= fvals_word_ct_m1) {
8565       if (fvals_widx > fvals_word_ct_m1) {
8566         break;
8567       }
8568       fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8569       loop_len = 1 + ((rare10_ct - 1) & (loop_len - 1));
8570     } else {
8571       fvals_bits = patch_10_fvalsw[fvals_widx];
8572     }
8573     const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
8574     if (allele_ct == 3) {
8575       for (uint32_t uii = 0; uii != loop_len; ++uii) {
8576         const uint32_t sample_uidx = cur_deltalist_base[uii];
8577         if (IsSet(sample_include, sample_uidx)) {
8578           ++subsetted_rare10_ct;
8579           two_cts_offset1[1] += (fvals_bits >> uii) & 1;
8580         }
8581       }
8582     } else {
8583       for (uint32_t uii = 0; uii != loop_len; ++uii) {
8584         const uint32_t sample_uidx = cur_deltalist_base[uii];
8585         if (IsSet(sample_include, sample_uidx)) {
8586           ++subsetted_rare10_ct;
8587           const uintptr_t cur_code_pair = fvals_bits >> (uii << code10_logwidth);
8588           const uint32_t cur_code_hi = (cur_code_pair >> allele_code_width) & allele_code_mask;
8589           const uint32_t cur_code_lo = cur_code_pair & allele_code_mask;
8590           if (cur_code_hi == cur_code_lo) {
8591             two_cts_offset1[cur_code_lo] += 1;
8592           } else {
8593             one_cts_offset1[cur_code_lo] += 1;
8594             one_cts_offset1[cur_code_hi] += 1;
8595           }
8596         }
8597       }
8598     }
8599   }
8600   two_cts_offset1[0] -= subsetted_rare10_ct;
8601   if (allele_ct == 3) {
8602     const uint32_t subsetted_het12_ct = subsetted_rare10_ct - two_cts_offset1[1];
8603     one_cts_offset1[0] += subsetted_het12_ct;
8604     one_cts_offset1[1] += subsetted_het12_ct;
8605   }
8606   return kPglRetSuccess;
8607 }
8608 
8609 PglErr GetMultiallelicCountsAndDosage16s(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t sample_ct, uint32_t vidx, uint32_t allele_ct, __maybe_unused uint32_t is_minimac3_r2, PgenReaderMain* pgrp, double* __restrict imp_r2_ptr, uint32_t* __restrict het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* all_dosages) {
8610   // only called on multiallelic variants
8611   // no dosages for now
8612   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8613   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
8614   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
8615   uintptr_t* raw_genovec = pgrp->workspace_vec;
8616   const unsigned char* fread_ptr;
8617   const unsigned char* fread_end;
8618   PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
8619   if (unlikely(reterr)) {
8620     return reterr;
8621   }
8622   ZeroTrailingNyps(raw_sample_ct, raw_genovec);
8623   if (!subsetting_required) {
8624     GenoarrCountFreqsUnsafe(raw_genovec, raw_sample_ct, genocounts);
8625     sample_include = nullptr;
8626   } else {
8627     GenoarrCountSubsetFreqs(raw_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
8628   }
8629   uint64_t* one_cts = pgrp->workspace_imp_r2;
8630   uint64_t* two_cts = &(one_cts[allele_ct]);
8631   one_cts[0] = genocounts[1];
8632   one_cts[1] = genocounts[1];
8633   ZeroU64Arr(allele_ct - 2, &(one_cts[2]));
8634   two_cts[0] = genocounts[0];
8635   two_cts[1] = genocounts[2];
8636   ZeroU64Arr(allele_ct - 2, &(two_cts[2]));
8637   // Cases:
8638   // - No hardcall-phase present.  Then we don't need to know raw_het_ct.
8639   // - No multiallelic dosages present, not computing minimac3-r2.  Then we
8640   //   still don't need to know raw_het_ct.
8641   // - Otherwise, we need to know raw_het_ct, either for the minimac3-r2
8642   //   computation or to locate the beginning of aux3/aux4.
8643   //   If we're computing minimac3-r2, AND
8644   //     (i) we're subsetting, or
8645   //     (ii) multiallelic dosages are present,
8646   //   it's also necessary to compute all_hets, either to compute correct
8647   //   subsetted minimac3-r2 or to know how many phased-hardcalls are
8648   //   overridden by phased dosages.
8649   const uint32_t raw_het_ct_needed = VrtypeHphase(vrtype) && (is_minimac3_r2 || (vrtype & 0x60));
8650   uintptr_t* all_hets = nullptr;
8651   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
8652   uint32_t raw_het_ct = genocounts[1]; // inaccurate, corrected later if needed
8653   if (VrtypeMultiallelicHc(vrtype)) {
8654     const uint32_t aux1_first_byte = *fread_ptr++;
8655     const uint32_t aux1a_mode = aux1_first_byte & 15;
8656     const uint32_t aux1b_mode = aux1_first_byte >> 4;
8657     uint32_t raw_10_ct = 0;
8658     if ((!aux1a_mode) || (!aux1b_mode) || sample_include) {
8659       GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_het_ct, &raw_10_ct);
8660     }
8661     uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
8662     reterr = CountAllAux1a(fread_end, sample_include, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, raw_het_ct, &fread_ptr, one_cts, deltalist_workspace);
8663     if (unlikely(reterr)) {
8664       return reterr;
8665     }
8666     const unsigned char* aux1b_start = fread_ptr;
8667     reterr = CountAllAux1b(fread_end, sample_include, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, one_cts, two_cts, deltalist_workspace);
8668     if (unlikely(reterr)) {
8669       return reterr;
8670     }
8671     if (raw_het_ct_needed) {
8672       if (!sample_include) {
8673         raw_het_ct += genocounts[2];
8674         for (uint32_t aidx = 1; aidx != allele_ct; ++aidx) {
8675           raw_het_ct -= two_cts[aidx];
8676         }
8677       }
8678       if (sample_include || (is_minimac3_r2 && (vrtype & 0x60))) {
8679         all_hets = pgrp->workspace_all_hets;
8680         PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
8681         if (aux1b_mode != 15) {
8682           uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
8683           uint32_t aux1b_het_present;
8684           reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &aux1b_start, aux1b_hets, &aux1b_het_present, deltalist_workspace);
8685           if (unlikely(reterr)) {
8686             return reterr;
8687           }
8688           if (aux1b_het_present) {
8689             BitvecOr(aux1b_hets, raw_sample_ctl, all_hets);
8690           }
8691         }
8692         if (sample_include) {
8693           raw_het_ct = PopcountWords(all_hets, raw_sample_ctl);
8694         }
8695       }
8696     }
8697   }
8698   uintptr_t* raw_phasepresent = nullptr;
8699   uint32_t extra_phased_het_ct = 0;
8700   if (raw_het_ct_needed) {
8701     if (!all_hets) {
8702       reterr = SkipAux2(fread_end, raw_het_ct, &fread_ptr, is_minimac3_r2? (&extra_phased_het_ct) : nullptr);
8703       if (unlikely(reterr)) {
8704         return reterr;
8705       }
8706     } else {
8707       raw_phasepresent = pgrp->workspace_subset;
8708       reterr = GetPhasepresentAndSkipPhaseinfo(fread_end, all_hets, raw_sample_ct, raw_het_ct, &fread_ptr, raw_phasepresent, &extra_phased_het_ct);
8709       if (unlikely(reterr)) {
8710         return reterr;
8711       }
8712       if (sample_include) {
8713         extra_phased_het_ct = PopcountWordsIntersect(raw_phasepresent, sample_include, raw_sample_ctl);
8714       }
8715     }
8716   }
8717   if (!(vrtype & 0x60)) {
8718     uint32_t hom_hc_ct = 0;
8719     for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
8720       const uint64_t cur_hom_ct = two_cts[allele_idx];
8721       hom_hc_ct += cur_hom_ct;
8722       const uint64_t two_dosage = cur_hom_ct * 0x8000LLU;
8723       const uint64_t dosage_sum = one_cts[allele_idx] * 0x4000LLU + two_dosage;
8724       all_dosages[allele_idx] = dosage_sum;
8725       // Repurpose two_cts[] to store ssqs.
8726       two_cts[allele_idx] = (dosage_sum + two_dosage) * 0x4000LLU;
8727     }
8728     const uint32_t nm_sample_ct = sample_ct - genocounts[3];
8729     *het_ctp = nm_sample_ct - hom_hc_ct;
8730     if (!imp_r2_ptr) {
8731       return kPglRetSuccess;
8732     }
8733     *imp_r2_ptr = MultiallelicDiploidMinimac3R2(all_dosages, two_cts, nm_sample_ct, allele_ct, extra_phased_het_ct);
8734     if (!is_minimac3_r2) {
8735       *imp_r2_ptr *= 2;
8736     }
8737     return kPglRetSuccess;
8738   }
8739   fputs("dosages not yet supported by GetMultiallelicCountsAndDosage16s()\n", stderr);
8740   exit(S_CAST(int32_t, kPglRetNotYetSupported));
8741   return kPglRetNotYetSupported;
8742 }
8743 
8744 PglErr PgrGetMDCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReader* pgr_ptr, double* __restrict imp_r2_ptr, uint32_t* __restrict het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages) {
8745   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8746   assert(vidx < pgrp->fi.raw_variant_ct);
8747   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8748   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8749   if (!sample_ct) {
8750     STD_ARRAY_REF_FILL0(4, genocounts);
8751     ZeroU64Arr(allele_ct, all_dosages);
8752     if (imp_r2_ptr) {
8753       *imp_r2_ptr = 0.0 / 0.0;
8754     }
8755     return kPglRetSuccess;
8756   }
8757   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
8758   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8759   if ((allele_ct == 2) || (!(vrtype & 0x68))) {
8760     PglErr reterr = GetBasicGenotypeCountsAndDosage16s(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, is_minimac3_r2, pgrp, imp_r2_ptr, genocounts, all_dosages);
8761     *het_ctp = genocounts[1];
8762     ZeroU64Arr(allele_ct - 2, &(all_dosages[2]));
8763     return reterr;
8764   }
8765   return GetMultiallelicCountsAndDosage16s(sample_include, sample_include_interleaved_vec, sample_ct, vidx, allele_ct, is_minimac3_r2, pgrp, imp_r2_ptr, het_ctp, genocounts, all_dosages);
8766 }
8767 
PgrGetMD(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)8768 PglErr PgrGetMD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
8769   pgvp->patch_01_ct = 0;
8770   pgvp->patch_10_ct = 0;
8771   pgvp->dosage_ct = 0;
8772   pgvp->multidosage_sample_ct = 0;
8773   if (!sample_ct) {
8774     return kPglRetSuccess;
8775   }
8776   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8777   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
8778   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8779   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8780   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8781   if ((allele_ct == 2) || (!(vrtype & 0x68))) {
8782     return IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, pgvp->genovec, pgvp->dosage_present, pgvp->dosage_main, &(pgvp->dosage_ct));
8783   }
8784   const unsigned char* fread_ptr;
8785   const unsigned char* fread_end;
8786   uintptr_t* all_hets = VrtypeHphase(vrtype)? pgrp->workspace_all_hets : nullptr;
8787   if (VrtypeMultiallelicHc(vrtype)) {
8788     PglErr reterr = GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, all_hets? (&fread_ptr) : nullptr, all_hets? (&fread_end) : nullptr, all_hets, pgvp);
8789     if (!(vrtype & 0x60)) {
8790       return reterr;
8791     }
8792   } else {
8793     // todo: ReadRawGenovec, etc.
8794   }
8795   fputs("true multiallelic dosages not yet supported by PgrGetMD()\n", stderr);
8796   exit(S_CAST(int32_t, kPglRetNotYetSupported));
8797   return kPglRetSuccess;
8798 }
8799 
IMPLPgrGetDp(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,PgenVariant * pgvp)8800 PglErr IMPLPgrGetDp(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, PgenVariant* pgvp) {
8801   assert(vidx < pgrp->fi.raw_variant_ct);
8802   if (!sample_ct) {
8803     pgvp->phasepresent_ct = 0;
8804     pgvp->dosage_ct = 0;
8805     pgvp->dphase_ct = 0;
8806     return kPglRetSuccess;
8807   }
8808   const unsigned char* fread_ptr = nullptr;
8809   const unsigned char* fread_end = nullptr;
8810   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8811   const uint32_t dosage_is_present = VrtypeDosage(vrtype);
8812   PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, dosage_is_present? (&fread_ptr) : nullptr, dosage_is_present? (&fread_end) : nullptr, pgvp->genovec, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct));
8813   if (reterr || (!dosage_is_present)) {
8814     pgvp->dosage_ct = 0;
8815     pgvp->dphase_ct = 0;
8816     return reterr;
8817   }
8818   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8819   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8820   return ParseDosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, allele_ct, pgrp, &(pgvp->dosage_ct), pgvp->dphase_present, pgvp->dphase_delta, &(pgvp->dphase_ct), pgvp->dosage_present, pgvp->dosage_main);
8821 }
8822 
PgrGetInv1Dp(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,AlleleCode allele_idx,PgenReader * pgr_ptr,PgenVariant * pgvp)8823 PglErr PgrGetInv1Dp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
8824   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8825   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
8826   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8827   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8828   if ((allele_ct == 2) || (!allele_idx)) {
8829     PglErr reterr = IMPLPgrGetDp(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, pgvp);
8830     if (allele_idx) {
8831       GenovecInvertUnsafe(sample_ct, pgvp->genovec);
8832       if (pgvp->phasepresent_ct) {
8833         BitvecInvert(BitCtToWordCt(sample_ct), pgvp->phaseinfo);
8834       }
8835       if (pgvp->dosage_ct) {
8836         BiallelicDosage16Invert(pgvp->dosage_ct, pgvp->dosage_main);
8837         if (pgvp->dphase_ct) {
8838           BiallelicDphase16Invert(pgvp->dphase_ct, pgvp->dphase_delta);
8839         }
8840       }
8841     }
8842     return reterr;
8843   }
8844   const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
8845   if (!VrtypeDosage(vrtype)) {
8846     pgvp->dosage_ct = 0;
8847     pgvp->dphase_ct = 0;
8848     return IMPLPgrGetInv1P(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, pgvp->genovec, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct));
8849   }
8850   fputs("multiallelic dosage not yet supported by GetInv1Dp()\n", stderr);
8851   exit(S_CAST(int32_t, kPglRetNotYetSupported));
8852   return kPglRetSuccess;
8853 }
8854 
PgrGetMDp(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)8855 PglErr PgrGetMDp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
8856   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8857   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
8858   pgvp->patch_01_ct = 0;
8859   pgvp->patch_10_ct = 0;
8860   pgvp->phasepresent_ct = 0;
8861   pgvp->dosage_ct = 0;
8862   pgvp->multidosage_sample_ct = 0;
8863   pgvp->dphase_ct = 0;
8864   pgvp->multidphase_sample_ct = 0;
8865   if (!sample_ct) {
8866     return kPglRetSuccess;
8867   }
8868   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8869   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8870   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8871   if ((allele_ct == 2) || (!(vrtype & 0x68))) {
8872     return IMPLPgrGetDp(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, pgvp);
8873   }
8874   const unsigned char* fread_ptr;
8875   const unsigned char* fread_end;
8876   uintptr_t* all_hets = VrtypeHphase(vrtype)? pgrp->workspace_all_hets : nullptr;
8877   if (VrtypeMultiallelicHc(vrtype)) {
8878     PglErr reterr = GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, all_hets? (&fread_ptr) : nullptr, all_hets? (&fread_end) : nullptr, all_hets, pgvp);
8879     if (reterr || (!all_hets)) {
8880       return reterr;
8881     }
8882     if (!(vrtype & 0x60)) {
8883       const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
8884       return ParseAux2Subset(fread_end, (sample_ct != raw_sample_ct)? sample_include : nullptr, all_hets, nullptr, raw_sample_ct, sample_ct, &fread_ptr, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct), pgrp->workspace_subset);
8885     }
8886   } else {
8887     // todo: ReadRawGenovec, etc.
8888   }
8889   fputs("true multiallelic dosages not yet supported by PgrGetMDp()\n", stderr);
8890   fprintf(stderr, "%u\n", vidx);
8891   exit(S_CAST(int32_t, kPglRetNotYetSupported));
8892   return kPglRetSuccess;
8893 
8894 }
8895 
8896 static_assert(sizeof(AlleleCode) == 1, "CountAux1bHets() must be updated.");
CountAux1bHets(const AlleleCode * patch_10_vals,uintptr_t rare10_ct)8897 uintptr_t CountAux1bHets(const AlleleCode* patch_10_vals, uintptr_t rare10_ct) {
8898   // Similar to CountByte().
8899   uintptr_t byte_ct = rare10_ct * 2;
8900 #ifdef __LP64__
8901   if (byte_ct < kBytesPerVec) {
8902 #endif
8903     uintptr_t tot = 0;
8904     for (uintptr_t offset = 0; offset < byte_ct; offset += 2) {
8905       tot += (patch_10_vals[offset] != patch_10_vals[offset + 1]);
8906     }
8907     return tot;
8908 #ifdef __LP64__
8909   }
8910   const unsigned char* bytearr_uc_iter = R_CAST(const unsigned char*, patch_10_vals);
8911   const VecW m0 = vecw_setzero();
8912   const VecW m8 = VCONST_W(kMask00FF);
8913   VecW acc = vecw_setzero();
8914   while (byte_ct > 255 * kBytesPerVec) {
8915     VecUc inner_acc = vecuc_setzero();
8916     for (uint32_t uii = 0; uii != 255; ++uii) {
8917       const VecUc cur_vvec = vecuc_loadu(bytearr_uc_iter);
8918       bytearr_uc_iter = &(bytearr_uc_iter[kBytesPerVec]);
8919       const VecUc shifted_vvec = R_CAST(VecUc, vecw_srli(R_CAST(VecW, cur_vvec), 8));
8920       inner_acc = inner_acc - (cur_vvec == shifted_vvec);
8921     }
8922     const VecW partial_sums = R_CAST(VecW, inner_acc) & m8;
8923     acc = acc + vecw_sad(partial_sums, m0);
8924     byte_ct -= 255 * kBytesPerVec;
8925   }
8926   const unsigned char* bytearr_uc_final = &(bytearr_uc_iter[byte_ct - kBytesPerVec]);
8927   VecUc inner_acc = vecuc_setzero();
8928   while (bytearr_uc_iter < bytearr_uc_final) {
8929     const VecUc cur_vvec = vecuc_loadu(bytearr_uc_iter);
8930     bytearr_uc_iter = &(bytearr_uc_iter[kBytesPerVec]);
8931     const VecUc shifted_vvec = R_CAST(VecUc, vecw_srli(R_CAST(VecW, cur_vvec), 8));
8932     inner_acc = inner_acc - (cur_vvec == shifted_vvec);
8933   }
8934   VecUc cur_vvec = vecuc_loadu(bytearr_uc_final);
8935   const uintptr_t overlap_byte_ct = bytearr_uc_iter - bytearr_uc_final;
8936   const VecUc shifted_vvec = R_CAST(VecUc, vecw_srli(R_CAST(VecW, cur_vvec), 8));
8937   const VecUc mask_vvec = vecuc_loadu(&(kLeadMask[kBytesPerVec - overlap_byte_ct]));
8938   cur_vvec = (cur_vvec == shifted_vvec) & mask_vvec;
8939   inner_acc = inner_acc - cur_vvec;
8940   const VecW partial_sums = R_CAST(VecW, inner_acc) & m8;
8941   acc = acc + vecw_sad(partial_sums, m0);
8942   const uintptr_t tot = HsumW(acc);
8943   return rare10_ct - tot;
8944 #endif
8945 }
8946 
PgrGetRaw(uint32_t vidx,PgenGlobalFlags read_gflags,PgenReader * pgr_ptr,uintptr_t ** loadbuf_iter_ptr,unsigned char * loaded_vrtype_ptr)8947 PglErr PgrGetRaw(uint32_t vidx, PgenGlobalFlags read_gflags, PgenReader* pgr_ptr, uintptr_t** loadbuf_iter_ptr, unsigned char* loaded_vrtype_ptr) {
8948   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8949   // currently handles multiallelic hardcalls, hardcall phase, and biallelic
8950   // dosage (both unphased and phased)
8951   // todo: multiallelic dosage
8952   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
8953   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8954   uintptr_t* genovec = (*loadbuf_iter_ptr);
8955   uintptr_t* loadbuf_iter = &(genovec[NypCtToAlignedWordCt(raw_sample_ct)]);
8956   const uint32_t multiallelic_hc_present = (vrtype / 8) & 1;
8957   const uint32_t save_multiallelic_hc = multiallelic_hc_present && (read_gflags & kfPgenGlobalMultiallelicHardcallFound);
8958   const uint32_t hphase_is_present = (vrtype / 0x10) & 1;
8959   const uint32_t save_hphase = hphase_is_present && (read_gflags & kfPgenGlobalHardcallPhasePresent);
8960   const uint32_t dosage_is_present = (vrtype & 0x60)? 1 : 0;
8961   const uint32_t save_dosage = dosage_is_present && (read_gflags & kfPgenGlobalDosagePresent);
8962 
8963   const uint32_t save_dphase = (vrtype & 0x80) && (read_gflags & kfPgenGlobalDosagePhasePresent);
8964   assert(save_dosage || (!save_dphase));
8965 
8966   if (loaded_vrtype_ptr) {
8967     *loaded_vrtype_ptr = save_multiallelic_hc * 8 + save_hphase * 0x10 + save_dosage * 0x60 + save_dphase * 0x80;
8968   }
8969   const unsigned char* fread_ptr;
8970   const unsigned char* fread_end;
8971   PglErr reterr = ReadRawGenovec(0, vidx, pgrp, &fread_ptr, &fread_end, genovec);
8972   if ((!(multiallelic_hc_present || save_hphase || save_dosage)) || reterr) {
8973     *loadbuf_iter_ptr = loadbuf_iter;
8974     return reterr;
8975   }
8976 
8977   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
8978   ZeroTrailingNyps(raw_sample_ct, genovec);
8979   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8980   const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8981   uint32_t het_ct = 0;
8982   if (multiallelic_hc_present) {
8983     if (!save_multiallelic_hc) {
8984       // todo: erase-alt2+ fast path
8985       // mostly mirror PgrGet2P(0, 1), but a bit of extra logic is needed to
8986       // throw out phased-10het entries
8987       return kPglRetNotYetSupported;
8988     }
8989     // assume we always save multiallelic info
8990     // raw format:
8991     //   rare01_ct, padded out to a word
8992     //   rare10_ct, padded out to a word
8993     //   [round up to vector boundary, for patch_01_set]
8994     //   aux1a, if not mode 15:
8995     //     patch_01_set as bitarray, raw_sample_ctl words
8996     //     patch_01_vals, round up to word boundary
8997     //     [round up to vector boundary, for patch_10_set]
8998     //   aux1b, if not mode 15:
8999     //     patch_10_set as bitarray, raw_sample_ctl words
9000     //     patch_10_vals, round up to word boundary
9001     // round up to vector boundary at end
9002     const uint32_t aux1_first_byte = *fread_ptr++;
9003     const uint32_t aux1a_mode = aux1_first_byte & 15;
9004     const uint32_t aux1b_mode = aux1_first_byte >> 4;
9005     uint32_t raw_10_ct = 0;
9006     if ((!aux1a_mode) || hphase_is_present) {
9007       if (!aux1b_mode) {
9008         GenovecCount12Unsafe(genovec, raw_sample_ct, &het_ct, &raw_10_ct);
9009       } else {
9010         het_ct = CountNyp(genovec, kMask5555, raw_sample_ct);
9011       }
9012     } else if (!aux1b_mode) {
9013       raw_10_ct = CountNyp(genovec, kMaskAAAA, raw_sample_ct);
9014     }
9015     uintptr_t* multihc_raw = loadbuf_iter;
9016     loadbuf_iter = &(loadbuf_iter[RoundUpPow2(2, kWordsPerVec)]);
9017     uint32_t rare01_ct = 0;
9018     if (aux1a_mode != 15) {
9019       uintptr_t* patch_01_set = loadbuf_iter;
9020       loadbuf_iter = &(loadbuf_iter[raw_sample_ctl]);
9021       // (could decide to vector-align patch_01_vals later)
9022       AlleleCode* patch_01_vals = R_CAST(AlleleCode*, loadbuf_iter);
9023       reterr = ExportAux1a(fread_end, genovec, aux1a_mode, raw_sample_ct, allele_ct, het_ct, &fread_ptr, patch_01_set, patch_01_vals, &rare01_ct);
9024       if (unlikely(reterr)) {
9025         return reterr;
9026       }
9027       loadbuf_iter = &(loadbuf_iter[DivUp(rare01_ct, kBytesPerWord / sizeof(AlleleCode))]);
9028       VecAlignUp64(&loadbuf_iter);
9029     }
9030     uint32_t rare10_ct = 0;
9031     if (aux1b_mode != 15) {
9032       uintptr_t* patch_10_set = loadbuf_iter;
9033       loadbuf_iter = &(loadbuf_iter[raw_sample_ctl]);
9034       AlleleCode* patch_10_vals = R_CAST(AlleleCode*, loadbuf_iter);
9035       reterr = ExportAux1b(fread_end, genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, patch_10_set, patch_10_vals, &rare10_ct);
9036       if (unlikely(reterr)) {
9037         return reterr;
9038       }
9039       loadbuf_iter = &(loadbuf_iter[DivUp(rare10_ct, kBytesPerWord / (2 * sizeof(AlleleCode)))]);
9040       VecAlignUp64(&loadbuf_iter);
9041       if (hphase_is_present) {
9042         het_ct += CountAux1bHets(patch_10_vals, rare10_ct);
9043       }
9044     }
9045     multihc_raw[0] = rare01_ct;
9046     multihc_raw[1] = rare10_ct;
9047   } else if (hphase_is_present) {
9048     het_ct = CountNyp(genovec, kMask5555, raw_sample_ct);
9049   }
9050 
9051   if (hphase_is_present) {
9052     if (unlikely(!het_ct)) {
9053       // there shouldn't be a hphase track at all in this case
9054       return kPglRetMalformedInput;
9055     }
9056     const uint32_t het_ctdl = het_ct / kBitsPerWord;
9057     uintptr_t* phaseraw = loadbuf_iter;
9058     const uint32_t first_half_byte_ct = 1 + (het_ct / CHAR_BIT);
9059     if (save_hphase) {
9060       // this needs to be synced with MakePgenThread()
9061 #ifdef __LP64__
9062       // save het_ct later so we can use PopcountWords() below
9063       phaseraw[0] = 0;
9064 #else
9065       phaseraw[0] = het_ct;
9066       phaseraw[1] = 0;
9067 #endif
9068       loadbuf_iter = &(loadbuf_iter[8 / kBytesPerWord]);
9069       loadbuf_iter[het_ctdl] = 0;
9070       memcpy(loadbuf_iter, fread_ptr, first_half_byte_ct);
9071       loadbuf_iter = &(loadbuf_iter[1 + het_ctdl]);
9072     }
9073     const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
9074     const unsigned char* aux2_start = fread_ptr;
9075     fread_ptr = &(fread_ptr[first_half_byte_ct]);
9076     if (explicit_phasepresent) {
9077       uint32_t raw_phasepresent_ct;
9078       if (save_hphase) {
9079 #ifdef __LP64__
9080         raw_phasepresent_ct = PopcountWords(phaseraw, het_ctdl + 2);
9081 #else
9082         raw_phasepresent_ct = PopcountWords(&(phaseraw[2]), het_ctdl + 1);
9083 #endif
9084       } else {
9085         // bugfix (11 Apr 2018): not copied to phaseraw in this case
9086         raw_phasepresent_ct = PopcountBytes(aux2_start, first_half_byte_ct);
9087       }
9088       --raw_phasepresent_ct;
9089       if (unlikely(!raw_phasepresent_ct)) {
9090         // there shouldn't be a hphase track at all in this case, either
9091         return kPglRetMalformedInput;
9092       }
9093       const uint32_t second_half_byte_ct = DivUp(raw_phasepresent_ct, CHAR_BIT);
9094       if (save_hphase) {
9095 #ifdef __LP64__
9096         phaseraw[0] = het_ct | (S_CAST(uint64_t, raw_phasepresent_ct) << 32);
9097 #else
9098         phaseraw[1] = raw_phasepresent_ct;
9099 #endif
9100         memcpy(loadbuf_iter, fread_ptr, second_half_byte_ct);
9101         loadbuf_iter = &(loadbuf_iter[BitCtToWordCt(raw_phasepresent_ct)]);
9102       }
9103       fread_ptr = &(fread_ptr[second_half_byte_ct]);
9104     }
9105 #ifdef __LP64__
9106     if (save_hphase) {
9107       if (!explicit_phasepresent) {
9108         phaseraw[0] = het_ct;
9109       }
9110       VecAlignUp(&loadbuf_iter);
9111     }
9112 #endif
9113   }
9114   if (!save_dosage) {
9115     *loadbuf_iter_ptr = loadbuf_iter;
9116     return kPglRetSuccess;
9117   }
9118   uintptr_t* dosage_present = loadbuf_iter;
9119   const uint32_t raw_sample_ctaw = BitCtToAlignedWordCt(raw_sample_ct);
9120   loadbuf_iter = &(loadbuf_iter[raw_sample_ctaw]);
9121   uint16_t* dosage_main = R_CAST(uint16_t*, loadbuf_iter);
9122   // probable todo: pack this more tightly in the future
9123   const uintptr_t dosage_main_aligned_wordct = kWordsPerVec * DivUp(raw_sample_ct, (kBytesPerVec / sizeof(int16_t)));
9124   loadbuf_iter = &(loadbuf_iter[dosage_main_aligned_wordct]);
9125   uintptr_t* dphase_present = nullptr;
9126   int16_t* dphase_delta = nullptr;
9127   if (save_dphase) {
9128     dphase_present = loadbuf_iter;
9129     loadbuf_iter = &(loadbuf_iter[raw_sample_ctaw]);
9130     dphase_delta = R_CAST(int16_t*, loadbuf_iter);
9131     loadbuf_iter = &(loadbuf_iter[dosage_main_aligned_wordct]);
9132   }
9133   *loadbuf_iter_ptr = loadbuf_iter;
9134   return ParseDosage16(fread_ptr, fread_end, nullptr, raw_sample_ct, vidx, allele_ct, pgrp, nullptr, dphase_present, dphase_delta, nullptr, dosage_present, dosage_main);
9135 }
9136 
9137 
9138 // Currently assumes no phase or multiallelic hardcalls.
9139 // tried to have more custom code, turned out to not be worth it
ReadMissingness(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict missingness,uintptr_t * __restrict hets,uintptr_t * __restrict genovec_buf)9140 PglErr ReadMissingness(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict missingness, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf) {
9141   const unsigned char* fread_ptr;
9142   const unsigned char* fread_end;
9143   PglErr reterr = ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec_buf);
9144   ZeroTrailingNyps(sample_ct, genovec_buf);
9145   GenoarrToMissingnessUnsafe(genovec_buf, sample_ct, missingness);
9146   if (hets) {
9147     PgrDetectGenoarrHetsUnsafe(genovec_buf, NypCtToWordCt(sample_ct), hets);
9148   }
9149   if (fread_pp) {
9150     *fread_pp = fread_ptr;
9151     *fread_endp = fread_end;
9152   }
9153   return reterr;
9154 }
9155 
PgrGetMissingness(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict missingness,uintptr_t * __restrict genovec_buf)9156 PglErr PgrGetMissingness(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict missingness, uintptr_t* __restrict genovec_buf) {
9157   if (!sample_ct) {
9158     return kPglRetSuccess;
9159   }
9160   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
9161   // may as well add a hets parameter?
9162   assert(vidx < pgrp->fi.raw_variant_ct);
9163   return ReadMissingness(sample_include, GetSicp(pssi), sample_ct, vidx, pgrp, nullptr, nullptr, missingness, nullptr, genovec_buf);
9164 }
9165 
PgrGetMissingnessD(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict missingness_hc,uintptr_t * __restrict missingness_dosage,uintptr_t * __restrict hets,uintptr_t * __restrict genovec_buf)9166 PglErr PgrGetMissingnessD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict missingness_hc, uintptr_t* __restrict missingness_dosage, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf) {
9167   if (!sample_ct) {
9168     return kPglRetSuccess;
9169   }
9170   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
9171   // sample_include can't be null
9172   // either missingness_hc or missingness_dosage must be non-null
9173   assert(vidx < pgrp->fi.raw_variant_ct);
9174   const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
9175   const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
9176   const uint32_t dosage_is_relevant = missingness_dosage && VrtypeDosage(vrtype);
9177   const uint32_t need_to_skip_aux1or2 = dosage_is_relevant && (vrtype & 0x18);
9178   const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
9179   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
9180   const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
9181   const unsigned char* fread_ptr = nullptr;
9182   const unsigned char* fread_end = nullptr;
9183   uintptr_t* missingness_base = missingness_hc? missingness_hc : missingness_dosage;
9184   if (!need_to_skip_aux1or2) {
9185     PglErr reterr = ReadMissingness(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, dosage_is_relevant? (&fread_ptr) : nullptr, dosage_is_relevant? (&fread_end) : nullptr, missingness_base, hets, genovec_buf);
9186     if (missingness_dosage && missingness_hc) {
9187       memcpy(missingness_dosage, missingness_hc, BitCtToWordCt(sample_ct) * sizeof(intptr_t));
9188     }
9189     if (reterr || (!dosage_is_relevant)) {
9190       return reterr;
9191     }
9192   } else {
9193     PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, genovec_buf);
9194     if (unlikely(reterr)) {
9195       return reterr;
9196     }
9197     ZeroTrailingNyps(raw_sample_ct, genovec_buf);
9198     uintptr_t* subsetted_genovec = pgrp->workspace_vec;
9199     CopyNyparrNonemptySubset(genovec_buf, sample_include, raw_sample_ct, sample_ct, subsetted_genovec);
9200     GenoarrToMissingnessUnsafe(subsetted_genovec, sample_ct, missingness_base);
9201     if (missingness_hc) {
9202       memcpy(missingness_dosage, missingness_hc, BitCtToWordCt(sample_ct) * sizeof(intptr_t));
9203     }
9204 
9205     const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
9206     const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
9207     if (VrtypeHphase(vrtype) || hets) {
9208       uintptr_t* all_hets = pgrp->workspace_all_hets;
9209       PgrDetectGenoarrHets(genovec_buf, raw_sample_ct, all_hets);
9210       if (VrtypeMultiallelicHc(vrtype)) {
9211         // see analogous branch in ReadGenovecHphaseSubsetUnsafe()
9212         // probable todo: make this a separate function
9213         const uint32_t aux1_first_byte = *fread_ptr++;
9214         const uint32_t aux1a_mode = aux1_first_byte & 15;
9215         const uint32_t aux1b_mode = aux1_first_byte >> 4;
9216         uint32_t raw_01_ct = 0;
9217         uint32_t raw_10_ct = 0;
9218         if ((!aux1a_mode) || (!aux1b_mode)) {
9219           GenovecCount12Unsafe(genovec_buf, raw_sample_ct, &raw_01_ct, &raw_10_ct);
9220         }
9221         reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr);
9222         if (unlikely(reterr)) {
9223           return reterr;
9224         }
9225         uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
9226         uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
9227         uint32_t aux1b_het_present;
9228         reterr = GetAux1bHets(fread_end, genovec_buf, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, aux1b_hets, &aux1b_het_present, deltalist_workspace);
9229         if (unlikely(reterr)) {
9230           return reterr;
9231         }
9232         if (aux1b_het_present) {
9233           BitvecOr(aux1b_hets, raw_sample_ctl, all_hets);
9234         }
9235       }
9236       if (hets) {
9237         CopyBitarrSubset(all_hets, sample_include, sample_ct, hets);
9238       }
9239       if (VrtypeHphase(vrtype)) {
9240         reterr = SkipAux2(fread_end, PopcountWords(all_hets, raw_sample_ctl), &fread_ptr, nullptr);
9241         if (unlikely(reterr)) {
9242           return reterr;
9243         }
9244       }
9245     } else {
9246       SkipAux1(fread_end, genovec_buf, raw_sample_ct, allele_ct, &fread_ptr);
9247     }
9248   }
9249   // now perform bitwise andnot with dosage_present
9250   if ((vrtype & 0x60) == 0x40) {
9251     // unconditional dosage.  spot-check the appropriate entries for equality
9252     // to 65535.
9253 #ifdef __arm__
9254 #  error "Unaligned accesses in PgrGetMissingnessPD()."
9255 #endif
9256     const uint16_t* dosage_main = R_CAST(const uint16_t*, fread_ptr);
9257     // bugfix (18 Feb 2019): sample_include is permitted to be nullptr here
9258     if (!subsetting_required) {
9259       // probable todo: faster iteration over set bits
9260       for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
9261         uintptr_t missing_dosage_bits = missingness_dosage[widx];
9262         if (missing_dosage_bits) {
9263           const uint16_t* cur_dosage_main = &(dosage_main[widx * kBitsPerWord]);
9264           do {
9265             uint32_t sample_idx_lowbits = ctzw(missing_dosage_bits);
9266             if (cur_dosage_main[sample_idx_lowbits] != 65535) {
9267               missingness_dosage[widx] ^= missing_dosage_bits & (-missing_dosage_bits);
9268             }
9269             missing_dosage_bits &= missing_dosage_bits - 1;
9270           } while (missing_dosage_bits);
9271         }
9272       }
9273     } else {
9274       uintptr_t sample_uidx_base = 0;
9275       uintptr_t sample_include_bits = sample_include[0];
9276       for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
9277         const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
9278         if (!IsSet(missingness_dosage, sample_idx)) {
9279           continue;
9280         }
9281         if (dosage_main[sample_uidx] != 65535) {
9282           ClearBit(sample_idx, missingness_dosage);
9283         }
9284       }
9285     }
9286     return kPglRetSuccess;
9287   }
9288   uintptr_t* dosage_present = pgrp->workspace_dosage_present;
9289   if ((vrtype & 0x60) == 0x20) {
9290     // dosage list
9291     uint32_t dummy;
9292     if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, &fread_ptr, dosage_present, &dummy))) {
9293       return kPglRetMalformedInput;
9294     }
9295   } else {
9296     // dosage bitarray
9297     dosage_present[raw_sample_ctl - 1] = 0;
9298     const uint32_t raw_sample_ctb = DivUp(raw_sample_ct, CHAR_BIT);
9299     memcpy(dosage_present, fread_ptr, raw_sample_ctb);
9300   }
9301   if (subsetting_required) {
9302     CopyBitarrSubset(dosage_present, sample_include, sample_ct, pgrp->workspace_vec);
9303     dosage_present = pgrp->workspace_vec;
9304   }
9305   BitvecInvmask(dosage_present, BitCtToWordCt(sample_ct), missingness_dosage);
9306   return kPglRetSuccess;
9307 }
9308 
ValidateVint31(const unsigned char * buf_end,const unsigned char ** bufpp,uint32_t * val_ptr)9309 static inline BoolErr ValidateVint31(const unsigned char* buf_end, const unsigned char** bufpp, uint32_t* val_ptr) {
9310   if (unlikely(buf_end <= (*bufpp))) {
9311     return 1;
9312   }
9313   uint32_t vint32 = *((*bufpp)++);
9314   if (vint32 <= 127) {
9315     *val_ptr = vint32;
9316     return 0;
9317   }
9318   vint32 &= 127;
9319   for (uint32_t shift = 7; shift != 28; shift += 7) {
9320     if (unlikely(buf_end == (*bufpp))) {
9321       return 1;
9322     }
9323     uint32_t uii = *((*bufpp)++);
9324     vint32 |= (uii & 127) << shift;
9325     if (uii <= 127) {
9326       *val_ptr = vint32;
9327       return 0;
9328     }
9329   }
9330   if (unlikely(buf_end == (*bufpp))) {
9331     return 1;
9332   }
9333   uint32_t uii = *((*bufpp)++);
9334   if (unlikely(uii > 7)) {
9335     return 1;
9336   }
9337   vint32 |= uii << 28;
9338   *val_ptr = vint32;
9339   return 0;
9340 }
9341 
ValidateDifflistHeader(const unsigned char * fread_end,uint32_t sample_ct,const unsigned char ** fread_pp,uintptr_t * raregeno_buf,const unsigned char ** difflist_group_info_ptr,uint32_t * difflist_len_ptr)9342 BoolErr ValidateDifflistHeader(const unsigned char* fread_end, uint32_t sample_ct, const unsigned char** fread_pp, uintptr_t* raregeno_buf, const unsigned char** difflist_group_info_ptr, uint32_t* difflist_len_ptr) {
9343   // can be used for deltalists: pass raregeno_buf == nullptr.
9344   if (unlikely(ValidateVint31(fread_end, fread_pp, difflist_len_ptr))) {
9345     // todo: ensure fread_pp points to a problematic byte whenever a validate_
9346     // function returns an error, so the error message can provide an accurate
9347     // byte offset.
9348     return 1;
9349   }
9350   const uint32_t difflist_len = *difflist_len_ptr;
9351   *difflist_group_info_ptr = *fread_pp;
9352   if (!difflist_len) {
9353     return 0;
9354   }
9355   if (unlikely(difflist_len > sample_ct / kPglMaxDifflistLenDivisor)) {
9356     return 1;
9357   }
9358   const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize);
9359   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(sample_ct);
9360   const uint32_t difflist_index_byte_ct = group_ct * (sample_id_byte_ct + 1) - 1;
9361   if (PtrAddCk(fread_end, difflist_index_byte_ct, fread_pp)) {
9362     return 1;
9363   }
9364   if (!raregeno_buf) {
9365     return 0;
9366   }
9367   const uint32_t raregeno_byte_ct = NypCtToByteCt(difflist_len);
9368   const unsigned char* raregeno_start = *fread_pp;
9369   if (PtrAddCk(fread_end, raregeno_byte_ct, fread_pp)) {
9370     return 1;
9371   }
9372   memcpy(raregeno_buf, raregeno_start, raregeno_byte_ct);
9373   const uint32_t difflist_len_mod4 = difflist_len % 4;
9374   if (difflist_len_mod4) {
9375     const uint32_t last_raregeno_byte = (*fread_pp)[-1];
9376     if (unlikely(last_raregeno_byte >> (2 * difflist_len_mod4))) {
9377       return 1;
9378     }
9379   }
9380   return 0;
9381 }
9382 
ValidateAndApplyDifflist(const unsigned char * fread_end,uint32_t common2_code,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)9383 BoolErr ValidateAndApplyDifflist(const unsigned char* fread_end, uint32_t common2_code, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
9384   // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
9385   // Similar to ParseAndApplyDifflist(), but with exhaustive input
9386   // validation.
9387   const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9388   uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
9389   const unsigned char* group_info_iter;
9390   uint32_t difflist_len;
9391   if (unlikely(ValidateDifflistHeader(fread_end, sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len))) {
9392     return 1;
9393   }
9394   if (!difflist_len) {
9395     return 0;
9396   }
9397   const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
9398   if (common2_code) {
9399     // 1-bit format + list of exceptions.  In this case,
9400     //   (i) the length of the exception list must be < (sample_ct / 16)
9401     //   (ii) every raregeno entry must either be one of the two rare genotype
9402     //        values, or involve a rare alt allele.
9403     if (unlikely(difflist_len >= (sample_ct / (2 * kPglMaxDifflistLenDivisor)))) {
9404       return 1;
9405     }
9406     const uintptr_t common_code_delta = common2_code & 3;
9407     const uintptr_t inv_common_word1 = (3 - common2_code / 4) * kMask5555;
9408     const uintptr_t inv_common_word2 = inv_common_word1 - (common_code_delta * kMask5555);
9409     for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
9410       uintptr_t cur_raregeno_word = cur_raregeno_iter[subgroup_idx];
9411       const uintptr_t match1 = Word11(cur_raregeno_word ^ inv_common_word1);
9412       const uintptr_t match2 = Word11(cur_raregeno_word ^ inv_common_word2);
9413       if (subgroup_idx == subgroup_idx_last) {
9414         // ignore trailing bits
9415         const uint32_t lshift = ((-difflist_len) % kBitsPerWordD2) * 2;
9416         if (unlikely((match1 << lshift) || (match2 << lshift))) {
9417           return 1;
9418         }
9419         break;
9420       }
9421       if (unlikely(match1 || match2)) {
9422         // todo: if (multiallelic_hc_present && (!inv_common_word2)), record
9423         // might be fine; but we need to verify these are actually rare alt
9424         // alleles.
9425         // (er, above comment is obsolete)
9426         return 1;
9427       }
9428     }
9429   }
9430   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(sample_ct);
9431   const unsigned char* group_byte_cts_iter = &(group_info_iter[DivUp(difflist_len, kPglDifflistGroupSize) * sample_id_byte_ct]);
9432   const unsigned char* prev_group_start = *fread_pp;
9433 
9434   uintptr_t sample_idx = 0;
9435   for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
9436     uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
9437     if (subgroup_idx >= subgroup_idx_last) {
9438       if (subgroup_idx > subgroup_idx_last) {
9439         return 0;
9440       }
9441       remaining_deltas_in_subgroup &= difflist_len - 1;
9442     }
9443     if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
9444       uintptr_t new_sample_idx_start = SubU32Load(group_info_iter, sample_id_byte_ct);
9445       if (subgroup_idx) {
9446         if (unlikely(sample_idx >= new_sample_idx_start)) {
9447           return 1;
9448         }
9449         const uint32_t group_byte_ct = S_CAST(uint32_t, *group_byte_cts_iter++) + 63;
9450         if (unlikely(S_CAST(uintptr_t, (*fread_pp) - prev_group_start) != group_byte_ct)) {
9451           return 1;
9452         }
9453         prev_group_start = *fread_pp;
9454       }
9455       sample_idx = new_sample_idx_start;
9456       group_info_iter = &(group_info_iter[sample_id_byte_ct]);
9457     } else {
9458       uint32_t sample_idx_incr;
9459       if (unlikely(ValidateVint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr))) {
9460         return 1;
9461       }
9462       sample_idx += sample_idx_incr;
9463     }
9464     uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
9465     for (; ; --remaining_deltas_in_subgroup) {
9466       if (unlikely(sample_idx >= sample_ct)) {
9467         return 1;
9468       }
9469       const uintptr_t cur_geno = cur_raregeno_word & 3;
9470       AssignNyparrEntry(sample_idx, cur_geno, genoarr);
9471       if (!remaining_deltas_in_subgroup) {
9472         break;
9473       }
9474       uint32_t sample_idx_incr;
9475       if (unlikely(ValidateVint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr))) {
9476         return 1;
9477       }
9478       sample_idx += sample_idx_incr;
9479       cur_raregeno_word >>= 2;
9480     }
9481   }
9482 }
9483 
ValidateOnebit(const unsigned char * fread_end,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)9484 BoolErr ValidateOnebit(const unsigned char* fread_end, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
9485   // ParseOnebitUnsafe() with exhaustive input validation.
9486   const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9487   const uint32_t common2_and_bitarray_byte_ct = (sample_ct + 15) / CHAR_BIT;
9488   const unsigned char* onebit_main_iter = *fread_pp;
9489   if (PtrAddCk(fread_end, common2_and_bitarray_byte_ct, fread_pp)) {
9490     return 1;
9491   }
9492   const uintptr_t common2_code = *onebit_main_iter++;
9493   const uintptr_t common_code_delta = common2_code & 3;
9494   uintptr_t word_base = common2_code / 4;
9495   if (unlikely((!common_code_delta) || (word_base + common_code_delta > 3))) {
9496     return 1;
9497   }
9498   word_base *= kMask5555;
9499   const uint32_t genoarr_widx_trail = (sample_ct + 7) / kBitsPerWordD2;
9500   const uint32_t genoarr_widx_end = NypCtToWordCt(sample_ct);
9501 #ifdef __arm__
9502 #  error "Unaligned accesses in ValidateOnebit()."
9503 #endif
9504   const Halfword* onebit_main = R_CAST(const Halfword*, onebit_main_iter);
9505   for (uint32_t genoarr_widx = 0; ; ++genoarr_widx) {
9506     uintptr_t ww;
9507     if (genoarr_widx >= genoarr_widx_trail) {
9508       if (genoarr_widx == genoarr_widx_end) {
9509         break;
9510       }
9511       const uint32_t nontrail_byte_ct = ((sample_ct - 1) % kBitsPerWordD2) / CHAR_BIT;
9512       ww = ProperSubwordLoad(&(onebit_main[genoarr_widx_trail]), 1 + nontrail_byte_ct);
9513       const uint32_t sample_ct_mod8 = sample_ct % 8;
9514       if (sample_ct_mod8) {
9515         if (unlikely(ww >> (nontrail_byte_ct * 8 + sample_ct_mod8))) {
9516           return 1;
9517         }
9518       }
9519     } else {
9520       ww = onebit_main[genoarr_widx];
9521     }
9522     ww = UnpackHalfwordToWord(ww);
9523     genoarr[genoarr_widx] = word_base + ww * common_code_delta;
9524   }
9525   return ValidateAndApplyDifflist(fread_end, common2_code, fread_pp, pgrp, genoarr);
9526 }
9527 
9528 // assumes that we aren't dealing with the trivial fixed-width case.
9529 // saves main genotype array to genovec.  does not zero out trailing bits.
ValidateGeno(const unsigned char * fread_end,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,uintptr_t * genovec,char * errstr_buf)9530 BoolErr ValidateGeno(const unsigned char* fread_end, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, uintptr_t* genovec, char* errstr_buf) {
9531   const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
9532   const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9533   if (VrtypeLdCompressed(vrtype)) {
9534     CopyNyparr(pgrp->ldbase_genovec, sample_ct, genovec);
9535     if (unlikely(ValidateAndApplyDifflist(fread_end, 0, fread_pp, pgrp, genovec))) {
9536       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid LD difflist for (0-based) variant #%u.\n", vidx);
9537       return 1;
9538     }
9539     if (vrtype & 1) {
9540       GenovecInvertUnsafe(sample_ct, genovec);
9541     }
9542     return 0;
9543   }
9544   const uint32_t is_ldbase = VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
9545   if (!(vrtype & 4)) {
9546     if (vrtype & 1) {
9547       if (unlikely(ValidateOnebit(fread_end, fread_pp, pgrp, genovec))) {
9548         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid 1-bit genotype record for (0-based) variant #%u.\n", vidx);
9549         return 1;
9550       }
9551     } else {
9552       const uint32_t genovec_byte_ct = DivUp(sample_ct, 4);
9553       const unsigned char* src_genodata = *fread_pp;
9554       if (PtrAddCk(fread_end, genovec_byte_ct, fread_pp)) {
9555         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid 2-bit genotype record for (0-based) variant #%u\n", vidx);
9556         return 1;
9557       }
9558       memcpy(genovec, src_genodata, genovec_byte_ct);
9559       const uint32_t sample_ct_mod4 = sample_ct % 4;
9560       if (sample_ct_mod4) {
9561         const uint32_t last_geno_byte = (*fread_pp)[-1];
9562         if (unlikely(last_geno_byte >> (2 * sample_ct_mod4))) {
9563           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Last genotype byte for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9564           return 1;
9565         }
9566       }
9567     }
9568   } else {
9569     const uint32_t vrtype_low2 = vrtype & 3;
9570     if (vrtype_low2 != 1) {
9571       const uint32_t vec_ct = NypCtToVecCt(sample_ct);
9572       vecset(genovec, vrtype_low2 * kMask5555, vec_ct);
9573       if (unlikely(ValidateAndApplyDifflist(fread_end, 0, fread_pp, pgrp, genovec))) {
9574         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid genotype difflist for (0-based) variant #%u.\n", vidx);
9575         return 1;
9576       }
9577     } else {
9578       if (is_ldbase) {
9579         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid LD back-reference from variant #%u to all-hom-ref variant #%u.\n", vidx + 1, vidx);
9580         return 1;
9581       }
9582       ZeroWArr(NypCtToWordCt(sample_ct), genovec);
9583     }
9584   }
9585   if (is_ldbase) {
9586     CopyNyparr(genovec, sample_ct, pgrp->ldbase_genovec);
9587   }
9588   return 0;
9589 }
9590 
ValidateAndCountDeltalist(const unsigned char * fread_end,uint32_t sample_ct,const unsigned char ** fread_pp,uint32_t * __restrict deltalist,uint32_t * deltalist_len_ptr)9591 BoolErr ValidateAndCountDeltalist(const unsigned char* fread_end, uint32_t sample_ct, const unsigned char** fread_pp, uint32_t* __restrict deltalist, uint32_t* deltalist_len_ptr) {
9592   // pass deltalist == nullptr when actual bit positions aren't needed
9593   const unsigned char* group_info_iter;
9594   if (unlikely(ValidateDifflistHeader(fread_end, sample_ct, fread_pp, nullptr, &group_info_iter, deltalist_len_ptr))) {
9595     return 1;
9596   }
9597   const uint32_t deltalist_len = *deltalist_len_ptr;
9598   if (!deltalist_len) {
9599     return 0;
9600   }
9601   const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(sample_ct);
9602   const uint32_t group_idx_last = (deltalist_len - 1) / kPglDifflistGroupSize;
9603   const unsigned char* group_byte_cts_iter = &(group_info_iter[DivUp(deltalist_len, kPglDifflistGroupSize) * sample_id_byte_ct]);
9604   const unsigned char* prev_group_start = *fread_pp;
9605   uint32_t* deltalist_iter = deltalist;
9606   uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
9607   uintptr_t sample_idx = 0;
9608   for (uint32_t group_idx = 0; ; ++group_idx) {
9609     if (group_idx >= group_idx_last) {
9610       if (group_idx > group_idx_last) {
9611         return 0;
9612       }
9613       group_len_m1 &= deltalist_len - 1;
9614     }
9615     uintptr_t new_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
9616     if (group_idx) {
9617       if (unlikely(sample_idx >= new_sample_idx)) {
9618         return 1;
9619       }
9620       const uint32_t group_byte_ct = S_CAST(uint32_t, *group_byte_cts_iter++) + 63;
9621       if (unlikely(S_CAST(uintptr_t, (*fread_pp) - prev_group_start) != group_byte_ct)) {
9622         return 1;
9623       }
9624       prev_group_start = *fread_pp;
9625     }
9626     sample_idx = new_sample_idx;
9627     group_info_iter = &(group_info_iter[sample_id_byte_ct]);
9628     for (uint32_t deltalist_idx_lowbits = 0; ; ++deltalist_idx_lowbits) {
9629       if (unlikely(sample_idx >= sample_ct)) {
9630         return 1;
9631       }
9632       if (deltalist_iter) {
9633         *deltalist_iter++ = sample_idx;
9634       }
9635       if (deltalist_idx_lowbits == group_len_m1) {
9636         break;
9637       }
9638       uint32_t sample_idx_incr;
9639       if (unlikely(ValidateVint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr))) {
9640         return 1;
9641       }
9642       sample_idx += sample_idx_incr;
9643     }
9644   }
9645 }
9646 
ValidateMultiallelicHc(const unsigned char * fread_end,const uintptr_t * __restrict raw_genovec,uint32_t vidx,uint32_t allele_ct,PgenReaderMain * pgrp,const unsigned char ** fread_pp,uint32_t * __restrict het_ctp,char * __restrict errstr_buf)9647 BoolErr ValidateMultiallelicHc(const unsigned char* fread_end, const uintptr_t* __restrict raw_genovec, uint32_t vidx, uint32_t allele_ct, PgenReaderMain* pgrp, const unsigned char** fread_pp, uint32_t* __restrict het_ctp, char* __restrict errstr_buf) {
9648   if (unlikely(allele_ct <= 2)) {
9649     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic hardcall track present for (0-based) variant #%u, but it apparently has only %u allele%s.\n", vidx, allele_ct, (allele_ct == 1)? "" : "s");
9650     return 1;
9651   }
9652   const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9653   const uint32_t aux1_first_byte = **fread_pp;
9654   *fread_pp += 1;
9655   if (unlikely(
9656           aux1_first_byte &&
9657           (aux1_first_byte != 1) &&
9658           (aux1_first_byte != 15) &&
9659           (aux1_first_byte != 16) &&
9660           (aux1_first_byte != 17) &&
9661           (aux1_first_byte != 31) &&
9662           (aux1_first_byte != 240) &&
9663           (aux1_first_byte != 241))) {
9664     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic hardcall track mode byte (%u; must be in {0, 1, 15, 16, 17, 31, 240, 241}) in (0-based) variant #%u.\n", aux1_first_byte, vidx);
9665     return 1;
9666   }
9667   const uint32_t aux1a_mode = aux1_first_byte & 15;
9668   const uint32_t aux1b_mode = aux1_first_byte >> 4;
9669   uint32_t raw_01_ct;
9670   uint32_t raw_10_ct;
9671   GenovecCount12Unsafe(raw_genovec, sample_ct, &raw_01_ct, &raw_10_ct);
9672   uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
9673   if (aux1a_mode != 15) {
9674     if (unlikely(!raw_01_ct)) {
9675       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic het-ref hardcall track present for (0-based) variant #%u, but no het-ref calls exist.\n", vidx);
9676       return 1;
9677     }
9678     uint32_t rare01_ct;
9679     if (!aux1a_mode) {
9680       const uint32_t subset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
9681       if (PtrCheck(fread_end, *fread_pp, subset_byte_ct)) {
9682         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall bitarray-subset for (0-based) variant #%u.\n", vidx);
9683         return 1;
9684       }
9685       rare01_ct = PopcountBytes(*fread_pp, subset_byte_ct);
9686       if (unlikely(!rare01_ct)) {
9687         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Empty multiallelic het-ref hardcall bitarray-subset for (0-based) variant #%u.\n", vidx);
9688         return 1;
9689       }
9690       *fread_pp += subset_byte_ct;
9691       const uint32_t raw_01_ct_mod8 = raw_01_ct % 8;
9692       if (raw_01_ct_mod8) {
9693         if (unlikely((*fread_pp)[-1] >> raw_01_ct_mod8)) {
9694           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic het-ref hardcall bitarray-subset for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9695           return 1;
9696         }
9697       }
9698     } else {
9699       if (unlikely(ValidateAndCountDeltalist(fread_end, sample_ct, fread_pp, deltalist_workspace, &rare01_ct) || (!rare01_ct))) {
9700         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall deltalist-subset for (0-based) variant #%u.\n", vidx);
9701         return 1;
9702       }
9703       for (uint32_t uii = 0; uii != rare01_ct; ++uii) {
9704         if (unlikely(GetNyparrEntry(raw_genovec, deltalist_workspace[uii]) != 1)) {
9705           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall deltalist-subset for (0-based) variant #%u (an index doesn't correspond to a het-ref call).\n", vidx);
9706           return 1;
9707         }
9708       }
9709     }
9710     if (allele_ct < 5) {
9711       // Nothing to do for allele_ct == 3.
9712       if (allele_ct == 4) {
9713         // 1-bit entries.  Contents must be in range, so just validate trailing
9714         // bits.
9715         const uint32_t fvals_byte_ct = DivUp(rare01_ct, 8);
9716         if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9717           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9718           return 1;
9719         }
9720         const uint32_t rare01_ct_mod8 = rare01_ct % 8;
9721         if (rare01_ct_mod8) {
9722           if (unlikely((*fread_pp)[-1] >> rare01_ct_mod8)) {
9723             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9724             return 1;
9725           }
9726         }
9727       }
9728     } else {
9729       const unsigned char* fvals = *fread_pp;
9730       if (allele_ct < 19) {
9731         if (allele_ct < 7) {
9732           // 2-bit entries.
9733           const uint32_t fvals_byte_ct = DivUp(rare01_ct, 4);
9734           if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9735             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9736             return 1;
9737           }
9738           if (allele_ct == 5) {
9739             // Contents may be out-of-range.
9740             const uint32_t fullword_ct = fvals_byte_ct / kBytesPerWord;
9741             uint32_t widx = 0;
9742             if (fullword_ct) {
9743               const uintptr_t* fvals_alias = R_CAST(const uintptr_t*, fvals);
9744               for (; widx != fullword_ct; ++widx) {
9745                 const uintptr_t cur_word = fvals_alias[widx];
9746                 if (unlikely(cur_word & (cur_word >> 1) & kMask5555)) {
9747                   snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (out-of-range allele code).\n", vidx);
9748                   return 1;
9749                 }
9750               }
9751             }
9752             for (uint32_t uii = widx * kBytesPerWord; uii != fvals_byte_ct; ++uii) {
9753               const uint32_t cur_byte = fvals[uii];
9754               if (unlikely(cur_byte & (cur_byte >> 1) & 0x55)) {
9755                 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (out-of-range allele code).\n", vidx);
9756                 return 1;
9757               }
9758             }
9759           }
9760           // Validate trailing bits.
9761           const uint32_t rare01_ct_mod4 = rare01_ct % 4;
9762           if (rare01_ct_mod4) {
9763             if (unlikely((*fread_pp)[-1] >> (2 * rare01_ct_mod4))) {
9764               snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9765               return 1;
9766             }
9767           }
9768         } else {
9769           // 4-bit entries.
9770           const uint32_t fvals_byte_ct = DivUp(rare01_ct, 2);
9771           if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9772             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9773             return 1;
9774           }
9775           if (allele_ct != 18) {
9776             // Contents may be out-of-range.
9777             // (Can optimize this loop later.)
9778             const uint32_t max_code = allele_ct - 3;
9779             for (uint32_t uii = 0; uii != fvals_byte_ct; ++uii) {
9780               const uint32_t cur_byte = fvals[uii];
9781               if (unlikely(((cur_byte & 15) > max_code) || ((cur_byte >> 4) > max_code))) {
9782                 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (out-of-range allele code).\n", vidx);
9783                 return 1;
9784               }
9785             }
9786           }
9787           // Validate trailing bits.
9788           const uint32_t rare01_ct_mod2 = rare01_ct % 2;
9789           if (rare01_ct_mod2) {
9790             if (unlikely((*fread_pp)[-1] >> 4)) {
9791               snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9792               return 1;
9793             }
9794           }
9795         }
9796       } else {
9797         // 8-bit entries.
9798         if (PtrAddCk(fread_end, rare01_ct, fread_pp)) {
9799           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9800           return 1;
9801         }
9802         // Can optimize this loop later.
9803         const uint32_t max_code = allele_ct - 3;
9804         for (uint32_t uii = 0; uii != rare01_ct; ++uii) {
9805           const uint32_t cur_byte = fvals[uii];
9806           if (unlikely(cur_byte > max_code)) {
9807             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (out-of-range allele code).\n", vidx);
9808             return 1;
9809           }
9810         }
9811       }
9812     }
9813   }
9814   if (aux1b_mode != 15) {
9815     if (unlikely(!raw_10_ct)) {
9816       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic altxy hardcall track present for (0-based) variant #%u, but no altxy calls exist.\n", vidx);
9817       return 1;
9818     }
9819     uint32_t rare10_ct;
9820     if (!aux1b_mode) {
9821       const uint32_t subset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
9822       if (PtrCheck(fread_end, *fread_pp, subset_byte_ct)) {
9823         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall bitarray-subset for (0-based) variant #%u.\n", vidx);
9824         return 1;
9825       }
9826       rare10_ct = PopcountBytes(*fread_pp, subset_byte_ct);
9827       if (unlikely(!rare10_ct)) {
9828         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Empty multiallelic altxy hardcall bitarray-subset for (0-based) variant #%u.\n", vidx);
9829         return 1;
9830       }
9831       *fread_pp += subset_byte_ct;
9832       const uint32_t raw_10_ct_mod8 = raw_10_ct % 8;
9833       if (raw_10_ct_mod8) {
9834         if (unlikely((*fread_pp)[-1] >> raw_10_ct_mod8)) {
9835           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic altxy hardcall bitarray-subset for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9836           return 1;
9837         }
9838       }
9839     } else {
9840       if (unlikely(ValidateAndCountDeltalist(fread_end, sample_ct, fread_pp, deltalist_workspace, &rare10_ct) || (!rare10_ct))) {
9841         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall deltalist-subset for (0-based) variant #%u.\n", vidx);
9842         return 1;
9843       }
9844       for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
9845         if (unlikely(GetNyparrEntry(raw_genovec, deltalist_workspace[uii]) != 2)) {
9846           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall deltalist-subset for (0-based) variant #%u (an index doesn't correspond to an altxy call).\n", vidx);
9847           return 1;
9848         }
9849       }
9850     }
9851     const unsigned char* fvals = *fread_pp;
9852     uint32_t het_incr;
9853     if (allele_ct < 6) {
9854       if (allele_ct == 3) {
9855         // 1-bit entries.
9856         const uint32_t fvals_byte_ct = DivUp(rare10_ct, 8);
9857         if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9858           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9859           return 1;
9860         }
9861         const uint32_t rare10_ct_mod8 = rare10_ct % 8;
9862         if (rare10_ct_mod8) {
9863           if (unlikely((*fread_pp)[-1] >> rare10_ct_mod8)) {
9864             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9865             return 1;
9866           }
9867         }
9868         het_incr = rare10_ct - PopcountBytes(fvals, fvals_byte_ct);
9869       } else {
9870         // 2+2 bit entries.
9871         const uint32_t fvals_byte_ct = DivUp(rare10_ct, 2);
9872         if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9873           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9874           return 1;
9875         }
9876         // Can optimize this later.
9877         uint64_t nybble_cts[16];
9878         ZeroU64Arr(16, nybble_cts);
9879         CountAllNybbles64(fvals, rare10_ct, nybble_cts);
9880         // 1/1 is invalid here
9881         if (nybble_cts[0]) {
9882           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (out-of-range allele code pair).\n", vidx);
9883           return 1;
9884         }
9885         const uint32_t max_code = allele_ct - 2;
9886         for (uint32_t hi_code = 0; hi_code != 4; ++hi_code) {
9887           uint32_t lo_code = hi_code + 1;
9888           if (hi_code > max_code) {
9889             lo_code = 0;
9890           }
9891           const uint64_t* nybble_cts_offset = &(nybble_cts[hi_code * 4]);
9892           for (; lo_code != 4; ++lo_code) {
9893             if (nybble_cts_offset[lo_code]) {
9894               snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (out-of-range allele code pair).\n", vidx);
9895               return 1;
9896             }
9897           }
9898         }
9899         const uintptr_t rarehom_ct = nybble_cts[5] + nybble_cts[10] + nybble_cts[15];
9900         het_incr = rare10_ct - rarehom_ct;
9901         const uint32_t rare10_ct_mod2 = rare10_ct % 2;
9902         if (rare10_ct_mod2) {
9903           if (unlikely((*fread_pp)[-1] >> 4)) {
9904             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9905             return 1;
9906           }
9907         }
9908       }
9909     } else {
9910       if (allele_ct < 18) {
9911         // 4+4 bit entries.
9912         if (PtrAddCk(fread_end, rare10_ct, fread_pp)) {
9913           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9914           return 1;
9915         }
9916         const uint32_t max_code = allele_ct - 2;
9917         het_incr = 0;
9918         for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
9919           const uint32_t cur_byte = fvals[uii];
9920           const uint32_t lo_code = cur_byte & 15;
9921           const uint32_t hi_code = cur_byte >> 4;
9922           if (unlikely((!hi_code) || (hi_code > max_code) || (lo_code > hi_code))) {
9923             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (out-of-range or misordered allele code pair).\n", vidx);
9924             return 1;
9925           }
9926           het_incr += (lo_code != hi_code);
9927         }
9928       } else {
9929         // 8+8 bit entries
9930         if (PtrAddCk(fread_end, 2 * rare10_ct, fread_pp)) {
9931           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9932           return 1;
9933         }
9934         const uint32_t max_code = allele_ct - 2;
9935         het_incr = 0;
9936         for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
9937           const AlleleCode lo_code = fvals[2 * uii];
9938           const AlleleCode hi_code = fvals[2 * uii + 1];
9939           if (unlikely((!hi_code) || (hi_code > max_code) || (lo_code > hi_code))) {
9940             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (out-of-range or misordered allele code pair).\n", vidx);
9941             return 1;
9942           }
9943           het_incr += (lo_code != hi_code);
9944         }
9945       }
9946     }
9947     *het_ctp += het_incr;
9948   }
9949   return 0;
9950 }
9951 
ValidateHphase(const unsigned char * fread_end,uint32_t vidx,uint32_t het_ct,const unsigned char ** fread_pp,char * errstr_buf)9952 BoolErr ValidateHphase(const unsigned char* fread_end, uint32_t vidx, uint32_t het_ct, const unsigned char** fread_pp, char* errstr_buf) {
9953   if (unlikely(!het_ct)) {
9954     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Hardcall phase track present for (0-based) variant #%u, but there were no heterozygous calls.\n", vidx);
9955     return 1;
9956   }
9957   const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
9958   const unsigned char* aux2_first_part = *fread_pp;
9959   if (PtrAddCk(fread_end, aux2_first_part_byte_ct, fread_pp)) {
9960     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid hardcall phase track present for (0-based) variant #%u.\n", vidx);
9961     return 1;
9962   }
9963   const uint32_t het_ct_p1_mod8 = (het_ct + 1) % CHAR_BIT;
9964   if (het_ct_p1_mod8) {
9965     // verify trailing bits are zero
9966     if (unlikely((*fread_pp)[-1] >> het_ct_p1_mod8)) {
9967       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Hardcall phase track for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9968       return 1;
9969     }
9970   }
9971   if (!((*aux2_first_part) & 1)) {
9972     // phase always present, "first part" is only part
9973     return 0;
9974   }
9975   const uint32_t phasepresent_ct = PopcountBytes(aux2_first_part, aux2_first_part_byte_ct) - 1;
9976   if (unlikely(!phasepresent_ct)) {
9977     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Hardcall phase track for (0-based) variant #%u does not have any actual phase information.\n", vidx);
9978     return 1;
9979   }
9980   const uint32_t phaseinfo_byte_ct = DivUp(phasepresent_ct, CHAR_BIT);
9981   if (PtrAddCk(fread_end, phaseinfo_byte_ct, fread_pp)) {
9982     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid hardcall phase track present for (0-based) variant #%u.\n", vidx);
9983     return 1;
9984   }
9985   const uint32_t phasepresent_ct_mod8 = phasepresent_ct % 8;
9986   if (phasepresent_ct_mod8) {
9987     if (unlikely((*fread_pp)[-1] >> phasepresent_ct_mod8)) {
9988       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Hardcall phase track for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9989       return 1;
9990     }
9991   }
9992   return 0;
9993 }
9994 
ValidateDosage16(const unsigned char * fread_end,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,char * errstr_buf)9995 PglErr ValidateDosage16(const unsigned char* fread_end, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, char* errstr_buf) {
9996   // similar to ParseDosage16().  doesn't support multiallelic data yet.
9997   const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
9998   const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9999   if ((vrtype & 0x60) == 0x40) {
10000     // unconditional dosage.  handle separately from other two cases since
10001     // 65535 is valid.
10002 #ifdef __arm__
10003 #  error "Unaligned accesses in ValidateDosage16()."
10004 #endif
10005     const uint16_t* dosage_main = R_CAST(const uint16_t*, *fread_pp);
10006     if (PtrAddCk(fread_end, sample_ct * sizeof(int16_t), fread_pp)) {
10007       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid unconditional dosage track for (0-based) variant #%u.\n", vidx);
10008       return kPglRetMalformedInput;
10009     }
10010     // todo: verify genotype and dosage are consistent
10011     for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
10012       uint16_t cur_dosage_val_p1 = dosage_main[sample_idx];
10013       cur_dosage_val_p1 += 1;  // intentional overflow on 65535
10014       if (unlikely(cur_dosage_val_p1 > 32769)) {
10015         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid unconditional dosage track for (0-based) variant #%u (dosage is greater than 2).\n", vidx);
10016         return kPglRetMalformedInput;
10017       }
10018     }
10019     if (vrtype & 0x80) {
10020       const int16_t* dphase_delta = R_CAST(const int16_t*, *fread_pp);
10021       if (PtrAddCk(fread_end, sample_ct * sizeof(int16_t), fread_pp)) {
10022         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid unconditional phased-dosages for (0-based) variant #%u.\n", vidx);
10023         return kPglRetMalformedInput;
10024       }
10025       for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
10026         const uint16_t dosage_val = dosage_main[sample_idx];
10027         const int16_t dphase_delta_val = dphase_delta[sample_idx];
10028         const uint16_t dpiece0_x2 = dosage_val + dphase_delta_val;
10029         const uint16_t dpiece1_x2 = dosage_val - dphase_delta_val;
10030         // Update (11 May 2018): parity condition removed.
10031         if ((dpiece0_x2 > 32768) || (dpiece1_x2 > 32768)) {
10032           if (unlikely((dphase_delta_val != -32768) || (dosage_val != 65535))) {
10033             snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid unconditional phased-dosages for (0-based) variant #%u.\n", vidx);
10034             return kPglRetMalformedInput;
10035           }
10036         }
10037       }
10038     }
10039     return kPglRetSuccess;
10040   }
10041   uint32_t dosage_ct;
10042   if ((vrtype & 0x60) == 0x20) {
10043     // dosage list
10044     if (unlikely(ValidateAndCountDeltalist(fread_end, sample_ct, fread_pp, nullptr, &dosage_ct))) {
10045       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid dosage list for (0-based) variant #%u.\n", vidx);
10046       return kPglRetMalformedInput;
10047     }
10048   } else {
10049     const uint32_t sample_ctb = DivUp(sample_ct, CHAR_BIT);
10050     if (PtrCheck(fread_end, *fread_pp, sample_ctb)) {
10051       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid dosage subset for (0-based) variant #%u.\n", vidx);
10052       return kPglRetMalformedInput;
10053     }
10054     dosage_ct = PopcountBytes(*fread_pp, sample_ctb);
10055     *fread_pp += sample_ctb;
10056     const uint32_t sample_ct_mod8 = sample_ct % 8;
10057     if (sample_ct_mod8) {
10058       if (unlikely((*fread_pp)[-1] >> sample_ct_mod8)) {
10059         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Dosage subset bitarray for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
10060         return kPglRetMalformedInput;
10061       }
10062     }
10063   }
10064   const uint16_t* dosage_main = R_CAST(const uint16_t*, *fread_pp);
10065   if (PtrAddCk(fread_end, dosage_ct * sizeof(int16_t), fread_pp)) {
10066     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid dosage track for (0-based) variant #%u.\n", vidx);
10067     return kPglRetMalformedInput;
10068   }
10069   for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
10070     if (unlikely(dosage_main[dosage_idx] > 32768)) {
10071       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid dosage track for (0-based) variant #%u (dosage is greater than 2).\n", vidx);
10072       return kPglRetMalformedInput;
10073     }
10074   }
10075   if (vrtype & 0x80) {
10076     const uintptr_t* file_dphase_present = R_CAST(const uintptr_t*, *fread_pp);
10077     const uint32_t dphase_present_byte_ct = DivUp(dosage_ct, CHAR_BIT);
10078     if (PtrAddCk(fread_end, dphase_present_byte_ct, fread_pp)) {
10079       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10080       return kPglRetMalformedInput;
10081     }
10082     const uint32_t trailing_bit_ct = dosage_ct % CHAR_BIT;
10083     if (unlikely(trailing_bit_ct && ((*fread_pp)[-1] & (255 << trailing_bit_ct)))) {
10084       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10085       return kPglRetMalformedInput;
10086     }
10087     const uint16_t* dosage_main_read_iter = dosage_main;
10088     const int16_t* dphase_delta_read_iter = R_CAST(const int16_t*, *fread_pp);
10089     const uint32_t dphase_widx_last = (dphase_present_byte_ct - 1) / kBytesPerWord;
10090     uint32_t loop_end = kBitsPerWord;
10091     for (uint32_t dphase_widx = 0; ; ++dphase_widx) {
10092       uintptr_t ww;
10093       if (dphase_widx >= dphase_widx_last) {
10094         if (dphase_widx > dphase_widx_last) {
10095           break;
10096         }
10097         loop_end = 1 + ((dosage_ct - 1) % kBitsPerWord);
10098         const uint32_t final_byte_ct = DivUp(loop_end, CHAR_BIT);
10099         ww = SubwordLoad(&(file_dphase_present[dphase_widx]), final_byte_ct);
10100       } else {
10101         ww = file_dphase_present[dphase_widx];
10102       }
10103       for (uint32_t dphase_lowbits = 0; dphase_lowbits != loop_end; ++dphase_lowbits, ++dosage_main_read_iter) {
10104         if (!((ww >> dphase_lowbits) & 1)) {
10105           continue;
10106         }
10107         const uint16_t dosage_val = *dosage_main_read_iter;
10108         const int16_t dphase_delta_val = *dphase_delta_read_iter++;
10109         const uint16_t dpiece0_x2 = dosage_val + dphase_delta_val;
10110         const uint16_t dpiece1_x2 = dosage_val - dphase_delta_val;
10111         if (unlikely((dpiece0_x2 > 32768) || (dpiece1_x2 > 32768))) {
10112           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10113           return kPglRetMalformedInput;
10114         }
10115       }
10116     }
10117     if (unlikely(dphase_delta_read_iter == R_CAST(const int16_t*, *fread_pp))) {
10118       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10119       return kPglRetMalformedInput;
10120     }
10121     *fread_pp = R_CAST(const unsigned char*, dphase_delta_read_iter);
10122     if (PtrCheck(fread_end, *fread_pp, 0)) {
10123       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10124       return kPglRetMalformedInput;
10125     }
10126   }
10127   return kPglRetSuccess;
10128 }
10129 
10130 static_assert(kPglVblockSize == 65536, "PgrValidate() needs to have an error message updated.");
PgrValidate(PgenReader * pgr_ptr,uintptr_t * genovec_buf,char * errstr_buf)10131 PglErr PgrValidate(PgenReader* pgr_ptr, uintptr_t* genovec_buf, char* errstr_buf) {
10132   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
10133   // Performs all validation which isn't done by pgfi_init_phase{1,2}() and
10134   // PgrInit().
10135   const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
10136   const uint32_t variant_ct = pgrp->fi.raw_variant_ct;
10137   const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
10138   const uint32_t const_vrtype = pgrp->fi.const_vrtype;
10139   if (const_vrtype != UINT32_MAX) {
10140     if (unlikely(allele_idx_offsets && (allele_idx_offsets[variant_ct] != 2 * variant_ct))) {
10141       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pvar file contains multiallelic variant(s), but .%s file does not.\n", (const_vrtype == kPglVrtypePlink1)? "bed" : "pgen");
10142       return kPglRetInconsistentInput;
10143     }
10144     // const uintptr_t const_vrec_width = pgrp->fi.const_vrec_width;
10145     if ((!const_vrtype) || (const_vrtype == kPglVrtypePlink1)) {
10146       // only thing that can go wrong is nonzero trailing bits
10147       const uint32_t dbl_sample_ct_mod4 = 2 * (sample_ct % 4);
10148       if (!dbl_sample_ct_mod4) {
10149         return kPglRetSuccess;
10150       }
10151       for (uint32_t vidx = 0; vidx != variant_ct; ++vidx) {
10152         const unsigned char* fread_ptr;
10153         const unsigned char* fread_end = nullptr;
10154         if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
10155           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10156           return kPglRetReadFail;
10157         }
10158         const uint32_t last_byte_in_record = fread_end[-1];
10159         if (unlikely(last_byte_in_record >> dbl_sample_ct_mod4)) {
10160           snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Last byte of (0-based) variant #%u has nonzero trailing bits.\n", vidx);
10161           return kPglRetMalformedInput;
10162         }
10163       }
10164       return kPglRetSuccess;
10165     }
10166     // todo: 16-bit dosage entries can't be in [32769,65534]
10167     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Validation of fixed-width dosage formats is not implemented yet.\n");
10168     return kPglRetNotYetSupported;
10169   }
10170   const unsigned char* vrtypes = pgrp->fi.vrtypes;
10171   for (uint32_t vidx = 0; vidx < variant_ct; vidx += kPglVblockSize) {
10172     if (unlikely(VrtypeLdCompressed(vrtypes[vidx]))) {
10173       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: (0-based) variant #%u is LD-compressed; this is prohibited when the variant index is a multiple of 65536.\n", vidx);
10174       return kPglRetMalformedInput;
10175     }
10176   }
10177   // file size may not be validated yet.
10178   uint64_t fsize;
10179   FILE* ff = pgrp->ff;
10180 #ifndef NO_MMAP
10181   if (ff == nullptr) {
10182     // mmap case
10183     fsize = pgrp->fi.file_size;
10184   } else {
10185 #endif
10186     if (unlikely(fseeko(ff, 0, SEEK_END))) {
10187       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10188       return kPglRetReadFail;
10189     }
10190     fsize = ftello(ff);
10191     pgrp->fp_vidx = 1;  // force fseek when loading first variant
10192 #ifndef NO_MMAP
10193   }
10194 #endif
10195   // todo: modify this check when phase sets are implemented
10196   const uint64_t expected_fsize = pgrp->fi.var_fpos[variant_ct];
10197   if (unlikely(expected_fsize != fsize)) {
10198     snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen header indicates that file size should be %" PRIu64 " bytes, but actual file size is %" PRIu64 " bytes.\n", expected_fsize, fsize);
10199     return kPglRetMalformedInput;
10200   }
10201   const uint32_t vblock_ct = DivUp(variant_ct, kPglVblockSize);
10202   uint32_t header_ctrl = 0;
10203 #ifndef NO_MMAP
10204   if (ff == nullptr) {
10205 #  ifdef __arm__
10206 #    error "Unaligned accesses in PgrValidate()."
10207 #  endif
10208     memcpy(&header_ctrl, &(pgrp->fi.block_base[11]), 1);
10209     // validate the random-access index.
10210     const uint64_t* fpos_index = R_CAST(const uint64_t*, &(pgrp->fi.block_base[12]));
10211     for (uint32_t vblock_idx = 0; vblock_idx != vblock_ct; ++vblock_idx) {
10212       if (unlikely(fpos_index[vblock_idx] != pgrp->fi.var_fpos[vblock_idx * kPglVblockSize])) {
10213         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen header vblock-start index is inconsistent with variant record length index.\n");
10214         return kPglRetMalformedInput;
10215       }
10216     }
10217   } else {
10218 #endif
10219     if (unlikely(fseeko(ff, 11, SEEK_SET))) {
10220       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10221       return kPglRetReadFail;
10222     }
10223     header_ctrl = getc_unlocked(ff);
10224     if (unlikely(header_ctrl > 255)) {
10225       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10226       return kPglRetReadFail;
10227     }
10228     for (uint32_t vblock_idx = 0; vblock_idx != vblock_ct; ++vblock_idx) {
10229       uint64_t vblock_start_fpos;
10230       if (unlikely(!fread_unlocked(&vblock_start_fpos, sizeof(int64_t), 1, ff))) {
10231         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10232         return kPglRetReadFail;
10233       }
10234       if (unlikely(vblock_start_fpos != pgrp->fi.var_fpos[vblock_idx * kPglVblockSize])) {
10235         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen header vblock-start index is inconsistent with variant record length index.\n");
10236         return kPglRetMalformedInput;
10237       }
10238     }
10239 #ifndef NO_MMAP
10240   }
10241 #endif
10242   const uint32_t vrtype_and_fpos_storage = header_ctrl & 15;
10243   const uint32_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
10244   const uint32_t nonref_flags_stored = ((header_ctrl >> 6) == 3);
10245 
10246   // does not include vrtypes yet
10247   uint64_t vblock_index_byte_ct = kPglVblockSize * (1 + (vrtype_and_fpos_storage & 3) + alt_allele_ct_byte_ct);
10248   if (nonref_flags_stored) {
10249     vblock_index_byte_ct += kPglVblockSize / CHAR_BIT;
10250   }
10251   uint64_t last_vrtype_byte_offset = 0;
10252   uint32_t trailing_shift = 4;
10253   if (vrtype_and_fpos_storage & 8) {
10254     vblock_index_byte_ct += kPglVblockSize >> (10 - vrtype_and_fpos_storage);
10255     if (vrtype_and_fpos_storage == 8) {
10256       const uint32_t variant_ct_mod4 = variant_ct % 4;
10257       if (variant_ct_mod4) {
10258         last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t)) + ((variant_ct % kPglVblockSize) / 4);
10259         trailing_shift = variant_ct_mod4 * 2;
10260       }
10261     } else {
10262       assert(vrtype_and_fpos_storage == 9);
10263       if (variant_ct % 2) {
10264         last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t)) + ((variant_ct % kPglVblockSize) / 2);
10265       }
10266     }
10267   } else if (!(vrtype_and_fpos_storage & 4)) {
10268     vblock_index_byte_ct += kPglVblockSize / 2;
10269     if (variant_ct % 2) {
10270       // bugfix (22 Nov 2017): forgot to add offset in last block
10271       last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t)) + ((variant_ct % kPglVblockSize) / 2);
10272     }
10273     /*
10274   } else {
10275     vblock_index_byte_ct += kPglVblockSize;
10276     */
10277   }
10278   if (last_vrtype_byte_offset) {
10279     uint32_t last_vrtype_byte = 0;
10280 #ifndef NO_MMAP
10281     if (ff == nullptr) {
10282       memcpy(&last_vrtype_byte, &(pgrp->fi.block_base[last_vrtype_byte_offset]), 1);
10283     } else {
10284 #endif
10285       if (unlikely(fseeko(ff, last_vrtype_byte_offset, SEEK_SET))) {
10286         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10287         return kPglRetReadFail;
10288       }
10289       last_vrtype_byte = getc_unlocked(ff);
10290       if (unlikely(last_vrtype_byte > 255)) {
10291         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10292         return kPglRetReadFail;
10293       }
10294 #ifndef NO_MMAP
10295     }
10296 #endif
10297     if (unlikely(last_vrtype_byte >> trailing_shift)) {
10298       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Nonzero trailing bits in last vrtype index byte.\n");
10299       return kPglRetMalformedInput;
10300     }
10301   }
10302   const uintptr_t* nonref_flags = pgrp->fi.nonref_flags;
10303   if (nonref_flags) {
10304     const uint32_t variant_ct_modl = variant_ct % kBitsPerWord;
10305     if (variant_ct % CHAR_BIT) {
10306       if (unlikely(nonref_flags[variant_ct / kBitsPerWord] >> variant_ct_modl)) {
10307         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Nonzero trailing bits in last nonref_flags byte.\n");
10308         return kPglRetMalformedInput;
10309       }
10310     }
10311   }
10312 
10313   // could move most of this into plink2_common and make it multithreaded, if
10314   // speed is ever an issue.
10315   uint32_t allele_ct = 2;
10316   for (uint32_t vidx = 0; vidx != variant_ct; ++vidx) {
10317     const unsigned char* fread_ptr;
10318     const unsigned char* fread_end;
10319     if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
10320       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10321       return kPglRetReadFail;
10322     }
10323     const unsigned char* fread_ptr_start = fread_ptr;
10324     if (unlikely(ValidateGeno(fread_end, vidx, pgrp, &fread_ptr, genovec_buf, errstr_buf))) {
10325       return kPglRetMalformedInput;
10326     }
10327     ZeroTrailingNyps(sample_ct, genovec_buf);
10328     const uint32_t vrtype = vrtypes[vidx];
10329     uint32_t het_ct = CountNyp(genovec_buf, kMask5555, sample_ct);
10330     if (allele_idx_offsets) {
10331       allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
10332     }
10333     if (VrtypeMultiallelicHc(vrtype)) {
10334       if (unlikely(ValidateMultiallelicHc(fread_end, genovec_buf, vidx, allele_ct, pgrp, &fread_ptr, &het_ct, errstr_buf))) {
10335         return kPglRetMalformedInput;
10336       }
10337     }
10338     // don't need genovec_buf to store main genotypes past this point.
10339     if (VrtypeHphase(vrtype)) {
10340       if (unlikely(ValidateHphase(fread_end, vidx, het_ct, &fread_ptr, errstr_buf))) {
10341         return kPglRetMalformedInput;
10342       }
10343     }
10344     if (vrtype & 0xe0) {
10345       if (unlikely((vrtype & 0xe0) == 0x80)) {
10346         snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid record type for (0-based) variant #%u (phased dosage bit set, but main dosage bits unset).\n", vidx);
10347         return kPglRetMalformedInput;
10348       }
10349       PglErr reterr = ValidateDosage16(fread_end, vidx, pgrp, &fread_ptr, errstr_buf);
10350       if (unlikely(reterr)) {
10351         return reterr;
10352       }
10353     }
10354     if (unlikely(fread_ptr != fread_end)) {
10355       // possible todo: tolerate this at the end of a vblock.
10356       snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Extra byte(s) in (0-based) variant record #%u. (record type = %u; expected length = %" PRIuPTR ", actual = %" PRIuPTR ")\n", vidx, vrtype, S_CAST(uintptr_t, fread_ptr - fread_ptr_start), S_CAST(uintptr_t, fread_end - fread_ptr_start));
10357       return kPglRetMalformedInput;
10358     }
10359   }
10360   return kPglRetSuccess;
10361 }
10362 
10363 
CleanupPgfi(PgenFileInfo * pgfip,PglErr * reterrp)10364 BoolErr CleanupPgfi(PgenFileInfo* pgfip, PglErr* reterrp) {
10365   // memory is the responsibility of the caller
10366   if (pgfip->shared_ff) {
10367     if (unlikely(fclose_null(&pgfip->shared_ff))) {
10368       if (*reterrp == kPglRetSuccess) {
10369         *reterrp = kPglRetReadFail;
10370         return 1;
10371       }
10372     }
10373 #ifndef NO_MMAP
10374   } else if (pgfip->block_base != nullptr) {
10375     munmap(K_CAST(unsigned char*, pgfip->block_base), pgfip->file_size);
10376 #endif
10377   }
10378   return 0;
10379 }
10380 
CleanupPgr(PgenReader * pgr_ptr,PglErr * reterrp)10381 BoolErr CleanupPgr(PgenReader* pgr_ptr, PglErr* reterrp) {
10382   PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
10383   // assume file is open if pgr.ff is not null
10384   // memory is the responsibility of the caller for now
10385   if (!pgrp->ff) {
10386     return 0;
10387   }
10388   if (fclose_null(&(pgrp->ff))) {
10389     if (*reterrp == kPglRetSuccess) {
10390       *reterrp = kPglRetReadFail;
10391       return 1;
10392     }
10393   }
10394   return 0;
10395 }
10396 
10397 #ifdef __cplusplus
10398 }  // namespace plink2
10399 #endif
10400