1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "pgenlib_read.h"
19
20 #include <errno.h>
21
22 #ifndef NO_MMAP
23 # include <sys/types.h> // fstat()
24 # include <sys/stat.h> // open(), fstat()
25 # include <sys/mman.h> // mmap()
26 # include <fcntl.h> // open()
27 # include <unistd.h> // fstat()
28 #endif
29
30 #ifdef __cplusplus
31 namespace plink2 {
32 #endif
33
GetPgrp(PgenReader * pgr_ptr)34 static inline PgenReaderMain* GetPgrp(PgenReader* pgr_ptr) {
35 return &GET_PRIVATE(*pgr_ptr, m);
36 }
37
GetSicp(PgrSampleSubsetIndex pssi)38 static inline const uint32_t* GetSicp(PgrSampleSubsetIndex pssi) {
39 return GET_PRIVATE(pssi, cumulative_popcounts);
40 }
41
42 #ifdef __arm__
43 # error "Unaligned accesses in SmallGenoarrCount3FreqIncr()."
44 #endif
SmallGenoarrCount3FreqIncr(const uintptr_t * genoarr_iter,uint32_t byte_ct,uint32_t * even_ctp,uint32_t * odd_ctp,uint32_t * bothset_ctp)45 void SmallGenoarrCount3FreqIncr(const uintptr_t* genoarr_iter, uint32_t byte_ct, uint32_t* even_ctp, uint32_t* odd_ctp, uint32_t* bothset_ctp) {
46 for (uint32_t bytes_left = byte_ct; ; ) {
47 uintptr_t cur_geno_word;
48 if (bytes_left < kBytesPerWord) {
49 if (!bytes_left) {
50 return;
51 }
52 cur_geno_word = ProperSubwordLoad(genoarr_iter, bytes_left);
53 bytes_left = 0;
54 } else {
55 cur_geno_word = *genoarr_iter++;
56 bytes_left -= kBytesPerWord;
57 }
58 const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
59 *even_ctp += Popcount01Word(cur_geno_word & kMask5555);
60 *odd_ctp += Popcount01Word(cur_geno_word_high);
61 *bothset_ctp += Popcount01Word(cur_geno_word & cur_geno_word_high);
62 }
63 }
64
65 void GenoarrbCountFreqs(const unsigned char* genoarrb, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
66 // does not read past the end of genoarrb
67 uint32_t lead_byte_ct = (-R_CAST(uintptr_t, genoarrb)) % kBytesPerVec;
68 uint32_t even_ct = 0;
69 uint32_t odd_ct = 0;
70 uint32_t bothset_ct = 0;
71 const uintptr_t* genoarrb_iter;
72 uint32_t trail_ct;
73 if (sample_ct > lead_byte_ct * 4 + (6 * kNypsPerVec)) {
74 const uint32_t remaining_sample_ct = sample_ct - 4 * lead_byte_ct;
75 // strictly speaking, this relies on undefined behavior: see e.g.
76 // http://pzemtsov.github.io/2016/11/06/bug-story-alignment-on-x86.html
77 // Probably want to search out all instances of __arm__ and make the code
78 // standard-compliant, if that can be done without a speed penalty. Though
79 // it makes sense to wait until more is known about Apple's MacBook
80 // processor plans...
81 SmallGenoarrCount3FreqIncr(R_CAST(const uintptr_t*, genoarrb), lead_byte_ct, &even_ct, &odd_ct, &bothset_ct);
82 genoarrb_iter = R_CAST(const uintptr_t*, &(genoarrb[lead_byte_ct]));
83 const uint32_t remaining_full_vec_ct = remaining_sample_ct / kNypsPerVec;
84 uint32_t even_ct_incr;
85 uint32_t odd_ct_incr;
86 uint32_t bothset_ct_incr;
87 const uint32_t vec_ct = remaining_full_vec_ct - (remaining_full_vec_ct % 6);
88 Count3FreqVec6(R_CAST(const VecW*, genoarrb_iter), vec_ct, &even_ct_incr, &odd_ct_incr, &bothset_ct_incr);
89 even_ct += even_ct_incr;
90 odd_ct += odd_ct_incr;
91 bothset_ct += bothset_ct_incr;
92 genoarrb_iter = &(genoarrb_iter[kWordsPerVec * vec_ct]);
93 trail_ct = remaining_sample_ct - (vec_ct * kNypsPerVec);
94 } else {
95 genoarrb_iter = R_CAST(const uintptr_t*, genoarrb);
96 trail_ct = sample_ct;
97 }
98 const uint32_t trail_byte_ct = NypCtToByteCt(trail_ct);
99 SmallGenoarrCount3FreqIncr(genoarrb_iter, trail_byte_ct, &even_ct, &odd_ct, &bothset_ct);
100 genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
101 genocounts[1] = even_ct - bothset_ct;
102 genocounts[2] = odd_ct - bothset_ct;
103 genocounts[3] = bothset_ct;
104 }
105
106 #ifdef __arm__
107 # error "Unaligned accesses in GenoarrbCountSubsetFreqs()."
108 #endif
109 void GenoarrbCountSubsetFreqs(const unsigned char* genoarrb, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t raw_sample_ct, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
110 // does not read past the end of genoarrb
111 const uint32_t raw_sample_ctv2 = NypCtToVecCt(raw_sample_ct);
112 uint32_t even_ct;
113 uint32_t odd_ct;
114 uint32_t bothset_ct;
115 uint32_t vec_idx = raw_sample_ctv2 - (raw_sample_ctv2 % 6);
116 CountSubset3FreqVec6(R_CAST(const VecW*, genoarrb), R_CAST(const VecW*, sample_include_interleaved_vec), vec_idx, &even_ct, &odd_ct, &bothset_ct);
117 const uintptr_t* genoarrb_iter = &(R_CAST(const uintptr_t*, genoarrb)[kWordsPerVec * vec_idx]);
118 #ifdef __LP64__
119 const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[vec_idx * (kWordsPerVec / 2)]);
120 #else
121 // bugfix (19 Jul 2018): (kWordsPerVec / 2) doesn't work in 32-bit case
122 const uintptr_t* interleaved_mask_iter = &(sample_include_interleaved_vec[(vec_idx * kWordsPerVec) / 2]);
123 #endif
124 #ifdef USE_AVX2
125 const uint32_t halfvec_idx_trail = (raw_sample_ct + 3) / (kBitsPerVec / 4);
126 uintptr_t mask_base1 = 0;
127 uintptr_t mask_base2 = 0;
128 uintptr_t mask_base3 = 0;
129 uintptr_t mask_base4 = 0;
130 for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
131 uintptr_t mask_word1;
132 uintptr_t mask_word2;
133 uintptr_t mask_word3;
134 uintptr_t mask_word4;
135 if (!(vec_idx % 2)) {
136 mask_base1 = *interleaved_mask_iter++;
137 mask_base2 = *interleaved_mask_iter++;
138 mask_base3 = *interleaved_mask_iter++;
139 mask_base4 = *interleaved_mask_iter++;
140 mask_word1 = mask_base1 & kMask5555;
141 mask_word2 = mask_base2 & kMask5555;
142 mask_word3 = mask_base3 & kMask5555;
143 mask_word4 = mask_base4 & kMask5555;
144 } else {
145 mask_word1 = (mask_base1 >> 1) & kMask5555;
146 mask_word2 = (mask_base2 >> 1) & kMask5555;
147 mask_word3 = (mask_base3 >> 1) & kMask5555;
148 mask_word4 = (mask_base4 >> 1) & kMask5555;
149 }
150 uint32_t vechalf_idx = 0;
151 while (1) {
152 uintptr_t cur_geno_word1;
153 uintptr_t cur_geno_word2;
154 if (2 * vec_idx + vechalf_idx < halfvec_idx_trail) {
155 cur_geno_word1 = *genoarrb_iter++;
156 cur_geno_word2 = *genoarrb_iter++;
157 } else {
158 const uint32_t remaining_byte_ct = NypCtToByteCt(raw_sample_ct) % kBytesPerVec;
159 // todo: check if this harms usual-case loop efficiency
160 vechalf_idx = 1;
161 if (remaining_byte_ct < kBytesPerWord) {
162 cur_geno_word1 = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct);
163 cur_geno_word2 = 0;
164 } else {
165 cur_geno_word1 = *genoarrb_iter++;
166 cur_geno_word2 = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct - kBytesPerWord);
167 }
168 }
169 const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
170 const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
171 even_ct += PopcountWord(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
172 odd_ct += PopcountWord((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
173 bothset_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
174 if (vechalf_idx) {
175 break;
176 }
177 ++vechalf_idx;
178 mask_word1 = mask_word3;
179 mask_word2 = mask_word4;
180 }
181 }
182 #else // not USE_AVX2
183 const uint32_t vec_idx_trail = (raw_sample_ct + 3) / kNypsPerVec;
184 # ifdef __LP64__
185 uintptr_t mask_base1 = 0;
186 uintptr_t mask_base2 = 0;
187 for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
188 uintptr_t mask_word1;
189 uintptr_t mask_word2;
190 if (!(vec_idx % 2)) {
191 mask_base1 = *interleaved_mask_iter++;
192 mask_base2 = *interleaved_mask_iter++;
193 mask_word1 = mask_base1 & kMask5555;
194 mask_word2 = mask_base2 & kMask5555;
195 } else {
196 mask_word1 = (mask_base1 >> 1) & kMask5555;
197 mask_word2 = (mask_base2 >> 1) & kMask5555;
198 }
199 uintptr_t cur_geno_word1;
200 uintptr_t cur_geno_word2;
201 if (vec_idx < vec_idx_trail) {
202 cur_geno_word1 = *genoarrb_iter++;
203 cur_geno_word2 = *genoarrb_iter++;
204 } else {
205 const uint32_t remaining_byte_ct = NypCtToByteCt(raw_sample_ct) % kBytesPerVec;
206 if (remaining_byte_ct < kBytesPerWord) {
207 cur_geno_word1 = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct);
208 cur_geno_word2 = 0;
209 } else {
210 cur_geno_word1 = *genoarrb_iter++;
211 cur_geno_word2 = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct - kBytesPerWord);
212 }
213 }
214 const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
215 const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
216 # ifdef USE_SSE42
217 even_ct += PopcountWord(((cur_geno_word1 & mask_word1) << 1) | (cur_geno_word2 & mask_word2));
218 odd_ct += PopcountWord((cur_geno_word1_high_masked << 1) | cur_geno_word2_high_masked);
219 bothset_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
220 # else
221 even_ct += NypsumWord((cur_geno_word1 & mask_word1) + (cur_geno_word2 & mask_word2));
222 odd_ct += NypsumWord(cur_geno_word1_high_masked + cur_geno_word2_high_masked);
223 bothset_ct += NypsumWord((cur_geno_word1 & cur_geno_word1_high_masked) + (cur_geno_word2 & cur_geno_word2_high_masked));
224 # endif
225 }
226 # else // not __LP64__
227 uintptr_t mask_base = 0;
228 for (; vec_idx != raw_sample_ctv2; ++vec_idx) {
229 uintptr_t mask_word;
230 if (!(vec_idx % 2)) {
231 mask_base = *interleaved_mask_iter++;
232 mask_word = mask_base & kMask5555;
233 } else {
234 mask_word = (mask_base >> 1) & kMask5555;
235 }
236 uintptr_t cur_geno_word;
237 if (vec_idx < vec_idx_trail) {
238 cur_geno_word = *genoarrb_iter++;
239 } else {
240 const uint32_t remaining_byte_ct = NypCtToByteCt(raw_sample_ct) % kBytesPerVec;
241 cur_geno_word = ProperSubwordLoad(genoarrb_iter, remaining_byte_ct);
242 }
243 const uintptr_t cur_geno_word_high_masked = mask_word & (cur_geno_word >> 1);
244 even_ct += Popcount01Word(cur_geno_word & mask_word);
245 odd_ct += Popcount01Word(cur_geno_word_high_masked);
246 bothset_ct += Popcount01Word(cur_geno_word & cur_geno_word_high_masked);
247 }
248 # endif // not __LP64__
249 #endif // not USE_AVX2
250 genocounts[0] = sample_ct + bothset_ct - even_ct - odd_ct;
251 genocounts[1] = even_ct - bothset_ct;
252 genocounts[2] = odd_ct - bothset_ct;
253 genocounts[3] = bothset_ct;
254 }
255
256 void GenoarrCountFreqs(const uintptr_t* genoarr, uint32_t sample_ct, STD_ARRAY_REF(uint32_t, 4) genocounts) {
257 // this masks out trailing genoarr bits
258 const uint32_t sample_ct_remainder = sample_ct % kBitsPerWordD2;
259 GenoarrCountFreqsUnsafe(genoarr, sample_ct - sample_ct_remainder, genocounts);
260 if (sample_ct_remainder) {
261 uintptr_t cur_geno_word = bzhi(genoarr[sample_ct / kBitsPerWordD2], 2 * sample_ct_remainder);
262 const uintptr_t cur_geno_word_high = kMask5555 & (cur_geno_word >> 1);
263 const uint32_t even_ct = Popcount01Word(cur_geno_word & kMask5555);
264 const uint32_t odd_ct = Popcount01Word(cur_geno_word_high);
265 const uint32_t bothset_ct = Popcount01Word(cur_geno_word & cur_geno_word_high);
266 genocounts[0] += sample_ct_remainder + bothset_ct - even_ct - odd_ct;
267 genocounts[1] += even_ct - bothset_ct;
268 genocounts[2] += odd_ct - bothset_ct;
269 genocounts[3] += bothset_ct;
270 }
271 }
272
GenovecNonmissingToZeroUnsafe(uint32_t sample_ct,uintptr_t * genovec)273 void GenovecNonmissingToZeroUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
274 // sets 1 and 2 to zero; leaves 3s untouched.
275 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
276 assert(VecIsAligned(genovec));
277 const VecW m1 = VCONST_W(kMask5555);
278 VecW* vptr = R_CAST(VecW*, genovec);
279 for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
280 VecW cur_vec = vptr[vidx];
281 const VecW cur_vec_rshifted = vecw_srli(cur_vec, 1);
282 cur_vec = cur_vec & m1;
283 cur_vec = cur_vec & cur_vec_rshifted;
284 vptr[vidx] = cur_vec | vecw_slli(cur_vec, 1);
285 }
286 }
287
GenovecNonzeroToMissingUnsafe(uint32_t sample_ct,uintptr_t * genovec)288 void GenovecNonzeroToMissingUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
289 // converts 1s and 2s to 3s, leaves zeroes untouched.
290 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
291 assert(VecIsAligned(genovec));
292 const VecW m1 = VCONST_W(kMask5555);
293 VecW* vptr = R_CAST(VecW*, genovec);
294 for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
295 VecW cur_vec = vptr[vidx];
296 const VecW cur_vec_rshifted = vecw_srli(cur_vec, 1);
297 cur_vec = cur_vec | cur_vec_rshifted;
298 cur_vec = cur_vec & m1;
299 vptr[vidx] = cur_vec | vecw_slli(cur_vec, 1);
300 }
301 }
302
GenovecNontwoToMissingUnsafe(uint32_t sample_ct,uintptr_t * genovec)303 void GenovecNontwoToMissingUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
304 // 0 -> 3, 1 -> 3.
305 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
306 assert(VecIsAligned(genovec));
307 const VecW not_m1 = VCONST_W(kMaskAAAA);
308 VecW* vptr = R_CAST(VecW*, genovec);
309 for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
310 const VecW cur_vec = vptr[vidx];
311 const VecW cur_vec_hi = vecw_and_notfirst(cur_vec, not_m1);
312 const VecW cur_or = cur_vec_hi | vecw_srli(cur_vec_hi, 1);
313 vptr[vidx] = cur_vec | cur_or;
314 }
315 }
316
GenovecNonzeroToMissingThenInvertUnsafe(uint32_t sample_ct,uintptr_t * genovec)317 void GenovecNonzeroToMissingThenInvertUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
318 // 0 -> 2, 1 -> 3, 2 -> 3
319 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
320 assert(VecIsAligned(genovec));
321 const VecW not_m1 = VCONST_W(kMaskAAAA);
322 VecW* vptr = R_CAST(VecW*, genovec);
323 for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
324 const VecW cur_vec = vptr[vidx];
325 vptr[vidx] = cur_vec | vecw_srli(cur_vec, 1) | not_m1;
326 }
327 }
328
GenovecInvertThenNonzeroToMissingUnsafe(uint32_t sample_ct,uintptr_t * genovec)329 void GenovecInvertThenNonzeroToMissingUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
330 // 0 -> 3, 1 -> 3, 2 -> 0
331 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
332 assert(VecIsAligned(genovec));
333 const VecW m1 = VCONST_W(kMask5555);
334 VecW* vptr = R_CAST(VecW*, genovec);
335 for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
336 const VecW cur_vec = vptr[vidx];
337 const VecW cur_vec_rshifted = vecw_srli(cur_vec, 1);
338 const VecW not2 = vecw_and_notfirst(vecw_and_notfirst(cur_vec, cur_vec_rshifted), m1);
339 vptr[vidx] = not2 | vecw_slli(not2, 1);
340 }
341 }
342
BiallelicDiploidMinimac3R2(uint64_t alt1_dosage,uint64_t hap_alt1_ssq_x2,uint32_t nm_sample_ct)343 double BiallelicDiploidMinimac3R2(uint64_t alt1_dosage, uint64_t hap_alt1_ssq_x2, uint32_t nm_sample_ct) {
344 if (!nm_sample_ct) {
345 return (0.0 / 0.0);
346 }
347
348 const uint64_t nm_sample_ct_x32768 = nm_sample_ct * 0x8000LLU;
349 if (nm_sample_ct < 131072) {
350 const uint64_t alt1_dosage_sq = alt1_dosage * alt1_dosage;
351 const uint64_t observed_variance_times_2n = hap_alt1_ssq_x2 * nm_sample_ct - alt1_dosage * alt1_dosage;
352 const uint64_t expected_variance_times_2n = nm_sample_ct_x32768 * alt1_dosage - alt1_dosage_sq;
353 return S_CAST(double, observed_variance_times_2n) / S_CAST(double, expected_variance_times_2n);
354 }
355 // Need to avoid catastrophic cancellation here.
356 const double alt1_dosaged = u63tod(alt1_dosage);
357 const double expected_variance_times_2n = alt1_dosaged * u63tod(nm_sample_ct_x32768 - alt1_dosage);
358 const uint64_t hap_alt1_ssq_x2_hi = hap_alt1_ssq_x2 >> 32;
359 uint64_t left_lo = (hap_alt1_ssq_x2 & 0xffffffffLLU) * nm_sample_ct;
360 const uint64_t left_hi = (left_lo >> 32) + hap_alt1_ssq_x2_hi * nm_sample_ct;
361 left_lo &= 0xffffffffU;
362 const uint64_t alt1_dosage_lo = alt1_dosage & 0xffffffffLLU;
363 const uint64_t alt1_dosage_hi = alt1_dosage >> 32;
364 uint64_t right_lo = alt1_dosage_lo * alt1_dosage_lo;
365 const uint64_t right_hi = (right_lo >> 32) + (alt1_dosage_lo + alt1_dosage) * alt1_dosage_hi;
366 right_lo &= 0xffffffffU;
367 const double observed_variance_times_2n_hi = u63tod(left_hi - right_hi);
368 const int64_t observed_variance_times_2n_lo = S_CAST(int64_t, left_lo) - S_CAST(int64_t, right_lo);
369 const double observed_variance_times_2n = (observed_variance_times_2n_hi * 4294967296.0) + observed_variance_times_2n_lo;
370 return observed_variance_times_2n / expected_variance_times_2n;
371 }
372
PreinitPgfi(PgenFileInfo * pgfip)373 void PreinitPgfi(PgenFileInfo* pgfip) {
374 pgfip->shared_ff = nullptr;
375 pgfip->block_base = nullptr;
376 // we want this for proper handling of e.g. sites-only VCFs
377 pgfip->nonref_flags = nullptr;
378 }
379
CountPgfiAllocCachelinesRequired(uint32_t raw_variant_ct)380 uint32_t CountPgfiAllocCachelinesRequired(uint32_t raw_variant_ct) {
381 // assumes variable-width variant records, otherwise pgfi.vrtypes and
382 // pgfi.vr_fpos can just be nullptr.
383
384 // vrtypes: 1 byte per entry, (raw_variant_ct + 1) entries
385 uint32_t cachelines_required = 1 + (raw_variant_ct / kCacheline);
386
387 // var_fpos: 8 bytes per entry, (raw_variant_ct + 1) entries
388 cachelines_required += 1 + (raw_variant_ct / kInt64PerCacheline);
389 return cachelines_required;
390 }
391
CountPgrAllocCachelinesRequired(uint32_t raw_sample_ct,PgenGlobalFlags gflags,uint32_t max_allele_ct,uint32_t fread_buf_byte_ct)392 uint32_t CountPgrAllocCachelinesRequired(uint32_t raw_sample_ct, PgenGlobalFlags gflags, uint32_t max_allele_ct, uint32_t fread_buf_byte_ct) {
393 // ldbase_raw_genovec: always needed, 2 bits per entry, up to raw_sample_ct
394 // entries
395 const uint32_t genovec_cacheline_req = NypCtToCachelineCt(raw_sample_ct);
396 const uint32_t bitvec_cacheline_req = BitCtToCachelineCt(raw_sample_ct);
397 uint32_t cachelines_required = genovec_cacheline_req;
398 // fread_buf. fread_buf_byte_ct should be zero if mmap() is being used.
399 // DivUp() won't overflow since fread_buf_byte_ct requirement can't exceed
400 // kPglMaxBytesPerVariant, which is sufficiently far from 2^32.
401 cachelines_required += DivUp(fread_buf_byte_ct, kCacheline);
402
403 const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1;
404 const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor);
405 if ((gflags & kfPgenGlobalDifflistOrLdPresent) || (max_allele_ct > 2)) {
406 // workspace_difflist_sample_ids
407 // bugfix: must add 1 since several routines add a terminator element
408 cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline);
409 }
410 if (gflags & kfPgenGlobalDifflistOrLdPresent) {
411 // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
412 // workspace_raregeno_vec
413 cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base);
414
415 // workspace_raregeno_tmp_loadbuf
416 cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base);
417
418 if (ld_compression_present) {
419 // ldbase_genovec
420 cachelines_required += genovec_cacheline_req;
421
422 // ldbase_raregeno
423 cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base);
424
425 // ldbase_difflist_sample_ids
426 cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline);
427 }
428 }
429 const PgenGlobalFlags gflags_hphase_dosage = gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent);
430 if ((max_allele_ct > 2) || gflags_hphase_dosage) {
431 cachelines_required += genovec_cacheline_req; // workspace_vec
432 if (max_allele_ct > 2) {
433 // workspace_aux1x_present
434 cachelines_required += bitvec_cacheline_req;
435 // workspace_imp_r2
436 cachelines_required += Int64CtToCachelineCt(2 * max_allele_ct);
437 }
438 if (gflags & kfPgenGlobalHardcallPhasePresent) {
439 // workspace_all_hets, workspace_subset
440 cachelines_required += bitvec_cacheline_req * 2;
441 }
442 if (gflags & kfPgenGlobalDosagePresent) {
443 // aux track #3: usually bitarray tracking which samples have dosage info
444 // (may be stored on disk as a dosage list)
445 cachelines_required += bitvec_cacheline_req;
446 if (gflags & kfPgenGlobalDosagePhasePresent) {
447 // aux track #7: bitarray tracking which dosage entries are phased
448 cachelines_required += bitvec_cacheline_req;
449
450 // phased aux tracks #4,8: 2 bytes per sample
451 // There may be overflow risk here in the future.
452 // (commented out since caller always provides this buffer for now)
453 // cachelines_required += DivUp(2 * k1LU * raw_sample_ct, kCacheline);
454 }
455 // unphased aux track #4: 2 bytes per sample
456 // cachelines_required += DivUp(2 * k1LU * raw_sample_ct, kCacheline);
457
458 // may need deltalist64 workspace in multiallelic dosage case
459 }
460 }
461 return cachelines_required;
462 }
463
464 static_assert(kPglMaxAltAlleleCt == 254, "Need to update PgfiInitPhase1().");
PgfiInitPhase1(const char * fname,uint32_t raw_variant_ct,uint32_t raw_sample_ct,uint32_t use_mmap,PgenHeaderCtrl * header_ctrl_ptr,PgenFileInfo * pgfip,uintptr_t * pgfi_alloc_cacheline_ct_ptr,char * errstr_buf)465 PglErr PgfiInitPhase1(const char* fname, uint32_t raw_variant_ct, uint32_t raw_sample_ct, uint32_t use_mmap, PgenHeaderCtrl* header_ctrl_ptr, PgenFileInfo* pgfip, uintptr_t* pgfi_alloc_cacheline_ct_ptr, char* errstr_buf) {
466 pgfip->var_fpos = nullptr;
467 pgfip->vrtypes = nullptr;
468 pgfip->allele_idx_offsets = nullptr;
469 pgfip->nonref_flags = nullptr;
470
471 // Caller is currently expected to reset max_allele_ct if allele_idx_offsets
472 // is preloaded... need to fix this interface.
473 pgfip->max_allele_ct = 2;
474 // pgfip->max_dosage_allele_ct = 0;
475
476 pgfip->block_base = nullptr;
477 // this should force overflow when value is uninitialized.
478 pgfip->block_offset = 1LLU << 63;
479
480 uint64_t fsize;
481 const unsigned char* fread_ptr;
482 FILE* shared_ff = nullptr;
483 unsigned char small_readbuf[3];
484 #ifdef NO_MMAP
485 if (unlikely(use_mmap)) {
486 pgfip->shared_ff = nullptr; // this must be initialized before block_base
487 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase1() use_mmap parameter is nonzero, but pgenlib was not compiled with mmap support.\n");
488 return kPglRetImproperFunctionCall;
489 }
490 #else
491 if (use_mmap) {
492 pgfip->shared_ff = nullptr; // this must be initialized before block_base
493 int32_t file_handle = open(fname, O_RDONLY);
494 if (unlikely(file_handle < 0)) {
495 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Failed to open %s : %s.\n", fname, strerror(errno));
496 return kPglRetOpenFail;
497 }
498 struct stat statbuf;
499 if (unlikely(fstat(file_handle, &statbuf) < 0)) {
500 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Failed to open %s : %s.\n", fname, strerror(errno));
501 return kPglRetOpenFail;
502 }
503 fsize = statbuf.st_size;
504 pgfip->block_offset = 0;
505 pgfip->file_size = fsize;
506 pgfip->block_base = S_CAST(const unsigned char*, mmap(0, pgfip->file_size, PROT_READ, MAP_SHARED, file_handle, 0));
507 if (unlikely(R_CAST(uintptr_t, pgfip->block_base) == (~k0LU))) {
508 pgfip->block_base = nullptr;
509 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s read failure: %s.\n", fname, strerror(errno));
510 return kPglRetReadFail;
511 }
512 // this provided less than a ~5% boost on OS X; mmap still took >80% longer
513 // than fread on an 85GB file there
514 // try MAP_POPULATE on Linux?
515 // madvise((unsigned char*)(pgfip->block_base), fsize, MADV_SEQUENTIAL);
516 close(file_handle);
517 // update (7 Jan 2018): drop support for zero-sample and zero-variant
518 // files, not worth the development cost
519 if (unlikely(fsize < 4)) {
520 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s is too small to be a valid .pgen file.\n", fname);
521 return kPglRetMalformedInput;
522 }
523 fread_ptr = pgfip->block_base;
524 }
525 #endif
526 else {
527 shared_ff = fopen(fname, FOPEN_RB);
528 pgfip->shared_ff = shared_ff;
529 if (unlikely(!shared_ff)) {
530 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Failed to open %s : %s.\n", fname, strerror(errno));
531 return kPglRetOpenFail;
532 }
533 if (unlikely(fseeko(shared_ff, 0, SEEK_END))) {
534 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s read failure: %s.\n", fname, strerror(errno));
535 return kPglRetReadFail;
536 }
537 fsize = ftello(shared_ff);
538 if (unlikely(fsize < 4)) {
539 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s is too small to be a valid .pgen file.\n", fname);
540 return kPglRetMalformedInput;
541 }
542 rewind(shared_ff);
543 if (unlikely(!fread_unlocked(small_readbuf, 3, 1, shared_ff))) {
544 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s read failure: %s.\n", fname, strerror(errno));
545 return kPglRetReadFail;
546 }
547 fread_ptr = small_readbuf;
548 }
549 // deliberate underflow
550 if (unlikely(((raw_variant_ct - 1) > 0x7ffffffc) && (raw_variant_ct != UINT32_MAX))) {
551 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid raw_variant_ct function parameter.\n");
552 return kPglRetImproperFunctionCall;
553 }
554 if (unlikely(((raw_sample_ct - 1) > 0x7ffffffd) && (raw_sample_ct != UINT32_MAX))) {
555 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid raw_sample_ct function parameter.\n");
556 return kPglRetImproperFunctionCall;
557 }
558 if (unlikely(!memequal_k(fread_ptr, "l\x1b", 2))) {
559 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s is not a .pgen file (first two bytes don't match the magic number).\n", fname);
560 return kPglRetMalformedInput;
561 }
562 const uint32_t file_type_code = fread_ptr[2];
563 *header_ctrl_ptr = 0;
564 if (file_type_code < 2) {
565 // plink 1 binary
566 if (unlikely(!file_type_code)) {
567 // sample-major. validate file size here so we don't have to recheck it
568 if ((raw_sample_ct != UINT32_MAX) && (raw_variant_ct != UINT32_MAX)) {
569 const uint64_t fsize_expected = 3 + S_CAST(uint64_t, raw_sample_ct) * NypCtToByteCt(raw_variant_ct);
570 if (fsize != fsize_expected) {
571 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Unexpected PLINK 1 sample-major .bed file size (%" PRIu64 " bytes expected).\n", fsize_expected);
572 return kPglRetMalformedInput;
573 }
574 }
575 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: pgenlib does not directly support sample-major PLINK 1 .bed files.\n(However, PLINK 2 automatically transposes and compresses them for you.)\n");
576 return kPglRetSampleMajorBed;
577 }
578 if (unlikely(raw_sample_ct == UINT32_MAX)) {
579 // either .fam must be loaded first, or user must provide sample count
580 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase1() must be called with an accurate raw_sample_ct value, since %s is a PLINK 1 .bed file.\n", fname);
581 return kPglRetImproperFunctionCall;
582 }
583 const uint32_t const_vrec_width = NypCtToByteCt(raw_sample_ct);
584 if (raw_variant_ct == UINT32_MAX) {
585 // allow raw_variant_ct to be inferred
586 uint64_t quotient = (fsize - 3) / const_vrec_width;
587 if (unlikely((quotient > 0x7fffffffU) || (quotient * const_vrec_width + 3 != fsize))) {
588 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Unexpected PLINK 1 .bed file size (since raw_sample_ct was %u, [file size - 3] should be divisible by %u and the quotient should be smaller than 2^31).\n", raw_sample_ct, const_vrec_width);
589 return kPglRetMalformedInput;
590 }
591 raw_variant_ct = quotient;
592 } else {
593 if (unlikely(S_CAST(uint64_t, raw_variant_ct) * const_vrec_width + 3 != fsize)) {
594 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Unexpected PLINK 1 .bed file size (expected %" PRIu64 " bytes).\n", S_CAST(uint64_t, raw_variant_ct) * const_vrec_width + 3);
595 return kPglRetMalformedInput;
596 }
597 }
598 pgfip->raw_variant_ct = raw_variant_ct;
599 pgfip->raw_sample_ct = raw_sample_ct;
600 pgfip->const_fpos_offset = 3;
601
602 pgfip->const_vrtype = kPglVrtypePlink1;
603 pgfip->const_vrec_width = const_vrec_width;
604 pgfip->gflags = kfPgenGlobalAllNonref;
605 *pgfi_alloc_cacheline_ct_ptr = 0;
606 return kPglRetSuccess;
607 }
608
609 if (unlikely(fsize < 12)) {
610 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s is too small to be a valid .pgen file.\n", fname);
611 return kPglRetMalformedInput;
612 }
613 #ifndef NO_MMAP
614 if (use_mmap) {
615 memcpy(&(pgfip->raw_variant_ct), &(fread_ptr[3]), sizeof(int32_t));
616 memcpy(&(pgfip->raw_sample_ct), &(fread_ptr[7]), sizeof(int32_t));
617 memcpy(header_ctrl_ptr, &(fread_ptr[11]), 1);
618 } else {
619 #endif
620 if (unlikely(
621 (!fread_unlocked(&(pgfip->raw_variant_ct), sizeof(int32_t), 1, shared_ff)) ||
622 (!fread_unlocked(&(pgfip->raw_sample_ct), sizeof(int32_t), 1, shared_ff)) ||
623 (!fread_unlocked(header_ctrl_ptr, 1, 1, shared_ff)))) {
624 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: %s read failure: %s.\n", fname, strerror(errno));
625 return kPglRetReadFail;
626 }
627 #ifndef NO_MMAP
628 }
629 #endif
630 PgenHeaderCtrl header_ctrl = *header_ctrl_ptr;
631 if (raw_variant_ct == UINT32_MAX) {
632 raw_variant_ct = pgfip->raw_variant_ct;
633 // deliberate underflow
634 if (unlikely((raw_variant_ct - 1) > 0x7ffffffc)) {
635 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid variant count in .pgen file.\n");
636 return kPglRetMalformedInput;
637 }
638 } else if (unlikely(raw_variant_ct != pgfip->raw_variant_ct)) {
639 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase1() was called with raw_variant_ct == %u, but %s contains %u variant%s.\n", raw_variant_ct, fname, pgfip->raw_variant_ct, (pgfip->raw_variant_ct == 1)? "" : "s");
640 return kPglRetInconsistentInput;
641 }
642 if (raw_sample_ct == UINT32_MAX) {
643 raw_sample_ct = pgfip->raw_sample_ct;
644 // deliberate underflow
645 if (unlikely((raw_sample_ct - 1) > 0x7ffffffd)) {
646 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid sample count in .pgen file.\n");
647 return kPglRetMalformedInput;
648 }
649 } else if (unlikely(raw_sample_ct != pgfip->raw_sample_ct)) {
650 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase1() was called with raw_sample_ct == %u, but %s contains %u sample%s.\n", raw_sample_ct, fname, pgfip->raw_sample_ct, (pgfip->raw_sample_ct == 1)? "" : "s");
651 return kPglRetInconsistentInput;
652 }
653 pgfip->gflags = kfPgenGlobal0;
654 pgfip->const_fpos_offset = 12;
655
656 // explicit storage of "is this reference allele untrusted?"
657 // need caller to allocate this
658 uint32_t nonref_flags_storage = header_ctrl >> 6;
659 if (nonref_flags_storage == 3) {
660 pgfip->const_fpos_offset += DivUp(raw_variant_ct, CHAR_BIT);
661 } else if (nonref_flags_storage == 2) {
662 pgfip->gflags |= kfPgenGlobalAllNonref;
663 }
664
665 if (file_type_code < 16) {
666 // plink 2 binary, single constant-width vrtype
667 if (unlikely(file_type_code > 4)) {
668 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Third byte of %s does not correspond to a storage mode supported by this version of pgenlib.\n", fname);
669 return kPglRetNotYetSupported;
670 }
671 if (unlikely(header_ctrl & 63)) {
672 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Third byte of %s corresponds to a fixed-width storage mode, but twelfth byte is only consistent with a variable-width mode.\n", fname);
673 return kPglRetMalformedInput;
674 }
675 uint32_t vrtype = 0;
676 uintptr_t const_vrec_width = NypCtToByteCt(raw_sample_ct);
677 if (file_type_code == 3) {
678 vrtype = 0x40;
679 const_vrec_width += raw_sample_ct * 2;
680 pgfip->gflags |= kfPgenGlobalDosagePresent;
681 } else if (file_type_code == 4) {
682 vrtype = 0xc0;
683 const_vrec_width += raw_sample_ct * 4;
684 pgfip->gflags |= kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent;
685 }
686 if (unlikely(S_CAST(uint64_t, raw_variant_ct) * const_vrec_width + pgfip->const_fpos_offset != fsize)) {
687 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Unexpected .pgen file size (expected %" PRIu64 " bytes).\n", S_CAST(uint64_t, raw_variant_ct) * const_vrec_width + pgfip->const_fpos_offset);
688 return kPglRetMalformedInput;
689 }
690 pgfip->const_vrtype = vrtype;
691 pgfip->const_vrec_width = const_vrec_width;
692 *pgfi_alloc_cacheline_ct_ptr = 0;
693 return kPglRetSuccess;
694 }
695 if (unlikely(file_type_code >= 0x11)) {
696 // todo: 0x11 phase sets (maybe not before 2021, though)
697 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Third byte of %s does not correspond to a storage mode supported by this version of pgenlib.\n", fname);
698 return kPglRetNotYetSupported;
699 }
700 // plink 2 binary, general-purpose
701 pgfip->const_vrtype = UINT32_MAX;
702 pgfip->const_vrec_width = 0;
703 const uintptr_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
704 if (unlikely(alt_allele_ct_byte_ct > 1)) {
705 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: This version of pgenlib does not support >254 alternate alleles for a single variant.\n");
706 return kPglRetNotYetSupported;
707 }
708
709 // 8 extra bytes per vblock, to support fast random access
710 const uintptr_t vblock_ct = DivUp(raw_variant_ct, kPglVblockSize);
711
712 uint64_t vrtype_and_vrec_len_bit_cost;
713 if (header_ctrl & 8) {
714 // Special header_ctrl modes:
715 // 8: 1 bit per fused vrtype-length. Unset = vrtype 5, set = vrtype 0.
716 // 9: 2 bits, multiallelic. 0 = vrtype 5, 1 = vrtype 0, 2-3 = vrtype
717 // 8 with that many more bytes than vrtype 0. Note that this is
718 // limited to 16 ALT alleles.
719 // 10: 2 bits, phased. 0 = vrtype 5, 1 = vrtype 0, 2-3 = vrtype 16
720 // with that many minus 1 bytes beyond vrtype 0. While this is also
721 // aimed at the single-sample use case, it technically supports up to
722 // 15 always-phased or 7 partially-phased samples.
723 // 11: 4 bits, multiallelic + phased. 0 = vrtype 5, 1 = vrtype 0,
724 // 2-7 = vrtype 8 with that many bytes beyond vrtype 0, 9 = vrtype 16
725 // phase info requiring just 1 byte, 10-15 = vrtype 24 with (x-7)
726 // extra bytes required between multiallelic and phased tracks.
727 // 12: 2 bits, dosage, must be single-sample. 0 = vrtype 5,
728 // 1 = vrtype 0, 2 = vrtype 0x45 with 2 bytes, 3 = vrtype 0x40 with 3
729 // total bytes.
730 // 13: reserved for single-sample multiallelic + dosage.
731 // 14: 4 bits, phased + dosage, must be single-sample. 0 and 1 as usual,
732 // 3 = vrtype 16 with 1 phaseinfo byte, 4 = vrtype 0x45 with 2 bytes,
733 // 5 = vrtype 0x40 with 3 total bytes, 12 = vrtype 0xc5 with 4 total
734 // bytes, 13 = vrtype 0xc0 with 5 total bytes, 15 = vrtype 0xe0 with
735 // 6 total bytes
736 // 15: reserved for single-sample multiallelic + phased dosage.
737 const uint32_t header_ctrl_low3 = header_ctrl & 7;
738 // this can be a table lookup once 13/15 are implemented
739 if (!header_ctrl_low3) {
740 vrtype_and_vrec_len_bit_cost = 1;
741 } else if ((header_ctrl_low3 == 3) || (header_ctrl_low3 == 6)) {
742 vrtype_and_vrec_len_bit_cost = 4;
743 } else if (likely(header_ctrl_low3 <= 4)) {
744 vrtype_and_vrec_len_bit_cost = 2;
745 } else {
746 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Twelfth byte of %s does not correspond to a format supported by this version of pgenlib.\n", fname);
747 return kPglRetNotYetSupported;
748 }
749 } else {
750 // set this to *4* if true, 0 if false
751 const uint32_t phase_or_dosage_present_x4 = header_ctrl & 4;
752 // vrtype entries = 4 bits if no phase/dosage, 8 otherwise
753 // var_fpos entries = 8 + (8 * (header_ctrl & 3)) bits
754 vrtype_and_vrec_len_bit_cost = 12 + phase_or_dosage_present_x4 + 8 * (header_ctrl & 3);
755 }
756 pgfip->const_fpos_offset += (raw_sample_ct * vrtype_and_vrec_len_bit_cost + 7) / 8 + (raw_sample_ct * alt_allele_ct_byte_ct) + (8 * vblock_ct);
757 *pgfi_alloc_cacheline_ct_ptr = CountPgfiAllocCachelinesRequired(raw_variant_ct);
758 return kPglRetSuccess;
759 }
760
761 static_assert(kPglMaxAltAlleleCt == 254, "Need to update PgfiInitPhase2().");
PgfiInitPhase2(PgenHeaderCtrl header_ctrl,uint32_t allele_cts_already_loaded,uint32_t nonref_flags_already_loaded,uint32_t use_blockload,uint32_t vblock_idx_start,uint32_t vidx_end,uint32_t * max_vrec_width_ptr,PgenFileInfo * pgfip,unsigned char * pgfi_alloc,uintptr_t * pgr_alloc_cacheline_ct_ptr,char * errstr_buf)762 PglErr PgfiInitPhase2(PgenHeaderCtrl header_ctrl, uint32_t allele_cts_already_loaded, uint32_t nonref_flags_already_loaded, uint32_t use_blockload, uint32_t vblock_idx_start, uint32_t vidx_end, uint32_t* max_vrec_width_ptr, PgenFileInfo* pgfip, unsigned char* pgfi_alloc, uintptr_t* pgr_alloc_cacheline_ct_ptr, char* errstr_buf) {
763 // *max_vrec_width_ptr technically only needs to be set in single-variant
764 // fread() mode, but its computation is not currently optimized out in the
765 // other two modes.
766
767 // possible todo: add option to skip validation when allele_cts/nonref_flags
768 // are already loaded. but let's play it safe for now.
769 const uint32_t raw_variant_ct = pgfip->raw_variant_ct;
770 const uint32_t const_vrec_width = pgfip->const_vrec_width;
771 *pgr_alloc_cacheline_ct_ptr = 0;
772
773 // Note that this is a rather hefty stack allocation.
774 unsigned char loadbuf[kPglVblockSize * 4];
775
776 uintptr_t* allele_idx_offsets_iter = pgfip->allele_idx_offsets;
777 uintptr_t prev_allele_idx_offset = 0;
778 if (allele_idx_offsets_iter) {
779 if (!allele_cts_already_loaded) {
780 *allele_idx_offsets_iter = 0;
781 } else {
782 prev_allele_idx_offset = *allele_idx_offsets_iter;
783 }
784 ++allele_idx_offsets_iter;
785 }
786 if (!raw_variant_ct) {
787 return kPglRetSuccess;
788 }
789 const uint32_t nonref_flags_stored = ((header_ctrl >> 6) == 3);
790 unsigned char* nonref_flags_iter = R_CAST(unsigned char*, pgfip->nonref_flags);
791 const unsigned char* fread_ptr = nullptr; // maybe-uninitialized warning
792 FILE* shared_ff = pgfip->shared_ff;
793 if (const_vrec_width) {
794 // no allele counts to verify if fixed-width
795 // always need ldbase_raw_genovec
796 *pgr_alloc_cacheline_ct_ptr = NypCtToCachelineCt(pgfip->raw_sample_ct);
797 *max_vrec_width_ptr = const_vrec_width;
798 #ifdef NO_MMAP
799 assert(shared_ff);
800 #else
801 if (!shared_ff) {
802 if (unlikely(use_blockload)) {
803 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase2() cannot be called with use_blockload set when PgfiInitPhase1() had use_mmap set.\n");
804 return kPglRetImproperFunctionCall;
805 }
806 if ((!(header_ctrl & 192)) || (pgfip->const_vrtype == kPglVrtypePlink1)) {
807 return kPglRetSuccess;
808 }
809 fread_ptr = &(pgfip->block_base[12]);
810 const uint32_t nonref_flags_byte_ct = DivUp(raw_variant_ct, CHAR_BIT);
811 if (!nonref_flags_already_loaded) {
812 if (nonref_flags_stored) {
813 memcpy(nonref_flags_iter, fread_ptr, nonref_flags_byte_ct);
814 }
815 return kPglRetSuccess;
816 }
817 if (nonref_flags_stored) {
818 if (unlikely(!memequal(nonref_flags_iter, fread_ptr, nonref_flags_byte_ct))) {
819 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
820 return kPglRetInconsistentInput;
821 }
822 return kPglRetSuccess;
823 }
824 if (header_ctrl & 64) {
825 // all ref
826 if (unlikely(!AllWordsAreZero(pgfip->nonref_flags, BitCtToWordCt(raw_variant_ct)))) {
827 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
828 return kPglRetInconsistentInput;
829 }
830 return kPglRetSuccess;
831 }
832 // all nonref
833 if (unlikely(!AllBitsAreOne(pgfip->nonref_flags, raw_variant_ct))) {
834 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
835 return kPglRetInconsistentInput;
836 }
837 return kPglRetSuccess;
838 }
839 #endif
840 if (!use_blockload) {
841 // using fread() single-variant-at-a-time, need pgr.fread_buf
842 *pgr_alloc_cacheline_ct_ptr += DivUp(const_vrec_width, kCacheline);
843 }
844 if ((!(header_ctrl & 192)) || (pgfip->const_vrtype == kPglVrtypePlink1)) {
845 return kPglRetSuccess;
846 }
847 if ((header_ctrl >> 6) == 1) {
848 // all ref
849 if (nonref_flags_already_loaded) {
850 if (unlikely(!AllWordsAreZero(pgfip->nonref_flags, BitCtToWordCt(raw_variant_ct)))) {
851 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
852 return kPglRetInconsistentInput;
853 }
854 }
855 return kPglRetSuccess;
856 }
857 if ((header_ctrl >> 6) == 2) {
858 // all nonref
859 if (nonref_flags_already_loaded) {
860 if (unlikely(!AllBitsAreOne(pgfip->nonref_flags, raw_variant_ct))) {
861 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
862 return kPglRetInconsistentInput;
863 }
864 }
865 return kPglRetSuccess;
866 }
867 // _last more useful than _end iff we just refer to the number of elements
868 // in the block and have no use for a _stop pointer
869 unsigned char* nonref_flags_last = &(nonref_flags_iter[((raw_variant_ct - 1) / (kPglVblockSize * 32)) * (kPglVblockSize * 4)]);
870 uint32_t cur_byte_ct = kPglVblockSize * 4;
871 for (; ; nonref_flags_iter = &(nonref_flags_iter[cur_byte_ct])) {
872 if (nonref_flags_iter >= nonref_flags_last) {
873 if (nonref_flags_iter > nonref_flags_last) {
874 return kPglRetSuccess;
875 }
876 cur_byte_ct = 1 + ((raw_variant_ct - 1) % (kPglVblockSize * 32)) / CHAR_BIT;
877 }
878 unsigned char* loadptr = nonref_flags_already_loaded? loadbuf : nonref_flags_iter;
879 if (unlikely(!fread_unlocked(loadptr, cur_byte_ct, 1, shared_ff))) {
880 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
881 return kPglRetReadFail;
882 }
883 if (nonref_flags_already_loaded) {
884 if (unlikely(!memequal(nonref_flags_iter, loadbuf, cur_byte_ct))) {
885 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
886 return kPglRetInconsistentInput;
887 }
888 }
889 }
890 }
891
892 const uint32_t raw_sample_ct = pgfip->raw_sample_ct;
893 unsigned char* vrtypes_iter = pgfi_alloc;
894 pgfip->vrtypes = vrtypes_iter;
895 uint64_t* var_fpos_iter = R_CAST(uint64_t*, &(vrtypes_iter[RoundUpPow2(raw_variant_ct + 1, kCacheline)]));
896 pgfip->var_fpos = var_fpos_iter;
897 uint32_t vblock_ct_m1 = (raw_variant_ct - 1) / kPglVblockSize;
898 uint32_t max_vrec_width = 0;
899 uint64_t cur_fpos;
900 #ifdef NO_MMAP
901 assert(shared_ff);
902 #else
903 if (!shared_ff) {
904 if (unlikely(use_blockload)) {
905 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: PgfiInitPhase2() cannot be called with use_blockload set when PgfiInitPhase1() had use_mmap set.\n");
906 return kPglRetImproperFunctionCall;
907 }
908 fread_ptr = &(pgfip->block_base[12 + 8 * vblock_idx_start]);
909 memcpy(&cur_fpos, fread_ptr, sizeof(int64_t));
910 fread_ptr = &(fread_ptr[(vblock_ct_m1 + 1 - vblock_idx_start) * sizeof(int64_t)]);
911 } else {
912 #endif
913 if (vblock_idx_start) {
914 if (unlikely(fseeko(shared_ff, vblock_idx_start * sizeof(int64_t), SEEK_CUR))) {
915 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
916 return kPglRetReadFail;
917 }
918 }
919 if (unlikely(!fread_unlocked(&cur_fpos, sizeof(int64_t), 1, shared_ff))) {
920 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
921 return kPglRetReadFail;
922 }
923 // May also need to load the rest of these values in the future, if we want
924 // to support dynamic insertion into a memory-mapped file. But skip them
925 // for now.
926 if (unlikely(fseeko(shared_ff, (vblock_ct_m1 - vblock_idx_start) * sizeof(int64_t), SEEK_CUR))) {
927 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
928 return kPglRetReadFail;
929 }
930 #ifndef NO_MMAP
931 }
932 #endif
933 const uint32_t vrtype_and_fpos_storage = header_ctrl & 15;
934 const uint32_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
935 if (alt_allele_ct_byte_ct) {
936 assert(alt_allele_ct_byte_ct == 1);
937 if (unlikely(!allele_idx_offsets_iter)) {
938 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: pgfip->allele_idx_offsets must be allocated before PgfiInitPhase2() is called.\n");
939 return kPglRetImproperFunctionCall;
940 }
941 }
942 uint32_t vblock_idx = vblock_idx_start;
943 vblock_ct_m1 = (vidx_end - 1) / kPglVblockSize;
944 if (vblock_idx) {
945 uintptr_t header_vblock_byte_ct = kPglVblockSize * alt_allele_ct_byte_ct;
946 if (nonref_flags_stored) {
947 header_vblock_byte_ct += kPglVblockSize / CHAR_BIT;
948 }
949 if (vrtype_and_fpos_storage & 8) {
950 header_vblock_byte_ct += kPglVblockSize >> (10 - vrtype_and_fpos_storage);
951 } else {
952 if (!(vrtype_and_fpos_storage & 4)) {
953 header_vblock_byte_ct += kPglVblockSize / 2;
954 } else {
955 header_vblock_byte_ct += kPglVblockSize;
956 }
957 header_vblock_byte_ct += kPglVblockSize * (1 + (vrtype_and_fpos_storage & 3));
958 }
959 #ifndef NO_MMAP
960 if (!shared_ff) {
961 fread_ptr = &(fread_ptr[header_vblock_byte_ct * S_CAST(uint64_t, vblock_idx)]);
962 } else {
963 #endif
964 if (unlikely(fseeko(shared_ff, header_vblock_byte_ct * S_CAST(uint64_t, vblock_idx), SEEK_CUR))) {
965 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
966 return kPglRetReadFail;
967 }
968 #ifndef NO_MMAP
969 }
970 #endif
971 }
972 uint32_t cur_vblock_variant_ct = kPglVblockSize;
973 uint32_t max_allele_ct = pgfip->max_allele_ct;
974 for (; ; ++vblock_idx) {
975 if (vblock_idx >= vblock_ct_m1) {
976 if (vblock_idx > vblock_ct_m1) {
977 // finish up
978 #ifndef NO_MMAP
979 // now > instead of != to allow additional information to be stored
980 // between header and first variant record
981 if (!shared_ff) {
982 if (unlikely(S_CAST(uintptr_t, fread_ptr - pgfip->block_base) > pgfip->var_fpos[0])) {
983 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid .pgen header.\n");
984 return kPglRetMalformedInput;
985 }
986 } else {
987 #endif
988 if (unlikely(S_CAST(uint64_t, ftello(shared_ff)) > pgfip->var_fpos[0])) {
989 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid .pgen header.\n");
990 return kPglRetMalformedInput;
991 }
992 #ifndef NO_MMAP
993 }
994 #endif
995 pgfip->var_fpos[vidx_end] = cur_fpos;
996 pgfip->max_allele_ct = max_allele_ct;
997 // if difflist/LD might be present, scan for them in a way that's
998 // likely to terminate quickly
999 PgenGlobalFlags new_gflags = kfPgenGlobal0;
1000 if (vrtype_and_fpos_storage != 8) {
1001 const uint32_t trailing_byte_ct = vidx_end & (kBytesPerVec - 1);
1002 if (trailing_byte_ct) {
1003 memset(&(pgfip->vrtypes[vidx_end]), 0, kBytesPerVec - trailing_byte_ct);
1004 }
1005 const VecW* vrtypes_alias_start = R_CAST(VecW*, pgfip->vrtypes);
1006 const VecW* vrtypes_alias_end = &(vrtypes_alias_start[DivUp(vidx_end, kBytesPerVec)]);
1007 if (vblock_idx_start) {
1008 vrtypes_alias_start = &(vrtypes_alias_start[vblock_idx_start * (kPglVblockSize / kBytesPerVec)]);
1009 }
1010 const VecW* vrtypes_alias_iter = vrtypes_alias_start;
1011 if (vrtype_and_fpos_storage < 8) {
1012 for (; vrtypes_alias_iter != vrtypes_alias_end; ++vrtypes_alias_iter) {
1013 const VecW cur_vvec = *vrtypes_alias_iter;
1014 #ifdef __LP64__
1015 const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5);
1016 const VecW cur_vvec_bit1 = vecw_slli(cur_vvec, 6);
1017 // check if any vrtype has bit 1 set and bit 2 clear
1018 if (vecw_movemask(vecw_and_notfirst(cur_vvec_bit2, cur_vvec_bit1))) {
1019 new_gflags |= kfPgenGlobalLdCompressionPresent | kfPgenGlobalDifflistOrLdPresent;
1020 break;
1021 }
1022 const VecW cur_vvec_bit0 = vecw_slli(cur_vvec, 7);
1023 if (vecw_movemask(cur_vvec_bit0 | cur_vvec_bit2)) {
1024 // this catches onebit
1025 new_gflags |= kfPgenGlobalDifflistOrLdPresent;
1026 }
1027 #else
1028 const uintptr_t cur_vvec_shifted = cur_vvec >> 1;
1029 // check if any vrtype has bit 1 set and bit 2 clear
1030 if (vecw_and_notfirst(cur_vvec_shifted, cur_vvec) & (2 * kMask0101)) {
1031 new_gflags |= kfPgenGlobalLdCompressionPresent | kfPgenGlobalDifflistOrLdPresent;
1032 break;
1033 }
1034 if (cur_vvec & (5 * kMask0101)) {
1035 // this catches onebit
1036 new_gflags |= kfPgenGlobalDifflistOrLdPresent;
1037 }
1038 #endif
1039 }
1040 }
1041 if (vrtype_and_fpos_storage >= 4) {
1042 // Likely for one of {hphase, dosage} to be present without the
1043 // other; make this scan faster in that case, at the cost of
1044 // failing to early-exit when both are present.
1045 // This is also suboptimal for the vrtype_and_fpos_storage > 8
1046 // special encodings.
1047 VecW or_vvec = vecw_setzero();
1048 for (vrtypes_alias_iter = vrtypes_alias_start; vrtypes_alias_iter != vrtypes_alias_end; ++vrtypes_alias_iter) {
1049 or_vvec |= *vrtypes_alias_iter;
1050 }
1051 #ifdef __LP64__
1052 const VecW or_vvec_bit3 = vecw_slli(or_vvec, 4);
1053 if (vecw_movemask(or_vvec_bit3)) {
1054 // note that, if no phase or dosage data is present, we don't
1055 // look for multiallelic hardcalls.
1056 new_gflags |= kfPgenGlobalMultiallelicHardcallFound;
1057 }
1058 const VecW or_vvec_bit4 = vecw_slli(or_vvec, 3);
1059 if (vecw_movemask(or_vvec_bit4)) {
1060 new_gflags |= kfPgenGlobalHardcallPhasePresent;
1061 }
1062 const VecW or_vvec_bit5 = vecw_slli(or_vvec, 2);
1063 const VecW or_vvec_bit6 = vecw_slli(or_vvec, 1);
1064 if (vecw_movemask(or_vvec_bit5 | or_vvec_bit6)) {
1065 new_gflags |= kfPgenGlobalDosagePresent;
1066 if (vecw_movemask(or_vvec)) {
1067 new_gflags |= kfPgenGlobalDosagePhasePresent;
1068 }
1069 }
1070 #else
1071 if (or_vvec & (8 * kMask0101)) {
1072 new_gflags |= kfPgenGlobalMultiallelicHardcallFound;
1073 }
1074 if (or_vvec & (0x10 * kMask0101)) {
1075 new_gflags |= kfPgenGlobalHardcallPhasePresent;
1076 }
1077 if (or_vvec & (0x60 * kMask0101)) {
1078 new_gflags |= kfPgenGlobalDosagePresent;
1079 if (or_vvec & (0x80 * kMask0101)) {
1080 new_gflags |= kfPgenGlobalDosagePhasePresent;
1081 }
1082 }
1083 #endif
1084 }
1085 if (vrtype_and_fpos_storage > 8) {
1086 if (vrtype_and_fpos_storage == 12) {
1087 max_vrec_width = 3;
1088 } else if (vrtype_and_fpos_storage == 14) {
1089 max_vrec_width = 6;
1090 } else {
1091 max_vrec_width = NypCtToByteCt(raw_sample_ct);
1092 if (vrtype_and_fpos_storage == 9) {
1093 max_vrec_width += 3;
1094 } else if (vrtype_and_fpos_storage == 10) {
1095 max_vrec_width += 2;
1096 } else {
1097 // 11
1098 max_vrec_width += 8;
1099 }
1100 // 13 and 15 not specified yet
1101 }
1102 } else if (!(vrtype_and_fpos_storage & 3)) {
1103 // 1 byte per vrec_len entry, don't bother to determine true
1104 // maximum
1105 max_vrec_width = 255;
1106 }
1107 pgfip->gflags |= new_gflags;
1108 } else {
1109 // vrtype_and_fpos_storage == 8.
1110 max_vrec_width = NypCtToByteCt(raw_sample_ct);
1111 }
1112 *pgr_alloc_cacheline_ct_ptr = CountPgrAllocCachelinesRequired(raw_sample_ct, new_gflags, max_allele_ct, (shared_ff && (!use_blockload))? max_vrec_width : 0);
1113 *max_vrec_width_ptr = max_vrec_width;
1114 return kPglRetSuccess;
1115 }
1116 cur_vblock_variant_ct = ModNz(vidx_end, kPglVblockSize);
1117 }
1118 // 1. handle vrtypes and var_fpos.
1119 if (vrtype_and_fpos_storage >= 8) {
1120 // Special encodings.
1121 uint32_t log2_entry_bit_width = 1;
1122 unsigned char vrtype_table[16];
1123 uint32_t vrec_len_table[16];
1124 vrtype_table[0] = 5;
1125 vrtype_table[1] = 0;
1126 vrec_len_table[0] = 0;
1127 const uint32_t raw_sample_ct4 = NypCtToByteCt(raw_sample_ct);
1128 vrec_len_table[1] = raw_sample_ct4;
1129 if (vrtype_and_fpos_storage == 8) {
1130 log2_entry_bit_width = 0;
1131 } else if (vrtype_and_fpos_storage == 9) {
1132 vrtype_table[2] = 8;
1133 vrtype_table[3] = 8;
1134 vrec_len_table[2] = raw_sample_ct4 + 2;
1135 vrec_len_table[3] = raw_sample_ct4 + 3;
1136 } else if (vrtype_and_fpos_storage == 10) {
1137 vrtype_table[2] = 16;
1138 vrtype_table[3] = 16;
1139 vrec_len_table[2] = raw_sample_ct4 + 1;
1140 vrec_len_table[3] = raw_sample_ct4 + 2;
1141 } else if (vrtype_and_fpos_storage == 11) {
1142 log2_entry_bit_width = 2;
1143 vrtype_table[2] = 8;
1144 vrtype_table[3] = 8;
1145 vrtype_table[4] = 8;
1146 vrtype_table[5] = 8;
1147 vrtype_table[6] = 8;
1148 vrtype_table[7] = 8;
1149 // 8 invalid
1150 vrtype_table[9] = 16;
1151 vrtype_table[10] = 24;
1152 vrtype_table[11] = 24;
1153 vrtype_table[12] = 24;
1154 vrtype_table[13] = 24;
1155 vrtype_table[14] = 24;
1156 vrtype_table[15] = 24;
1157 vrec_len_table[9] = raw_sample_ct4 + 1;
1158 for (uint32_t uii = 2; uii < 8; ++uii) {
1159 vrec_len_table[uii] = raw_sample_ct4 + uii;
1160 vrec_len_table[uii + 8] = raw_sample_ct4 + 1 + uii;
1161 }
1162 } else if (vrtype_and_fpos_storage == 12) {
1163 assert(raw_sample_ct == 1);
1164 vrtype_table[2] = 0x45;
1165 vrtype_table[3] = 0x40;
1166 vrec_len_table[2] = 2;
1167 vrec_len_table[3] = 3;
1168 } else {
1169 // 14 is only remaining possibility for now
1170 assert(raw_sample_ct == 1);
1171 log2_entry_bit_width = 2;
1172 vrtype_table[3] = 0x10;
1173 vrtype_table[4] = 0x45;
1174 vrtype_table[5] = 0x40;
1175 vrtype_table[12] = 0xc5;
1176 vrtype_table[13] = 0xc0;
1177 vrtype_table[15] = 0xe0;
1178 vrec_len_table[3] = 2;
1179 vrec_len_table[4] = 2;
1180 vrec_len_table[5] = 3;
1181 vrec_len_table[12] = 4;
1182 vrec_len_table[13] = 5;
1183 vrec_len_table[15] = 6;
1184 }
1185 const uint32_t entry_bit_width = 1 << log2_entry_bit_width;
1186 const uint32_t entry_mask = (1 << entry_bit_width) - 1;
1187 const uint32_t cur_byte_ct = 1 + ((cur_vblock_variant_ct - 1) >> (3 - log2_entry_bit_width));
1188 const uintptr_t* loadbuf_iter;
1189 #ifdef __arm__
1190 # error "Unaligned accesses in PgfiInitPhase2()."
1191 #endif
1192 #ifndef NO_MMAP
1193 if (!shared_ff) {
1194 loadbuf_iter = R_CAST(const uintptr_t*, fread_ptr);
1195 fread_ptr = &(fread_ptr[cur_byte_ct]);
1196 } else {
1197 #endif
1198 if (unlikely(!fread_unlocked(loadbuf, cur_byte_ct, 1, shared_ff))) {
1199 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1200 return kPglRetReadFail;
1201 }
1202 loadbuf_iter = R_CAST(const uintptr_t*, loadbuf);
1203 #ifndef NO_MMAP
1204 }
1205 #endif
1206 const uint32_t log2_entries_per_word = kBitsPerWordLog2 - log2_entry_bit_width;
1207 const uint32_t block_len = 1 << log2_entries_per_word;
1208 uint32_t cur_vblock_idx = 0;
1209 uint32_t cur_vblock_idx_stop = block_len;
1210 for (; ; cur_vblock_idx_stop += block_len) {
1211 if (cur_vblock_idx_stop > cur_vblock_variant_ct) {
1212 if (cur_vblock_idx == cur_vblock_variant_ct) {
1213 break;
1214 }
1215 cur_vblock_idx_stop = cur_vblock_variant_ct;
1216 }
1217 uintptr_t input_word = *loadbuf_iter++;
1218 for (; cur_vblock_idx != cur_vblock_idx_stop; ++cur_vblock_idx) {
1219 const uint32_t input_word_masked = input_word & entry_mask;
1220 *vrtypes_iter++ = vrtype_table[input_word_masked];
1221 *var_fpos_iter++ = cur_fpos;
1222 cur_fpos += vrec_len_table[input_word_masked];
1223 input_word >>= entry_bit_width;
1224 }
1225 }
1226 } else {
1227 if (vrtype_and_fpos_storage < 4) {
1228 // no phase or dosage present, 4-bit vrtypes
1229 const uint32_t cur_byte_ct = DivUp(cur_vblock_variant_ct, 2);
1230 #ifndef NO_MMAP
1231 if (shared_ff) {
1232 #endif
1233 if (unlikely(!fread_unlocked(loadbuf, cur_byte_ct, 1, shared_ff))) {
1234 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1235 return kPglRetReadFail;
1236 }
1237 fread_ptr = loadbuf;
1238 #ifndef NO_MMAP
1239 }
1240 #endif
1241 const uint32_t word_write_ct = DivUp(cur_vblock_variant_ct, kBytesPerWord);
1242 uintptr_t* vrtypes_alias_fullword = R_CAST(uintptr_t*, vrtypes_iter);
1243 const Halfword* loadbuf_alias_halfword = R_CAST(const Halfword*, fread_ptr);
1244 for (uint32_t widx = 0; widx != word_write_ct; ++widx) {
1245 uintptr_t ww = loadbuf_alias_halfword[widx];
1246 #ifdef USE_AVX2
1247 // speed advantage is small on my Mac since compiler auto-vectorizes
1248 // the code below?
1249 vrtypes_alias_fullword[widx] = _pdep_u64(ww, kMask0F0F);
1250 #else
1251 # ifdef __LP64__
1252 ww = (ww | (ww << 16)) & kMask0000FFFF;
1253 # endif
1254 ww = (ww | (ww << 8)) & kMask00FF;
1255 vrtypes_alias_fullword[widx] = (ww | (ww << 4)) & kMask0F0F;
1256 #endif // !USE_AVX2
1257 }
1258 const uint32_t last_word_byte_ct = cur_vblock_variant_ct % kBytesPerWord;
1259 vrtypes_iter = &(vrtypes_iter[cur_vblock_variant_ct]);
1260 if (last_word_byte_ct) {
1261 ProperSubwordStore(0, kBytesPerWord - last_word_byte_ct, vrtypes_iter);
1262 } else {
1263 // must guarantee a trailing zero for is_ldbase check to work
1264 vrtypes_iter[0] = 0;
1265 }
1266 #ifndef NO_MMAP
1267 if (!shared_ff) {
1268 fread_ptr = &(fread_ptr[cur_byte_ct]);
1269 }
1270 #endif
1271 } else {
1272 // phase and dosage
1273 #ifndef NO_MMAP
1274 if (shared_ff) {
1275 #endif
1276 if (unlikely(!fread_unlocked(vrtypes_iter, cur_vblock_variant_ct, 1, shared_ff))) {
1277 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1278 return kPglRetReadFail;
1279 }
1280 #ifndef NO_MMAP
1281 } else {
1282 memcpy(vrtypes_iter, fread_ptr, cur_vblock_variant_ct);
1283 }
1284 #endif
1285 const uint32_t last_word_byte_ct = cur_vblock_variant_ct % kBytesPerWord;
1286 vrtypes_iter = &(vrtypes_iter[cur_vblock_variant_ct]);
1287 if (last_word_byte_ct) {
1288 ProperSubwordStore(0, kBytesPerWord - last_word_byte_ct, vrtypes_iter);
1289 } else {
1290 // must guarantee a trailing zero for is_ldbase check to work
1291 vrtypes_iter[0] = 0;
1292 }
1293 #ifndef NO_MMAP
1294 if (!shared_ff) {
1295 fread_ptr = &(fread_ptr[cur_vblock_variant_ct]);
1296 }
1297 #endif
1298 }
1299 const uint32_t bytes_per_entry = 1 + (vrtype_and_fpos_storage & 3);
1300 const uint32_t cur_byte_ct = cur_vblock_variant_ct * bytes_per_entry;
1301 #ifndef NO_MMAP
1302 if (shared_ff) {
1303 #endif
1304 if (unlikely(!fread_unlocked(loadbuf, cur_byte_ct, 1, shared_ff))) {
1305 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1306 return kPglRetReadFail;
1307 }
1308 fread_ptr = loadbuf;
1309 #ifndef NO_MMAP
1310 }
1311 #endif
1312 if (bytes_per_entry == 1) {
1313 for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1314 var_fpos_iter[cur_vblock_vidx] = cur_fpos;
1315 uint32_t cur_vrec_len = fread_ptr[cur_vblock_vidx];
1316 cur_fpos += cur_vrec_len;
1317 // no need for correct max_vrec_width
1318 }
1319 } else if (bytes_per_entry == 2) {
1320 for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1321 var_fpos_iter[cur_vblock_vidx] = cur_fpos;
1322 uint16_t cur_vrec_len;
1323 memcpy_k(&cur_vrec_len, &(fread_ptr[cur_vblock_vidx * 2]), 2);
1324 cur_fpos += cur_vrec_len;
1325 if (cur_vrec_len > max_vrec_width) {
1326 // todo: check whether we're better off just assuming 2^16 - 1
1327 max_vrec_width = cur_vrec_len;
1328 }
1329 }
1330 } else if (bytes_per_entry == 3) {
1331 for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1332 var_fpos_iter[cur_vblock_vidx] = cur_fpos;
1333 uint32_t cur_vrec_len;
1334 // safe to read a byte past the end, since that's either in loadbuf
1335 // or, in mmap case, we can't be at the end of a valid file
1336 memcpy(&cur_vrec_len, &(fread_ptr[cur_vblock_vidx * 3]), sizeof(int32_t));
1337 cur_vrec_len &= 0xffffff;
1338 cur_fpos += cur_vrec_len;
1339 if (cur_vrec_len > max_vrec_width) {
1340 max_vrec_width = cur_vrec_len;
1341 }
1342 }
1343 } else {
1344 for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1345 var_fpos_iter[cur_vblock_vidx] = cur_fpos;
1346 uint32_t cur_vrec_len;
1347 memcpy(&cur_vrec_len, &(fread_ptr[cur_vblock_vidx * 4]), 4);
1348 cur_fpos += cur_vrec_len;
1349 if (cur_vrec_len > max_vrec_width) {
1350 max_vrec_width = cur_vrec_len;
1351 }
1352 }
1353 #ifdef __LP64__
1354 if (unlikely(max_vrec_width > kPglMaxBytesPerVariant)) {
1355 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid .pgen header.\n");
1356 return kPglRetMalformedInput;
1357 }
1358 #else
1359 if (unlikely(max_vrec_width > kMaxBytesPerIO)) {
1360 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Variant records too large for 32-bit pgenlib.\n");
1361 return kPglRetNomem;
1362 }
1363 #endif
1364 }
1365 var_fpos_iter = &(var_fpos_iter[cur_vblock_variant_ct]);
1366 #ifndef NO_MMAP
1367 if (!shared_ff) {
1368 fread_ptr = &(fread_ptr[cur_byte_ct]);
1369 }
1370 #endif
1371 }
1372 // 2. allele counts?
1373 if (alt_allele_ct_byte_ct) {
1374 assert(alt_allele_ct_byte_ct == 1);
1375 #ifndef NO_MMAP
1376 if (shared_ff) {
1377 #endif
1378 if (unlikely(!fread_unlocked(loadbuf, cur_vblock_variant_ct * alt_allele_ct_byte_ct, 1, shared_ff))) {
1379 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1380 return kPglRetReadFail;
1381 }
1382 fread_ptr = loadbuf;
1383 #ifndef NO_MMAP
1384 }
1385 #endif
1386 // max_allele_ct scan can probably be sped up with _mm{256}_max_epu8()?
1387 // probably can't do much for main loop (at least in sizeof(AlleleCode)
1388 // == 1 case)
1389 if (allele_cts_already_loaded) {
1390 // todo: update this for multibyte AlleleCode
1391 for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1392 const uintptr_t cur_allele_idx_offset = allele_idx_offsets_iter[cur_vblock_vidx];
1393 const uint32_t cur_allele_ct = fread_ptr[cur_vblock_vidx];
1394 if (unlikely((cur_allele_idx_offset - prev_allele_idx_offset) != cur_allele_ct)) {
1395 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded allele_idx_offsets do not match values in .pgen file.\n");
1396 return kPglRetInconsistentInput;
1397 }
1398 prev_allele_idx_offset = cur_allele_idx_offset;
1399 if (cur_allele_ct > max_allele_ct) {
1400 max_allele_ct = cur_allele_ct;
1401 }
1402 }
1403 } else {
1404 for (uint32_t cur_vblock_vidx = 0; cur_vblock_vidx != cur_vblock_variant_ct; ++cur_vblock_vidx) {
1405 const uint32_t cur_allele_ct = fread_ptr[cur_vblock_vidx];
1406 allele_idx_offsets_iter[cur_vblock_vidx] = prev_allele_idx_offset;
1407 prev_allele_idx_offset += cur_allele_ct;
1408 if (cur_allele_ct > max_allele_ct) {
1409 max_allele_ct = cur_allele_ct;
1410 }
1411 }
1412 }
1413 allele_idx_offsets_iter = &(allele_idx_offsets_iter[cur_vblock_variant_ct]);
1414 #ifndef NO_MMAP
1415 if (!shared_ff) {
1416 fread_ptr = &(fread_ptr[cur_vblock_variant_ct * alt_allele_ct_byte_ct]);
1417 }
1418 #endif
1419 }
1420 // 3. nonref flags?
1421 if (nonref_flags_stored) {
1422 const uint32_t cur_byte_ct = DivUp(cur_vblock_variant_ct, CHAR_BIT);
1423 #ifndef NO_MMAP
1424 if (!shared_ff) {
1425 if (nonref_flags_already_loaded) {
1426 if (unlikely(!memequal(nonref_flags_iter, fread_ptr, cur_byte_ct))) {
1427 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
1428 return kPglRetInconsistentInput;
1429 }
1430 } else {
1431 memcpy(nonref_flags_iter, fread_ptr, cur_byte_ct);
1432 }
1433 fread_ptr = &(fread_ptr[cur_byte_ct]);
1434 } else {
1435 #endif
1436 unsigned char* loadptr = nonref_flags_already_loaded? loadbuf : nonref_flags_iter;
1437 if (unlikely(!fread_unlocked(loadptr, cur_byte_ct, 1, shared_ff))) {
1438 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
1439 return kPglRetReadFail;
1440 }
1441 if (nonref_flags_already_loaded) {
1442 if (unlikely(!memequal(nonref_flags_iter, loadbuf, cur_byte_ct))) {
1443 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Loaded nonref_flags do not match values in .pgen file.\n");
1444 return kPglRetInconsistentInput;
1445 }
1446 }
1447 #ifndef NO_MMAP
1448 }
1449 #endif
1450 nonref_flags_iter = &(nonref_flags_iter[cur_byte_ct]);
1451 }
1452 }
1453 }
1454
GetLdbaseVidx(const unsigned char * vrtypes,uint32_t cur_vidx)1455 uint32_t GetLdbaseVidx(const unsigned char* vrtypes, uint32_t cur_vidx) {
1456 #ifdef __LP64__
1457 const VecW* vrtypes_valias = R_CAST(const VecW*, vrtypes);
1458 const uint32_t cur_vidx_orig_remainder = cur_vidx % kBytesPerVec;
1459 uint32_t vidx_vec_idx = cur_vidx / kBytesPerVec;
1460 Vec8thUint v8ui = 0;
1461 if (cur_vidx_orig_remainder) {
1462 const VecW cur_vvec = vrtypes_valias[vidx_vec_idx];
1463 // non-ld: ((bit 2) OR (NOT bit 1))
1464 const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5);
1465 const VecW inv_cur_vvec_bit1 = ~vecw_slli(cur_vvec, 6);
1466 v8ui = vecw_movemask(cur_vvec_bit2 | inv_cur_vvec_bit1);
1467 v8ui = bzhi(v8ui, cur_vidx_orig_remainder);
1468 }
1469 while (!v8ui) {
1470 const VecW cur_vvec = vrtypes_valias[--vidx_vec_idx];
1471 const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5);
1472 const VecW inv_cur_vvec_bit1 = ~vecw_slli(cur_vvec, 6);
1473 v8ui = vecw_movemask(cur_vvec_bit2 | inv_cur_vvec_bit1);
1474 }
1475 return (vidx_vec_idx * kBytesPerVec) + bsru32(v8ui);
1476 #else
1477 const uintptr_t* vrtypes_walias = R_CAST(const uintptr_t*, vrtypes);
1478 const uint32_t cur_vidx_orig_remainder = cur_vidx % kBytesPerWord;
1479 uint32_t vidx_word_idx = (cur_vidx - 1) / kBytesPerWord;
1480 uintptr_t cur_vrtypes_word = vrtypes_walias[vidx_word_idx];
1481 if (cur_vidx_orig_remainder) {
1482 // make sure we don't detect a byte after the current position.
1483 cur_vrtypes_word = bzhi(cur_vrtypes_word, CHAR_BIT * cur_vidx_orig_remainder);
1484 cur_vrtypes_word |= (kMask0101 * 2) << (CHAR_BIT * cur_vidx_orig_remainder);
1485 }
1486 while (1) {
1487 // ((bit 2) OR (NOT bit 1)) for each byte. (possible experiment: see if
1488 // the same assembly is generated if this expression is rewritten to use
1489 // ands/nots.)
1490 const uintptr_t detect_non_ld_word = ((cur_vrtypes_word >> 1) | (~cur_vrtypes_word)) & (kMask0101 * 2);
1491 if (detect_non_ld_word) {
1492 // find the highest-order set bit in detect_non_ld_word; this corresponds
1493 // to the last non-LD-compressed byte (assuming little-endian).
1494 const uint32_t new_ldbase_vidx_loworder = bsrw(detect_non_ld_word) / CHAR_BIT;
1495 return (vidx_word_idx * kBytesPerWord) + new_ldbase_vidx_loworder;
1496 }
1497 // everything LD-compressed in the current block. move back 8 bytes in the
1498 // array (or 4-bytes for 32-bit build).
1499 cur_vrtypes_word = vrtypes_walias[--vidx_word_idx];
1500 }
1501 #endif
1502 }
1503
PgfiMultireadGetCachelineReq(const uintptr_t * variant_include,const PgenFileInfo * pgfip,uint32_t variant_ct,uint32_t block_size)1504 uint64_t PgfiMultireadGetCachelineReq(const uintptr_t* variant_include, const PgenFileInfo* pgfip, uint32_t variant_ct, uint32_t block_size) {
1505 // if block_size < kPglVblockSize, it's ideal for it to be a power of 2 (to
1506 // avoid unnecessary vblock crossing), but that's not required.
1507 const uint32_t raw_variant_ct = pgfip->raw_variant_ct;
1508 if (variant_ct == raw_variant_ct) {
1509 variant_include = nullptr;
1510 }
1511 uint32_t block_ct_m1 = 0;
1512 if (raw_variant_ct < block_size) {
1513 block_size = raw_variant_ct;
1514 } else {
1515 block_ct_m1 = (raw_variant_ct - 1) / block_size;
1516 }
1517 const uint64_t* var_fpos = pgfip->var_fpos;
1518 if ((!variant_include) && (!var_fpos)) {
1519 return DivUpU64(S_CAST(uint64_t, pgfip->const_vrec_width) * block_size, kCacheline);
1520 }
1521 uint64_t max_block_byte_ct = 0;
1522 uint32_t max_block_variant_ct = 0;
1523 for (uint32_t block_idx = 0; ; ++block_idx) {
1524 uint32_t variant_uidx_start = block_idx * block_size;
1525 uint32_t variant_uidx_end = variant_uidx_start + block_size;
1526 if (block_idx >= block_ct_m1) {
1527 if (block_idx > block_ct_m1) {
1528 break;
1529 }
1530 variant_uidx_end = raw_variant_ct;
1531 }
1532 if (variant_include) {
1533 variant_uidx_start = AdvBoundedTo1Bit(variant_include, variant_uidx_start, variant_uidx_end);
1534 if (variant_uidx_start == variant_uidx_end) {
1535 continue;
1536 }
1537 variant_uidx_end = 1 + FindLast1BitBefore(variant_include, variant_uidx_end);
1538 }
1539 if (var_fpos) {
1540 if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
1541 // need to start loading from LD-buddy
1542 variant_uidx_start = GetLdbaseVidx(pgfip->vrtypes, variant_uidx_start);
1543 }
1544 uint64_t cur_block_byte_ct = var_fpos[variant_uidx_end] - var_fpos[variant_uidx_start];
1545 if (cur_block_byte_ct > max_block_byte_ct) {
1546 max_block_byte_ct = cur_block_byte_ct;
1547 }
1548 } else {
1549 // no LD compression here
1550 const uint32_t cur_block_variant_ct = variant_uidx_end - variant_uidx_start;
1551 if (cur_block_variant_ct > max_block_variant_ct) {
1552 max_block_variant_ct = cur_block_variant_ct;
1553 if (cur_block_variant_ct == block_size) {
1554 // no larger value possible, terminate search
1555 break;
1556 }
1557 }
1558 }
1559 }
1560 if (!var_fpos) {
1561 max_block_byte_ct = max_block_variant_ct * S_CAST(uint64_t, pgfip->const_vrec_width);
1562 }
1563 return DivUpU64(max_block_byte_ct, kCacheline);
1564 }
1565
PgfiMultiread(const uintptr_t * variant_include,uint32_t variant_uidx_start,uint32_t variant_uidx_end,uint32_t load_variant_ct,PgenFileInfo * pgfip)1566 PglErr PgfiMultiread(const uintptr_t* variant_include, uint32_t variant_uidx_start, uint32_t variant_uidx_end, uint32_t load_variant_ct, PgenFileInfo* pgfip) {
1567 // we could permit 0, but that encourages lots of unnecessary thread wakeups
1568 assert(load_variant_ct);
1569 if (variant_include) {
1570 variant_uidx_start = AdvTo1Bit(variant_include, variant_uidx_start);
1571 }
1572 assert(variant_uidx_start < pgfip->raw_variant_ct);
1573 uint64_t block_offset;
1574 if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
1575 // need to start loading from LD-buddy
1576 // assume for now that we can't skip any variants between the LD-buddy and
1577 // the actual first variant; should remove this assumption later
1578 block_offset = pgfip->var_fpos[GetLdbaseVidx(pgfip->vrtypes, variant_uidx_start)];
1579 } else {
1580 block_offset = GetPgfiFpos(pgfip, variant_uidx_start);
1581 }
1582 pgfip->block_offset = block_offset;
1583 uint64_t next_read_start_fpos = block_offset;
1584 // break this up into multiple freads whenever this lets us skip an entire
1585 // disk block
1586 // (possible todo: make the disk block size a parameter of this function)
1587 do {
1588 const uint64_t cur_read_start_fpos = next_read_start_fpos;
1589 uint32_t cur_read_uidx_end;
1590 uint64_t cur_read_end_fpos;
1591 while (1) {
1592 cur_read_uidx_end = variant_uidx_end;
1593 if (cur_read_uidx_end - variant_uidx_start == load_variant_ct) {
1594 cur_read_end_fpos = GetPgfiFpos(pgfip, cur_read_uidx_end);
1595 load_variant_ct = 0;
1596 break;
1597 }
1598 cur_read_uidx_end = AdvTo0Bit(variant_include, variant_uidx_start);
1599 cur_read_end_fpos = GetPgfiFpos(pgfip, cur_read_uidx_end);
1600 load_variant_ct -= cur_read_uidx_end - variant_uidx_start;
1601 if (!load_variant_ct) {
1602 break;
1603 }
1604 variant_uidx_start = AdvTo1Bit(variant_include, cur_read_uidx_end);
1605 next_read_start_fpos = GetPgfiFpos(pgfip, variant_uidx_start);
1606 if (pgfip->vrtypes && ((pgfip->vrtypes[variant_uidx_start] & 6) == 2)) {
1607 const uint32_t variant_read_uidx_start = GetLdbaseVidx(pgfip->vrtypes, variant_uidx_start);
1608 if (variant_read_uidx_start <= cur_read_uidx_end) {
1609 continue;
1610 }
1611 next_read_start_fpos = pgfip->var_fpos[variant_read_uidx_start];
1612 }
1613 // bugfix: can't use do..while, since previous "continue" needs to skip
1614 // this check
1615 if (RoundDownPow2U64(cur_read_end_fpos + kDiskBlockSize + 1LLU, kDiskBlockSize) < RoundDownPow2U64(next_read_start_fpos, kDiskBlockSize)) {
1616 // minor bugfix (7 Jul 2017): break, not continue
1617 break;
1618 }
1619 }
1620 if (unlikely(fseeko(pgfip->shared_ff, cur_read_start_fpos, SEEK_SET))) {
1621 return kPglRetReadFail;
1622 }
1623 uintptr_t len = cur_read_end_fpos - cur_read_start_fpos;
1624 if (unlikely(fread_checked(K_CAST(unsigned char*, &(pgfip->block_base[cur_read_start_fpos - block_offset])), len, pgfip->shared_ff))) {
1625 return kPglRetReadFail;
1626 }
1627 } while (load_variant_ct);
1628 return kPglRetSuccess;
1629 }
1630
1631
PreinitPgr(PgenReader * pgr_ptr)1632 void PreinitPgr(PgenReader* pgr_ptr) {
1633 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
1634 pgrp->ff = nullptr;
1635 }
1636
PgrInit(const char * fname,uint32_t max_vrec_width,PgenFileInfo * pgfip,PgenReader * pgr_ptr,unsigned char * pgr_alloc)1637 PglErr PgrInit(const char* fname, uint32_t max_vrec_width, PgenFileInfo* pgfip, PgenReader* pgr_ptr, unsigned char* pgr_alloc) {
1638 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
1639 // See CountPgrAllocCachelinesRequired().
1640 // Could add a debug mode.
1641
1642 // Mode 1 (mmap): block_base initialized, shared_ff == nullptr. fname must
1643 // be nullptr.
1644 // Mode 2 (block-fread): block_base initialized, shared_ff != nullptr. fname
1645 // must be nullptr.
1646 // Mode 3 (per-variant fread): block_base == nullptr. fname must be
1647 // non-null, though it isn't actually referenced during the first
1648 // PgenReader initialization (instead shared_ff is moved).
1649 unsigned char* pgr_alloc_iter = pgr_alloc;
1650 if (pgfip->block_base != nullptr) {
1651 if (unlikely(fname != nullptr)) {
1652 return kPglRetImproperFunctionCall;
1653 }
1654 pgrp->ff = nullptr; // make sure CleanupPgr() doesn't break
1655 } else {
1656 if (pgfip->shared_ff != nullptr) {
1657 if (unlikely(fname == nullptr)) {
1658 return kPglRetImproperFunctionCall;
1659 }
1660 // move instead of close/reopen.
1661 pgrp->ff = pgfip->shared_ff;
1662 pgfip->shared_ff = nullptr;
1663 } else {
1664 pgrp->ff = fopen(fname, FOPEN_RB);
1665 if (unlikely(!pgrp->ff)) {
1666 return kPglRetOpenFail;
1667 }
1668 }
1669 // now that arbitrary info can be stored between header and first variant
1670 // record, always seek.
1671 uint64_t seek_pos;
1672 if (pgfip->var_fpos) {
1673 seek_pos = pgfip->var_fpos[0];
1674 } else {
1675 seek_pos = pgfip->const_fpos_offset;
1676 }
1677 if (unlikely(fseeko(pgrp->ff, seek_pos, SEEK_SET))) {
1678 return kPglRetReadFail;
1679 }
1680 }
1681 pgrp->fi = *pgfip; // struct copy
1682 if (fname) {
1683 // Mode 3 per-reader load buffer
1684 pgrp->fread_buf = pgr_alloc_iter;
1685 pgr_alloc_iter = &(pgr_alloc_iter[RoundUpPow2(max_vrec_width, kCacheline)]);
1686 }
1687 pgrp->fp_vidx = 0;
1688 pgrp->ldbase_vidx = UINT32_MAX;
1689 pgrp->ldbase_stypes = kfPgrLdcache0;
1690 pgrp->ldbase_genovec = nullptr;
1691 pgrp->ldbase_raregeno = nullptr;
1692 pgrp->ldbase_difflist_sample_ids = nullptr;
1693
1694 const PgenGlobalFlags gflags = pgrp->fi.gflags;
1695 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
1696 const uint32_t genovec_bytes_req = NypCtToCachelineCt(raw_sample_ct) * kCacheline;
1697 pgrp->ldbase_raw_genovec = R_CAST(uintptr_t*, pgr_alloc_iter);
1698 pgr_alloc_iter = &(pgr_alloc_iter[genovec_bytes_req]);
1699 const uint32_t bitvec_bytes_req = BitCtToCachelineCt(raw_sample_ct) * kCacheline;
1700 const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1;
1701 const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor);
1702 const uint32_t max_allele_ct = pgrp->fi.max_allele_ct;
1703 pgrp->workspace_difflist_sample_ids = nullptr;
1704 if ((gflags & kfPgenGlobalDifflistOrLdPresent) || (max_allele_ct > 2)) {
1705 pgrp->workspace_difflist_sample_ids = R_CAST(uint32_t*, pgr_alloc_iter);
1706 pgr_alloc_iter = &(pgr_alloc_iter[(1 + (max_difflist_entry_ct_base / kInt32PerCacheline)) * (kCacheline * k1LU)]);
1707 }
1708 if (gflags & kfPgenGlobalDifflistOrLdPresent) {
1709 // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
1710
1711 pgrp->workspace_raregeno_vec = R_CAST(uintptr_t*, pgr_alloc_iter);
1712 pgr_alloc_iter = &(pgr_alloc_iter[NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline]);
1713
1714 pgrp->workspace_raregeno_tmp_loadbuf = R_CAST(uintptr_t*, pgr_alloc_iter);
1715 pgr_alloc_iter = &(pgr_alloc_iter[NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline]);
1716
1717 if (ld_compression_present) {
1718 pgrp->ldbase_genovec = R_CAST(uintptr_t*, pgr_alloc_iter);
1719 pgr_alloc_iter = &(pgr_alloc_iter[genovec_bytes_req]);
1720
1721 pgrp->ldbase_raregeno = R_CAST(uintptr_t*, pgr_alloc_iter);
1722 pgr_alloc_iter = &(pgr_alloc_iter[NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline]);
1723
1724 pgrp->ldbase_difflist_sample_ids = R_CAST(uint32_t*, pgr_alloc_iter);
1725 pgr_alloc_iter = &(pgr_alloc_iter[(1 + (max_difflist_entry_ct_base / kInt32PerCacheline)) * (kCacheline * k1LU)]);
1726 }
1727 } else {
1728 pgrp->workspace_raregeno_vec = nullptr;
1729 pgrp->workspace_raregeno_tmp_loadbuf = nullptr;
1730 }
1731 pgrp->workspace_vec = nullptr;
1732 pgrp->workspace_aux1x_present = nullptr;
1733 pgrp->workspace_imp_r2 = nullptr;
1734 pgrp->workspace_all_hets = nullptr;
1735 pgrp->workspace_subset = nullptr;
1736 const PgenGlobalFlags gflags_hphase_dosage = gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent);
1737 if ((max_allele_ct > 2) || gflags_hphase_dosage) {
1738 pgrp->workspace_vec = R_CAST(uintptr_t*, pgr_alloc_iter);
1739 pgr_alloc_iter = &(pgr_alloc_iter[genovec_bytes_req]);
1740 if (max_allele_ct > 2) {
1741 pgrp->workspace_aux1x_present = R_CAST(uintptr_t*, pgr_alloc_iter);
1742 pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1743 pgrp->workspace_imp_r2 = R_CAST(uint64_t*, pgr_alloc_iter);
1744 pgr_alloc_iter = &(pgr_alloc_iter[Int64CtToCachelineCt(2 * max_allele_ct) * (kCacheline * k1LU)]);
1745 }
1746 if (gflags & kfPgenGlobalHardcallPhasePresent) {
1747 pgrp->workspace_all_hets = R_CAST(uintptr_t*, pgr_alloc_iter);
1748 pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1749 pgrp->workspace_subset = R_CAST(uintptr_t*, pgr_alloc_iter);
1750 pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1751 }
1752 pgrp->workspace_dosage_present = nullptr;
1753 pgrp->workspace_dphase_present = nullptr;
1754 if (gflags & kfPgenGlobalDosagePresent) {
1755 pgrp->workspace_dosage_present = R_CAST(uintptr_t*, pgr_alloc_iter);
1756 pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1757 if (gflags & kfPgenGlobalDosagePhasePresent) {
1758 pgrp->workspace_dphase_present = R_CAST(uintptr_t*, pgr_alloc_iter);
1759 }
1760 // pgr_alloc_iter = &(pgr_alloc_iter[bitvec_bytes_req]);
1761 }
1762 }
1763 return kPglRetSuccess;
1764 }
1765
PgrPlink1ToPlink2InplaceUnsafe(uint32_t sample_ct,uintptr_t * genovec)1766 void PgrPlink1ToPlink2InplaceUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
1767 // 00 -> 10, 01 -> 11, 10 -> 01, 11 -> 00
1768 // new low bit = [old low] ^ [old high]
1769 // new high bit = ~[old high]
1770 // "unsafe" because trailing bits are not zeroed out.
1771 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
1772 const VecW m1 = VCONST_W(kMask5555);
1773 const VecW not_m1 = VCONST_W(kMaskAAAA);
1774 VecW* vptr = R_CAST(VecW*, genovec);
1775 for (uint32_t vidx = 0; vidx != vec_ct; vidx++) {
1776 const VecW not_cur_vec_high = vecw_and_notfirst(vptr[vidx], not_m1);
1777 vptr[vidx] = (vecw_and_notfirst(vptr[vidx], m1) ^ vecw_srli(not_cur_vec_high, 1)) | not_cur_vec_high;
1778 }
1779 }
1780
PgrPlink2ToPlink1InplaceUnsafe(uint32_t sample_ct,uintptr_t * genovec)1781 void PgrPlink2ToPlink1InplaceUnsafe(uint32_t sample_ct, uintptr_t* genovec) {
1782 // 00 -> 11, 01 -> 10, 10 -> 00, 11 -> 01
1783 // new low bit = [old low] ^ (~[old high])
1784 // new high bit = ~[old high]
1785 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
1786 const VecW not_m1 = VCONST_W(kMaskAAAA);
1787 VecW* vptr = R_CAST(VecW*, genovec);
1788 for (uint32_t vidx = 0; vidx != vec_ct; vidx++) {
1789 VecW cur_vec = vptr[vidx];
1790 VecW not_cur_vec_high = vecw_and_notfirst(cur_vec, not_m1);
1791 vptr[vidx] = (vecw_and_notfirst(not_m1, cur_vec) ^ vecw_srli(not_cur_vec_high, 1)) | not_cur_vec_high;
1792 }
1793 }
1794
ParseDifflistHeader(const unsigned char * fread_end,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uintptr_t * raregeno_buf,const unsigned char ** difflist_group_info_ptr,uint32_t * difflist_len_ptr)1795 PglErr ParseDifflistHeader(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* raregeno_buf, const unsigned char** difflist_group_info_ptr, uint32_t* difflist_len_ptr) {
1796 // Can be used for deltalists as well: pass raregeno_buf == nullptr.
1797 // Trailing bits of raregeno may not be zeroed out.
1798 // Will need a separate 64-bit version of this for multiallelic dosages.
1799 const uint32_t difflist_len = GetVint31(fread_end, fread_pp);
1800 // moved here to address maybe-uninitialized warnings
1801 *difflist_group_info_ptr = *fread_pp;
1802 *difflist_len_ptr = difflist_len;
1803 if (!difflist_len) {
1804 return kPglRetSuccess;
1805 }
1806 if (unlikely(difflist_len > raw_sample_ct / kPglMaxDifflistLenDivisor)) {
1807 // automatically catches GetVint31() failure
1808 return kPglRetMalformedInput;
1809 }
1810 const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize);
1811 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
1812 const uint32_t difflist_index_byte_ct = group_ct * (sample_id_byte_ct + 1) - 1;
1813 if (PtrAddCk(fread_end, difflist_index_byte_ct, fread_pp)) {
1814 return kPglRetMalformedInput;
1815 }
1816 if (!raregeno_buf) {
1817 // for sample ID lists without 2-bit genotype info, used for sparse dosage
1818 return kPglRetSuccess;
1819 }
1820 const uint32_t raregeno_byte_ct = NypCtToByteCt(difflist_len);
1821 const unsigned char* raregeno_start = *fread_pp;
1822 if (PtrAddCk(fread_end, raregeno_byte_ct, fread_pp)) {
1823 return kPglRetMalformedInput;
1824 }
1825 // possible todo: just return raregeno_start, and let the caller perform this
1826 // copy
1827 memcpy(raregeno_buf, raregeno_start, raregeno_byte_ct);
1828 return kPglRetSuccess;
1829 }
1830
ParseAndSaveDifflist(const unsigned char * fread_end,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uintptr_t * __restrict raregeno,uint32_t * __restrict difflist_sample_ids,uint32_t * __restrict difflist_len_ptr)1831 PglErr ParseAndSaveDifflist(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
1832 // Appropriate when we need to iterate through the difflist multiple times.
1833 // Other functions are more efficient if we only need to process the list
1834 // once.
1835 // Trailing bits of raregeno may not be zeroed out.
1836 const unsigned char* group_info_iter;
1837 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno, &group_info_iter, difflist_len_ptr);
1838 uint32_t difflist_len = *difflist_len_ptr;
1839 // todo: check if difflist_len == 0 early exit is a net positive or negative
1840 // on a few test datasets
1841 if (reterr || (!difflist_len)) {
1842 return reterr;
1843 }
1844 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
1845 uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
1846 for (uint32_t difflist_remaining = difflist_len; ; ) {
1847 const uint32_t* difflist_sample_ids_stop;
1848 if (difflist_remaining < kPglDifflistGroupSize) {
1849 if (!difflist_remaining) {
1850 return kPglRetSuccess;
1851 }
1852 difflist_sample_ids_stop = &(difflist_sample_ids_iter[difflist_remaining]);
1853 difflist_remaining = 0;
1854 } else {
1855 difflist_sample_ids_stop = &(difflist_sample_ids_iter[kPglDifflistGroupSize]);
1856 difflist_remaining -= kPglDifflistGroupSize;
1857 }
1858 // can't use uint32_t assignment trick for now since there's a corner case
1859 // where that would read past the end of the mapped address range
1860 uintptr_t raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
1861 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
1862 while (1) {
1863 #ifndef __LP64__
1864 // perform more frequent checks in 32-bit build since raw_sample_idx may
1865 // overflow
1866 // misses "small negative" malformed input, but it'll catch data
1867 // corruption with very high probability
1868 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
1869 return kPglRetMalformedInput;
1870 }
1871 #endif
1872 *difflist_sample_ids_iter++ = raw_sample_idx;
1873 if (difflist_sample_ids_iter == difflist_sample_ids_stop) {
1874 break;
1875 }
1876 raw_sample_idx += GetVint31(fread_end, fread_pp);
1877 }
1878 #ifdef __LP64__
1879 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
1880 return kPglRetMalformedInput;
1881 }
1882 #endif
1883 }
1884 return kPglRetSuccess;
1885 }
1886
ParseAndSaveDifflistProperSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uintptr_t * __restrict raregeno,uint32_t * __restrict difflist_sample_ids,uint32_t * __restrict difflist_len_ptr,uintptr_t * __restrict raregeno_workspace)1887 PglErr ParseAndSaveDifflistProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr, uintptr_t* __restrict raregeno_workspace) {
1888 // Requires a PROPER subset. Might want to just merge this with
1889 // ParseAndSaveDifflist() and rename appropriately.
1890 // Trailing bits of raregeno are zeroed out.
1891 uint32_t raw_difflist_len;
1892 const unsigned char* group_info_iter;
1893 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &raw_difflist_len);
1894 if (reterr || (!raw_difflist_len)) {
1895 *difflist_len_ptr = 0;
1896 return reterr;
1897 }
1898 const uint32_t subgroup_idx_last = (raw_difflist_len - 1) / kBitsPerWordD2;
1899 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
1900 uintptr_t* raregeno_workspace_iter = raregeno_workspace;
1901 uintptr_t* raregeno_iter = raregeno;
1902 uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
1903
1904 // technically doesn't need to be initialized, but I have principles
1905 uintptr_t raw_sample_idx = 0;
1906
1907 uintptr_t raregeno_word = 0;
1908 uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
1909 uint32_t difflist_len_lowbits = 0;
1910 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
1911 if (subgroup_idx >= subgroup_idx_last) {
1912 if (subgroup_idx > subgroup_idx_last) {
1913 if (difflist_len_lowbits) {
1914 *raregeno_iter = raregeno_word;
1915 }
1916 *difflist_len_ptr = S_CAST(uintptr_t, difflist_sample_ids_iter - difflist_sample_ids) + difflist_len_lowbits;
1917 return kPglRetSuccess;
1918 }
1919 subgroup_len_m1 &= raw_difflist_len - 1;
1920 }
1921 // We need to consume a new rare genotype word every 32 entries, and pull a
1922 // raw sample index from the difflist header every 64 entries. So it's
1923 // best to make the inner loop have a period of 32 (call this a 'subgroup',
1924 // where 'group' refers to a set of 64 entries).
1925 if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
1926 #ifdef __LP64__
1927 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
1928 return kPglRetMalformedInput;
1929 }
1930 #endif
1931 raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
1932 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
1933 } else {
1934 raw_sample_idx += GetVint31(fread_end, fread_pp);
1935 }
1936 uintptr_t raregeno_workspace_word = *raregeno_workspace_iter++;
1937 for (uint32_t raw_difflist_idx_lowbits = 0; ; ++raw_difflist_idx_lowbits) {
1938 #ifndef __LP64__
1939 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
1940 return kPglRetMalformedInput;
1941 }
1942 #endif
1943 if (IsSet(sample_include, raw_sample_idx)) {
1944 raregeno_word |= ((raregeno_workspace_word >> (2 * raw_difflist_idx_lowbits)) & 3) << (difflist_len_lowbits * 2);
1945 difflist_sample_ids_iter[difflist_len_lowbits] = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx);
1946 if (difflist_len_lowbits++ == (kBitsPerWordD2 - 1)) {
1947 *raregeno_iter++ = raregeno_word;
1948 raregeno_word = 0;
1949 difflist_len_lowbits = 0;
1950 difflist_sample_ids_iter = &(difflist_sample_ids_iter[kBitsPerWordD2]);
1951 }
1952 }
1953 if (raw_difflist_idx_lowbits == subgroup_len_m1) {
1954 break;
1955 }
1956 raw_sample_idx += GetVint31(fread_end, fread_pp);
1957 }
1958 }
1959 }
1960
ParseLdAndMergeDifflistSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict ldbase_raregeno,const uint32_t * __restrict ldbase_difflist_sample_ids,uint32_t ldbase_difflist_len,uintptr_t ldbase_common_geno,uint32_t raw_sample_ct,uint32_t sample_ct,const unsigned char ** fread_pp,uintptr_t * __restrict merged_raregeno,uint32_t * __restrict merged_difflist_sample_ids,uint32_t * __restrict merged_difflist_len_ptr,uintptr_t * __restrict diff_from_ldbase_raregeno_iter)1961 PglErr ParseLdAndMergeDifflistSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict ldbase_raregeno, const uint32_t* __restrict ldbase_difflist_sample_ids, uint32_t ldbase_difflist_len, uintptr_t ldbase_common_geno, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict merged_raregeno, uint32_t* __restrict merged_difflist_sample_ids, uint32_t* __restrict merged_difflist_len_ptr, uintptr_t* __restrict diff_from_ldbase_raregeno_iter) {
1962 // Used when the ldbase variant was saved as a difflist, and it's useful to
1963 // process the current variant as a difflist.
1964 // * Assumes ldbase_difflist_sample_ids[ldbase_difflist_len]==sample_ct.
1965 // * Assumes sample_include == nullptr if no subsetting needed. (Otherwise,
1966 // it'll still work, but performance will be worse.)
1967 // Trailing bits of merged_raregeno may not be zeroed out.
1968 // Caller is responsible for inverting ldbase_common_geno and merged_raregeno
1969 // afterward if necessary.
1970 assert(ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct);
1971 uint32_t diff_from_ldbase_len;
1972 const unsigned char* group_info_iter;
1973 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, diff_from_ldbase_raregeno_iter, &group_info_iter, &diff_from_ldbase_len);
1974 if (unlikely(reterr)) {
1975 return reterr;
1976 }
1977 if (!diff_from_ldbase_len) {
1978 memcpy(merged_difflist_sample_ids, ldbase_difflist_sample_ids, ldbase_difflist_len * sizeof(int32_t));
1979 *merged_difflist_len_ptr = ldbase_difflist_len;
1980 CopyNyparr(ldbase_raregeno, ldbase_difflist_len, merged_raregeno);
1981 return kPglRetSuccess;
1982 }
1983 const uint32_t subgroup_idx_last = (diff_from_ldbase_len - 1) / kBitsPerWordD2;
1984 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
1985 uintptr_t* merged_raregeno_iter = merged_raregeno;
1986 uint32_t* merged_difflist_sample_ids_iter = merged_difflist_sample_ids;
1987 uintptr_t merged_raregeno_word = 0;
1988 uintptr_t ldbase_raregeno_word = 0;
1989 uintptr_t diff_from_ldbase_raregeno_word = 0;
1990 uint32_t ldbase_sample_idx = ldbase_difflist_sample_ids[0];
1991 uintptr_t raw_sample_idx = 0;
1992 uintptr_t cur_geno = 0;
1993 uint32_t sample_idx = 0;
1994 uint32_t ldbase_difflist_idx = 0;
1995 uint32_t done = 0;
1996 uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
1997 uint32_t merge_idx_lowbits = 0;
1998 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
1999 uint32_t diff_from_ldbase_idx_lowbits = 0;
2000 if (subgroup_idx >= subgroup_idx_last) {
2001 if (subgroup_idx > subgroup_idx_last) {
2002 done = 1;
2003 sample_idx = sample_ct;
2004 goto ParseLdAndMergeDifflistSubset_finish;
2005 }
2006 subgroup_len_m1 &= diff_from_ldbase_len - 1;
2007 }
2008 if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
2009 raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
2010 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
2011 } else {
2012 raw_sample_idx += GetVint31(fread_end, fread_pp);
2013 }
2014 diff_from_ldbase_raregeno_word = *diff_from_ldbase_raregeno_iter++;
2015 for (; ; ++diff_from_ldbase_idx_lowbits) {
2016 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2017 return kPglRetMalformedInput;
2018 }
2019 cur_geno = diff_from_ldbase_raregeno_word & 3;
2020 if ((!sample_include) || IsSet(sample_include, raw_sample_idx)) {
2021 sample_idx = sample_include? RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx) : raw_sample_idx;
2022 ParseLdAndMergeDifflistSubset_finish:
2023 while (ldbase_sample_idx < sample_idx) {
2024 // replace with blocked copy?
2025 if (!(ldbase_difflist_idx % kBitsPerWordD2)) {
2026 ldbase_raregeno_word = ldbase_raregeno[ldbase_difflist_idx / kBitsPerWordD2];
2027 }
2028 *merged_difflist_sample_ids_iter++ = ldbase_sample_idx;
2029 merged_raregeno_word |= (ldbase_raregeno_word & 3) << (2 * merge_idx_lowbits);
2030 if (merge_idx_lowbits++ == (kBitsPerWordD2 - 1)) {
2031 *merged_raregeno_iter++ = merged_raregeno_word;
2032 merged_raregeno_word = 0;
2033 merge_idx_lowbits = 0;
2034 }
2035 ++ldbase_difflist_idx;
2036 ldbase_raregeno_word >>= 2;
2037 ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx];
2038 }
2039 if (ldbase_sample_idx == sample_idx) {
2040 if (done) {
2041 if (merge_idx_lowbits) {
2042 *merged_raregeno_iter = merged_raregeno_word;
2043 }
2044 *merged_difflist_len_ptr = merged_difflist_sample_ids_iter - merged_difflist_sample_ids;
2045 return kPglRetSuccess;
2046 }
2047 if (!(ldbase_difflist_idx % kBitsPerWordD2)) {
2048 ldbase_raregeno_word = ldbase_raregeno[ldbase_difflist_idx / kBitsPerWordD2];
2049 }
2050 ++ldbase_difflist_idx;
2051 ldbase_raregeno_word >>= 2;
2052 ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx];
2053 }
2054 if (cur_geno != ldbase_common_geno) {
2055 *merged_difflist_sample_ids_iter++ = sample_idx;
2056 merged_raregeno_word |= cur_geno << (2 * merge_idx_lowbits);
2057 if (merge_idx_lowbits++ == (kBitsPerWordD2 - 1)) {
2058 *merged_raregeno_iter++ = merged_raregeno_word;
2059 merged_raregeno_word = 0;
2060 merge_idx_lowbits = 0;
2061 }
2062 }
2063 }
2064 if (diff_from_ldbase_idx_lowbits == subgroup_len_m1) {
2065 break;
2066 }
2067 raw_sample_idx += GetVint31(fread_end, fread_pp);
2068 diff_from_ldbase_raregeno_word >>= 2;
2069 }
2070 }
2071 }
2072
2073 /*
2074 void PrunedDifflistToGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t sample_ct, uint32_t difflist_common_geno, uint32_t difflist_len, uintptr_t* __restrict genovec) {
2075 // Designed to be used after genovec subsetting. Assumes all difflist
2076 // entries are valid. Ok for trailing bits of raregeno to be nonzero. Does
2077 // not zero out trailing bits of genovec.
2078 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
2079 vecset(genovec, difflist_common_geno * kMask5555, vec_ct);
2080 if (!difflist_len) {
2081 return;
2082 }
2083 const uintptr_t* raregeno_incr = raregeno;
2084 const uint32_t* difflist_sample_ids_iter = difflist_sample_ids;
2085 const uint32_t* difflist_sample_ids_end = &(difflist_sample_ids[difflist_len]);
2086 // don't think there's a point to separating out the
2087 // difflist_common_geno == 0 case here, since the RawToSubsettedPos
2088 // operation is a bit expensive
2089 while (1) {
2090 // er, get rid of this undefined behavior if we uncomment this function
2091 const uint32_t* difflist_sample_ids_stop = &(difflist_sample_ids_iter[kBitsPerWordD2]);
2092 uintptr_t raregeno_word = *raregeno_incr++;
2093 if (difflist_sample_ids_stop > difflist_sample_ids_end) {
2094 if (difflist_sample_ids_iter == difflist_sample_ids_end) {
2095 return;
2096 }
2097 difflist_sample_ids_stop = difflist_sample_ids_end;
2098 }
2099 while (1) {
2100 const uint32_t cur_sample_idx = *difflist_sample_ids_iter;
2101 const uint32_t cur_subsetted_pos = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, cur_sample_idx);
2102 AssignNyparrEntry(cur_subsetted_pos, raregeno_word & 3, genovec);
2103 if (difflist_sample_ids_iter++ == difflist_sample_ids_stop) {
2104 break;
2105 }
2106 raregeno_word >>= 2;
2107 }
2108 }
2109 }
2110 */
2111
ParseAndApplyDifflist(const unsigned char * fread_end,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)2112 PglErr ParseAndApplyDifflist(const unsigned char* fread_end, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
2113 // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
2114 // Cannot occur after genoarr subsetting since the difflist sample indexes
2115 // will be incorrect.
2116 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2117 uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
2118 const unsigned char* group_info_iter;
2119 uint32_t difflist_len;
2120 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len);
2121 if (reterr || (!difflist_len)) {
2122 return reterr;
2123 }
2124 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
2125 const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
2126 uintptr_t raw_sample_idx = 0;
2127 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
2128 uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
2129 if (subgroup_idx >= subgroup_idx_last) {
2130 if (subgroup_idx > subgroup_idx_last) {
2131 return kPglRetSuccess;
2132 }
2133 remaining_deltas_in_subgroup &= difflist_len - 1;
2134 }
2135 if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
2136 raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
2137 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
2138 } else {
2139 raw_sample_idx += GetVint31(fread_end, fread_pp);
2140 }
2141 uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
2142 // This loop tends to be the decompression bottleneck. Tried to modify it
2143 // to process 4 entries at a time, but that didn't end up helping.
2144 for (; ; --remaining_deltas_in_subgroup) {
2145 // always check, since otherwise AssignNyparrEntry() can scribble
2146 // over arbitrary memory
2147 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2148 return kPglRetMalformedInput;
2149 }
2150 const uintptr_t cur_geno = cur_raregeno_word & 3;
2151 AssignNyparrEntry(raw_sample_idx, cur_geno, genoarr);
2152 if (!remaining_deltas_in_subgroup) {
2153 break;
2154 }
2155 raw_sample_idx += GetVint31(fread_end, fread_pp);
2156 cur_raregeno_word >>= 2;
2157 }
2158 }
2159 }
2160
2161 // could merge ParseAndApplyDifflist() with this?
ParseAndApplyDifflistSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)2162 PglErr ParseAndApplyDifflistSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
2163 // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
2164 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2165 if (sample_ct == raw_sample_ct) {
2166 return ParseAndApplyDifflist(fread_end, fread_pp, pgrp, genoarr);
2167 }
2168 uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
2169 const unsigned char* group_info_iter;
2170 uint32_t difflist_len;
2171 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len);
2172 if (reterr || (!difflist_len)) {
2173 return reterr;
2174 }
2175 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
2176 const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
2177 uintptr_t raw_sample_idx = 0;
2178 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
2179 uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
2180 if (subgroup_idx >= subgroup_idx_last) {
2181 if (subgroup_idx > subgroup_idx_last) {
2182 return kPglRetSuccess;
2183 }
2184 remaining_deltas_in_subgroup &= difflist_len - 1;
2185 }
2186 if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
2187 raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
2188 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
2189 } else {
2190 raw_sample_idx += GetVint31(fread_end, fread_pp);
2191 }
2192 uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
2193 // This loop tends to be the decompression bottleneck. Tried to modify it
2194 // to process 4 entries at a time, but that didn't end up helping.
2195 for (; ; --remaining_deltas_in_subgroup) {
2196 // always check, since otherwise AssignNyparrEntry() can scribble
2197 // over arbitrary memory
2198 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2199 return kPglRetMalformedInput;
2200 }
2201 if (IsSet(sample_include, raw_sample_idx)) {
2202 const uintptr_t cur_geno = cur_raregeno_word & 3;
2203 AssignNyparrEntry(RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx), cur_geno, genoarr);
2204 }
2205 if (!remaining_deltas_in_subgroup) {
2206 break;
2207 }
2208 raw_sample_idx += GetVint31(fread_end, fread_pp);
2209 cur_raregeno_word >>= 2;
2210 }
2211 }
2212 }
2213
2214 // vector-alignment preferred
ParseOnebitUnsafe(const unsigned char * fread_end,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)2215 PglErr ParseOnebitUnsafe(const unsigned char* fread_end, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
2216 // doesn't zero out trailing genoarr bits
2217 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2218 const uint32_t common2_and_bitarray_byte_ct = (raw_sample_ct + 15) / CHAR_BIT;
2219 const unsigned char* onebit_main_iter = *fread_pp;
2220 if (PtrAddCk(fread_end, common2_and_bitarray_byte_ct, fread_pp)) {
2221 return kPglRetMalformedInput;
2222 }
2223 const uintptr_t common2_code = *onebit_main_iter++;
2224 const uintptr_t word_base = (common2_code / 4) * kMask5555;
2225 const uintptr_t common_code_delta = common2_code & 3;
2226 uint32_t genoarr_widx = 0;
2227 #if defined(__LP64__) && !defined(USE_AVX2)
2228 // this is slower in AVX2 case
2229 const uint32_t read_hw_ct = raw_sample_ct / kBitsPerWordD2;
2230 if (read_hw_ct >= 2 * kWordsPerVec) {
2231 const uint32_t read_vec_ct = raw_sample_ct / kBitsPerVec;
2232 const VecW* onebit_main_valias = R_CAST(const VecW*, onebit_main_iter);
2233 const VecW m4 = VCONST_W(kMask0F0F);
2234 # ifdef USE_SSE42
2235 // 0, 1, 4, 5, 16, 17, 20, 21, 64, 65, 68, 69, 80, 81, 84, 85 if the codes
2236 // are 0 and 1
2237 const VecW lookup = {word_base + common_code_delta * 0x1514111005040100LLU,
2238 word_base + common_code_delta * 0x5554515045444140LLU};
2239 # else
2240 const VecW m1 = VCONST_W(kMask5555);
2241 const VecW m2 = VCONST_W(kMask3333);
2242 const VecW vec_base = VCONST_W(word_base);
2243 const VecW vec_delta = VCONST_W(common_code_delta * kMask5555);
2244 # endif
2245 VecW* genoarr_valias = R_CAST(VecW*, genoarr);
2246 for (uint32_t vidx = 0; vidx != read_vec_ct; ++vidx) {
2247 const VecW cur_vec = vecw_loadu(&(onebit_main_valias[vidx]));
2248 const VecW vec_even = cur_vec & m4;
2249 const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;
2250 VecW vec_lo = vecw_unpacklo8(vec_even, vec_odd);
2251 VecW vec_hi = vecw_unpackhi8(vec_even, vec_odd);
2252 # ifdef USE_SSE42
2253 vec_lo = vecw_shuffle8(lookup, vec_lo);
2254 vec_hi = vecw_shuffle8(lookup, vec_hi);
2255 # else
2256 // unpack bytes, then use as mask for vec_add.
2257 vec_lo = (vec_lo | vecw_slli(vec_lo, 2)) & m2;
2258 vec_hi = (vec_hi | vecw_slli(vec_hi, 2)) & m2;
2259 vec_lo = (vec_lo | vecw_slli(vec_lo, 1)) & m1;
2260 vec_hi = (vec_hi | vecw_slli(vec_hi, 1)) & m1;
2261 vec_lo = vec_lo | vecw_slli(vec_lo, 1);
2262 vec_hi = vec_hi | vecw_slli(vec_hi, 1);
2263 vec_lo = vec_base + (vec_delta & vec_lo);
2264 vec_hi = vec_base + (vec_delta & vec_hi);
2265 # endif
2266 genoarr_valias[2 * vidx] = vec_lo;
2267 genoarr_valias[2 * vidx + 1] = vec_hi;
2268 }
2269 genoarr_widx = read_vec_ct * (2 * kWordsPerVec);
2270 }
2271 #endif
2272 const uint32_t genoarr_widx_trail = (raw_sample_ct + 7) / kBitsPerWordD2;
2273 const uint32_t genoarr_widx_end = NypCtToWordCt(raw_sample_ct);
2274 # ifdef __arm__
2275 # error "Unaligned accesses in ParseOnebitUnsafe()."
2276 # endif
2277 const Halfword* onebit_main_alias = R_CAST(const Halfword*, onebit_main_iter);
2278 for (; ; ++genoarr_widx) {
2279 uintptr_t ww;
2280 if (genoarr_widx >= genoarr_widx_trail) {
2281 // might want to modify to not go here if last read is an entire halfword
2282 if (genoarr_widx == genoarr_widx_end) {
2283 break;
2284 }
2285 ww = ProperSubwordLoad(&(onebit_main_alias[genoarr_widx_trail]), 1 + (((raw_sample_ct - 1) % kBitsPerWordD2) / CHAR_BIT));
2286 } else {
2287 ww = onebit_main_alias[genoarr_widx];
2288 }
2289 // apply middle-out operation
2290 // 64-bit:
2291 // const uintptr_t middle_out_result = (ww | (ww << 31)) & kMask5555;
2292 // 32-bit:
2293 // *genoarr_iter++ = word_base + (ww & kMask5555) * common_code_delta;
2294 // *genoarr_iter++ = word_base + ((ww >> 1) & kMask5555) * common_code_delta;
2295 // (scrapped since the time savings don't seem to be worth the extra
2296 // end-of-vector corner cases, apparently the extra operations here are
2297 // sufficiently cheap, or even negative-cost in AVX2 case)
2298
2299 ww = UnpackHalfwordToWord(ww);
2300 genoarr[genoarr_widx] = word_base + ww * common_code_delta;
2301 }
2302 return ParseAndApplyDifflist(fread_end, fread_pp, pgrp, genoarr);
2303 }
2304
2305 // vector-alignment preferred
Parse1or2bitGenoarrUnsafe(const unsigned char * fread_end,uint32_t vrtype,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)2306 PglErr Parse1or2bitGenoarrUnsafe(const unsigned char* fread_end, uint32_t vrtype, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
2307 // Side effect: may use pgrp->workspace_raregeno_tmp_loadbuf.
2308 // Does not update fp_vidx, does not rotate plink1-formatted data (since it's
2309 // better to do that post-subsetting)
2310 if (vrtype & 3) {
2311 return ParseOnebitUnsafe(fread_end, fread_pp, pgrp, genoarr);
2312 }
2313 // uncompressed storage
2314 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2315 const uint32_t genoarr_byte_ct = NypCtToByteCt(raw_sample_ct);
2316 const unsigned char* src_genodata = *fread_pp;
2317 if (PtrAddCk(fread_end, genoarr_byte_ct, fread_pp)) {
2318 return kPglRetMalformedInput;
2319 }
2320 memcpy(genoarr, src_genodata, genoarr_byte_ct);
2321 return kPglRetSuccess;
2322 }
2323
ParseNonLdGenovecSubsetUnsafe(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vrtype,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genovec)2324 PglErr ParseNonLdGenovecSubsetUnsafe(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vrtype, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genovec) {
2325 // Side effects:
2326 // may use pgrp->workspace_raregeno_tmp_loadbuf
2327 // fills pgrp->ldbase_raw_genovec iff (!(vrtype & 4)) and
2328 // subsetting_required (does not update ldbase_stypes, caller's
2329 // responsibility to care)
2330 // See comments on Parse1or2bitGenoarrUnsafe().
2331 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2332 if (!(vrtype & 4)) {
2333 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
2334 uintptr_t* raw_genovec = subsetting_required? pgrp->ldbase_raw_genovec : genovec;
2335 PglErr reterr = Parse1or2bitGenoarrUnsafe(fread_end, vrtype, fread_pp, pgrp, raw_genovec);
2336 if ((!subsetting_required) || reterr) {
2337 return reterr;
2338 }
2339 CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
2340 return kPglRetSuccess;
2341 }
2342 const uint32_t vrtype_low2 = vrtype & 3;
2343 if (vrtype_low2 != 1) {
2344 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
2345
2346 // This memset is frequently the limiting operation. This suggests that we
2347 // should eventually make more use of the DifflistOrGenovec interface.
2348 vecset(genovec, vrtype_low2 * kMask5555, vec_ct);
2349 return ParseAndApplyDifflistSubset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, fread_pp, pgrp, genovec);
2350 }
2351 // all homozygous-ref special case
2352 ZeroWArr(NypCtToWordCt(sample_ct), genovec);
2353 return kPglRetSuccess;
2354 }
2355
InitReadPtrs(uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp)2356 BoolErr InitReadPtrs(uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp) {
2357 const unsigned char* block_base = pgrp->fi.block_base;
2358 if (block_base != nullptr) {
2359 // possible todo: special handling of end of vblock
2360 const uint64_t block_offset = pgrp->fi.block_offset;
2361 *fread_pp = &(block_base[GetPgfiFpos(&(pgrp->fi), vidx) - block_offset]);
2362 *fread_endp = &(block_base[GetPgfiFpos(&(pgrp->fi), vidx + 1) - block_offset]);
2363
2364 // still a useful hint to LdLoadNecessary()
2365 pgrp->fp_vidx = vidx + 1;
2366
2367 return 0;
2368 }
2369 if (pgrp->fp_vidx != vidx) {
2370 if (unlikely(fseeko(pgrp->ff, GetPgfiFpos(&(pgrp->fi), vidx), SEEK_SET))) {
2371 return 1;
2372 }
2373 }
2374 const uintptr_t cur_vrec_width = GetPgfiVrecWidth(&(pgrp->fi), vidx);
2375 #ifdef __LP64__
2376 if (unlikely(fread_checked(pgrp->fread_buf, cur_vrec_width, pgrp->ff))) {
2377 return 1;
2378 }
2379 #else
2380 // cur_vrec_width < 2^31 since otherwise we error out on initialization
2381 if (unlikely(!fread_unlocked(pgrp->fread_buf, cur_vrec_width, 1, pgrp->ff))) {
2382 return 1;
2383 }
2384 #endif
2385 *fread_pp = pgrp->fread_buf;
2386 *fread_endp = &(pgrp->fread_buf[cur_vrec_width]);
2387 pgrp->fp_vidx = vidx + 1;
2388 return 0;
2389 }
2390
LdLoadNecessary(uint32_t cur_vidx,PgenReaderMain * pgrp)2391 uint32_t LdLoadNecessary(uint32_t cur_vidx, PgenReaderMain* pgrp) {
2392 // Determines whether LD base variant needs to be loaded (in addition to the
2393 // current variant), assuming we need (possibly subsetted) hardcalls.
2394 // Important: this updates pgrp->ldbase_vidx when necessary, as a side
2395 // effect.
2396 // bugfix (22 May 2018): this only checked whether ldbase_stypes was nonzero;
2397 // there was an AllHets + cache-clear edge case where that's not good enough.
2398 // now that AllHets has been removed, though, it should be safe again.
2399 if (pgrp->ldbase_stypes && (cur_vidx == pgrp->fp_vidx)) {
2400 assert(pgrp->ldbase_stypes & (kfPgrLdcacheNyp | kfPgrLdcacheDifflist | kfPgrLdcacheRawNyp));
2401 // ldbase variant guaranteed to be up-to-date if we didn't skip the last
2402 // variant, and cache wasn't cleared
2403 return 0;
2404 }
2405 // Find the last vrtypes[] value before vrtypes[cur_vidx] with bit 1 unset or
2406 // bit 2 set.
2407 const uint32_t old_ldbase_vidx = pgrp->ldbase_vidx;
2408 const uint32_t new_ldbase_vidx = GetLdbaseVidx(pgrp->fi.vrtypes, cur_vidx);
2409 if (old_ldbase_vidx == new_ldbase_vidx) {
2410 return 0;
2411 }
2412 pgrp->ldbase_vidx = new_ldbase_vidx;
2413 return 1;
2414 }
2415
2416 // Fills dest with subsetted ldbase contents, and ensures ldcache is filled so
2417 // no explicit reload of ldbase is needed for next variant if we're extracting
2418 // the same sample subset. (Reload is occasionally needed if next variant is
2419 // multiallelic or phased, we only prevent that when convenient.)
LdLoadAndCopyGenovecSubset(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,uintptr_t * dest)2420 PglErr LdLoadAndCopyGenovecSubset(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uintptr_t* dest) {
2421 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2422 if (LdLoadNecessary(vidx, pgrp)) {
2423 const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
2424 const unsigned char* fread_ptr;
2425 const unsigned char* fread_end;
2426 if (unlikely(InitReadPtrs(ldbase_vidx, pgrp, &fread_ptr, &fread_end))) {
2427 return kPglRetReadFail;
2428 }
2429 const uint32_t vrtype = pgrp->fi.vrtypes[ldbase_vidx];
2430 PglErr reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, dest);
2431 pgrp->ldbase_stypes = ((sample_ct != raw_sample_ct) && (!(vrtype & 4)))? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp) : kfPgrLdcacheNyp;
2432 CopyNyparr(dest, sample_ct, pgrp->ldbase_genovec);
2433 return reterr;
2434 }
2435 if (pgrp->ldbase_stypes & kfPgrLdcacheNyp) {
2436 CopyNyparr(pgrp->ldbase_genovec, sample_ct, dest);
2437 } else {
2438 if ((pgrp->ldbase_stypes & kfPgrLdcacheRawNyp) && (sample_ct == raw_sample_ct)) {
2439 CopyNyparr(pgrp->ldbase_raw_genovec, sample_ct, dest);
2440 } else if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) {
2441 // rematerialize-from-difflist is cheap.
2442 PgrDifflistToGenovecUnsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, sample_ct, pgrp->ldbase_difflist_len, dest);
2443 } else {
2444 CopyNyparrNonemptySubset(pgrp->ldbase_raw_genovec, sample_include, pgrp->fi.raw_sample_ct, sample_ct, dest);
2445 CopyNyparr(dest, sample_ct, pgrp->ldbase_genovec);
2446 pgrp->ldbase_stypes |= kfPgrLdcacheNyp;
2447 }
2448 }
2449 return kPglRetSuccess;
2450 }
2451
2452 // fread_pp should be non-null iff this is being called by an internal function
2453 // as part of a more complex read.
2454 // in multiallelic case:
2455 // hom-ref = 0
2456 // het-ref = 1
2457 // two nonref = 2
2458 // missing = 3
ReadGenovecSubsetUnsafe(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict genovec)2459 PglErr ReadGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec) {
2460 // Side effects:
2461 // may use pgr.workspace_raregeno_tmp_loadbuf (any difflist)
2462 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
2463 const uint32_t maintrack_vrtype = vrtype & 7;
2464 if (VrtypeLdCompressed(maintrack_vrtype)) {
2465 // LD compression
2466 PglErr reterr = LdLoadAndCopyGenovecSubset(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, genovec);
2467 if (unlikely(reterr)) {
2468 return reterr;
2469 }
2470 const unsigned char* fread_ptr;
2471 const unsigned char* fread_end;
2472 if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
2473 return kPglRetReadFail;
2474 }
2475 reterr = ParseAndApplyDifflistSubset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, &fread_ptr, pgrp, genovec);
2476 if (unlikely(reterr)) {
2477 return reterr;
2478 }
2479 if (maintrack_vrtype == 3) {
2480 GenovecInvertUnsafe(sample_ct, genovec);
2481 }
2482 if (fread_pp) {
2483 *fread_pp = fread_ptr;
2484 *fread_endp = fread_end;
2485 }
2486 return kPglRetSuccess;
2487 }
2488 const unsigned char* fread_ptr;
2489 const unsigned char* fread_end = nullptr; // maybe-uninitialized warning
2490 // tried inserting special-case code for the plink1 case to avoid a copy, and
2491 // it was actually slower
2492 if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
2493 return kPglRetReadFail;
2494 }
2495 // tried to add more sophisticated caching, but turns out it isn't worth it
2496 PglErr reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, maintrack_vrtype, &fread_ptr, pgrp, genovec);
2497 if (unlikely(reterr)) {
2498 return reterr;
2499 }
2500 if (vrtype == kPglVrtypePlink1) {
2501 PgrPlink1ToPlink2InplaceUnsafe(sample_ct, genovec);
2502 } else {
2503 const uint32_t is_ldbase = pgrp->fi.vrtypes && VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
2504 const uint32_t ldbase_raw_genovec_saved = (sample_ct != pgrp->fi.raw_sample_ct) && (!(maintrack_vrtype & 4));
2505 if (is_ldbase) {
2506 CopyNyparr(genovec, sample_ct, pgrp->ldbase_genovec);
2507 pgrp->ldbase_vidx = vidx;
2508 // may be better to just always set to kfPgrLdcacheNyp? this depends
2509 // on multiallelic code
2510 pgrp->ldbase_stypes = ldbase_raw_genovec_saved? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp) : kfPgrLdcacheNyp;
2511 } else if (ldbase_raw_genovec_saved) {
2512 // bugfix (22 Sep 2018): when accessing variants out of order, need to
2513 // note that we just clobbered the cache
2514 pgrp->ldbase_stypes &= ~kfPgrLdcacheRawNyp;
2515 }
2516 }
2517 if (fread_pp) {
2518 *fread_pp = fread_ptr;
2519 *fread_endp = fread_end;
2520 }
2521 return kPglRetSuccess;
2522 }
2523
PgrGet(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict genovec)2524 PglErr PgrGet(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec) {
2525 if (!sample_ct) {
2526 return kPglRetSuccess;
2527 }
2528 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
2529 assert(vidx < pgrp->fi.raw_variant_ct);
2530 return ReadGenovecSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
2531 }
2532
2533 // Fills dest with ldbase contents, and ensures ldcache is filled so no
2534 // explicit reload of ldbase is needed for next variant.
LdLoadAndCopyRawGenovec(uint32_t subsetting_required,uint32_t vidx,PgenReaderMain * pgrp,uintptr_t * dest)2535 PglErr LdLoadAndCopyRawGenovec(uint32_t subsetting_required, uint32_t vidx, PgenReaderMain* pgrp, uintptr_t* dest) {
2536 const uint32_t genovec_byte_ct = NypCtToVecCt(pgrp->fi.raw_sample_ct) * kBytesPerVec;
2537 if (LdLoadNecessary(vidx, pgrp) || (subsetting_required && (!(pgrp->ldbase_stypes & kfPgrLdcacheRawNyp)))) {
2538 const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
2539 const unsigned char* fread_ptr;
2540 const unsigned char* fread_end;
2541 if (unlikely(InitReadPtrs(ldbase_vidx, pgrp, &fread_ptr, &fread_end))) {
2542 return kPglRetReadFail;
2543 }
2544 const uint32_t vrtype = pgrp->fi.vrtypes[ldbase_vidx];
2545 pgrp->ldbase_stypes = kfPgrLdcacheRawNyp;
2546 assert((vrtype & 7) != 5); // all-hom-ref can't be ldbase
2547 uintptr_t* raw_genovec = pgrp->ldbase_raw_genovec;
2548 PglErr reterr;
2549 if (!(vrtype & 4)) {
2550 reterr = Parse1or2bitGenoarrUnsafe(fread_end, vrtype, &fread_ptr, pgrp, raw_genovec);
2551 } else {
2552 const uint32_t vrtype_low2 = vrtype & 3;
2553 vecset(raw_genovec, vrtype_low2 * kMask5555, DivUp(genovec_byte_ct, kBytesPerVec));
2554 reterr = ParseAndApplyDifflist(fread_end, &fread_ptr, pgrp, raw_genovec);
2555 }
2556 memcpy(dest, raw_genovec, genovec_byte_ct);
2557 return reterr;
2558 }
2559 if (pgrp->ldbase_stypes & kfPgrLdcacheRawNyp) {
2560 memcpy(dest, pgrp->ldbase_raw_genovec, genovec_byte_ct);
2561 } else {
2562 // no subsetting, can use regular Ldcache entries
2563 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2564 if (pgrp->ldbase_stypes & kfPgrLdcacheNyp) {
2565 memcpy(dest, pgrp->ldbase_genovec, genovec_byte_ct);
2566 } else {
2567 PgrDifflistToGenovecUnsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, raw_sample_ct, pgrp->ldbase_difflist_len, dest);
2568 }
2569 }
2570 return kPglRetSuccess;
2571 }
2572
2573 // Does not zero out trailing bits.
2574 // Requires fread_pp and fread_endp to be non-null for now.
ReadRawGenovec(uint32_t subsetting_required,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * raw_genovec)2575 PglErr ReadRawGenovec(uint32_t subsetting_required, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* raw_genovec) {
2576 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
2577 const uint32_t maintrack_vrtype = vrtype & 7;
2578 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2579 if (VrtypeLdCompressed(maintrack_vrtype)) {
2580 // LD compression
2581 PglErr reterr = LdLoadAndCopyRawGenovec(subsetting_required, vidx, pgrp, raw_genovec);
2582 if (unlikely(reterr)) {
2583 return reterr;
2584 }
2585 if (unlikely(InitReadPtrs(vidx, pgrp, fread_pp, fread_endp))) {
2586 return kPglRetReadFail;
2587 }
2588 reterr = ParseAndApplyDifflist(*fread_endp, fread_pp, pgrp, raw_genovec);
2589 if (unlikely(reterr)) {
2590 return reterr;
2591 }
2592 if (maintrack_vrtype == 3) {
2593 GenovecInvertUnsafe(raw_sample_ct, raw_genovec);
2594 }
2595 return kPglRetSuccess;
2596 }
2597 if (unlikely(InitReadPtrs(vidx, pgrp, fread_pp, fread_endp))) {
2598 return kPglRetReadFail;
2599 }
2600 const unsigned char* fread_end = *fread_endp;
2601 PglErr reterr;
2602 if (!(vrtype & 4)) {
2603 reterr = Parse1or2bitGenoarrUnsafe(fread_end, vrtype, fread_pp, pgrp, raw_genovec);
2604 } else {
2605 const uint32_t vrtype_low2 = vrtype & 3;
2606 if (vrtype_low2 == 1) {
2607 ZeroWArr(NypCtToWordCt(raw_sample_ct), raw_genovec);
2608 // all-hom-ref can't be ldbase
2609 return kPglRetSuccess;
2610 }
2611 const uint32_t vec_ct = NypCtToVecCt(raw_sample_ct);
2612 vecset(raw_genovec, vrtype_low2 * kMask5555, vec_ct);
2613 reterr = ParseAndApplyDifflist(fread_end, fread_pp, pgrp, raw_genovec);
2614 }
2615 if (vrtype == kPglVrtypePlink1) {
2616 PgrPlink1ToPlink2InplaceUnsafe(raw_sample_ct, raw_genovec);
2617 } else {
2618 const uint32_t is_ldbase = pgrp->fi.vrtypes && VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
2619 if (is_ldbase) {
2620 CopyNyparr(raw_genovec, raw_sample_ct, pgrp->ldbase_raw_genovec);
2621 pgrp->ldbase_vidx = vidx;
2622 pgrp->ldbase_stypes = kfPgrLdcacheRawNyp;
2623 }
2624 }
2625 return reterr;
2626 }
2627 /*
2628 void CopyAndSubsetDifflist(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_raregeno, const uint32_t* __restrict raw_difflist_sample_ids, uint32_t raw_difflist_len, uintptr_t* __restrict new_raregeno, uint32_t* __restrict new_difflist_sample_ids, uint32_t* __restrict new_difflist_len_ptr) {
2629 // Trailing bits of new_raregeno are zeroed out.
2630 if (!raw_difflist_len) {
2631 *new_difflist_len_ptr = 0;
2632 return;
2633 }
2634 const uintptr_t* raw_raregeno_incr = raw_raregeno;
2635 const uint32_t* raw_difflist_sample_ids_iter = raw_difflist_sample_ids;
2636 const uint32_t* raw_difflist_sample_ids_last = &(raw_difflist_sample_ids[RoundDownPow2(raw_difflist_len - 1, kBitsPerWordD2)]);
2637 uintptr_t* new_raregeno_incr = new_raregeno;
2638 uintptr_t new_raregeno_word = 0;
2639 uint32_t new_difflist_len = 0;
2640 uint32_t block_len_m1 = kBitsPerWordD2 - 1;
2641 while (1) {
2642 if (raw_difflist_sample_ids_iter >= raw_difflist_sample_ids_last) {
2643 if (raw_difflist_sample_ids_iter > raw_difflist_sample_ids_last) {
2644 if (new_difflist_len % kBitsPerWordD2) {
2645 *new_raregeno_incr = new_raregeno_word;
2646 }
2647 *new_difflist_len_ptr = new_difflist_len;
2648 return;
2649 }
2650 block_len_m1 &= raw_difflist_len - 1;
2651 }
2652 uintptr_t raw_raregeno_word = *raw_raregeno_incr++;
2653 uint32_t raw_difflist_idx_lowbits = 0;
2654 while (1) {
2655 const uint32_t raw_sample_idx = raw_difflist_sample_ids_iter[raw_difflist_idx_lowbits];
2656 if (IsSet(sample_include, raw_sample_idx)) {
2657 new_difflist_sample_ids[new_difflist_len] = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx);
2658 new_raregeno_word |= ((raw_raregeno_word >> (2 * raw_difflist_idx_lowbits)) & 3) << (2 * (new_difflist_len % kBitsPerWordD2));
2659 ++new_difflist_len;
2660 if (!(new_difflist_len % kBitsPerWordD2)) {
2661 *new_raregeno_incr++ = new_raregeno_word;
2662 new_raregeno_word = 0;
2663 }
2664 }
2665 if (raw_difflist_idx_lowbits == block_len_m1) {
2666 break;
2667 }
2668 ++raw_difflist_idx_lowbits;
2669 }
2670 raw_difflist_sample_ids_iter = &(raw_difflist_sample_ids_iter[kBitsPerWordD2]);
2671 }
2672 }
2673 */
2674
2675 // Populates pgrp->ldbase_genovec or
2676 // pgrp->ldbase_{raregeno,difflist_sample_ids,difflist_len}, depending on
2677 // storage type.
2678 // Currently just called by ReadDifflistOrGenovecSubsetUnsafe(), which isn't
2679 // exploited by plink2 yet.
LdLoadMinimalSubsetIfNecessary(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp)2680 PglErr LdLoadMinimalSubsetIfNecessary(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp) {
2681 if (!LdLoadNecessary(vidx, pgrp)) {
2682 return kPglRetSuccess;
2683 }
2684 const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
2685 const uint64_t cur_vidx_fpos = pgrp->fi.var_fpos[ldbase_vidx];
2686 const uint32_t ldbase_vrtype = pgrp->fi.vrtypes[ldbase_vidx];
2687 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2688 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
2689 uintptr_t* raw_genovec = subsetting_required? pgrp->ldbase_raw_genovec : pgrp->ldbase_genovec;
2690 const unsigned char* fread_ptr;
2691 const unsigned char* fread_end;
2692 const unsigned char* block_base = pgrp->fi.block_base;
2693 PglErr reterr = kPglRetSuccess;
2694 if (block_base != nullptr) {
2695 {
2696 const uint64_t block_offset = pgrp->fi.block_offset;
2697 fread_ptr = &(block_base[cur_vidx_fpos - block_offset]);
2698 fread_end = &(block_base[pgrp->fi.var_fpos[ldbase_vidx + 1] - block_offset]);
2699 }
2700 if (!(ldbase_vrtype & 4)) {
2701 reterr = Parse1or2bitGenoarrUnsafe(fread_end, ldbase_vrtype, &fread_ptr, pgrp, raw_genovec);
2702 LdLoadMinimalSubsetIfNecessary_genovec_finish:
2703 pgrp->ldbase_stypes = subsetting_required? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp) : kfPgrLdcacheNyp;
2704 if ((!subsetting_required) || reterr) {
2705 return reterr;
2706 }
2707 CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, pgrp->ldbase_genovec);
2708 return kPglRetSuccess;
2709 }
2710 pgrp->fp_vidx = ldbase_vidx + 1;
2711 } else {
2712 if (unlikely(fseeko(pgrp->ff, pgrp->fi.var_fpos[ldbase_vidx], SEEK_SET))) {
2713 return kPglRetReadFail;
2714 }
2715 const uintptr_t cur_vrec_width = pgrp->fi.var_fpos[ldbase_vidx + 1] - cur_vidx_fpos;
2716 pgrp->fp_vidx = ldbase_vidx + 1;
2717 if (!(ldbase_vrtype & 7)) {
2718 // don't actually need to fread the whole record in this case
2719 const uint32_t raw_sample_ct4 = NypCtToByteCt(raw_sample_ct);
2720 if (unlikely(!fread_unlocked(raw_genovec, raw_sample_ct4, 1, pgrp->ff))) {
2721 return kPglRetReadFail;
2722 }
2723 if (raw_sample_ct4 != cur_vrec_width) {
2724 // ensure this doesn't match
2725 pgrp->fp_vidx = 0;
2726 }
2727 goto LdLoadMinimalSubsetIfNecessary_genovec_finish;
2728 }
2729 if (unlikely(!fread_unlocked(pgrp->fread_buf, cur_vrec_width, 1, pgrp->ff))) {
2730 return kPglRetReadFail;
2731 }
2732 fread_ptr = pgrp->fread_buf;
2733 fread_end = &(pgrp->fread_buf[cur_vrec_width]);
2734 if (!(ldbase_vrtype & 4)) {
2735 reterr = ParseOnebitUnsafe(fread_end, &fread_ptr, pgrp, raw_genovec);
2736 goto LdLoadMinimalSubsetIfNecessary_genovec_finish;
2737 }
2738 }
2739 uint32_t ldbase_difflist_len;
2740 if (!subsetting_required) {
2741 reterr = ParseAndSaveDifflist(fread_end, raw_sample_ct, &fread_ptr, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, &ldbase_difflist_len);
2742 } else {
2743 reterr = ParseAndSaveDifflistProperSubset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_sample_ct, &fread_ptr, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, &ldbase_difflist_len, pgrp->workspace_raregeno_tmp_loadbuf);
2744 }
2745 if (unlikely(reterr)) {
2746 return reterr;
2747 }
2748 pgrp->ldbase_difflist_len = ldbase_difflist_len;
2749 pgrp->ldbase_difflist_sample_ids[ldbase_difflist_len] = sample_ct;
2750 pgrp->ldbase_stypes = kfPgrLdcacheDifflist;
2751 return kPglRetSuccess;
2752 }
2753
ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t max_simple_difflist_len,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict genovec,uint32_t * difflist_common_geno_ptr,uintptr_t * __restrict main_raregeno,uint32_t * __restrict difflist_sample_ids,uint32_t * __restrict difflist_len_ptr)2754 PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
2755 assert(vidx < pgrp->fi.raw_variant_ct);
2756 assert(sample_ct);
2757 assert(max_simple_difflist_len < sample_ct);
2758 // Side effects:
2759 // may use pgr.workspace_raregeno_tmp_loadbuf
2760 // Trailing bits of genovec/main_raregeno may not be zeroed out.
2761 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
2762 const uint32_t maintrack_vrtype = vrtype & 7;
2763 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
2764 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
2765 // const uint32_t multiallelic_hc_present = fread_pp && VrtypeMultiallelic(vrtype);
2766 if (VrtypeLdCompressed(maintrack_vrtype)) {
2767 // LD compression
2768
2769 // note that this can currently load a difflist longer than
2770 // max_simple_difflist_len
2771 PglErr reterr = LdLoadMinimalSubsetIfNecessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
2772 if (unlikely(reterr)) {
2773 return reterr;
2774 }
2775 const unsigned char* fread_ptr;
2776 const unsigned char* fread_end;
2777 if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
2778 return kPglRetReadFail;
2779 }
2780 const uint32_t ld_invert = (maintrack_vrtype == 3);
2781 if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) {
2782 const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3;
2783 // unnecessary for this to branch on LD difflist length, since that's
2784 // limited to 3/4 of the ldbase difflist length.
2785 *difflist_common_geno_ptr = ldbase_common_geno;
2786 reterr = ParseLdAndMergeDifflistSubset(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->ldbase_difflist_len, ldbase_common_geno, raw_sample_ct, sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, pgrp->workspace_raregeno_tmp_loadbuf);
2787 if (unlikely(reterr)) {
2788 return reterr;
2789 }
2790 if (ld_invert) {
2791 *difflist_common_geno_ptr = (6 - ldbase_common_geno) & 3;
2792 GenovecInvertUnsafe(*difflist_len_ptr, main_raregeno);
2793 }
2794 return kPglRetSuccess;
2795 }
2796 if (pgrp->ldbase_stypes & kfPgrLdcacheNyp) {
2797 CopyNyparr(pgrp->ldbase_genovec, sample_ct, genovec);
2798 } else {
2799 assert(pgrp->ldbase_stypes & kfPgrLdcacheRawNyp);
2800 CopyNyparrNonemptySubset(pgrp->ldbase_raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
2801 CopyNyparr(genovec, sample_ct, pgrp->ldbase_genovec);
2802 pgrp->ldbase_stypes |= kfPgrLdcacheNyp;
2803 }
2804 *difflist_common_geno_ptr = UINT32_MAX;
2805 reterr = ParseAndApplyDifflistSubset(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, &fread_ptr, pgrp, genovec);
2806 if (unlikely(reterr)) {
2807 return reterr;
2808 }
2809 if (ld_invert) {
2810 GenovecInvertUnsafe(sample_ct, genovec);
2811 }
2812 if (fread_pp) {
2813 *fread_pp = fread_ptr;
2814 *fread_endp = fread_end;
2815 }
2816 return kPglRetSuccess;
2817 }
2818 const unsigned char* fread_ptr;
2819 const unsigned char* fread_end = nullptr; // maybe-uninitialized warning
2820 if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
2821 return kPglRetReadFail;
2822 }
2823 const uint32_t is_ldbase = pgrp->fi.vrtypes && VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
2824 const uint32_t saved_difflist_len = VrtypeDifflist(vrtype)? PeekVint31(fread_ptr, fread_end) : raw_sample_ct;
2825 pgrp->ldbase_vidx = vidx;
2826 // no limit is slightly better than /16 but substantially worse than /32 on
2827 // the large test dataset (/64 is slightly worse than /32)
2828 // no limit is best on the small test dataset
2829 if (saved_difflist_len > max_simple_difflist_len) {
2830 *difflist_common_geno_ptr = UINT32_MAX;
2831 PglErr reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, genovec);
2832 if (unlikely(reterr)) {
2833 return reterr;
2834 }
2835 const uint32_t ldbase_raw_genovec_saved = (subsetting_required && (!(vrtype & 4)));
2836 if (is_ldbase) {
2837 CopyNyparr(genovec, sample_ct, pgrp->ldbase_genovec);
2838 pgrp->ldbase_stypes = ldbase_raw_genovec_saved? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp) : kfPgrLdcacheNyp;
2839 } else if (ldbase_raw_genovec_saved) {
2840 // bugfix (22 Sep 2018)
2841 pgrp->ldbase_stypes &= ~kfPgrLdcacheRawNyp;
2842 }
2843 if (vrtype == kPglVrtypePlink1) {
2844 PgrPlink1ToPlink2InplaceUnsafe(sample_ct, genovec);
2845 }
2846 if (fread_pp) {
2847 *fread_pp = fread_ptr;
2848 *fread_endp = fread_end;
2849 }
2850 return kPglRetSuccess;
2851 }
2852 *difflist_common_geno_ptr = vrtype & 3;
2853 PglErr reterr;
2854 if (!subsetting_required) {
2855 reterr = ParseAndSaveDifflist(fread_end, raw_sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
2856 } else {
2857 reterr = ParseAndSaveDifflistProperSubset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, pgrp->workspace_raregeno_tmp_loadbuf);
2858 }
2859 if (unlikely(reterr)) {
2860 return kPglRetMalformedInput;
2861 }
2862 if (is_ldbase) {
2863 const uint32_t difflist_len = *difflist_len_ptr;
2864 pgrp->ldbase_stypes = kfPgrLdcacheDifflist;
2865 pgrp->ldbase_difflist_len = difflist_len;
2866 CopyNyparr(main_raregeno, difflist_len, pgrp->ldbase_raregeno);
2867 memcpy(pgrp->ldbase_difflist_sample_ids, difflist_sample_ids, difflist_len * sizeof(int32_t));
2868 pgrp->ldbase_difflist_sample_ids[difflist_len] = sample_ct;
2869 }
2870 if (fread_pp) {
2871 *fread_pp = fread_ptr;
2872 *fread_endp = fread_end;
2873 }
2874 return kPglRetSuccess;
2875 }
2876
PgrGetDifflistOrGenovec(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t max_simple_difflist_len,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict genovec,uint32_t * difflist_common_geno_ptr,uintptr_t * __restrict main_raregeno,uint32_t * __restrict difflist_sample_ids,uint32_t * __restrict difflist_len_ptr)2877 PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
2878 if (!sample_ct) {
2879 *difflist_common_geno_ptr = UINT32_MAX;
2880 return kPglRetSuccess;
2881 }
2882 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
2883 assert(vidx < pgrp->fi.raw_variant_ct);
2884 return ReadDifflistOrGenovecSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, genovec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
2885 }
2886
2887 PglErr LdSubsetAdjustGenocounts(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict ldbase_genovec, uint32_t raw_sample_ct, const unsigned char** fread_pp, STD_ARRAY_REF(uint32_t, 4) genocounts, uintptr_t* __restrict raregeno_workspace) {
2888 // * sample_include assumed to be nullptr if no subsetting required
2889 // * Assumes genocounts[] is initialized to the proper values for the LD
2890 // reference variant (including subsetting).
2891 // * Tried a hybrid implementation which allowed the base variant to be saved
2892 // as a difflist; turns out it's practically always better to unpack to a
2893 // genovec first.
2894 // * There are two modes:
2895 // 1. If sample_include is nullptr, we're not selecting a sample subset.
2896 // 2. If sample_include and sample_include_cumulative_popcounts are both
2897 // non-null, we're computing counts over a sample subset, and
2898 // ldbase_genovec is assumed to be subsetted.
2899 // Experimented with a third mode where ldbase_genovec was replaced with
2900 // ldbase_raw_genovec in the subsetted case, but that didn't seem to pay
2901 // off.
2902 // * This is the main frequency-counting bottleneck.
2903 uint32_t raw_difflist_len;
2904 const unsigned char* group_info_iter;
2905 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &raw_difflist_len);
2906 if (reterr || (!raw_difflist_len)) {
2907 return reterr;
2908 }
2909 const uint32_t subgroup_idx_last = (raw_difflist_len - 1) / kBitsPerWordD2;
2910 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
2911 uintptr_t* raregeno_workspace_iter = raregeno_workspace;
2912 uintptr_t raw_sample_idx = 0;
2913 STD_ARRAY_DECL(uint32_t, 16, delta_counts);
2914 STD_ARRAY_FILL0(delta_counts);
2915 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
2916 uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
2917 if (subgroup_idx >= subgroup_idx_last) {
2918 if (subgroup_idx > subgroup_idx_last) {
2919 const int32_t incr0 = delta_counts[1] + delta_counts[2] + delta_counts[3] - delta_counts[4] - delta_counts[8] - delta_counts[12];
2920 const int32_t incr1 = delta_counts[4] + delta_counts[6] + delta_counts[7] - delta_counts[1] - delta_counts[9] - delta_counts[13];
2921 const int32_t incr2 = delta_counts[8] + delta_counts[9] + delta_counts[11] - delta_counts[2] - delta_counts[6] - delta_counts[14];
2922 genocounts[0] += incr0;
2923 genocounts[1] += incr1;
2924 genocounts[2] += incr2;
2925 genocounts[3] -= incr0 + incr1 + incr2;
2926 return kPglRetSuccess;
2927 }
2928 remaining_deltas_in_subgroup &= raw_difflist_len - 1;
2929 }
2930 if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
2931 #ifdef __LP64__
2932 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2933 return kPglRetMalformedInput;
2934 }
2935 #endif
2936 raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
2937 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
2938 } else {
2939 raw_sample_idx += GetVint31(fread_end, fread_pp);
2940 }
2941 uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
2942 if (!sample_include) {
2943 for (; ; --remaining_deltas_in_subgroup) {
2944 #ifndef __LP64__
2945 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2946 return kPglRetMalformedInput;
2947 }
2948 #endif
2949 const uintptr_t cur_geno = cur_raregeno_word & 3;
2950 delta_counts[cur_geno * 4 + GetNyparrEntry(ldbase_genovec, raw_sample_idx)] += 1;
2951 if (!remaining_deltas_in_subgroup) {
2952 break;
2953 }
2954 raw_sample_idx += GetVint31(fread_end, fread_pp);
2955 cur_raregeno_word >>= 2;
2956 }
2957 } else {
2958 for (; ; --remaining_deltas_in_subgroup) {
2959 #ifndef __LP64__
2960 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
2961 return kPglRetMalformedInput;
2962 }
2963 #endif
2964 if (IsSet(sample_include, raw_sample_idx)) {
2965 const uintptr_t cur_geno = cur_raregeno_word & 3;
2966 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx);
2967 delta_counts[cur_geno * 4 + GetNyparrEntry(ldbase_genovec, sample_idx)] += 1;
2968 }
2969 if (!remaining_deltas_in_subgroup) {
2970 break;
2971 }
2972 raw_sample_idx += GetVint31(fread_end, fread_pp);
2973 cur_raregeno_word >>= 2;
2974 }
2975 }
2976 }
2977 }
2978
SkipDeltalistIds(const unsigned char * fread_end,const unsigned char * group_info,uint32_t difflist_len,uint32_t raw_sample_ct,uint32_t has_genotypes,const unsigned char ** fread_pp)2979 PglErr SkipDeltalistIds(const unsigned char* fread_end, const unsigned char* group_info, uint32_t difflist_len, uint32_t raw_sample_ct, uint32_t has_genotypes, const unsigned char** fread_pp) {
2980 assert(difflist_len);
2981 // fread_pp is a pure output parameter here
2982 const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize);
2983 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
2984 const unsigned char* extra_byte_cts = &(group_info[group_ct * sample_id_byte_ct]);
2985 const uint32_t extra_byte_tot = BytesumArr(extra_byte_cts, group_ct - 1);
2986
2987 // (group_ct - 1) for extra_byte_cts
2988 // (difflist_len + 3) / 4 for raregeno
2989 // (group_ct - 1) * (kPglDifflistGroupSize - 1) + extra_byte_tot for
2990 // all but last ID block
2991 // total = (group_ct - 1) * kPglDifflistGroupSize + extra_byte_tot +
2992 // (difflist_len + 3) / 4
2993 #ifdef __arm__
2994 # error "Unaligned accesses in SkipDeltalistIds()."
2995 #endif
2996 const unsigned char* iddiff_start = &(extra_byte_cts[(group_ct - 1) * kPglDifflistGroupSize + extra_byte_tot]);
2997 if (has_genotypes) {
2998 iddiff_start = &(iddiff_start[NypCtToByteCt(difflist_len)]);
2999 }
3000 const uintptr_t* fread_alias = R_CAST(const uintptr_t*, iddiff_start);
3001 const uintptr_t* fread_alias_stop = R_CAST(const uintptr_t*, &(fread_end[-S_CAST(int32_t, kBytesPerWord)]));
3002 uint32_t remaining_id_ct = (difflist_len - 1) % kPglDifflistGroupSize;
3003 #ifdef __LP64__
3004 while (remaining_id_ct >= kBytesPerVec) {
3005 if (unlikely(fread_alias > fread_alias_stop)) {
3006 return kPglRetMalformedInput;
3007 }
3008 const VecW vv = vecw_loadu(R_CAST(const VecW*, fread_alias));
3009 fread_alias = &(fread_alias[kWordsPerVec]);
3010 const uint32_t highbits = vecw_movemask(vv);
3011 remaining_id_ct -= kBytesPerVec - PopcountVec8thUint(highbits);
3012 }
3013 #endif
3014 while (remaining_id_ct >= kBytesPerWord) {
3015 // scan a word at a time, count number of high bits set
3016 if (unlikely(fread_alias > fread_alias_stop)) {
3017 return kPglRetMalformedInput;
3018 }
3019 #ifdef USE_SSE42
3020 const uintptr_t ww = (*fread_alias++) & (0x80 * kMask0101);
3021 remaining_id_ct -= kBytesPerWord - PopcountWord(ww);
3022 #else
3023 const uintptr_t ww = ((*fread_alias++) >> 7) & kMask0101;
3024 remaining_id_ct -= kBytesPerWord - ((ww * kMask0101) >> (kBitsPerWord - 8));
3025 #endif
3026 }
3027 const unsigned char* fread_ptr = R_CAST(const unsigned char*, fread_alias);
3028 if (!remaining_id_ct) {
3029 *fread_pp = fread_ptr;
3030 return kPglRetSuccess;
3031 }
3032 --remaining_id_ct;
3033 while (likely(fread_ptr < fread_end)) {
3034 if ((*fread_ptr++) <= 127) {
3035 if (!remaining_id_ct) {
3036 *fread_pp = fread_ptr;
3037 return kPglRetSuccess;
3038 }
3039 --remaining_id_ct;
3040 }
3041 }
3042 return kPglRetMalformedInput;
3043 }
3044
3045 PglErr CountparseDifflistSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t common_geno, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, STD_ARRAY_REF(uint32_t, 4) genocounts, uintptr_t* __restrict raregeno_workspace) {
3046 const unsigned char* group_info_iter;
3047 uint32_t difflist_len;
3048 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &difflist_len);
3049 STD_ARRAY_REF_FILL0(4, genocounts);
3050 if (reterr || (!difflist_len)) {
3051 genocounts[common_geno] = sample_ct;
3052 return reterr;
3053 }
3054 if (raw_sample_ct == sample_ct) {
3055 ZeroTrailingNyps(difflist_len, raregeno_workspace);
3056 GenoarrCountFreqsUnsafe(raregeno_workspace, difflist_len, genocounts);
3057 genocounts[common_geno] = sample_ct - difflist_len;
3058 // bugfix (26 Mar 2019): forgot to advance fread_pp
3059 return SkipDeltalistIds(fread_end, group_info_iter, difflist_len, raw_sample_ct, 1, fread_pp);
3060 }
3061 const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
3062 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
3063 uintptr_t* raregeno_workspace_iter = raregeno_workspace;
3064 uintptr_t raw_sample_idx = 0;
3065 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
3066 uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
3067 if (subgroup_idx >= subgroup_idx_last) {
3068 if (subgroup_idx > subgroup_idx_last) {
3069 genocounts[common_geno] = sample_ct - genocounts[0] - genocounts[1] - genocounts[2] - genocounts[3];
3070 return kPglRetSuccess;
3071 }
3072 remaining_deltas_in_subgroup &= difflist_len - 1;
3073 }
3074 if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
3075 #ifdef __LP64__
3076 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3077 return kPglRetMalformedInput;
3078 }
3079 #endif
3080 raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
3081 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
3082 } else {
3083 raw_sample_idx += GetVint31(fread_end, fread_pp);
3084 }
3085 uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
3086 for (; ; --remaining_deltas_in_subgroup) {
3087 #ifndef __LP64__
3088 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3089 return kPglRetMalformedInput;
3090 }
3091 #endif
3092 if (IsSet(sample_include, raw_sample_idx)) {
3093 const uintptr_t cur_geno = cur_raregeno_word & 3;
3094 genocounts[cur_geno] += 1;
3095 }
3096 if (!remaining_deltas_in_subgroup) {
3097 break;
3098 }
3099 raw_sample_idx += GetVint31(fread_end, fread_pp);
3100 cur_raregeno_word >>= 2;
3101 }
3102 }
3103 }
3104
3105 // 1-bit, unsubsetted: count 1-bit array, then count raregeno
3106 // 1-bit, subsetted: count [1-bit array AND sample_include], iterate through
3107 // difflist
3108 PglErr CountparseOnebitSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, STD_ARRAY_REF(uint32_t, 4) genocounts, uintptr_t* __restrict raregeno_workspace) {
3109 const uint32_t initial_bitarray_byte_ct = DivUp(raw_sample_ct, CHAR_BIT);
3110 const unsigned char* onebit_main_iter = *fread_pp;
3111 if (PtrAddCk(fread_end, initial_bitarray_byte_ct + 1, fread_pp)) {
3112 return kPglRetMalformedInput;
3113 }
3114 const uint32_t common2_code = *onebit_main_iter++;
3115 const uint32_t geno_code_low = common2_code / 4;
3116 const uint32_t geno_code_high = (common2_code & 3) + geno_code_low;
3117 #ifdef __arm__
3118 # error "Unaligned accesses in CountparseOnebitSubset()."
3119 #endif
3120 uint32_t high_geno_ct;
3121 if (raw_sample_ct == sample_ct) {
3122 high_geno_ct = PopcountBytes(onebit_main_iter, initial_bitarray_byte_ct);
3123 } else {
3124 high_geno_ct = PopcountBytesMasked(onebit_main_iter, sample_include, initial_bitarray_byte_ct);
3125 }
3126 const unsigned char* group_info_iter;
3127 uint32_t difflist_len;
3128 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, raregeno_workspace, &group_info_iter, &difflist_len);
3129 STD_ARRAY_REF_FILL0(4, genocounts);
3130 if (reterr || (!difflist_len)) {
3131 genocounts[geno_code_low] = sample_ct - high_geno_ct;
3132 genocounts[geno_code_high] = high_geno_ct;
3133 return reterr;
3134 }
3135 if (raw_sample_ct == sample_ct) {
3136 ZeroTrailingNyps(difflist_len, raregeno_workspace);
3137 GenoarrCountFreqsUnsafe(raregeno_workspace, difflist_len, genocounts);
3138 genocounts[geno_code_low] = sample_ct - difflist_len - high_geno_ct;
3139 genocounts[geno_code_high] = high_geno_ct;
3140 // bugfix (26 Mar 2019): forgot to advance fread_pp
3141 return SkipDeltalistIds(fread_end, group_info_iter, difflist_len, raw_sample_ct, 1, fread_pp);
3142 }
3143 const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
3144 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
3145 const uintptr_t* onebitarr = R_CAST(const uintptr_t*, onebit_main_iter);
3146 uintptr_t* raregeno_workspace_iter = raregeno_workspace;
3147 uintptr_t raw_sample_idx = 0;
3148 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
3149 uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
3150 if (subgroup_idx >= subgroup_idx_last) {
3151 if (subgroup_idx > subgroup_idx_last) {
3152 // avoid read-after-write dependency?
3153 genocounts[geno_code_low] = sample_ct - high_geno_ct - genocounts[0] - genocounts[1] - genocounts[2] - genocounts[3];
3154 genocounts[geno_code_high] = high_geno_ct;
3155 return kPglRetSuccess;
3156 }
3157 remaining_deltas_in_subgroup &= difflist_len - 1;
3158 }
3159 if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
3160 #ifdef __LP64__
3161 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3162 return kPglRetMalformedInput;
3163 }
3164 #endif
3165 raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
3166 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
3167 } else {
3168 raw_sample_idx += GetVint31(fread_end, fread_pp);
3169 }
3170 uintptr_t cur_raregeno_word = *raregeno_workspace_iter++;
3171 for (; ; --remaining_deltas_in_subgroup) {
3172 #ifndef __LP64__
3173 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3174 return kPglRetMalformedInput;
3175 }
3176 #endif
3177 if (IsSet(sample_include, raw_sample_idx)) {
3178 const uintptr_t cur_geno = cur_raregeno_word & 3;
3179 genocounts[cur_geno] += 1;
3180 high_geno_ct -= IsSet(onebitarr, raw_sample_idx);
3181 }
3182 if (!remaining_deltas_in_subgroup) {
3183 break;
3184 }
3185 raw_sample_idx += GetVint31(fread_end, fread_pp);
3186 cur_raregeno_word >>= 2;
3187 }
3188 }
3189 }
3190
3191 // loads ldbase variant if necessary, guarantees pgrp->ldbase_genovec is filled
3192 // on return
3193 // only called by GetBasicGenotypeCounts(), usually LdLoadAndCopy... is better
LdLoadGenovecSubsetIfNecessary(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp)3194 PglErr LdLoadGenovecSubsetIfNecessary(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp) {
3195 if (LdLoadNecessary(vidx, pgrp)) {
3196 const uint32_t ldbase_vidx = pgrp->ldbase_vidx;
3197 const unsigned char* fread_ptr;
3198 const unsigned char* fread_end;
3199 if (unlikely(InitReadPtrs(ldbase_vidx, pgrp, &fread_ptr, &fread_end))) {
3200 return kPglRetReadFail;
3201 }
3202 const uint32_t vrtype = pgrp->fi.vrtypes[ldbase_vidx];
3203 // bugfix (6 Mar 2019): ldbase_raw_genovec is only filled in (!difflist) &&
3204 // subsetting_required case; (!difflist) isn't enough
3205 pgrp->ldbase_stypes = ((vrtype & 4) || (sample_ct == pgrp->fi.raw_sample_ct))? kfPgrLdcacheNyp : (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp);
3206 return ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, pgrp->ldbase_genovec);
3207 }
3208 if (!(pgrp->ldbase_stypes & kfPgrLdcacheNyp)) {
3209 if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) {
3210 PgrDifflistToGenovecUnsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3, sample_ct, pgrp->ldbase_difflist_len, pgrp->ldbase_genovec);
3211 } else {
3212 assert(pgrp->ldbase_stypes & kfPgrLdcacheRawNyp);
3213 CopyNyparrNonemptySubset(pgrp->ldbase_raw_genovec, sample_include, pgrp->fi.raw_sample_ct, sample_ct, pgrp->ldbase_genovec);
3214 }
3215 pgrp->ldbase_stypes |= kfPgrLdcacheNyp;
3216 }
3217 return kPglRetSuccess;
3218 }
3219
3220 PglErr GetBasicGenotypeCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uint32_t* unphased_het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts) {
3221 // genocounts[0] := ref/ref, genocounts[1] := ref/altx,
3222 // genocounts[2] := altx/alty, genocounts[3] := missing
3223 // If unphased_het_ctp is non-null, this assumes multiallelic hardcalls are
3224 // not present, phased hardcalls are present, we aren't subsetting, and
3225 // unphased_het_ct is initialized to zero.
3226 assert(vidx < pgrp->fi.raw_variant_ct);
3227 assert(sample_ct);
3228 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
3229 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
3230 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
3231 const unsigned char* fread_ptr;
3232 const unsigned char* fread_end = nullptr; // maybe-uninitialized warning
3233 PglErr reterr;
3234 if (VrtypeLdCompressed(vrtype)) {
3235 // LD compression
3236 reterr = LdLoadGenovecSubsetIfNecessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
3237 if (unlikely(reterr)) {
3238 return reterr;
3239 }
3240 if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
3241 return kPglRetReadFail;
3242 }
3243 if (!(pgrp->ldbase_stypes & kfPgrLdcacheBasicGenocounts)) {
3244 ZeroTrailingNyps(sample_ct, pgrp->ldbase_genovec);
3245 GenoarrCountFreqsUnsafe(pgrp->ldbase_genovec, sample_ct, pgrp->ldbase_basic_genocounts);
3246 pgrp->ldbase_stypes |= kfPgrLdcacheBasicGenocounts;
3247 }
3248 STD_ARRAY_COPY(pgrp->ldbase_basic_genocounts, 4, genocounts);
3249 reterr = LdSubsetAdjustGenocounts(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_genovec, raw_sample_ct, &fread_ptr, genocounts, pgrp->workspace_raregeno_tmp_loadbuf);
3250 if (vrtype & 1) {
3251 // inverted
3252 const uint32_t tmpval = genocounts[0];
3253 genocounts[0] = genocounts[2];
3254 genocounts[2] = tmpval;
3255 }
3256 } else {
3257 if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
3258 return kPglRetReadFail;
3259 }
3260 const uint32_t is_ldbase = pgrp->fi.vrtypes && VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
3261 if (is_ldbase) {
3262 // difflists are very efficient to count directly when not subsetting
3263 // (since we can entirely ignore the sample IDs), but it's often better
3264 // to unpack them first when subsetting.
3265
3266 // ...er, the statement above is a lie, unpack-first almost always seems
3267 // to be better.
3268 pgrp->ldbase_vidx = vidx;
3269 // this may be slowed down by the LD caching change.
3270 reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, pgrp->ldbase_genovec);
3271 ZeroTrailingNyps(sample_ct, pgrp->ldbase_genovec);
3272 GenoarrCountFreqsUnsafe(pgrp->ldbase_genovec, sample_ct, genocounts);
3273 STD_ARRAY_COPY(genocounts, 4, pgrp->ldbase_basic_genocounts);
3274 pgrp->ldbase_stypes = (subsetting_required && (!(vrtype & 4)))? (kfPgrLdcacheNyp | kfPgrLdcacheRawNyp | kfPgrLdcacheBasicGenocounts) : (kfPgrLdcacheNyp | kfPgrLdcacheBasicGenocounts);
3275 } else if (vrtype & 4) {
3276 const uint32_t vrtype_low2 = vrtype & 3;
3277 if (vrtype_low2 != 1) {
3278 reterr = CountparseDifflistSubset(fread_end, sample_include, vrtype & 3, raw_sample_ct, sample_ct, &fread_ptr, genocounts, pgrp->workspace_raregeno_tmp_loadbuf);
3279 } else {
3280 genocounts[0] = sample_ct;
3281 genocounts[1] = 0;
3282 genocounts[2] = 0;
3283 genocounts[3] = 0;
3284 reterr = kPglRetSuccess;
3285 }
3286 } else if (vrtype & 1) {
3287 reterr = CountparseOnebitSubset(fread_end, sample_include, raw_sample_ct, sample_ct, &fread_ptr, genocounts, pgrp->workspace_raregeno_tmp_loadbuf);
3288 } else {
3289 const uint32_t genovec_byte_ct = NypCtToByteCt(raw_sample_ct);
3290 const unsigned char* genoarrb = fread_ptr;
3291 if (PtrAddCk(fread_end, genovec_byte_ct, &fread_ptr)) {
3292 return kPglRetMalformedInput;
3293 }
3294 const uint32_t genoarrb_is_unaligned = R_CAST(uintptr_t, genoarrb) % kBytesPerVec;
3295 if (!subsetting_required) {
3296 if (genoarrb_is_unaligned) {
3297 GenoarrbCountFreqs(genoarrb, raw_sample_ct, genocounts);
3298 } else {
3299 GenoarrCountFreqs(R_CAST(const uintptr_t*, genoarrb), raw_sample_ct, genocounts);
3300 }
3301 } else {
3302 GenoarrbCountSubsetFreqs(genoarrb, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
3303 }
3304 if (vrtype == kPglVrtypePlink1) {
3305 // [3] -> [0]
3306 // [2] -> [1]
3307 // [1] -> [3]
3308 // [0] -> [2]
3309 const uint32_t save2 = genocounts[0];
3310 const uint32_t save3 = genocounts[1];
3311 genocounts[0] = genocounts[3];
3312 genocounts[1] = genocounts[2];
3313 genocounts[2] = save2;
3314 genocounts[3] = save3;
3315 }
3316 reterr = kPglRetSuccess;
3317 }
3318 }
3319 if ((!unphased_het_ctp) || reterr) {
3320 return reterr;
3321 }
3322 assert((!subsetting_required) && ((vrtype & 0x18) == 0x10));
3323 const uint32_t het_ct = genocounts[1];
3324 const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
3325 if (PtrCheck(fread_end, fread_ptr, aux2_first_part_byte_ct)) {
3326 return kPglRetMalformedInput;
3327 }
3328 const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
3329 if (explicit_phasepresent) {
3330 // otherwise initial value if 0 is correct
3331 *unphased_het_ctp = het_ct + 1 - PopcountBytes(fread_ptr, aux2_first_part_byte_ct);
3332 }
3333 return kPglRetSuccess;
3334 }
3335
3336 PglErr PgrGetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts) {
3337 if (!sample_ct) {
3338 STD_ARRAY_REF_FILL0(4, genocounts);
3339 return kPglRetSuccess;
3340 }
3341 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
3342 assert(vidx < pgrp->fi.raw_variant_ct);
3343 return GetBasicGenotypeCounts(sample_include, sample_include_interleaved_vec, GetSicp(pssi), sample_ct, vidx, pgrp, nullptr, genocounts);
3344 }
3345
3346 // Ok for nyp_vvec to be unaligned.
CountNypVec6(const VecW * nyp_vvec,uintptr_t nyp_word,uint32_t vec_ct)3347 uint32_t CountNypVec6(const VecW* nyp_vvec, uintptr_t nyp_word, uint32_t vec_ct) {
3348 assert(!(vec_ct % 6));
3349 const VecW m0 = vecw_setzero();
3350 const VecW m1 = VCONST_W(kMask5555);
3351 const VecW m2 = VCONST_W(kMask3333);
3352 const VecW m4 = VCONST_W(kMask0F0F);
3353 const VecW xor_vvec = vecw_set1(nyp_word);
3354 const VecW* nyp_vvec_iter = nyp_vvec;
3355 VecW prev_sad_result = vecw_setzero();
3356 VecW acc = vecw_setzero();
3357 uintptr_t cur_incr = 60;
3358 for (; ; vec_ct -= cur_incr) {
3359 if (vec_ct < 60) {
3360 if (!vec_ct) {
3361 acc = acc + prev_sad_result;
3362 return HsumW(acc);
3363 }
3364 cur_incr = vec_ct;
3365 }
3366 VecW inner_acc = vecw_setzero();
3367 const VecW* nyp_vvec_stop = &(nyp_vvec_iter[cur_incr]);
3368 do {
3369 VecW loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3370 VecW loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3371 VecW count1 = vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, m1);
3372 VecW count2 = vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, m1);
3373
3374 loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3375 loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3376 count1 = count1 + vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, m1);
3377 count2 = count2 + vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, m1);
3378
3379 loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3380 loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3381 count1 = count1 + vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, m1);
3382 count2 = count2 + vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, m1);
3383
3384 count1 = (count1 & m2) + (vecw_srli(count1, 2) & m2);
3385 count1 = count1 + (count2 & m2) + (vecw_srli(count2, 2) & m2);
3386 inner_acc = inner_acc + (count1 & m4) + (vecw_srli(count1, 4) & m4);
3387 } while (nyp_vvec_iter < nyp_vvec_stop);
3388 acc = acc + prev_sad_result;
3389 prev_sad_result = vecw_bytesum(inner_acc, m0);
3390 }
3391 }
3392
3393 // Ok for nyparr to be unaligned. Ok if unsafe to read trailing bytes of
3394 // nyparr.
CountNyp(const void * nyparr,uintptr_t nyp_word,uint32_t nyp_ct)3395 uint32_t CountNyp(const void* nyparr, uintptr_t nyp_word, uint32_t nyp_ct) {
3396 const uint32_t fullword_ct = nyp_ct / kBitsPerWordD2;
3397 uint32_t word_idx = fullword_ct - (fullword_ct % (6 * kWordsPerVec));
3398 uint32_t tot = CountNypVec6(S_CAST(const VecW*, nyparr), nyp_word, word_idx / kWordsPerVec);
3399 const uintptr_t* nypvec = S_CAST(const uintptr_t*, nyparr);
3400 for (; word_idx != fullword_ct; ++word_idx) {
3401 const uintptr_t cur_word = nypvec[word_idx] ^ nyp_word;
3402 tot += Popcount01Word(Word00(cur_word));
3403 }
3404 const uint32_t trailing_nyp_ct = nyp_ct % kBitsPerWordD2;
3405 if (trailing_nyp_ct) {
3406 const uint32_t trailing_byte_ct = DivUp(trailing_nyp_ct, (CHAR_BIT / 2));
3407 uintptr_t cur_word = SubwordLoad(&(nypvec[fullword_ct]), trailing_byte_ct) ^ nyp_word;
3408 cur_word = bzhi(Word00(cur_word), trailing_nyp_ct * 2);
3409 tot += Popcount01Word(cur_word);
3410 }
3411 return tot;
3412 }
3413
3414 /*
3415 uint32_t CountNypSubsetVec6(const VecW* __restrict nyp_vvec, const VecW* __restrict interleaved_mask_vvec, uintptr_t nyp_word, uint32_t vec_ct) {
3416 assert(!(vec_ct % 6));
3417 const VecW m0 = vecw_setzero();
3418 const VecW m1 = VCONST_W(kMask5555);
3419 const VecW m2 = VCONST_W(kMask3333);
3420 const VecW m4 = VCONST_W(kMask0F0F);
3421 const VecW xor_vvec = vecw_set1(nyp_word);
3422 const VecW* nyp_vvec_iter = nyp_vvec;
3423 const VecW* interleaved_mask_vvec_iter = interleaved_mask_vvec;
3424 VecW prev_sad_result = vecw_setzero();
3425 VecW acc = vecw_setzero();
3426 uintptr_t cur_incr = 60;
3427 while (1) {
3428 if (vec_ct < 60) {
3429 if (!vec_ct) {
3430 acc = acc + prev_sad_result;
3431 return HsumW(acc);
3432 }
3433 cur_incr = vec_ct;
3434 }
3435 VecW inner_acc = vecw_setzero();
3436 const VecW* nyp_vvec_stop = &(nyp_vvec_iter[cur_incr]);
3437 vec_ct -= cur_incr;
3438 do {
3439 VecW mask1 = *interleaved_mask_vvec_iter++;
3440 VecW loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3441 VecW mask2 = vecw_srli(mask1, 1) & m1;
3442 VecW loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3443 mask1 = mask1 & m1;
3444 VecW count1 = vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, mask1);
3445 VecW count2 = vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, mask2);
3446
3447 mask1 = *interleaved_mask_vvec_iter++;
3448 loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3449 mask2 = vecw_srli(mask1, 1) & m1;
3450 loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3451 mask1 = mask1 & m1;
3452 count1 = count1 + vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, mask1);
3453 count2 = count2 + vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, mask2);
3454
3455 mask1 = *interleaved_mask_vvec_iter++;
3456 loader1 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3457 mask2 = vecw_srli(mask2, 1) & m1;
3458 loader2 = vecw_loadu(nyp_vvec_iter++) ^ xor_vvec;
3459 mask1 = mask1 & m1;
3460 count1 = count1 + vecw_and_notfirst(vecw_srli(loader1, 1) | loader1, mask1);
3461 count2 = count2 + vecw_and_notfirst(vecw_srli(loader2, 1) | loader2, mask2);
3462
3463 count1 = (count1 & m2) + (vecw_srli(count1, 2) & m2);
3464 count1 = count1 + (count2 & m2) + (vecw_srli(count2, 2) & m2);
3465 inner_acc = inner_acc + (count1 & m4) + (vecw_srli(count1, 4) & m4);
3466 } while (nyp_vvec_iter < nyp_vvec_stop);
3467 acc = acc + prev_sad_result;
3468 prev_sad_result = vecw_bytesum(inner_acc, m0);
3469 }
3470 }
3471
3472 uint32_t CountNypSubset(const uintptr_t* __restrict nypvec, const uintptr_t* __restrict interleaved_vec, uintptr_t nyp_word, uint32_t raw_nyp_ct) {
3473 // simplified GenoarrCountSubsetFreqs()
3474 const uint32_t raw_nyp_ctv2 = NypCtToVecCt(raw_nyp_ct);
3475 #ifdef __LP64__
3476 uint32_t vec_idx = raw_nyp_ctv2 - (raw_nyp_ctv2 % 6);
3477 uint32_t tot = CountNypSubsetVec6(R_CAST(const VecW*, nypvec), R_CAST(const VecW*, interleaved_vec), nyp_word, vec_idx);
3478 const uintptr_t* nypvec_iter = &(nypvec[kWordsPerVec * vec_idx]);
3479 const uintptr_t* interleaved_mask_iter = &(interleaved_vec[(kWordsPerVec / 2) * vec_idx]);
3480 # ifdef USE_AVX2
3481 uintptr_t mask_base1 = 0;
3482 uintptr_t mask_base2 = 0;
3483 uintptr_t mask_base3 = 0;
3484 uintptr_t mask_base4 = 0;
3485 for (; vec_idx != raw_nyp_ctv2; ++vec_idx) {
3486 uintptr_t mask_word1;
3487 uintptr_t mask_word2;
3488 uintptr_t mask_word3;
3489 uintptr_t mask_word4;
3490 if (!(vec_idx % 2)) {
3491 mask_base1 = *interleaved_mask_iter++;
3492 mask_base2 = *interleaved_mask_iter++;
3493 mask_base3 = *interleaved_mask_iter++;
3494 mask_base4 = *interleaved_mask_iter++;
3495 mask_word1 = mask_base1 & kMask5555;
3496 mask_word2 = mask_base2 & kMask5555;
3497 mask_word3 = mask_base3 & kMask5555;
3498 mask_word4 = mask_base4 & kMask5555;
3499 } else {
3500 mask_word1 = (mask_base1 >> 1) & kMask5555;
3501 mask_word2 = (mask_base2 >> 1) & kMask5555;
3502 mask_word3 = (mask_base3 >> 1) & kMask5555;
3503 mask_word4 = (mask_base4 >> 1) & kMask5555;
3504 }
3505 uint32_t uii = 0;
3506 while (1) {
3507 const uintptr_t cur_geno_word1 = (*nypvec_iter++) ^ nyp_word;
3508 const uintptr_t cur_geno_word2 = (*nypvec_iter++) ^ nyp_word;
3509 const uintptr_t masked1 = mask_word1 & (~(cur_geno_word1 | (cur_geno_word1 >> 1)));
3510 const uintptr_t masked2 = mask_word2 & (~(cur_geno_word2 | (cur_geno_word2 >> 1)));
3511 tot += PopcountWord((masked1 << 1) | masked2);
3512 if (uii) {
3513 break;
3514 }
3515 ++uii;
3516 mask_word1 = mask_word3;
3517 mask_word2 = mask_word4;
3518 }
3519 }
3520 # else // not USE_AVX2
3521 uintptr_t mask_base1 = 0;
3522 uintptr_t mask_base2 = 0;
3523 for (; vec_idx != raw_nyp_ctv2; ++vec_idx) {
3524 uintptr_t mask_word1;
3525 uintptr_t mask_word2;
3526 if (!(vec_idx % 2)) {
3527 mask_base1 = *interleaved_mask_iter++;
3528 mask_base2 = *interleaved_mask_iter++;
3529 mask_word1 = mask_base1 & kMask5555;
3530 mask_word2 = mask_base2 & kMask5555;
3531 } else {
3532 mask_word1 = (mask_base1 >> 1) & kMask5555;
3533 mask_word2 = (mask_base2 >> 1) & kMask5555;
3534 }
3535 const uintptr_t cur_geno_word1 = (*nypvec_iter++) ^ nyp_word;
3536 const uintptr_t cur_geno_word2 = (*nypvec_iter++) ^ nyp_word;
3537 const uintptr_t masked1 = mask_word1 & (~(cur_geno_word1 | (cur_geno_word1 >> 1)));
3538 const uintptr_t masked2 = mask_word2 & (~(cur_geno_word2 | (cur_geno_word2 >> 1)));
3539 # ifdef USE_SSE42
3540 tot += PopcountWord((masked1 << 1) | masked2);
3541 # else
3542 tot += NypsumWord(masked1 + masked2);
3543 # endif
3544 }
3545 # endif // not USE_AVX2
3546 #else // not __LP64__
3547 uint32_t word_idx = raw_nyp_ctv2 - (raw_nyp_ctv2 % 6);
3548 uint32_t tot = CountNypSubsetVec6(R_CAST(const VecW*, nypvec), R_CAST(const VecW*, interleaved_vec), nyp_word, word_idx);
3549 const uintptr_t* interleaved_mask_iter = &(interleaved_vec[word_idx / 2]);
3550 uintptr_t mask_base = 0;
3551 for (; word_idx != raw_nyp_ctv2; ++word_idx) {
3552 uintptr_t mask_word;
3553 if (!(word_idx % 2)) {
3554 mask_base = *interleaved_mask_iter++;
3555 mask_word = mask_base & kMask5555;
3556 } else {
3557 mask_word = (mask_base >> 1) & kMask5555;
3558 }
3559 const uintptr_t cur_geno_word = nypvec[word_idx] ^ nyp_word;
3560 const uintptr_t masked = mask_word & (~(cur_geno_word | (cur_geno_word >> 1)));
3561 tot += Popcount01Word(masked);
3562 }
3563 #endif
3564 return tot;
3565 }
3566 */
3567
3568 // Ok for nybble_vvec to be unaligned.
CountNybbleVec(const VecW * nybble_vvec,uintptr_t nybble_word,uint32_t vec_ct)3569 uint32_t CountNybbleVec(const VecW* nybble_vvec, uintptr_t nybble_word, uint32_t vec_ct) {
3570 const VecW m0 = vecw_setzero();
3571 const VecW alld15 = VCONST_W(kMask1111);
3572 const VecW m4 = VCONST_W(kMask0F0F);
3573 const VecW xor_vvec = vecw_set1(nybble_word);
3574 const VecW* nybble_vvec_iter = nybble_vvec;
3575 VecW prev_sad_result = vecw_setzero();
3576 VecW acc = vecw_setzero();
3577 uintptr_t cur_incr = 15;
3578 for (; ; vec_ct -= cur_incr) {
3579 if (vec_ct < 15) {
3580 if (!vec_ct) {
3581 acc = acc + prev_sad_result;
3582 return HsumW(acc);
3583 }
3584 cur_incr = vec_ct;
3585 }
3586 VecW inner_acc = vecw_setzero();
3587 const VecW* nybble_vvec_stop = &(nybble_vvec_iter[cur_incr]);
3588 do {
3589 VecW loader = vecw_loadu(nybble_vvec_iter++) ^ xor_vvec;
3590 // DetectAllZeroNybbles() followed by right-shift-3 is the same number of
3591 // operations, can see if that's any faster in practice
3592 loader = vecw_srli(loader, 1) | loader;
3593 loader = vecw_srli(loader, 2) | loader;
3594 inner_acc = inner_acc + vecw_and_notfirst(loader, alld15);
3595 } while (nybble_vvec_iter < nybble_vvec_stop);
3596 inner_acc = (inner_acc & m4) + (vecw_srli(inner_acc, 4) & m4);
3597 acc = acc + prev_sad_result;
3598 prev_sad_result = vecw_bytesum(inner_acc, m0);
3599 }
3600 }
3601
CountNybble(const void * nybblearr,uintptr_t nybble_word,uintptr_t nybble_ct)3602 uint32_t CountNybble(const void* nybblearr, uintptr_t nybble_word, uintptr_t nybble_ct) {
3603 const uint32_t fullword_ct = nybble_ct / kBitsPerWordD4;
3604 uint32_t tot = CountNybbleVec(S_CAST(const VecW*, nybblearr), nybble_word, fullword_ct / kWordsPerVec);
3605 const uintptr_t* nybblevec = S_CAST(const uintptr_t*, nybblearr);
3606 #ifdef __LP64__
3607 for (uint32_t word_idx = RoundDownPow2(fullword_ct, kWordsPerVec); word_idx != fullword_ct; ++word_idx) {
3608 uintptr_t cur_word = nybblevec[word_idx] ^ nybble_word;
3609 cur_word = cur_word | (cur_word >> 1);
3610 cur_word = cur_word | (cur_word >> 2);
3611 tot += Popcount0001Word((~cur_word) & kMask1111);
3612 }
3613 #endif
3614 const uint32_t trailing_nybble_ct = nybble_ct % kBitsPerWordD4;
3615 if (trailing_nybble_ct) {
3616 const uint32_t trailing_byte_ct = DivUp(trailing_nybble_ct, (CHAR_BIT / 4));
3617 uintptr_t cur_word = SubwordLoad(&(nybblevec[fullword_ct]), trailing_byte_ct) ^ nybble_word;
3618 cur_word = cur_word | (cur_word >> 1);
3619 cur_word = cur_word | (cur_word >> 2);
3620 cur_word = bzhi((~cur_word) & kMask1111, trailing_nybble_ct * 4);
3621 #if defined(USE_SSE42) || !defined(__LP64__)
3622 tot += Popcount0001Word(cur_word);
3623 #else
3624 // minor optimization, can't overflow
3625 tot += (cur_word * kMask1111) >> 60;
3626 #endif
3627 }
3628 return tot;
3629 }
3630
3631 // similar to ParseAndSaveDifflist()
ParseAndSaveDeltalist(const unsigned char * fread_end,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uint32_t * __restrict deltalist,uint32_t * __restrict deltalist_len_ptr)3632 PglErr ParseAndSaveDeltalist(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uint32_t* __restrict deltalist, uint32_t* __restrict deltalist_len_ptr) {
3633 const unsigned char* group_info_iter;
3634 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, deltalist_len_ptr);
3635 const uint32_t deltalist_len = *deltalist_len_ptr;
3636 if (reterr || (!deltalist_len)) {
3637 return reterr;
3638 }
3639 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
3640 const uint32_t group_idx_last = (deltalist_len - 1) / kPglDifflistGroupSize;
3641 uint32_t* deltalist_iter = deltalist;
3642 uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
3643 for (uint32_t group_idx = 0; ; ++group_idx) {
3644 if (group_idx >= group_idx_last) {
3645 if (group_idx > group_idx_last) {
3646 return kPglRetSuccess;
3647 }
3648 group_len_m1 &= deltalist_len - 1;
3649 }
3650 uintptr_t raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
3651 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
3652 for (uint32_t raw_deltalist_idx_lowbits = 0; ; ++raw_deltalist_idx_lowbits) {
3653 // always check, otherwise we may scribble over arbitrary memory
3654 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3655 return kPglRetMalformedInput;
3656 }
3657 deltalist_iter[raw_deltalist_idx_lowbits] = raw_sample_idx;
3658 if (raw_deltalist_idx_lowbits == group_len_m1) {
3659 break;
3660 }
3661 raw_sample_idx += GetVint31(fread_end, fread_pp);
3662 }
3663 deltalist_iter = &(deltalist_iter[group_len_m1 + 1]);
3664 }
3665 }
3666
CountDeltalistIntersect(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uint32_t * __restrict intersect_ctp,uint32_t * __restrict raw_deltalist_len_ptr)3667 PglErr CountDeltalistIntersect(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t raw_sample_ct, const unsigned char** fread_pp, uint32_t* __restrict intersect_ctp, uint32_t* __restrict raw_deltalist_len_ptr) {
3668 // Requires a PROPER subset.
3669 const unsigned char* group_info_iter;
3670 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, raw_deltalist_len_ptr);
3671 const uint32_t raw_deltalist_len = *raw_deltalist_len_ptr;
3672 if (reterr || (!raw_deltalist_len)) {
3673 *intersect_ctp = 0;
3674 return reterr;
3675 }
3676 const uint32_t group_idx_last = (raw_deltalist_len - 1) / kPglDifflistGroupSize;
3677 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
3678 uintptr_t intersect_ct = 0;
3679
3680 // technically doesn't need to be initialized, but I have principles
3681 uintptr_t raw_sample_idx = 0;
3682
3683 uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
3684 for (uint32_t group_idx = 0; ; ++group_idx) {
3685 if (group_idx >= group_idx_last) {
3686 if (group_idx > group_idx_last) {
3687 *intersect_ctp = intersect_ct;
3688 return kPglRetSuccess;
3689 }
3690 group_len_m1 &= raw_deltalist_len - 1;
3691 }
3692 // We need to pull a raw sample index from the deltalist header every 64
3693 // entries.
3694 #ifdef __LP64__
3695 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3696 return kPglRetMalformedInput;
3697 }
3698 #endif
3699 raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
3700 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
3701 for (uint32_t raw_deltalist_idx_lowbits = 0; ; ++raw_deltalist_idx_lowbits) {
3702 #ifndef __LP64__
3703 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
3704 return kPglRetMalformedInput;
3705 }
3706 #endif
3707 intersect_ct += IsSet(sample_include, raw_sample_idx);
3708 if (raw_deltalist_idx_lowbits == group_len_m1) {
3709 break;
3710 }
3711 raw_sample_idx += GetVint31(fread_end, fread_pp);
3712 }
3713 }
3714 }
3715
CountAux1aDense(const void * patch_01_fvals,uint32_t allele_ct,uint32_t allele_idx,uint32_t raw_01_ct,uint32_t rare01_ct)3716 uint32_t CountAux1aDense(const void* patch_01_fvals, uint32_t allele_ct, uint32_t allele_idx, uint32_t raw_01_ct, uint32_t rare01_ct) {
3717 // The 'f' in patch_01_fset/patch_01_fvals is to distinguish the in-file
3718 // representation from the returned AlleleCode*-based representation.
3719 if (allele_idx == 1) {
3720 // safe to ignore allele codes
3721 return raw_01_ct - rare01_ct;
3722 }
3723 if (allele_ct < 5) {
3724 if (allele_ct == 3) {
3725 return rare01_ct;
3726 }
3727 // need to count matches
3728 const uint32_t allele_code_byte_ct = DivUp(rare01_ct, 8);
3729 const uint32_t alt3_ct = PopcountBytes(patch_01_fvals, allele_code_byte_ct);
3730 if (allele_idx == 3) {
3731 return alt3_ct;
3732 }
3733 return rare01_ct - alt3_ct;
3734 }
3735 if (allele_ct < 19) {
3736 if (allele_ct < 7) {
3737 return CountNyp(patch_01_fvals, (allele_idx - 2) * kMask5555, rare01_ct);
3738 }
3739 return CountNybble(patch_01_fvals, (allele_idx - 2) * kMask1111, rare01_ct);
3740 }
3741 return CountByte(patch_01_fvals, allele_idx - 2, rare01_ct);
3742 }
3743
GetAux1aWidth(uint32_t allele_ct)3744 uint32_t GetAux1aWidth(uint32_t allele_ct) {
3745 if (allele_ct < 7) {
3746 if (allele_ct < 5) {
3747 return allele_ct - 3;
3748 }
3749 return 2;
3750 }
3751 if (allele_ct < 19) {
3752 return 4;
3753 }
3754 return 8;
3755 }
3756
3757 // Returns allele_code_width. Other return values are inaccurate for allele_ct
3758 // == 3, since it's assumed that they're unused in that case.
GetAux1aConsts(uint32_t allele_ct,uintptr_t * detect_mask_hi_ptr,uintptr_t * detect_mask_lo_ptr,uint32_t * allele_code_logwidth_ptr)3759 uint32_t GetAux1aConsts(uint32_t allele_ct, uintptr_t* detect_mask_hi_ptr, uintptr_t* detect_mask_lo_ptr, uint32_t* allele_code_logwidth_ptr) {
3760 if (allele_ct < 7) {
3761 if (allele_ct < 5) {
3762 *detect_mask_hi_ptr = ~k0LU;
3763 *detect_mask_lo_ptr = ~k0LU;
3764 *allele_code_logwidth_ptr = 0;
3765 return allele_ct - 3;
3766 }
3767 *detect_mask_hi_ptr = kMaskAAAA;
3768 *detect_mask_lo_ptr = kMask5555;
3769 *allele_code_logwidth_ptr = 1;
3770 return 2;
3771 }
3772 if (allele_ct < 19) {
3773 *detect_mask_hi_ptr = kMask1111 * 8;
3774 *detect_mask_lo_ptr = kMask1111;
3775 *allele_code_logwidth_ptr = 2;
3776 return 4;
3777 }
3778 *detect_mask_hi_ptr = kMask0101 * 0x80;
3779 *detect_mask_lo_ptr = kMask0101;
3780 *allele_code_logwidth_ptr = 3;
3781 return 8;
3782 }
3783
3784 // Advances *fread_pp past aux1a, and sets *het_ctp to the number of ref-altx
3785 // hets where x == allele_idx in sample_include. (If allele_idx == 1, *het_ctp
3786 // is raw_01_ct - [# of aux1a entries] when there's no subsetting.)
3787 // Note that raw_01_ct must be an un-subsetted count.
3788 // Ok for subsetted_01_ct to be uninitialized if not subsetting, or allele_idx
3789 // != 1.
3790 // sample_include assumed to be nullptr if no subsetting required
CountAux1a(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx,uint32_t raw_01_ct,uint32_t subsetted_01_ct,const unsigned char ** fread_pp,uint32_t * __restrict het_ctp,uint32_t * __restrict deltalist_workspace)3791 PglErr CountAux1a(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx, uint32_t raw_01_ct, uint32_t subsetted_01_ct, const unsigned char** fread_pp, uint32_t* __restrict het_ctp, uint32_t* __restrict deltalist_workspace) {
3792 if (aux1a_mode == 15) {
3793 if (allele_idx == 1) {
3794 if (sample_include) {
3795 *het_ctp = subsetted_01_ct;
3796 } else {
3797 *het_ctp = raw_01_ct;
3798 }
3799 } else {
3800 *het_ctp = 0;
3801 }
3802 return kPglRetSuccess;
3803 }
3804 const uint32_t ignore_01_fvals = (allele_idx == 1) || (allele_ct == 3);
3805 uintptr_t detect_mask_hi;
3806 uintptr_t detect_mask_lo;
3807 uint32_t allele_code_logwidth;
3808 const uint32_t allele_code_width = GetAux1aConsts(allele_ct, &detect_mask_hi, &detect_mask_lo, &allele_code_logwidth);
3809 const uintptr_t xor_word = (allele_idx - 2) * detect_mask_lo;
3810 if (!aux1a_mode) {
3811 // 01-collapsed bitarray
3812 const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
3813 const uint32_t rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
3814 #ifdef __arm__
3815 # error "Unaligned accesses in CountAux1a()."
3816 #endif
3817 const uintptr_t* patch_01_fsetw = R_CAST(const uintptr_t*, *fread_pp);
3818 *fread_pp += fset_byte_ct;
3819 const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
3820 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
3821 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
3822 return kPglRetMalformedInput;
3823 }
3824 if (!sample_include) {
3825 *het_ctp = CountAux1aDense(patch_01_fvalsw, allele_ct, allele_idx, raw_01_ct, rare01_ct);
3826 return kPglRetSuccess;
3827 }
3828 const Halfword* sample_include_hw = R_CAST(const Halfword*, sample_include);
3829 uintptr_t sample_hwidx = 0;
3830 uintptr_t cur_raw_genoarr_hets = Word01(raw_genoarr[0]);
3831 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
3832 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
3833 uintptr_t fvals_bits = 0;
3834 uint32_t fvals_widx = 0;
3835 uint32_t subsetted_hetx_ct = 0;
3836 uint32_t loop_len = kBitsPerWord;
3837 uint32_t rare01_lowbits = kBitsPerWord;
3838 for (uint32_t fset_widx = 0; ; ++fset_widx) {
3839 uintptr_t fset_bits;
3840 if (fset_widx >= fset_word_ct_m1) {
3841 if (fset_widx > fset_word_ct_m1) {
3842 break;
3843 }
3844 fset_bits = SubwordLoad(&(patch_01_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
3845 loop_len = ModNz(raw_01_ct, kBitsPerWord);
3846 } else {
3847 fset_bits = patch_01_fsetw[fset_widx];
3848 }
3849 // format 0, sample_include non-null
3850 if (ignore_01_fvals) {
3851 for (uint32_t uii = 0; uii != loop_len; ++uii) {
3852 while (!cur_raw_genoarr_hets) {
3853 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
3854 }
3855 if (fset_bits & 1) {
3856 // Considered replacing cur_raw_genoarr_hets with the result of
3857 // two PackWordToHalfword() operations, since that keeps all
3858 // the sample word-indexes aligned. Couldn't justify it given
3859 // the expected sparsity of this case, though.
3860 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
3861 subsetted_hetx_ct += (sample_include_hw[sample_hwidx] >> sample_uidx_lowbits) & 1;
3862 }
3863 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
3864 fset_bits = fset_bits >> 1;
3865 }
3866 } else {
3867 for (uint32_t uii = 0; uii != loop_len; ++uii) {
3868 while (!cur_raw_genoarr_hets) {
3869 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
3870 }
3871 if (fset_bits & 1) {
3872 if (rare01_lowbits == kBitsPerWord) {
3873 if (fvals_widx == fvals_word_ct_m1) {
3874 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
3875 } else {
3876 fvals_bits = patch_01_fvalsw[fvals_widx];
3877 }
3878 fvals_bits = fvals_bits ^ xor_word;
3879 fvals_bits = (detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)))) >> (allele_code_width - 1);
3880 // unnecessary to apply bzhi here
3881 ++fvals_widx;
3882 rare01_lowbits = 0;
3883 }
3884 if (fvals_bits & (k1LU << rare01_lowbits)) {
3885 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
3886 subsetted_hetx_ct += (sample_include_hw[sample_hwidx] >> sample_uidx_lowbits) & 1;
3887 }
3888 rare01_lowbits += allele_code_width;
3889 }
3890 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
3891 fset_bits = fset_bits >> 1;
3892 }
3893 }
3894 }
3895 if (allele_idx == 1) {
3896 *het_ctp = subsetted_01_ct - subsetted_hetx_ct;
3897 } else {
3898 *het_ctp = subsetted_hetx_ct;
3899 }
3900 return kPglRetSuccess;
3901 }
3902 // mode 1: difflist.
3903 if (!sample_include) {
3904 const unsigned char* group_info_iter;
3905 uint32_t rare01_ct;
3906 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare01_ct);
3907 // rare01_ct == 0 should be impossible
3908 if (unlikely(reterr)) {
3909 return reterr;
3910 }
3911 reterr = SkipDeltalistIds(fread_end, group_info_iter, rare01_ct, raw_sample_ct, 1, fread_pp);
3912 if (unlikely(reterr)) {
3913 return reterr;
3914 }
3915 const unsigned char* patch_01_fvals = *fread_pp;
3916 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
3917 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
3918 return kPglRetMalformedInput;
3919 }
3920
3921 *het_ctp = CountAux1aDense(patch_01_fvals, allele_ct, allele_idx, raw_01_ct, rare01_ct);
3922 return kPglRetSuccess;
3923 }
3924 if (ignore_01_fvals) {
3925 // Don't need to save deltalist contents in this case.
3926 uint32_t subsetted_hetx_ct;
3927 uint32_t rare01_ct;
3928 PglErr reterr = CountDeltalistIntersect(fread_end, sample_include, raw_sample_ct, fread_pp, &subsetted_hetx_ct, &rare01_ct);
3929 if (unlikely(reterr)) {
3930 return reterr;
3931 }
3932 if (allele_idx == 1) {
3933 *het_ctp = subsetted_01_ct - subsetted_hetx_ct;
3934 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
3935 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
3936 return kPglRetMalformedInput;
3937 }
3938 } else {
3939 *het_ctp = subsetted_hetx_ct;
3940 }
3941 return kPglRetSuccess;
3942 }
3943 // Save deltalist elements, iterate.
3944 uint32_t rare01_ct;
3945 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare01_ct);
3946 if (unlikely(reterr)) {
3947 return reterr;
3948 }
3949 const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
3950 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
3951 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
3952 return kPglRetMalformedInput;
3953 }
3954 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
3955 uint32_t subsetted_hetx_ct = 0;
3956 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
3957 uintptr_t fvals_bits;
3958 if (fvals_widx >= fvals_word_ct_m1) {
3959 if (fvals_widx > fvals_word_ct_m1) {
3960 break;
3961 }
3962 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
3963 } else {
3964 fvals_bits = patch_01_fvalsw[fvals_widx];
3965 }
3966 fvals_bits = fvals_bits ^ xor_word;
3967 fvals_bits = detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)));
3968 if (fvals_widx == fvals_word_ct_m1) {
3969 fvals_bits = bzhi_max(fvals_bits, ModNz(rare01_ct << allele_code_logwidth, kBitsPerWord));
3970 }
3971 if (!fvals_bits) {
3972 continue;
3973 }
3974 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - allele_code_logwidth)]);
3975 do {
3976 const uint32_t rare01_idx_lowbits = ctzw(fvals_bits) >> allele_code_logwidth;
3977 const uint32_t sample_uidx = cur_deltalist_base[rare01_idx_lowbits];
3978 subsetted_hetx_ct += IsSet(sample_include, sample_uidx);
3979 fvals_bits &= fvals_bits - 1;
3980 } while (fvals_bits);
3981 }
3982 *het_ctp = subsetted_hetx_ct;
3983 return kPglRetSuccess;
3984 }
3985
CountAux1bDense(const void * patch_10_fvals,uint32_t allele_ct,uint32_t allele_idx_m1,uint32_t raw_10_ct,uint32_t rare10_ct,uint32_t * __restrict het_ctp,uint32_t * __restrict hom_ctp)3986 void CountAux1bDense(const void* patch_10_fvals, uint32_t allele_ct, uint32_t allele_idx_m1, uint32_t raw_10_ct, uint32_t rare10_ct, uint32_t* __restrict het_ctp, uint32_t* __restrict hom_ctp) {
3987 uint32_t matching_hom_ct = 0;
3988 uint32_t het_incr;
3989 if (allele_ct < 6) {
3990 if (allele_ct == 3) {
3991 const uint32_t allele_code_byte_ct = DivUp(rare10_ct, 8);
3992 matching_hom_ct = PopcountBytes(patch_10_fvals, allele_code_byte_ct);
3993 het_incr = rare10_ct - matching_hom_ct;
3994 } else {
3995 // 2+2 bits
3996 het_incr = CountNyp(patch_10_fvals, allele_idx_m1 * kMask5555, rare10_ct * 2);
3997 if (allele_idx_m1) {
3998 matching_hom_ct = CountNybble(patch_10_fvals, allele_idx_m1 * kMask5555, rare10_ct);
3999 }
4000 }
4001 } else {
4002 if (allele_ct < 18) {
4003 // 4+4 bits
4004 het_incr = CountNybble(patch_10_fvals, allele_idx_m1 * kMask1111, rare10_ct * 2);
4005 if (allele_idx_m1) {
4006 matching_hom_ct = CountByte(patch_10_fvals, allele_idx_m1 * 0x11, rare10_ct);
4007 }
4008 } else {
4009 // 8+8 bits
4010 het_incr = CountByte(patch_10_fvals, allele_idx_m1 * 0x11, rare10_ct * 2);
4011 if (allele_idx_m1) {
4012 matching_hom_ct = CountU16(patch_10_fvals, allele_idx_m1 * 0x1111, rare10_ct);
4013 }
4014 }
4015 }
4016 if (!allele_idx_m1) {
4017 *hom_ctp = raw_10_ct - rare10_ct;
4018 } else {
4019 het_incr -= 2 * matching_hom_ct;
4020 *hom_ctp = matching_hom_ct;
4021 }
4022 *het_ctp += het_incr;
4023 }
4024
4025 // Returns allele_code_logwidth.
GetAux1bConsts(uint32_t allele_ct,uintptr_t * detect_hom_mask_lo_ptr)4026 uint32_t GetAux1bConsts(uint32_t allele_ct, uintptr_t* detect_hom_mask_lo_ptr) {
4027 if (allele_ct < 6) {
4028 if (allele_ct == 3) {
4029 *detect_hom_mask_lo_ptr = ~k0LU;
4030 return 0;
4031 }
4032 *detect_hom_mask_lo_ptr = kMask1111;
4033 return 1;
4034 }
4035 if (allele_ct < 18) {
4036 *detect_hom_mask_lo_ptr = kMask0101;
4037 return 2;
4038 }
4039 *detect_hom_mask_lo_ptr = kMask0001;
4040 return 3;
4041 }
4042
4043 // Advances *fread_pp past aux1b; increments *het_ctp by the number of
4044 // altx-alty genotypes in aux1b and sample_include with one allele ==
4045 // allele_idx; and sets *hom_ctp to the number of such hom-allele_idx genotypes
4046 // present. (For allele_idx == 1, *hom_ctp is equal to raw_10_ct -
4047 // <# of aux1b entries> when there's no subsetting.)
4048 // Trailing bits of raw_genoarr must be cleared.
4049 // Ok for subsetted_10_ct to be uninitialized if not subsetting, or allele_idx
4050 // != 1.
4051 // sample_include assumed to be nullptr if no subsetting required
CountAux1b(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx,uint32_t raw_10_ct,uint32_t subsetted_10_ct,const unsigned char ** fread_pp,uint32_t * __restrict het_ctp,uint32_t * __restrict hom_ctp,uint32_t * __restrict deltalist_workspace)4052 PglErr CountAux1b(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx, uint32_t raw_10_ct, uint32_t subsetted_10_ct, const unsigned char** fread_pp, uint32_t* __restrict het_ctp, uint32_t* __restrict hom_ctp, uint32_t* __restrict deltalist_workspace) {
4053 if (aux1b_mode == 15) {
4054 if (allele_idx == 1) {
4055 if (sample_include) {
4056 *hom_ctp = subsetted_10_ct;
4057 } else {
4058 *hom_ctp = raw_10_ct;
4059 }
4060 } else {
4061 *hom_ctp = 0;
4062 }
4063 return kPglRetSuccess;
4064 }
4065 uintptr_t detect_hom_mask_lo;
4066 const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
4067 const uint32_t allele_code_width = 1U << allele_code_logwidth;
4068 const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
4069 const uint32_t code10_width = 1U << code10_logwidth;
4070 const uint32_t allele_idx_m1 = allele_idx - 1;
4071 uint32_t rare10_lowbits = kBitsPerWord;
4072 if (!aux1b_mode) {
4073 // 10-collapsed bitarray
4074 const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
4075 const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
4076 #ifdef __arm__
4077 # error "Unaligned accesses in CountAux1b()."
4078 #endif
4079 const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
4080 *fread_pp += fset_byte_ct;
4081 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4082 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, 8);
4083 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4084 return kPglRetMalformedInput;
4085 }
4086 if (!sample_include) {
4087 CountAux1bDense(patch_10_fvalsw, allele_ct, allele_idx_m1, raw_10_ct, rare10_ct, het_ctp, hom_ctp);
4088 return kPglRetSuccess;
4089 }
4090 const Halfword* sample_include_hw = R_CAST(const Halfword*, sample_include);
4091 uintptr_t sample_hwidx = 0;
4092 uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
4093 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
4094 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4095 uintptr_t fvals_bits = 0;
4096 uint32_t fvals_widx = 0;
4097 uint32_t loop_len = kBitsPerWord;
4098 if ((!allele_idx_m1) || (allele_ct == 3)) {
4099 // bugfix (29 Dec 2019)
4100 const uintptr_t detect_alt1_mask_hi = detect_hom_mask_lo << (allele_code_width - 1);
4101 uint32_t subsetted_rare10_ct = 0;
4102 uint32_t het_1x_ct = 0;
4103 for (uint32_t fset_widx = 0; ; ++fset_widx) {
4104 uintptr_t fset_bits;
4105 if (fset_widx >= fset_word_ct_m1) {
4106 if (fset_widx > fset_word_ct_m1) {
4107 break;
4108 }
4109 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4110 loop_len = ModNz(raw_10_ct, kBitsPerWord);
4111 } else {
4112 fset_bits = patch_10_fsetw[fset_widx];
4113 }
4114 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4115 while (!cur_raw_genoarr_xys) {
4116 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4117 }
4118 if (fset_bits & 1) {
4119 if (rare10_lowbits == kBitsPerWord) {
4120 if (fvals_widx == fvals_word_ct_m1) {
4121 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4122 } else {
4123 fvals_bits = patch_10_fvalsw[fvals_widx];
4124 }
4125 // This sets each fvals_bits entry to 1 iff the patch genotype is
4126 // ALT1-ALTx, i.e. the original low bits were zero.
4127 fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4128 // unnecessary to apply bzhi here
4129 ++fvals_widx;
4130 rare10_lowbits = 0;
4131 }
4132 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
4133 if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
4134 ++subsetted_rare10_ct;
4135 het_1x_ct += (fvals_bits >> rare10_lowbits) & 1;
4136 }
4137 rare10_lowbits += code10_width;
4138 }
4139 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4140 fset_bits = fset_bits >> 1;
4141 }
4142 }
4143 if (allele_ct == 3) {
4144 if (allele_idx_m1) {
4145 *hom_ctp = subsetted_rare10_ct - het_1x_ct;
4146 *het_ctp += het_1x_ct;
4147 return kPglRetSuccess;
4148 }
4149 }
4150 *hom_ctp = subsetted_10_ct - subsetted_rare10_ct;
4151 *het_ctp += het_1x_ct;
4152 return kPglRetSuccess;
4153 }
4154 // allele_idx > 1, allele_ct > 3
4155 const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4156 const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4157 const uintptr_t xor_word = allele_idx_m1 * detect_all_mask_lo;
4158 uint32_t matching_allele_ct = 0; // 2x hom + 1x het
4159 uint32_t matching_het_or_hom_ct = 0;
4160 for (uint32_t fset_widx = 0; ; ++fset_widx) {
4161 uintptr_t fset_bits;
4162 if (fset_widx >= fset_word_ct_m1) {
4163 if (fset_widx > fset_word_ct_m1) {
4164 break;
4165 }
4166 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4167 loop_len = ModNz(raw_10_ct, kBitsPerWord);
4168 } else {
4169 fset_bits = patch_10_fsetw[fset_widx];
4170 }
4171 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4172 while (!cur_raw_genoarr_xys) {
4173 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4174 }
4175 if (fset_bits & 1) {
4176 if (rare10_lowbits == kBitsPerWord) {
4177 if (fvals_widx == fvals_word_ct_m1) {
4178 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4179 } else {
4180 fvals_bits = patch_10_fvalsw[fvals_widx];
4181 }
4182 fvals_bits ^= xor_word;
4183 fvals_bits = (detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)))) >> (allele_code_width - 1);
4184 // unnecessary to apply bzhi or detect_hom_mask_lo here
4185 fvals_bits = fvals_bits + (fvals_bits >> allele_code_width);
4186 ++fvals_widx;
4187 rare10_lowbits = 0;
4188 }
4189 const uintptr_t cur_hit_ct = (fvals_bits >> rare10_lowbits) & 3;
4190 rare10_lowbits += code10_width;
4191 if (cur_hit_ct) {
4192 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
4193 if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
4194 ++matching_het_or_hom_ct;
4195 matching_allele_ct += cur_hit_ct;
4196 }
4197 }
4198 }
4199 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4200 fset_bits = fset_bits >> 1;
4201 }
4202 }
4203 const uint32_t matching_hom_ct = matching_allele_ct - matching_het_or_hom_ct;
4204 *hom_ctp = matching_hom_ct;
4205 *het_ctp += matching_het_or_hom_ct - matching_hom_ct;
4206 return kPglRetSuccess;
4207 }
4208 // mode 1: difflist.
4209 if (!sample_include) {
4210 const unsigned char* group_info_iter;
4211 uint32_t rare10_ct;
4212 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare10_ct);
4213 // rare10_ct == 0 should be impossible
4214 if (unlikely(reterr)) {
4215 return reterr;
4216 }
4217 reterr = SkipDeltalistIds(fread_end, group_info_iter, rare10_ct, raw_sample_ct, 0, fread_pp);
4218 if (unlikely(reterr)) {
4219 return reterr;
4220 }
4221 const unsigned char* patch_10_fvals = *fread_pp;
4222 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
4223 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4224 return kPglRetMalformedInput;
4225 }
4226 CountAux1bDense(patch_10_fvals, allele_ct, allele_idx_m1, raw_10_ct, rare10_ct, het_ctp, hom_ctp);
4227 return kPglRetSuccess;
4228 }
4229 // Save deltalist elements, iterate.
4230 uint32_t rare10_ct;
4231 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
4232 if (unlikely(reterr)) {
4233 return reterr;
4234 }
4235 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4236 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
4237 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4238 return kPglRetMalformedInput;
4239 }
4240 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4241 if ((!allele_idx_m1) || (allele_ct == 3)) {
4242 const uintptr_t detect_alt1_mask_hi = detect_hom_mask_lo << (allele_code_width - 1);
4243 uint32_t subsetted_rare10_ct = 0;
4244 uint32_t het_1x_ct = 0;
4245 uint32_t loop_len = kBitsPerWord >> code10_logwidth;
4246 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4247 uintptr_t fvals_bits;
4248 if (fvals_widx >= fvals_word_ct_m1) {
4249 if (fvals_widx > fvals_word_ct_m1) {
4250 break;
4251 }
4252 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4253 loop_len = 1 + ((rare10_ct - 1) & ((kBitsPerWord >> code10_logwidth) - 1));
4254 } else {
4255 fvals_bits = patch_10_fvalsw[fvals_widx];
4256 }
4257 fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4258 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
4259 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4260 const uint32_t sample_uidx = cur_deltalist_base[uii];
4261 if (IsSet(sample_include, sample_uidx)) {
4262 ++subsetted_rare10_ct;
4263 het_1x_ct += (fvals_bits >> (uii << code10_logwidth)) & 1;
4264 }
4265 }
4266 }
4267 if (allele_ct == 3) {
4268 if (allele_idx_m1) {
4269 *hom_ctp = subsetted_rare10_ct - het_1x_ct;
4270 *het_ctp += het_1x_ct;
4271 return kPglRetSuccess;
4272 }
4273 }
4274 *hom_ctp = subsetted_10_ct - subsetted_rare10_ct;
4275 *het_ctp += het_1x_ct;
4276 return kPglRetSuccess;
4277 }
4278 // allele_idx > 1, allele_ct > 3
4279 const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4280 const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4281 detect_hom_mask_lo = detect_hom_mask_lo * 3;
4282 const uintptr_t xor_word = allele_idx_m1 * detect_all_mask_lo;
4283 uint32_t matching_het_or_hom_ct = 0;
4284 uint32_t matching_hom_ct = 0;
4285 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4286 uintptr_t fvals_bits;
4287 if (fvals_widx >= fvals_word_ct_m1) {
4288 if (fvals_widx > fvals_word_ct_m1) {
4289 break;
4290 }
4291 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4292 } else {
4293 fvals_bits = patch_10_fvalsw[fvals_widx];
4294 }
4295 fvals_bits = fvals_bits ^ xor_word;
4296 fvals_bits = detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)));
4297 if (fvals_widx == fvals_word_ct_m1) {
4298 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
4299 }
4300 if (!fvals_bits) {
4301 continue;
4302 }
4303 fvals_bits = fvals_bits >> (allele_code_width - 1);
4304 fvals_bits = (fvals_bits + (fvals_bits >> allele_code_width)) & detect_hom_mask_lo;
4305 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
4306 do {
4307 const uint32_t bit_idx = ctzw(fvals_bits);
4308 const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
4309 if (IsSet(sample_include, sample_uidx)) {
4310 ++matching_het_or_hom_ct;
4311 matching_hom_ct += bit_idx & 1;
4312 }
4313 fvals_bits &= fvals_bits - 1;
4314 } while (fvals_bits);
4315 }
4316 *hom_ctp = matching_hom_ct;
4317 *het_ctp += matching_het_or_hom_ct - matching_hom_ct;
4318 return kPglRetSuccess;
4319 }
4320
4321 PglErr PgrGetInv1Counts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts) {
4322 // May use workspace_vec and workspace_difflist_sample_ids.
4323 if (!sample_ct) {
4324 STD_ARRAY_REF_FILL0(4, genocounts);
4325 return kPglRetSuccess;
4326 }
4327 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
4328 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
4329 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
4330 PglErr reterr;
4331 if ((!allele_idx) || (!allele_idx_offsets)) {
4332 PgrGetInv1Counts_biallelic:
4333 reterr = GetBasicGenotypeCounts(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, genocounts);
4334 if (allele_idx) {
4335 const uint32_t homref_ct = genocounts[0];
4336 genocounts[0] = genocounts[2];
4337 genocounts[2] = homref_ct;
4338 }
4339 return reterr;
4340 }
4341 const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
4342 if (allele_ct == 2) {
4343 goto PgrGetInv1Counts_biallelic;
4344 }
4345 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
4346 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
4347 uintptr_t* tmp_genovec = pgrp->workspace_vec;
4348 const unsigned char* fread_ptr;
4349 const unsigned char* fread_end;
4350 reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, tmp_genovec);
4351 if (unlikely(reterr)) {
4352 return reterr;
4353 }
4354 ZeroTrailingNyps(raw_sample_ct, tmp_genovec);
4355 const uint32_t aux1_first_byte = *fread_ptr++;
4356 const uint32_t aux1a_mode = aux1_first_byte & 15;
4357 const uint32_t aux1b_mode = aux1_first_byte >> 4;
4358 // raw_01_ct not needed when aux1a uses difflist form and subsetting is
4359 // occurring; same applies to raw_10_ct.
4360 uint32_t raw_01_ct = 0;
4361 uint32_t raw_10_ct = 0;
4362 if ((!subsetting_required) || (!aux1a_mode) || (!aux1b_mode)) {
4363 GenoarrCountFreqsUnsafe(tmp_genovec, raw_sample_ct, genocounts);
4364 raw_01_ct = genocounts[1];
4365 raw_10_ct = genocounts[2];
4366 }
4367 uint32_t subsetted_01_ct = 0;
4368 uint32_t subsetted_10_ct = 0;
4369 if (subsetting_required) {
4370 // need accurate subsetted missing count for allele_idx > 1 case
4371 GenoarrCountSubsetFreqs(tmp_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
4372 subsetted_01_ct = genocounts[1];
4373 subsetted_10_ct = genocounts[2];
4374 } else {
4375 sample_include = nullptr;
4376 }
4377 uint32_t het_ct;
4378 reterr = CountAux1a(fread_end, sample_include, tmp_genovec, aux1a_mode, raw_sample_ct, allele_ct, allele_idx, raw_01_ct, subsetted_01_ct, &fread_ptr, &het_ct, pgrp->workspace_difflist_sample_ids);
4379 if (unlikely(reterr)) {
4380 return reterr;
4381 }
4382 uint32_t hom_ct;
4383 reterr = CountAux1b(fread_end, sample_include, tmp_genovec, aux1b_mode, raw_sample_ct, allele_ct, allele_idx, raw_10_ct, subsetted_10_ct, &fread_ptr, &het_ct, &hom_ct, pgrp->workspace_difflist_sample_ids);
4384 genocounts[0] = hom_ct;
4385 genocounts[1] = het_ct;
4386 genocounts[2] = sample_ct - genocounts[3] - hom_ct - het_ct;
4387 return reterr;
4388 }
4389
4390 // sample_include assumed to be nullptr if no subsetting required
GenoarrAux1aUpdate(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx,uintptr_t lshifted_bit,uint32_t raw_01_ct,const unsigned char ** fread_pp,uintptr_t * __restrict target_genoarr,uint32_t * __restrict deltalist_workspace)4391 PglErr GenoarrAux1aUpdate(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx, uintptr_t lshifted_bit, uint32_t raw_01_ct, const unsigned char** fread_pp, uintptr_t* __restrict target_genoarr, uint32_t* __restrict deltalist_workspace) {
4392 if (aux1a_mode == 15) {
4393 return kPglRetSuccess;
4394 }
4395 const uint32_t ignore_01_fvals = (allele_idx == 1) || (allele_ct == 3);
4396 uintptr_t detect_mask_hi;
4397 uintptr_t detect_mask_lo;
4398 uint32_t allele_code_logwidth;
4399 const uint32_t allele_code_width = GetAux1aConsts(allele_ct, &detect_mask_hi, &detect_mask_lo, &allele_code_logwidth);
4400 const uintptr_t xor_word = (allele_idx - 2) * detect_mask_lo;
4401 if (!aux1a_mode) {
4402 #ifdef __arm__
4403 # error "Unaligned accesses in GenoarrAux1aUpdate()."
4404 #endif
4405 const uintptr_t* patch_01_fsetw = R_CAST(const uintptr_t*, *fread_pp);
4406 const uint32_t fset_byte_ct = DivUp(raw_01_ct, 8);
4407 uint32_t rare01_ct = 0;
4408 if (allele_ct > 3) {
4409 rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
4410 }
4411 *fread_pp += fset_byte_ct;
4412 const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4413 uintptr_t sample_hwidx = 0;
4414 uintptr_t cur_raw_genoarr_hets = Word01(raw_genoarr[0]);
4415 uint32_t loop_len = kBitsPerWord;
4416 const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
4417 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4418 return kPglRetMalformedInput;
4419 }
4420 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
4421 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4422 const uint32_t lshift = lshifted_bit - 1;
4423 uintptr_t fvals_bits = 0;
4424 uint32_t fvals_widx = 0;
4425 uint32_t rare01_lowbits = kBitsPerWord;
4426 for (uint32_t fset_widx = 0; ; ++fset_widx) {
4427 uintptr_t fset_bits;
4428 if (fset_widx >= fset_word_ct_m1) {
4429 if (fset_widx > fset_word_ct_m1) {
4430 return kPglRetSuccess;
4431 }
4432 fset_bits = SubwordLoad(&(patch_01_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4433 loop_len = ModNz(raw_01_ct, kBitsPerWord);
4434 } else {
4435 fset_bits = patch_01_fsetw[fset_widx];
4436 }
4437 if (!sample_include) {
4438 if (ignore_01_fvals) {
4439 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4440 while (!cur_raw_genoarr_hets) {
4441 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
4442 }
4443 if (fset_bits & 1) {
4444 // ref/altx present for x>1. Change genovec entry from 01 to 11
4445 // (or 11 -> 01 in allele_idx == 2, allele_ct == 3 case; same xor
4446 // operation works for that)
4447 const uintptr_t lowbit = cur_raw_genoarr_hets & (-cur_raw_genoarr_hets);
4448 target_genoarr[sample_hwidx] ^= lowbit << lshift;
4449 }
4450 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
4451 fset_bits = fset_bits >> 1;
4452 }
4453 } else {
4454 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4455 while (!cur_raw_genoarr_hets) {
4456 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
4457 }
4458 if (fset_bits & 1) {
4459 if (rare01_lowbits == kBitsPerWord) {
4460 if (fvals_widx == fvals_word_ct_m1) {
4461 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4462 } else {
4463 fvals_bits = patch_01_fvalsw[fvals_widx];
4464 }
4465 fvals_bits = fvals_bits ^ xor_word;
4466 fvals_bits = (detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)))) >> (allele_code_width - 1);
4467 // unnecessary to apply bzhi here
4468 ++fvals_widx;
4469 rare01_lowbits = 0;
4470 }
4471 if (fvals_bits & (k1LU << rare01_lowbits)) {
4472 const uintptr_t lowbit = cur_raw_genoarr_hets & (-cur_raw_genoarr_hets);
4473 target_genoarr[sample_hwidx] ^= lowbit << lshift;
4474 }
4475 rare01_lowbits += allele_code_width;
4476 }
4477 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
4478 fset_bits = fset_bits >> 1;
4479 }
4480 }
4481 } else {
4482 // format 0, sample_include non-null
4483 if (ignore_01_fvals) {
4484 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4485 while (!cur_raw_genoarr_hets) {
4486 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
4487 }
4488 if (fset_bits & 1) {
4489 // Considered replacing cur_raw_genoarr_hets with the result of
4490 // two PackWordToHalfword() operations, since that keeps all
4491 // the sample word-indexes aligned. Couldn't justify it given
4492 // the expected sparsity of this case, though.
4493 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
4494 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
4495 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
4496 target_genoarr[sample_idx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_idx % kBitsPerWordD2));
4497 }
4498 }
4499 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
4500 fset_bits = fset_bits >> 1;
4501 }
4502 } else {
4503 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4504 while (!cur_raw_genoarr_hets) {
4505 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
4506 }
4507 if (fset_bits & 1) {
4508 if (rare01_lowbits == kBitsPerWord) {
4509 if (fvals_widx == fvals_word_ct_m1) {
4510 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4511 } else {
4512 fvals_bits = patch_01_fvalsw[fvals_widx];
4513 }
4514 fvals_bits = fvals_bits ^ xor_word;
4515 fvals_bits = (detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)))) >> (allele_code_width - 1);
4516 // unnecessary to apply bzhi here
4517 ++fvals_widx;
4518 rare01_lowbits = 0;
4519 }
4520 if (fvals_bits & (k1LU << rare01_lowbits)) {
4521 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
4522 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
4523 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
4524 target_genoarr[sample_idx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_idx % kBitsPerWordD2));
4525 }
4526 }
4527 rare01_lowbits += allele_code_width;
4528 }
4529 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
4530 fset_bits = fset_bits >> 1;
4531 }
4532 }
4533 }
4534 }
4535 }
4536 // aux1a_mode == 1
4537 uint32_t rare01_ct;
4538 // Might hardcode the ParseAndSaveDeltalist logic later, but lets get
4539 // this working first.
4540 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare01_ct);
4541 if (unlikely(reterr)) {
4542 return reterr;
4543 }
4544 const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4545 const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
4546 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4547 return kPglRetMalformedInput;
4548 }
4549 if (ignore_01_fvals) {
4550 if (!sample_include) {
4551 for (uint32_t rare01_idx = 0; rare01_idx != rare01_ct; ++rare01_idx) {
4552 const uint32_t sample_uidx = deltalist_workspace[rare01_idx];
4553 // todo: benchmark against k1LU << (lshift + ...)
4554 target_genoarr[sample_uidx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_uidx % kBitsPerWordD2));
4555 }
4556 return kPglRetSuccess;
4557 }
4558 for (uint32_t rare01_idx = 0; rare01_idx != rare01_ct; ++rare01_idx) {
4559 const uint32_t sample_uidx = deltalist_workspace[rare01_idx];
4560 // could wrap this boilerplate
4561 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
4562 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
4563 const uintptr_t sample_include_word = sample_include[sample_widx];
4564 if (sample_include_word & lowbit) {
4565 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
4566 target_genoarr[sample_idx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_idx % kBitsPerWordD2));
4567 }
4568 }
4569 return kPglRetSuccess;
4570 }
4571 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4572 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4573 uintptr_t fvals_bits;
4574 if (fvals_widx >= fvals_word_ct_m1) {
4575 if (fvals_widx > fvals_word_ct_m1) {
4576 return kPglRetSuccess;
4577 }
4578 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4579 } else {
4580 fvals_bits = patch_01_fvalsw[fvals_widx];
4581 }
4582 fvals_bits = fvals_bits ^ xor_word;
4583 fvals_bits = detect_mask_hi & (~(fvals_bits | ((fvals_bits | detect_mask_hi) - detect_mask_lo)));
4584 if (fvals_widx == fvals_word_ct_m1) {
4585 fvals_bits = bzhi_max(fvals_bits, ModNz(rare01_ct << allele_code_logwidth, kBitsPerWord));
4586 }
4587 if (!fvals_bits) {
4588 continue;
4589 }
4590 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - allele_code_logwidth)]);
4591 if (!sample_include) {
4592 do {
4593 const uint32_t rare01_idx_lowbits = ctzw(fvals_bits) >> allele_code_logwidth;
4594 const uint32_t sample_uidx = cur_deltalist_base[rare01_idx_lowbits];
4595 target_genoarr[sample_uidx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_uidx % kBitsPerWordD2));
4596 fvals_bits &= fvals_bits - 1;
4597 } while (fvals_bits);
4598 } else {
4599 do {
4600 const uint32_t rare01_idx_lowbits = ctzw(fvals_bits) >> allele_code_logwidth;
4601 const uint32_t sample_uidx = cur_deltalist_base[rare01_idx_lowbits];
4602 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
4603 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
4604 const uintptr_t sample_include_word = sample_include[sample_widx];
4605 if (sample_include_word & lowbit) {
4606 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
4607 target_genoarr[sample_idx / kBitsPerWordD2] ^= lshifted_bit << (2 * (sample_idx % kBitsPerWordD2));
4608 }
4609 fvals_bits &= fvals_bits - 1;
4610 } while (fvals_bits);
4611 }
4612 }
4613 }
4614
4615 // sample_include assumed to be nullptr if no subsetting required
GenoarrAux1bStandardUpdate(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict target_genoarr,uint32_t * __restrict deltalist_workspace)4616 PglErr GenoarrAux1bStandardUpdate(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict target_genoarr, uint32_t* __restrict deltalist_workspace) {
4617 if (aux1b_mode == 15) {
4618 return kPglRetSuccess;
4619 }
4620 const uint32_t allele_idx_m1 = allele_idx - 1;
4621 uintptr_t detect_hom_mask_lo;
4622 const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
4623 const uint32_t allele_code_width = 1U << allele_code_logwidth;
4624 const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
4625 const uint32_t code10_width = 1U << code10_logwidth;
4626 uint32_t rare10_lowbits = kBitsPerWord;
4627 if (!aux1b_mode) {
4628 #ifdef __arm__
4629 # error "Unaligned accesses in GenoarrAux1bStandardUpdate()."
4630 #endif
4631 const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
4632 const uint32_t fset_byte_ct = DivUp(raw_10_ct, 8);
4633 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
4634 const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
4635 *fread_pp += fset_byte_ct;
4636 uintptr_t sample_hwidx = 0;
4637 uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
4638 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4639 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, CHAR_BIT);
4640 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4641 return kPglRetMalformedInput;
4642 }
4643 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4644 uintptr_t fvals_bits = 0;
4645 uint32_t fvals_widx = 0;
4646 uint32_t loop_len = kBitsPerWord;
4647 if ((!allele_idx_m1) || (allele_ct == 3)) {
4648 // bugfix (29 Dec 2019)
4649 const uintptr_t detect_alt1_mask_hi = detect_hom_mask_lo << (allele_code_width - 1);
4650 // If allele_ct == 3:
4651 // code10_width = 1
4652 // 0 -> 1/2, 1 -> 2/2
4653 // if allele_idx == 1:
4654 // we want to convert 2 -> 1 for 1/2 genotypes, and 2 -> 0 for 2/2.
4655 // if allele_idx == 2:
4656 // we want to convert 0 -> 1 for 1/2 genotypes, and 0 -> 2 for 2/2.
4657 // If allele_ct == 4 (allele_idx == 1 forced):
4658 // allele_code_width = 2
4659 // code10_width = 4
4660 // we want to convert 2 -> 1 for 1/x genotypes, and 2 -> 0 otherwise.
4661 const uint32_t lowcode_add = 2 - allele_idx_m1;
4662 for (uint32_t fset_widx = 0; ; ++fset_widx) {
4663 uintptr_t fset_bits;
4664 if (fset_widx >= fset_word_ct_m1) {
4665 if (fset_widx > fset_word_ct_m1) {
4666 break;
4667 }
4668 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4669 loop_len = ModNz(raw_10_ct, kBitsPerWord);
4670 } else {
4671 fset_bits = patch_10_fsetw[fset_widx];
4672 }
4673 if (!sample_include) {
4674 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4675 while (!cur_raw_genoarr_xys) {
4676 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4677 }
4678 if (fset_bits & 1) {
4679 if (rare10_lowbits == kBitsPerWord) {
4680 if (fvals_widx == fvals_word_ct_m1) {
4681 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4682 } else {
4683 fvals_bits = patch_10_fvalsw[fvals_widx];
4684 }
4685 // modify to het 1/x = 1, otherwise 0, except in allele_idx ==
4686 // 2 special case.
4687 if (!allele_idx_m1) {
4688 fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4689 }
4690 // unnecessary to apply bzhi here
4691 ++fvals_widx;
4692 rare10_lowbits = 0;
4693 }
4694 const uint32_t cur_lowcode0 = (fvals_bits >> rare10_lowbits) & 1;
4695 rare10_lowbits += code10_width;
4696 const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
4697 target_genoarr[sample_hwidx] ^= lowbit * (lowcode_add + cur_lowcode0);
4698 }
4699 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4700 fset_bits = fset_bits >> 1;
4701 }
4702 } else {
4703 // sample_include non-null
4704 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4705 while (!cur_raw_genoarr_xys) {
4706 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4707 }
4708 if (fset_bits & 1) {
4709 if (rare10_lowbits == kBitsPerWord) {
4710 if (fvals_widx == fvals_word_ct_m1) {
4711 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4712 } else {
4713 fvals_bits = patch_10_fvalsw[fvals_widx];
4714 }
4715 // modify to het 1/x = 1, otherwise 0, except in allele_idx ==
4716 // 2 special case
4717 if (!allele_idx_m1) {
4718 fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4719 }
4720 // unnecessary to apply bzhi here
4721 ++fvals_widx;
4722 rare10_lowbits = 0;
4723 }
4724 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
4725 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
4726 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
4727 const uintptr_t cur_lowcode0 = (fvals_bits >> rare10_lowbits) & 1;
4728 const uintptr_t shifted_xor_mult = (lowcode_add + cur_lowcode0) << (2 * (sample_idx % kBitsPerWordD2));
4729 target_genoarr[sample_idx / kBitsPerWordD2] ^= shifted_xor_mult;
4730 }
4731 rare10_lowbits += code10_width;
4732 }
4733 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4734 fset_bits = fset_bits >> 1;
4735 }
4736 }
4737 }
4738 return kPglRetSuccess;
4739 }
4740 // allele_idx > 1, allele_ct > 3
4741 const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4742 const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4743 const uintptr_t xor_word = allele_idx_m1 * detect_all_mask_lo;
4744 for (uint32_t fset_widx = 0; ; ++fset_widx) {
4745 uintptr_t fset_bits;
4746 if (fset_widx >= fset_word_ct_m1) {
4747 if (fset_widx > fset_word_ct_m1) {
4748 break;
4749 }
4750 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4751 loop_len = ModNz(raw_10_ct, kBitsPerWord);
4752 } else {
4753 fset_bits = patch_10_fsetw[fset_widx];
4754 }
4755 if (!sample_include) {
4756 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4757 while (!cur_raw_genoarr_xys) {
4758 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4759 }
4760 if (fset_bits & 1) {
4761 if (rare10_lowbits == kBitsPerWord) {
4762 if (fvals_widx == fvals_word_ct_m1) {
4763 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4764 } else {
4765 fvals_bits = patch_10_fvalsw[fvals_widx];
4766 }
4767 // modify to hom = 2, het = 1, neither = 0
4768 fvals_bits = fvals_bits ^ xor_word;
4769 fvals_bits = (detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)))) >> (allele_code_width - 1);
4770 // unnecessary to apply bzhi or detect_hom_mask_lo here
4771 fvals_bits = fvals_bits + (fvals_bits >> allele_code_width);
4772 ++fvals_widx;
4773 rare10_lowbits = 0;
4774 }
4775 const uintptr_t cur_hit_ct = (fvals_bits >> rare10_lowbits) & 3;
4776 rare10_lowbits += code10_width;
4777 if (cur_hit_ct) {
4778 const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
4779 target_genoarr[sample_hwidx] ^= lowbit * cur_hit_ct;
4780 }
4781 }
4782 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4783 fset_bits = fset_bits >> 1;
4784 }
4785 } else {
4786 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4787 while (!cur_raw_genoarr_xys) {
4788 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4789 }
4790 if (fset_bits & 1) {
4791 if (rare10_lowbits == kBitsPerWord) {
4792 if (fvals_widx == fvals_word_ct_m1) {
4793 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4794 } else {
4795 fvals_bits = patch_10_fvalsw[fvals_widx];
4796 }
4797 // modify to hom = 2, het = 1, neither = 0
4798 fvals_bits = fvals_bits ^ xor_word;
4799 fvals_bits = (detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)))) >> (allele_code_width - 1);
4800 if (fvals_widx == fvals_word_ct_m1) {
4801 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct * code10_width, kBitsPerWord));
4802 }
4803 fvals_bits = fvals_bits + (fvals_bits >> allele_code_width);
4804 ++fvals_widx;
4805 rare10_lowbits = 0;
4806 }
4807 const uintptr_t cur_hit_ct = (fvals_bits >> rare10_lowbits) & 3;
4808 rare10_lowbits += code10_width;
4809 if (cur_hit_ct) {
4810 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
4811 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
4812 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
4813 target_genoarr[sample_idx / kBitsPerWordD2] ^= cur_hit_ct << (2 * (sample_idx % kBitsPerWordD2));
4814 }
4815 }
4816 }
4817 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
4818 fset_bits = fset_bits >> 1;
4819 }
4820 }
4821 }
4822 return kPglRetSuccess;
4823 }
4824 // aux1b_mode == 1
4825 uint32_t rare10_ct;
4826 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
4827 if (unlikely(reterr)) {
4828 return reterr;
4829 }
4830 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4831 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
4832 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4833 return kPglRetMalformedInput;
4834 }
4835 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4836 if ((!allele_idx_m1) || (allele_ct == 3)) {
4837 // bugfix (29 Dec 2019)
4838 const uintptr_t detect_alt1_mask_hi = detect_hom_mask_lo << (allele_code_width - 1);
4839 const uintptr_t lowcode_add = 2 - allele_idx_m1;
4840 uint32_t loop_len = kBitsPerWord >> code10_logwidth;
4841 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4842 uintptr_t fvals_bits;
4843 if (fvals_widx >= fvals_word_ct_m1) {
4844 if (fvals_widx > fvals_word_ct_m1) {
4845 break;
4846 }
4847 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4848 loop_len = 1 + ((rare10_ct - 1) & ((kBitsPerWord >> code10_logwidth) - 1));
4849 } else {
4850 fvals_bits = patch_10_fvalsw[fvals_widx];
4851 }
4852 if (!allele_idx_m1) {
4853 fvals_bits = (detect_alt1_mask_hi & (~(fvals_bits | ((fvals_bits | detect_alt1_mask_hi) - detect_hom_mask_lo)))) >> (allele_code_width - 1);
4854 }
4855 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
4856 if (!sample_include) {
4857 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4858 const uint32_t sample_uidx = cur_deltalist_base[uii];
4859 const uintptr_t cur_lowcode0 = fvals_bits & 1;
4860 const uintptr_t shifted_xor_mult = (lowcode_add + cur_lowcode0) << (2 * (sample_uidx % kBitsPerWordD2));
4861 target_genoarr[sample_uidx / kBitsPerWordD2] ^= shifted_xor_mult;
4862 fvals_bits = fvals_bits >> code10_width;
4863 }
4864 } else {
4865 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4866 const uint32_t sample_uidx = cur_deltalist_base[uii];
4867 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
4868 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
4869 const uintptr_t sample_include_word = sample_include[sample_widx];
4870 if (sample_include_word & lowbit) {
4871 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
4872 const uintptr_t cur_lowcode0 = fvals_bits & 1;
4873 const uintptr_t shifted_xor_mult = (lowcode_add + cur_lowcode0) << (2 * (sample_idx % kBitsPerWordD2));
4874 target_genoarr[sample_idx / kBitsPerWordD2] ^= shifted_xor_mult;
4875 }
4876 fvals_bits = fvals_bits >> code10_width;
4877 }
4878 }
4879 }
4880 return kPglRetSuccess;
4881 }
4882 // allele_idx > 1, allele_ct > 3
4883 const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4884 const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4885 detect_hom_mask_lo = detect_hom_mask_lo * 3;
4886 const uintptr_t xor_word = allele_idx_m1 * detect_all_mask_lo;
4887 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
4888 uintptr_t fvals_bits;
4889 if (fvals_widx >= fvals_word_ct_m1) {
4890 if (fvals_widx > fvals_word_ct_m1) {
4891 break;
4892 }
4893 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4894 } else {
4895 fvals_bits = patch_10_fvalsw[fvals_widx];
4896 }
4897 fvals_bits = fvals_bits ^ xor_word;
4898 fvals_bits = detect_all_mask_hi & (~(fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)));
4899 if (fvals_widx == fvals_word_ct_m1) {
4900 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
4901 }
4902 if (!fvals_bits) {
4903 continue;
4904 }
4905 fvals_bits = fvals_bits >> (allele_code_width - 1);
4906 fvals_bits = (fvals_bits + (fvals_bits >> allele_code_width)) & detect_hom_mask_lo;
4907 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
4908 if (!sample_include) {
4909 do {
4910 const uint32_t bit_idx = ctzw(fvals_bits);
4911 const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
4912 target_genoarr[sample_uidx / kBitsPerWordD2] ^= k1LU << ((bit_idx % 2) + 2 * (sample_uidx % kBitsPerWordD2));
4913 fvals_bits &= fvals_bits - 1;
4914 } while (fvals_bits);
4915 } else {
4916 do {
4917 const uint32_t bit_idx = ctzw(fvals_bits);
4918 const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
4919 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
4920 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
4921 const uintptr_t sample_include_word = sample_include[sample_widx];
4922 if (sample_include_word & lowbit) {
4923 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
4924 target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << ((bit_idx % 2) + 2 * (sample_idx % kBitsPerWordD2));
4925 }
4926 fvals_bits &= fvals_bits - 1;
4927 } while (fvals_bits);
4928 }
4929 }
4930 return kPglRetSuccess;
4931 }
4932
4933 // if aux1b_het_present is true, aux1b_hets becomes a 1-bit-per-sample bitarray
4934 // with the positions of altx/alty hets in aux1b.
GetAux1bHets(const unsigned char * fread_end,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict aux1b_hets,uint32_t * __restrict aux1b_het_presentp,uint32_t * __restrict deltalist_workspace)4935 PglErr GetAux1bHets(const unsigned char* fread_end, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict aux1b_hets, uint32_t* __restrict aux1b_het_presentp, uint32_t* __restrict deltalist_workspace) {
4936 if (aux1b_mode == 15) {
4937 *aux1b_het_presentp = 0;
4938 return kPglRetSuccess;
4939 }
4940 uintptr_t detect_hom_mask_lo;
4941 const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
4942 const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
4943 const uint32_t code10_width = 1U << code10_logwidth;
4944 const uint32_t allele_code_width = 1U << allele_code_logwidth;
4945 const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
4946 const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
4947 Halfword* aux1b_hets_alias = R_CAST(Halfword*, aux1b_hets);
4948 uint32_t rare10_lowbits = kBitsPerWord;
4949 uint32_t aux1b_het_present = 0;
4950 if (!aux1b_mode) {
4951 #ifdef __arm__
4952 # error "Unaligned accesses in GetAux1bHets()."
4953 #endif
4954 const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
4955 const uint32_t fset_byte_ct = DivUp(raw_10_ct, 8);
4956 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
4957 const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
4958 *fread_pp += fset_byte_ct;
4959 uintptr_t sample_hwidx = 0;
4960 uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
4961 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
4962 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, CHAR_BIT);
4963 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
4964 return kPglRetMalformedInput;
4965 }
4966 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
4967 uintptr_t fvals_bits = 0;
4968 uint32_t fvals_widx = 0;
4969 uint32_t loop_len = kBitsPerWord;
4970 for (uint32_t fset_widx = 0; ; ++fset_widx) {
4971 uintptr_t fset_bits;
4972 if (fset_widx >= fset_word_ct_m1) {
4973 if (fset_widx > fset_word_ct_m1) {
4974 break;
4975 }
4976 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
4977 loop_len = ModNz(raw_10_ct, kBitsPerWord);
4978 } else {
4979 fset_bits = patch_10_fsetw[fset_widx];
4980 }
4981 for (uint32_t uii = 0; uii != loop_len; ++uii) {
4982 while (!cur_raw_genoarr_xys) {
4983 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
4984 }
4985 if (fset_bits & 1) {
4986 if (rare10_lowbits == kBitsPerWord) {
4987 if (fvals_widx == fvals_word_ct_m1) {
4988 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
4989 } else {
4990 fvals_bits = patch_10_fvalsw[fvals_widx];
4991 }
4992 // allele_ct == 3: just invert raw fvals_bits
4993 // allele_ct > 3: shift by allele_code_width, xor with self so that
4994 // 0 == hom, detect nonzero by inverting the usual check
4995 if (allele_ct == 3) {
4996 fvals_bits = ~fvals_bits;
4997 } else {
4998 fvals_bits = fvals_bits ^ (fvals_bits << allele_code_width);
4999 // conveniently, removing a ~ here is equivalent to inverting the
5000 // relevant bits of the final result
5001 fvals_bits = detect_hom_mask_lo & ((fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)) >> (code10_width - 1));
5002 }
5003 // bzhi only relevant for detecting if there are any hets at all
5004 if (!aux1b_het_present) {
5005 if (fvals_widx == fvals_word_ct_m1) {
5006 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct * code10_width, kBitsPerWord));
5007 }
5008 if (fvals_bits) {
5009 // lazy-initialize
5010 aux1b_het_present = 1;
5011 ZeroHwArr(2 * BitCtToWordCt(raw_sample_ct), aux1b_hets_alias);
5012 }
5013 }
5014 ++fvals_widx;
5015 rare10_lowbits = 0;
5016 }
5017 if (fvals_bits & (k1LU << rare10_lowbits)) {
5018 const uint32_t bit_idx = ctzw(cur_raw_genoarr_xys) / 2;
5019 aux1b_hets_alias[sample_hwidx] |= 1U << bit_idx;
5020 }
5021 rare10_lowbits += code10_width;
5022 }
5023 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5024 fset_bits = fset_bits >> 1;
5025 }
5026 }
5027 *aux1b_het_presentp = aux1b_het_present;
5028 return kPglRetSuccess;
5029 }
5030 // aux1b_mode == 1
5031 uint32_t rare10_ct;
5032 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
5033 if (unlikely(reterr)) {
5034 return reterr;
5035 }
5036 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
5037 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
5038 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
5039 return kPglRetMalformedInput;
5040 }
5041 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
5042 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
5043 uintptr_t fvals_bits;
5044 if (fvals_widx >= fvals_word_ct_m1) {
5045 if (fvals_widx > fvals_word_ct_m1) {
5046 break;
5047 }
5048 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5049 } else {
5050 fvals_bits = patch_10_fvalsw[fvals_widx];
5051 }
5052 if (allele_ct == 3) {
5053 fvals_bits = ~fvals_bits;
5054 } else {
5055 fvals_bits = fvals_bits ^ (fvals_bits << allele_code_width);
5056 fvals_bits = detect_hom_mask_lo & ((fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo)) >> (code10_width - 1));
5057 }
5058 if (fvals_widx == fvals_word_ct_m1) {
5059 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
5060 }
5061 if (!fvals_bits) {
5062 continue;
5063 }
5064 if (!aux1b_het_present) {
5065 aux1b_het_present = 1;
5066 ZeroHwArr(2 * BitCtToWordCt(raw_sample_ct), aux1b_hets_alias);
5067 }
5068 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
5069 do {
5070 const uint32_t bit_idx = ctzw(fvals_bits);
5071 const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5072 aux1b_hets_alias[sample_uidx / kBitsPerWordD2] |= 1U << (sample_uidx % kBitsPerWordD2);
5073 fvals_bits &= fvals_bits - 1;
5074 } while (fvals_bits);
5075 }
5076 *aux1b_het_presentp = aux1b_het_present;
5077 return kPglRetSuccess;
5078 }
5079
Get1Multiallelic(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict all_hets,uintptr_t * __restrict allele_countvec,uintptr_t ** subsetted_10hetp)5080 PglErr Get1Multiallelic(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict all_hets, uintptr_t* __restrict allele_countvec, uintptr_t** subsetted_10hetp) {
5081 // sample_ct > 0; either allele_idx > 1 or ((allele_idx == 1) &&
5082 // multiallelic_hc_present)
5083 // subsetted_10het assumed to be initialized to nullptr, if present at all
5084 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
5085 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
5086 uintptr_t* raw_genovec = pgrp->workspace_vec;
5087 const unsigned char* fread_ptr;
5088 const unsigned char* fread_end;
5089 PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
5090 if (unlikely(reterr)) {
5091 return reterr;
5092 }
5093
5094 const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
5095 const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
5096 CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, allele_countvec);
5097 ZeroTrailingNyps(raw_sample_ct, raw_genovec);
5098 if (fread_pp) {
5099 *fread_endp = fread_end;
5100 if (all_hets) {
5101 PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
5102 }
5103 }
5104 if (allele_idx != 1) {
5105 GenovecNonmissingToZeroUnsafe(sample_ct, allele_countvec);
5106 if (!multiallelic_hc_present) {
5107 if (fread_pp) {
5108 *fread_pp = fread_ptr;
5109 }
5110 return kPglRetSuccess;
5111 }
5112 }
5113 const uint32_t aux1_first_byte = *fread_ptr++;
5114 const uint32_t aux1a_mode = aux1_first_byte & 15;
5115 const uint32_t aux1b_mode = aux1_first_byte >> 4;
5116 // only need to initialize these in dense modes
5117 uint32_t raw_01_ct = 0;
5118 uint32_t raw_10_ct = 0;
5119 if ((!aux1a_mode) || (!aux1b_mode)) {
5120 GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
5121 }
5122
5123 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
5124 const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
5125 if (!subsetting_required) {
5126 sample_include = nullptr;
5127 }
5128 // allele_idx == 1 case:
5129 // allele_countvec currently contains ALT counts; we want to reduce them to
5130 // ALT1 counts. This can be done with the following steps:
5131 // 1. For every element of patch_01_fset, reduce the value from 1 to 0. We
5132 // don't actually need to look at patch_01_fvals.
5133 // 2. For every element of patch_10_fset, reduce the value from 2 depending
5134 // on the low bit(s) of the patch_01_fvals entry (reduce to 0 unless low
5135 // bit(s) are all zero).
5136 // allele_idx > 1 case:
5137 // 1. For every element of patch_01_fset, set a 1 for each matching value
5138 // of patch_01_fvals.
5139 // 2. For every element of patch_10_fset, set a 1 for each het-matching
5140 // value of patch_10_fvals, and a 2 for each hom-match.
5141 uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
5142 // Two cases:
5143 // - If allele_idx == 1, convert all aux1a entries from 01 to 00.
5144 // - Otherwise, for each matching aux1a entry, convert from 00 to 01.
5145 reterr = GenoarrAux1aUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, allele_idx, 1, raw_01_ct, &fread_ptr, allele_countvec, deltalist_workspace);
5146 if (unlikely(reterr)) {
5147 return reterr;
5148 }
5149 const unsigned char* aux1b_start = fread_ptr;
5150 reterr = GenoarrAux1bStandardUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, allele_idx, raw_10_ct, &fread_ptr, allele_countvec, deltalist_workspace);
5151 if ((!fread_pp) || reterr) {
5152 return reterr;
5153 }
5154 *fread_pp = fread_ptr;
5155 if (all_hets) {
5156 // can merge this with GenovecAux1bStandardUpdate if this is ever a
5157 // significant bottleneck
5158 uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
5159 uint32_t aux1b_het_present;
5160 reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &aux1b_start, aux1b_hets, &aux1b_het_present, deltalist_workspace);
5161 if (unlikely(reterr)) {
5162 return reterr;
5163 }
5164 if (aux1b_het_present) {
5165 BitvecOr(aux1b_hets, BitCtToWordCt(raw_sample_ct), all_hets);
5166 if (!sample_include) {
5167 *subsetted_10hetp = aux1b_hets;
5168 } else {
5169 // Don't need raw_genovec any more.
5170 CopyBitarrSubset(aux1b_hets, sample_include, sample_ct, raw_genovec);
5171 *subsetted_10hetp = raw_genovec;
5172 }
5173 }
5174 }
5175 return kPglRetSuccess;
5176 }
5177
IMPLPgrGet1(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,uintptr_t * __restrict allele_countvec)5178 PglErr IMPLPgrGet1(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_countvec) {
5179 if (!sample_ct) {
5180 return kPglRetSuccess;
5181 }
5182 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
5183 const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
5184 if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
5185 PglErr reterr = ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, allele_countvec);
5186 if (unlikely(reterr)) {
5187 return reterr;
5188 }
5189 if (!allele_idx) {
5190 GenovecInvertUnsafe(sample_ct, allele_countvec);
5191 }
5192 return kPglRetSuccess;
5193 }
5194 return Get1Multiallelic(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, nullptr, nullptr, nullptr, allele_countvec, nullptr);
5195 }
5196
IMPLPgrGetInv1(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,uintptr_t * __restrict allele_invcountvec)5197 PglErr IMPLPgrGetInv1(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec) {
5198 if (!sample_ct) {
5199 return kPglRetSuccess;
5200 }
5201 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
5202 const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
5203 if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
5204 PglErr reterr = ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, allele_invcountvec);
5205 if (unlikely(reterr)) {
5206 return reterr;
5207 }
5208 if (allele_idx) {
5209 GenovecInvertUnsafe(sample_ct, allele_invcountvec);
5210 }
5211 return kPglRetSuccess;
5212 }
5213 PglErr reterr = Get1Multiallelic(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, nullptr, nullptr, nullptr, allele_invcountvec, nullptr);
5214 GenovecInvertUnsafe(sample_ct, allele_invcountvec);
5215 return reterr;
5216 }
5217
5218 // Assumes allele_idx0 < allele_idx1, and allele_idx0 < 2. Rotates hardcalls
5219 // such that, if no multiallelic hardcalls are present, 0 = 0/0, 1 = 0/1,
5220 // 2 = 1/1, and 3 = anything else.
Rotate2(uint32_t allele_idx0,uint32_t allele_idx1,uint32_t sample_ct,uintptr_t * genovec)5221 void Rotate2(uint32_t allele_idx0, uint32_t allele_idx1, uint32_t sample_ct, uintptr_t* genovec) {
5222 if (!allele_idx0) {
5223 if (allele_idx1 > 1) {
5224 GenovecNonzeroToMissingUnsafe(sample_ct, genovec);
5225 }
5226 } else {
5227 GenovecInvertThenNonzeroToMissingUnsafe(sample_ct, genovec);
5228 }
5229 }
5230
SkipAux1a(const unsigned char * fread_end,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_01_ct,const unsigned char ** fread_pp)5231 PglErr SkipAux1a(const unsigned char* fread_end, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_01_ct, const unsigned char** fread_pp) {
5232 if (aux1a_mode == 15) {
5233 return kPglRetSuccess;
5234 }
5235 uint32_t rare01_ct;
5236 if (!aux1a_mode) {
5237 const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
5238 rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
5239 *fread_pp += fset_byte_ct;
5240 } else {
5241 const unsigned char* group_info_iter;
5242 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare01_ct);
5243 if (unlikely(reterr)) {
5244 return reterr;
5245 }
5246 reterr = SkipDeltalistIds(fread_end, group_info_iter, rare01_ct, raw_sample_ct, 0, fread_pp);
5247 if (unlikely(reterr)) {
5248 return reterr;
5249 }
5250 }
5251 const uint32_t fvals_byte_ct = GetAux1aAlleleEntryByteCt(allele_ct, rare01_ct);
5252 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
5253 return kPglRetMalformedInput;
5254 }
5255 return kPglRetSuccess;
5256 }
5257
5258 // sample_include assumed to be nullptr if no subsetting required
GenoarrAux1bUpdate2(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t allele_idx0,uint32_t allele_idx1,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict target_genoarr,uint32_t * __restrict deltalist_workspace)5259 PglErr GenoarrAux1bUpdate2(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t allele_idx0, uint32_t allele_idx1, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict target_genoarr, uint32_t* __restrict deltalist_workspace) {
5260 // Possible aux1b updates:
5261 // - allele_idx0 == 0:
5262 // allele_idx1 == 1: all altx/alty including a rarealt from 10 to 11
5263 // allele_idx1 > 1: set one rarealtx/rarealtx from 11 to 10
5264 //
5265 // - allele_idx0 == 1: change all alt1/rarealtx from 00 to 01,
5266 // rarealtx/rarealtx from 00 to 10, and all other aux1b entries to missing.
5267 // This can use the same driver as Get1Multiallelic.
5268 //
5269 // - allele_idx0 > 1: change all rarealtx/rarealtx from missing to 00,
5270 // rarealtx/rarealty to 01, and rarealty/rarealty to 10.
5271 if (aux1b_mode == 15) {
5272 return kPglRetSuccess;
5273 }
5274 if (allele_idx0 == 1) {
5275 return GenoarrAux1bStandardUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genoarr, aux1b_mode, raw_sample_ct, allele_ct, allele_idx1, raw_10_ct, fread_pp, target_genoarr, deltalist_workspace);
5276 }
5277 uintptr_t detect_hom_mask_lo;
5278 const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
5279 const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
5280 const uint32_t code10_width = 1U << code10_logwidth;
5281 const uintptr_t detect_hom_mask_hi = detect_hom_mask_lo << (code10_width - 1);
5282 uintptr_t xor_word2 = allele_idx1 - 1;
5283 // fortunately, this sequence of operations happens to work for allele_ct ==
5284 // 3
5285 xor_word2 = xor_word2 | (xor_word2 << (code10_width / 2));
5286 xor_word2 = xor_word2 * detect_hom_mask_lo;
5287 uint32_t rare10_lowbits = kBitsPerWord;
5288 if (!aux1b_mode) {
5289 #ifdef __arm__
5290 # error "Unaligned accesses in GenoarrAux1bUpdate2()."
5291 #endif
5292 const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
5293 const uint32_t fset_byte_ct = DivUp(raw_10_ct, 8);
5294 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
5295 const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
5296 *fread_pp += fset_byte_ct;
5297 uintptr_t sample_hwidx = 0;
5298 uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
5299 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
5300 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, CHAR_BIT);
5301 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
5302 return kPglRetMalformedInput;
5303 }
5304 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
5305 uintptr_t fvals_bits = 0;
5306 uint32_t fvals_widx = 0;
5307 uint32_t loop_len = kBitsPerWord;
5308 if (!allele_idx0) {
5309 for (uint32_t fset_widx = 0; ; ++fset_widx) {
5310 uintptr_t fset_bits;
5311 if (fset_widx >= fset_word_ct_m1) {
5312 if (fset_widx > fset_word_ct_m1) {
5313 return kPglRetSuccess;
5314 }
5315 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
5316 loop_len = ModNz(raw_10_ct, kBitsPerWord);
5317 } else {
5318 fset_bits = patch_10_fsetw[fset_widx];
5319 }
5320 if (!sample_include) {
5321 if (allele_idx1 == 1) {
5322 // All aux1b 10 -> 11. Ignore aux1b_fvals.
5323 for (uint32_t uii = 0; uii != loop_len; ++uii) {
5324 while (!cur_raw_genoarr_xys) {
5325 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5326 }
5327 if (fset_bits & 1) {
5328 const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
5329 target_genoarr[sample_hwidx] ^= lowbit;
5330 }
5331 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5332 fset_bits = fset_bits >> 1;
5333 }
5334 } else {
5335 // hom-altx 11 -> 10.
5336 for (uint32_t uii = 0; uii != loop_len; ++uii) {
5337 while (!cur_raw_genoarr_xys) {
5338 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5339 }
5340 if (fset_bits & 1) {
5341 if (rare10_lowbits == kBitsPerWord) {
5342 if (fvals_widx == fvals_word_ct_m1) {
5343 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5344 } else {
5345 fvals_bits = patch_10_fvalsw[fvals_widx];
5346 }
5347 fvals_bits = fvals_bits ^ xor_word2;
5348 fvals_bits = (detect_hom_mask_hi & (~(fvals_bits | ((fvals_bits | detect_hom_mask_hi) - detect_hom_mask_lo)))) >> (code10_width - 1);
5349 // unnecessary to apply bzhi here
5350 ++fvals_widx;
5351 rare10_lowbits = 0;
5352 }
5353 if (fvals_bits & (k1LU << rare10_lowbits)) {
5354 const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
5355 target_genoarr[sample_hwidx] ^= lowbit;
5356 }
5357 rare10_lowbits += code10_width;
5358 }
5359 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5360 fset_bits = fset_bits >> 1;
5361 }
5362 }
5363 } else {
5364 // sample_include non-null
5365 if (allele_idx1 == 1) {
5366 for (uint32_t uii = 0; uii != loop_len; ++uii) {
5367 while (!cur_raw_genoarr_xys) {
5368 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5369 }
5370 if (fset_bits & 1) {
5371 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
5372 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
5373 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
5374 target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << (2 * (sample_idx % kBitsPerWordD2));
5375 }
5376 rare10_lowbits += code10_width;
5377 }
5378 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5379 fset_bits = fset_bits >> 1;
5380 }
5381 } else {
5382 for (uint32_t uii = 0; uii != loop_len; ++uii) {
5383 while (!cur_raw_genoarr_xys) {
5384 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5385 }
5386 if (fset_bits & 1) {
5387 if (rare10_lowbits == kBitsPerWord) {
5388 if (fvals_widx == fvals_word_ct_m1) {
5389 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5390 } else {
5391 fvals_bits = patch_10_fvalsw[fvals_widx];
5392 }
5393 fvals_bits = fvals_bits ^ xor_word2;
5394 fvals_bits = (detect_hom_mask_hi & (~(fvals_bits | ((fvals_bits | detect_hom_mask_hi) - detect_hom_mask_lo)))) >> (code10_width - 1);
5395 // unnecessary to apply bzhi here
5396 ++fvals_widx;
5397 rare10_lowbits = 0;
5398 }
5399 if (fvals_bits & (k1LU << rare10_lowbits)) {
5400 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
5401 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
5402 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
5403 target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << (2 * (sample_idx % kBitsPerWordD2));
5404 }
5405 }
5406 rare10_lowbits += code10_width;
5407 }
5408 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5409 fset_bits = fset_bits >> 1;
5410 }
5411 }
5412 }
5413 }
5414 }
5415 // 2 <= allele_idx0 < allele_idx1 (so allele_ct > 3 guaranteed)
5416 uintptr_t xor_word1 = allele_idx1 - 1;
5417 uintptr_t xor_word0 = allele_idx0 - 1;
5418 xor_word1 = xor_word0 | (xor_word1 << (code10_width / 2));
5419 xor_word0 = xor_word0 | (xor_word0 << (code10_width / 2));
5420 xor_word1 *= detect_hom_mask_lo;
5421 xor_word0 *= detect_hom_mask_lo;
5422 for (uint32_t fset_widx = 0; ; ++fset_widx) {
5423 uintptr_t fset_bits;
5424 if (fset_widx >= fset_word_ct_m1) {
5425 if (fset_widx > fset_word_ct_m1) {
5426 return kPglRetSuccess;
5427 }
5428 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
5429 loop_len = ModNz(raw_10_ct, kBitsPerWord);
5430 } else {
5431 fset_bits = patch_10_fsetw[fset_widx];
5432 }
5433 if (!sample_include) {
5434 for (uint32_t uii = 0; uii != loop_len; ++uii) {
5435 while (!cur_raw_genoarr_xys) {
5436 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5437 }
5438 if (fset_bits & 1) {
5439 if (rare10_lowbits == kBitsPerWord) {
5440 if (fvals_widx == fvals_word_ct_m1) {
5441 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5442 } else {
5443 fvals_bits = patch_10_fvalsw[fvals_widx];
5444 }
5445 uintptr_t match0 = fvals_bits ^ xor_word0;
5446 uintptr_t match1 = fvals_bits ^ xor_word1;
5447 uintptr_t match2 = fvals_bits ^ xor_word2;
5448 match0 = detect_hom_mask_hi & (~(match0 | ((match0 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5449 match1 = detect_hom_mask_hi & (~(match1 | ((match1 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5450 match2 = detect_hom_mask_hi & (~(match2 | ((match2 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5451 // Now want match0 -> 11, match1 -> 10, and match2 -> 01.
5452 fvals_bits = ((match0 | match1) >> (code10_width - 2)) | ((match0 | match2) >> (code10_width - 1));
5453 // unnecessary to apply bzhi here
5454 ++fvals_widx;
5455 rare10_lowbits = 0;
5456 }
5457 const uintptr_t xor_val = (fvals_bits >> rare10_lowbits) & 3;
5458 if (xor_val) {
5459 const uintptr_t lowbit = cur_raw_genoarr_xys & (-cur_raw_genoarr_xys);
5460 target_genoarr[sample_hwidx] ^= lowbit * xor_val;
5461 }
5462 rare10_lowbits += code10_width;
5463 }
5464 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5465 fset_bits = fset_bits >> 1;
5466 }
5467 } else {
5468 // sample_include non-null
5469 for (uint32_t uii = 0; uii != loop_len; ++uii) {
5470 while (!cur_raw_genoarr_xys) {
5471 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
5472 }
5473 if (fset_bits & 1) {
5474 if (rare10_lowbits == kBitsPerWord) {
5475 if (fvals_widx == fvals_word_ct_m1) {
5476 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5477 } else {
5478 fvals_bits = patch_10_fvalsw[fvals_widx];
5479 }
5480 uintptr_t match0 = fvals_bits ^ xor_word0;
5481 uintptr_t match1 = fvals_bits ^ xor_word1;
5482 uintptr_t match2 = fvals_bits ^ xor_word2;
5483 match0 = detect_hom_mask_hi & (~(match0 | ((match0 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5484 match1 = detect_hom_mask_hi & (~(match1 | ((match1 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5485 match2 = detect_hom_mask_hi & (~(match2 | ((match2 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5486 fvals_bits = ((match0 | match1) >> (code10_width - 2)) | ((match0 | match2) >> (code10_width - 1));
5487 // unnecessary to apply bzhi here
5488 ++fvals_widx;
5489 rare10_lowbits = 0;
5490 }
5491 const uintptr_t xor_val = (fvals_bits >> rare10_lowbits) & 3;
5492 if (xor_val) {
5493 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
5494 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
5495 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
5496 target_genoarr[sample_idx / kBitsPerWordD2] ^= xor_val << (2 * (sample_idx % kBitsPerWordD2));
5497 }
5498 }
5499 rare10_lowbits += code10_width;
5500 }
5501 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
5502 fset_bits = fset_bits >> 1;
5503 }
5504 }
5505 }
5506 }
5507 // aux1b_mode == 1
5508 uint32_t rare10_ct;
5509 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
5510 if (unlikely(reterr)) {
5511 return reterr;
5512 }
5513 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
5514 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
5515 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
5516 return kPglRetMalformedInput;
5517 }
5518 if (allele_idx1 == 1) {
5519 if (!sample_include) {
5520 for (uint32_t rare10_idx = 0; rare10_idx != rare10_ct; ++rare10_idx) {
5521 const uint32_t sample_uidx = deltalist_workspace[rare10_idx];
5522 target_genoarr[sample_uidx / kBitsPerWordD2] ^= k1LU << (2 * (sample_uidx % kBitsPerWordD2));
5523 }
5524 return kPglRetSuccess;
5525 }
5526 for (uint32_t rare10_idx = 0; rare10_idx != rare10_ct; ++rare10_idx) {
5527 const uint32_t sample_uidx = deltalist_workspace[rare10_idx];
5528 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
5529 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
5530 const uintptr_t sample_include_word = sample_include[sample_widx];
5531 if (sample_include_word & lowbit) {
5532 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
5533 target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << (2 * (sample_idx % kBitsPerWordD2));
5534 }
5535 }
5536 return kPglRetSuccess;
5537 }
5538 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
5539 if (!allele_idx0) {
5540 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
5541 uintptr_t fvals_bits;
5542 if (fvals_widx >= fvals_word_ct_m1) {
5543 if (fvals_widx > fvals_word_ct_m1) {
5544 return kPglRetSuccess;
5545 }
5546 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5547 } else {
5548 fvals_bits = patch_10_fvalsw[fvals_widx];
5549 }
5550 fvals_bits = fvals_bits ^ xor_word2;
5551 fvals_bits = detect_hom_mask_hi & (~(fvals_bits | ((fvals_bits | detect_hom_mask_hi) - detect_hom_mask_lo)));
5552 if (fvals_widx == fvals_word_ct_m1) {
5553 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
5554 }
5555 if (!fvals_bits) {
5556 continue;
5557 }
5558 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
5559 if (!sample_include) {
5560 do {
5561 const uint32_t bit_idx = ctzw(fvals_bits);
5562 const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5563 target_genoarr[sample_uidx / kBitsPerWordD2] ^= k1LU << (2 * (sample_uidx % kBitsPerWordD2));
5564 fvals_bits &= fvals_bits - 1;
5565 } while (fvals_bits);
5566 } else {
5567 do {
5568 const uint32_t bit_idx = ctzw(fvals_bits);
5569 const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5570 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
5571 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
5572 const uintptr_t sample_include_word = sample_include[sample_widx];
5573 if (sample_include_word & lowbit) {
5574 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
5575 target_genoarr[sample_idx / kBitsPerWordD2] ^= k1LU << (2 * (sample_idx % kBitsPerWordD2));
5576 }
5577 fvals_bits &= fvals_bits - 1;
5578 } while (fvals_bits);
5579 }
5580 }
5581 }
5582 // 2 <= allele_idx0 < allele_idx1
5583 uintptr_t xor_word1 = allele_idx1 - 1;
5584 uintptr_t xor_word0 = allele_idx0 - 1;
5585 xor_word1 = xor_word0 | (xor_word1 << (code10_width / 2));
5586 xor_word0 = xor_word0 | (xor_word0 << (code10_width / 2));
5587 xor_word1 *= detect_hom_mask_lo;
5588 xor_word0 *= detect_hom_mask_lo;
5589 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
5590 uintptr_t fvals_bits;
5591 if (fvals_widx >= fvals_word_ct_m1) {
5592 if (fvals_widx > fvals_word_ct_m1) {
5593 return kPglRetSuccess;
5594 }
5595 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
5596 } else {
5597 fvals_bits = patch_10_fvalsw[fvals_widx];
5598 }
5599 uintptr_t match0 = fvals_bits ^ xor_word0;
5600 uintptr_t match1 = fvals_bits ^ xor_word1;
5601 uintptr_t match2 = fvals_bits ^ xor_word2;
5602 match0 = detect_hom_mask_hi & (~(match0 | ((match0 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5603 match1 = detect_hom_mask_hi & (~(match1 | ((match1 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5604 match2 = detect_hom_mask_hi & (~(match2 | ((match2 | detect_hom_mask_hi) - detect_hom_mask_lo)));
5605 // since code10_width >= 4, we can use match0 == 3 (mod 4), match1 == 2
5606 // (mod 4), match2 == 1 (mod 4) representation.
5607 fvals_bits = (match0 >> (code10_width - 4)) | (match1 >> (code10_width - 3)) | (match2 >> (code10_width - 2));
5608 if (fvals_widx == fvals_word_ct_m1) {
5609 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
5610 }
5611 if (!fvals_bits) {
5612 continue;
5613 }
5614 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
5615 if (!sample_include) {
5616 do {
5617 const uintptr_t bit_idx = ctzw(fvals_bits);
5618 const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5619 target_genoarr[sample_uidx / kBitsPerWordD2] ^= (bit_idx & 3) << (2 * (sample_uidx % kBitsPerWordD2));
5620 fvals_bits &= fvals_bits - 1;
5621 } while (fvals_bits);
5622 } else {
5623 do {
5624 const uintptr_t bit_idx = ctzw(fvals_bits);
5625 const uint32_t sample_uidx = cur_deltalist_base[bit_idx >> code10_logwidth];
5626 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
5627 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
5628 const uintptr_t sample_include_word = sample_include[sample_widx];
5629 if (sample_include_word & lowbit) {
5630 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
5631 target_genoarr[sample_idx / kBitsPerWordD2] ^= (bit_idx & 3) << (2 * (sample_idx % kBitsPerWordD2));
5632 }
5633 fvals_bits &= fvals_bits - 1;
5634 } while (fvals_bits);
5635 }
5636 }
5637 }
5638
IMPLPgrGet2(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx0,uint32_t allele_idx1,PgenReaderMain * pgrp,uintptr_t * __restrict genovec)5639 PglErr IMPLPgrGet2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReaderMain* pgrp, uintptr_t* __restrict genovec) {
5640 assert(allele_idx0 != allele_idx1);
5641 if (!sample_ct) {
5642 return kPglRetSuccess;
5643 }
5644 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
5645 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
5646 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
5647 const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
5648 if (!multiallelic_hc_present) {
5649 if ((allele_idx0 > 1) && (allele_idx1 > 1)) {
5650 // Trivial all-missing case.
5651 SetAllBits(2 * sample_ct, genovec);
5652 return kPglRetSuccess;
5653 }
5654 PglErr reterr = ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
5655 if (unlikely(reterr)) {
5656 return reterr;
5657 }
5658 if (allele_idx0 < allele_idx1) {
5659 Rotate2(allele_idx0, allele_idx1, sample_ct, genovec);
5660 return kPglRetSuccess;
5661 }
5662 if (allele_idx0 == 1) {
5663 GenovecInvertUnsafe(sample_ct, genovec);
5664 return kPglRetSuccess;
5665 }
5666 if (!allele_idx1) {
5667 GenovecNonzeroToMissingThenInvertUnsafe(sample_ct, genovec);
5668 return kPglRetSuccess;
5669 }
5670 GenovecNontwoToMissingUnsafe(sample_ct, genovec);
5671 return kPglRetSuccess;
5672 }
5673 uintptr_t* raw_genovec = pgrp->workspace_vec;
5674 const unsigned char* fread_ptr;
5675 const unsigned char* fread_end;
5676 PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
5677 if (unlikely(reterr)) {
5678 return reterr;
5679 }
5680 ZeroTrailingNyps(raw_sample_ct, raw_genovec);
5681
5682 uint32_t invert = 0;
5683 if (allele_idx0 > allele_idx1) {
5684 const uint32_t swap = allele_idx0;
5685 allele_idx0 = allele_idx1;
5686 allele_idx1 = swap;
5687 invert = 1;
5688 }
5689 if (allele_idx0 > 1) {
5690 SetAllBits(2 * sample_ct, genovec);
5691 } else {
5692 CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
5693 Rotate2(allele_idx0, allele_idx1, sample_ct, genovec);
5694 }
5695 const uint32_t aux1_first_byte = *fread_ptr++;
5696 const uint32_t aux1a_mode = aux1_first_byte & 15;
5697 const uint32_t aux1b_mode = aux1_first_byte >> 4;
5698 uint32_t raw_01_ct = 0;
5699 uint32_t raw_10_ct = 0;
5700 if ((!aux1a_mode) || (!aux1b_mode)) {
5701 GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
5702 }
5703 if (!subsetting_required) {
5704 sample_include = nullptr;
5705 }
5706 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
5707 const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
5708 uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
5709 if (!allele_idx0) {
5710 // Two cases:
5711 // - If allele_idx == 1, convert all aux1a entries from 01 to 11.
5712 // - Otherwise, for each matching aux1a entry, convert from 11 to 01.
5713 reterr = GenoarrAux1aUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, allele_idx1, 2, raw_01_ct, &fread_ptr, genovec, deltalist_workspace);
5714 } else {
5715 reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr);
5716 }
5717 if (unlikely(reterr)) {
5718 return reterr;
5719 }
5720 reterr = GenoarrAux1bUpdate2(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, allele_idx0, allele_idx1, raw_10_ct, &fread_ptr, genovec, deltalist_workspace);
5721 if (unlikely(reterr)) {
5722 return reterr;
5723 }
5724 if (invert) {
5725 GenovecInvertUnsafe(sample_ct, genovec);
5726 }
5727 return kPglRetSuccess;
5728 }
5729
PreinitPgv(PgenVariant * pgvp)5730 void PreinitPgv(PgenVariant* pgvp) {
5731 pgvp->genovec = nullptr;
5732 pgvp->patch_01_set = nullptr;
5733 pgvp->patch_01_vals = nullptr;
5734 pgvp->patch_10_set = nullptr;
5735 pgvp->patch_10_vals = nullptr;
5736 pgvp->phasepresent = nullptr;
5737 pgvp->phaseinfo = nullptr;
5738 pgvp->dosage_present = nullptr;
5739 pgvp->dosage_main = nullptr;
5740 pgvp->multidosage_present = nullptr;
5741 pgvp->multidosage_cts = nullptr;
5742 pgvp->multidosage_codes = nullptr;
5743 pgvp->multidosage_vals = nullptr;
5744 pgvp->dphase_present = nullptr;
5745 pgvp->dphase_delta = nullptr;
5746 pgvp->multidphase_present = nullptr;
5747 pgvp->multidphase_cts = nullptr;
5748 pgvp->multidphase_codes = nullptr;
5749 pgvp->multidphase_delta = nullptr;
5750
5751 pgvp->patch_01_ct = 0;
5752 pgvp->patch_10_ct = 0;
5753 pgvp->phasepresent_ct = 0;
5754 pgvp->dosage_ct = 0;
5755 pgvp->multidosage_sample_ct = 0;
5756 pgvp->dphase_ct = 0;
5757 pgvp->multidphase_sample_ct = 0;
5758 }
5759
5760 // similar to ParseAndSaveDifflist()
ParseAndSaveDeltalistAsBitarr(const unsigned char * fread_end,uint32_t raw_sample_ct,const unsigned char ** fread_pp,uintptr_t * deltalist_include,uint32_t * deltalist_len_ptr)5761 PglErr ParseAndSaveDeltalistAsBitarr(const unsigned char* fread_end, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* deltalist_include, uint32_t* deltalist_len_ptr) {
5762 const unsigned char* group_info_iter;
5763 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, deltalist_len_ptr);
5764 const uint32_t deltalist_len = *deltalist_len_ptr;
5765 if (reterr || (!deltalist_len)) {
5766 return reterr;
5767 }
5768 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
5769 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
5770 const uint32_t group_idx_last = (deltalist_len - 1) / kPglDifflistGroupSize;
5771 ZeroWArr(raw_sample_ctl, deltalist_include);
5772 uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
5773 for (uint32_t group_idx = 0; ; ++group_idx) {
5774 if (group_idx >= group_idx_last) {
5775 if (group_idx > group_idx_last) {
5776 return kPglRetSuccess;
5777 }
5778 group_len_m1 &= deltalist_len - 1;
5779 }
5780 uintptr_t raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
5781 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
5782 for (uint32_t raw_deltalist_idx_lowbits = 0; ; ++raw_deltalist_idx_lowbits) {
5783 // always check, otherwise we may scribble over arbitrary memory
5784 if (unlikely(raw_sample_idx >= raw_sample_ct)) {
5785 return kPglRetMalformedInput;
5786 }
5787 SetBit(raw_sample_idx, deltalist_include);
5788 if (raw_deltalist_idx_lowbits == group_len_m1) {
5789 break;
5790 }
5791 raw_sample_idx += GetVint31(fread_end, fread_pp);
5792 }
5793 }
5794 }
5795
5796 // These functions do not overread, but may write extra bytes up to the word
5797 // boundary.
Expand2bitTo8(const void * __restrict bytearr,uint32_t input_nyp_ct,uint32_t incr,uintptr_t * __restrict dst)5798 void Expand2bitTo8(const void* __restrict bytearr, uint32_t input_nyp_ct, uint32_t incr, uintptr_t* __restrict dst) {
5799 const unsigned char* src_iter = S_CAST(const unsigned char*, bytearr);
5800 const uint32_t input_byte_ct = DivUp(input_nyp_ct, 4);
5801 #ifdef __arm__
5802 # error "Unaligned accesses in Expand2bitTo8()."
5803 #endif
5804 #ifdef __LP64__
5805 const uint32_t input_vec_ct = input_byte_ct / kBytesPerVec;
5806 unsigned char* dst_iter = R_CAST(unsigned char*, dst);
5807 if (input_vec_ct) {
5808 const VecW mincr = R_CAST(VecW, vecuc_set1(incr));
5809 const VecW m03 = VCONST_W(kMask0303);
5810 for (uint32_t vec_idx = 0; vec_idx != input_vec_ct; ++vec_idx) {
5811 VecW cur_vec = vecw_loadu(src_iter);
5812 src_iter = &(src_iter[kBytesPerVec]);
5813 # ifdef USE_AVX2
5814 // (todo: benchmark against just reading 8 bytes at a time and
5815 // broadcasting.)
5816 // midswapped_vec contains {0-1-2-3, 4-5-6-7, ..., 12-13-14-15,
5817 // 32-33-34-35, ..., 44-45-46-47,
5818 // 16-17-18-19, ..., 28-29-30-31,
5819 // 48-49-50-51, ..., 60-61-62-63,
5820 // 64-65-66-67, ..., 76-77-78-79,
5821 // 96-97-98-99, ..., 108-109-110-111,
5822 // 80-81-82-83, ..., 92-93-94-95,
5823 // 112-113-114-115, ..., 124-125-126-127}
5824 // 0xd8: {0, 2, 1, 3}
5825 const __m256i midswapped_vec = _mm256_shuffle_epi32(R_CAST(__m256i, cur_vec), 0xd8);
5826 // This operation is also used in FillInterleavedMaskVec().
5827 // cur_vec now contains {0-1-2-3, 4-5-6-7, 8-9-10-11, 12-13-14-15,
5828 // 32-33-34-35, ..., 44-45-46-47,
5829 // 64-65-66-67, ..., 76-77-78-79,
5830 // 96-97-98-99, ..., 108-109-110-111,
5831 // 16-17-18-19, ..., 28-29-30-31,
5832 // 48-49-50-51, ..., 60-61-62-63,
5833 // 80-81-82-83, ..., 92-93-94-95,
5834 // 112-113-114-115, ..., 124-125-126-127}
5835 cur_vec = vecw_permute0xd8_if_avx2(R_CAST(VecW, midswapped_vec));
5836 # endif
5837 // AVX2:
5838 // vec_even contains {0-1, 4-5, 8-9, 12-13, 32-33, ..., 44-45,
5839 // 64-65, ..., 76-77, 96-97, ..., 108-109,
5840 // 16-17, ..., 28-29, 48-49, ..., 60-61,
5841 // 80-81, ..., 92-93, 112-113, ..., 124-125}
5842 // vec_odd contains {2-3, 6-7, 10-11, 14-15, 34-35, ..., 46-47,
5843 // 66-67, ..., 78-79, 98-99, ..., 110-111,
5844 // 18-19, ..., 30-31, 50-51, ..., 62-63,
5845 // 82-83, ..., 94-95, 114-115, ..., 126-127}
5846 // SSE2:
5847 // vec_even contains {0-1, 4-5, 8-9, ..., 60-61}
5848 // vec_odd contains {2-3, 6-7, 10-11, ..., 62-63}
5849 const VecW vec_even = cur_vec;
5850 const VecW vec_odd = vecw_srli(cur_vec, 4);
5851
5852 // AVX2:
5853 // vec01 contains {0-1, 2-3, 4-5, ..., 14-15, 32-33, ..., 46-47,
5854 // 16-17, ..., 30-31, 48-49, ..., 62-63}
5855 // vec23 contains {64-65, 66-67, ..., 78-79, 96-97, ..., 110-111,
5856 // 80-81, ..., 94-95, 112-113, ..., 126-127}
5857 // SSE2:
5858 // vec01 contains {0-1, 2-3, 4-5, 6-7, ..., 30-31}
5859 // vec23 contains {32-33, 34-35, 36-37, 38-39, ..., 62-63}
5860 const VecW vec01 = vecw_unpacklo8(vec_even, vec_odd);
5861 const VecW vec23 = vecw_unpackhi8(vec_even, vec_odd);
5862
5863 // AVX2:
5864 // vec01_even contains {0, 2, 4, ..., 14, 32, 34, ..., 46,
5865 // 16, 18, ..., 30, 48, 50, ..., 62}
5866 // vec01_odd contains {1, 3, 5, ..., 15, 33, 35, ..., 47,
5867 // 17, 19, ..., 31, 49, 51, ..., 63}
5868 // SSE2:
5869 // vec01_even contains {0, 2, 4, 6, ..., 30}
5870 // vec01_odd contains {1, 3, 5, 7, ..., 31}
5871 const VecW vec01_even = vec01 & m03;
5872 const VecW vec01_odd = vecw_srli(vec01, 2) & m03;
5873
5874 // AVX2:
5875 // vecw_unpacklo8() contains {0, 1, ..., 15, 16, ..., 31}
5876 // vecw_unpachhi8() contains {32, 33, ..., 47, 48, ..., 63}
5877 // SSE2:
5878 // vecw_unpacklo8() contains {0, 1, ..., 15}
5879 // vecw_unpachhi8() contains {16, 17, ..., 31}
5880 vecw_storeu(dst_iter, mincr + vecw_unpacklo8(vec01_even, vec01_odd));
5881 dst_iter = &(dst_iter[kBytesPerVec]);
5882 vecw_storeu(dst_iter, mincr + vecw_unpackhi8(vec01_even, vec01_odd));
5883 dst_iter = &(dst_iter[kBytesPerVec]);
5884 const VecW vec23_odd = vecw_srli(vec23, 2) & m03;
5885 const VecW vec23_even = vec23 & m03;
5886 vecw_storeu(dst_iter, mincr + vecw_unpacklo8(vec23_even, vec23_odd));
5887 dst_iter = &(dst_iter[kBytesPerVec]);
5888 vecw_storeu(dst_iter, mincr + vecw_unpackhi8(vec23_even, vec23_odd));
5889 dst_iter = &(dst_iter[kBytesPerVec]);
5890 }
5891 }
5892 const uint32_t remainder = input_byte_ct % kBytesPerVec;
5893 if (remainder) {
5894 const uint32_t full_qw_ct = remainder / sizeof(Quarterword);
5895 const Quarterword* src_alias = R_CAST(const Quarterword*, src_iter);
5896 const uintptr_t incr_word = kMask0101 * incr;
5897 uintptr_t* dstw = R_CAST(uintptr_t*, dst_iter);
5898 for (uint32_t uii = 0; uii != full_qw_ct; ++uii) {
5899 const uintptr_t cur_2byte = src_alias[uii];
5900 dstw[uii] = incr_word + Unpack0303(cur_2byte);
5901 }
5902 if (input_byte_ct % 2) {
5903 uintptr_t cur_byte = src_iter[remainder - 1];
5904 # ifdef USE_AVX2
5905 cur_byte = _pdep_u64(cur_byte, kMask0303);
5906 # else
5907 cur_byte = cur_byte | (cur_byte << 12);
5908 cur_byte = (cur_byte | (cur_byte << 6)) & kMask0303;
5909 # endif
5910 dstw[full_qw_ct] = incr_word + cur_byte;
5911 }
5912 }
5913 #else // !__LP64__
5914 const Quarterword* src_alias = R_CAST(const Quarterword*, src_iter);
5915 const uintptr_t incr_word = kMask0101 * incr;
5916 uintptr_t* dstw = R_CAST(uintptr_t*, dst);
5917 for (uint32_t uii = 0; uii != input_byte_ct; ++uii) {
5918 const uintptr_t cur_2byte = src_alias[uii];
5919 dstw[uii] = incr_word + Unpack0303(cur_2byte);
5920 }
5921 #endif
5922 }
5923
Expand4bitTo8(const void * __restrict bytearr,uint32_t input_nybble_ct,uint32_t incr,uintptr_t * __restrict dst)5924 void Expand4bitTo8(const void* __restrict bytearr, uint32_t input_nybble_ct, uint32_t incr, uintptr_t* __restrict dst) {
5925 const unsigned char* src_iter = R_CAST(const unsigned char*, bytearr);
5926 const uint32_t input_byte_ct = DivUp(input_nybble_ct, 2);
5927 #ifdef __LP64__
5928 const uint32_t input_vec_ct = input_byte_ct / kBytesPerVec;
5929 unsigned char* dst_iter = R_CAST(unsigned char*, dst);
5930 if (input_vec_ct) {
5931 const VecW mincr = R_CAST(VecW, vecuc_set1(incr));
5932 const VecW m4 = VCONST_W(kMask0F0F);
5933 for (uint32_t vec_idx = 0; vec_idx != input_vec_ct; ++vec_idx) {
5934 VecW cur_vec = vecw_loadu(src_iter);
5935 src_iter = &(src_iter[kBytesPerVec]);
5936 cur_vec = vecw_permute0xd8_if_avx2(cur_vec);
5937 // AVX2:
5938 // vec_even contains {0, 2, 4, ..., 14, 32, 34, ..., 46,
5939 // 16, 18, ..., 30, 48, ... 62}
5940 // vec_odd contains {1, 3, 5, ..., 15, 33, 35, ..., 47,
5941 // 17, 19, ..., 31, 49, ..., 63}
5942 // SSE2:
5943 // vec_even contains {0, 2, 4, ..., 30}
5944 // vec_odd contains {1, 3, 5, ..., 31}
5945 const VecW vec_even = cur_vec & m4;
5946 const VecW vec_odd = vecw_srli(cur_vec, 4) & m4;
5947
5948 // AVX2:
5949 // vec_lo contains {0, 1, ..., 31}
5950 // vec_hi contains {32, 33, ..., 63}
5951 // SSE2:
5952 // vec_lo contains {0, 1, 2, ..., 15}
5953 // vec_hi contains {16, 17, 18, ..., 31}
5954 const VecW vec_lo = vecw_unpacklo8(vec_even, vec_odd);
5955 const VecW vec_hi = vecw_unpackhi8(vec_even, vec_odd);
5956 vecw_storeu(dst_iter, mincr + vec_lo);
5957 dst_iter = &(dst_iter[kBytesPerVec]);
5958 vecw_storeu(dst_iter, mincr + vec_hi);
5959 dst_iter = &(dst_iter[kBytesPerVec]);
5960 }
5961 }
5962 const uint32_t remainder = input_byte_ct % kBytesPerVec;
5963 if (remainder) {
5964 const Halfword* src_alias = R_CAST(const Halfword*, src_iter);
5965 uintptr_t incr_word = kMask0101 * incr;
5966 const uint32_t hw_ct_m1 = (remainder - 1) / sizeof(Halfword);
5967 uintptr_t* dstw = R_CAST(uintptr_t*, dst_iter);
5968 for (uint32_t hwidx = 0; ; ++hwidx) {
5969 uint32_t cur_4byte;
5970 if (hwidx >= hw_ct_m1) {
5971 if (hwidx > hw_ct_m1) {
5972 break;
5973 }
5974 cur_4byte = SubU32Load(&(src_alias[hwidx]), ModNz(remainder, 4));
5975 } else {
5976 cur_4byte = src_alias[hwidx];
5977 }
5978 dstw[hwidx] = incr_word + Unpack0F0F(cur_4byte);
5979 }
5980 }
5981 #else
5982 unsigned char* dst_iter = R_CAST(unsigned char*, dst);
5983 for (uint32_t uii = 0; uii < input_byte_ct; ++uii) {
5984 uint32_t cur_byte = src_iter[uii];
5985 *dst_iter++ = (cur_byte & 15) + incr;
5986 *dst_iter++ = (cur_byte >> 4) + incr;
5987 }
5988 #endif
5989 }
5990
5991 static_assert(sizeof(AlleleCode) == 1, "GetAux1aCodes() must be updated.");
GetAux1aCodes(const unsigned char * fread_end,uint32_t rare01_ct,uint32_t allele_ct,const unsigned char ** fread_pp,AlleleCode * __restrict patch_01_vals)5992 PglErr GetAux1aCodes(const unsigned char* fread_end, uint32_t rare01_ct, uint32_t allele_ct, const unsigned char** fread_pp, AlleleCode* __restrict patch_01_vals) {
5993 if (allele_ct == 3) {
5994 memset(patch_01_vals, 2, rare01_ct);
5995 return kPglRetSuccess;
5996 }
5997 const unsigned char* patch_01_fvals = *fread_pp;
5998 if (allele_ct == 4) {
5999 const uint32_t patch_01_fvals_byte_ct = DivUp(rare01_ct, CHAR_BIT);
6000 if (PtrAddCk(fread_end, patch_01_fvals_byte_ct, fread_pp)) {
6001 return kPglRetMalformedInput;
6002 }
6003 Expand1bitTo8(patch_01_fvals, rare01_ct, 2, R_CAST(uintptr_t*, patch_01_vals));
6004 return kPglRetSuccess;
6005 }
6006 if (allele_ct < 7) {
6007 const uint32_t patch_01_fvals_byte_ct = DivUp(rare01_ct, 4);
6008 if (PtrAddCk(fread_end, patch_01_fvals_byte_ct, fread_pp)) {
6009 return kPglRetMalformedInput;
6010 }
6011 Expand2bitTo8(patch_01_fvals, rare01_ct, 2, R_CAST(uintptr_t*, patch_01_vals));
6012 return kPglRetSuccess;
6013 }
6014 if (allele_ct < 19) {
6015 const uint32_t patch_01_fvals_byte_ct = DivUp(rare01_ct, 2);
6016 if (PtrAddCk(fread_end, patch_01_fvals_byte_ct, fread_pp)) {
6017 return kPglRetMalformedInput;
6018 }
6019 Expand4bitTo8(patch_01_fvals, rare01_ct, 2, R_CAST(uintptr_t*, patch_01_vals));
6020 return kPglRetSuccess;
6021 }
6022 if (PtrAddCk(fread_end, rare01_ct, fread_pp)) {
6023 return kPglRetMalformedInput;
6024 }
6025 // todo: verify the compiler recognizes this
6026 for (uint32_t uii = 0; uii < rare01_ct; ++uii) {
6027 patch_01_vals[uii] = patch_01_fvals[uii] + 2;
6028 }
6029 return kPglRetSuccess;
6030 }
6031
6032 // Assumes aux1a_mode != 15.
ExportAux1a(const unsigned char * fread_end,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_01_ct,const unsigned char ** fread_pp,uintptr_t * __restrict patch_01_set,AlleleCode * __restrict patch_01_vals,uint32_t * __restrict rare01_ctp)6033 PglErr ExportAux1a(const unsigned char* fread_end, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_01_ct, const unsigned char** fread_pp, uintptr_t* __restrict patch_01_set, AlleleCode* __restrict patch_01_vals, uint32_t* __restrict rare01_ctp) {
6034 uint32_t rare01_ct;
6035 if (!aux1a_mode) {
6036 const unsigned char* patch_01_fset = *fread_pp;
6037 const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
6038 if (PtrAddCk(fread_end, fset_byte_ct, fread_pp)) {
6039 return kPglRetMalformedInput;
6040 }
6041 rare01_ct = PopcountBytes(patch_01_fset, fset_byte_ct);
6042 ExpandBytearrFromGenoarr(patch_01_fset, raw_genoarr, kMask5555, NypCtToWordCt(raw_sample_ct), raw_01_ct, 0, patch_01_set);
6043 } else {
6044 if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, fread_pp, patch_01_set, &rare01_ct))) {
6045 return kPglRetMalformedInput;
6046 }
6047 }
6048 *rare01_ctp = rare01_ct;
6049 return GetAux1aCodes(fread_end, rare01_ct, allele_ct, fread_pp, patch_01_vals);
6050 }
6051
ExportAux1aProperSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t allele_ct,uint32_t raw_01_ct,const unsigned char ** fread_pp,uintptr_t * __restrict dst_01_set,AlleleCode * __restrict dst_01_vals,uint32_t * __restrict rare01_ctp,uint32_t * __restrict deltalist_workspace)6052 PglErr ExportAux1aProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t allele_ct, uint32_t raw_01_ct, const unsigned char** fread_pp, uintptr_t* __restrict dst_01_set, AlleleCode* __restrict dst_01_vals, uint32_t* __restrict rare01_ctp, uint32_t* __restrict deltalist_workspace) {
6053 const uint32_t allele_code_width = GetAux1aWidth(allele_ct);
6054 const uintptr_t allele_code_mask = (1U << allele_code_width) - 1;
6055 memset(dst_01_set, 0, BitCtToWordCt(sample_ct) * sizeof(intptr_t));
6056 AlleleCode* dst_01_vals_iter = dst_01_vals;
6057 if (!aux1a_mode) {
6058 #ifdef __arm__
6059 # error "Unaligned accesses in ExportAux1aProperSubset()."
6060 #endif
6061 // similar to GenoarrAux1aUpdate()
6062 const uintptr_t* patch_01_fsetw = R_CAST(const uintptr_t*, *fread_pp);
6063 const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
6064 const uint32_t rare01_ct = PopcountBytes(patch_01_fsetw, fset_byte_ct);
6065 if (PtrAddCk(fread_end, fset_byte_ct, fread_pp)) {
6066 return kPglRetMalformedInput;
6067 }
6068 const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
6069 uintptr_t sample_hwidx = 0;
6070 uintptr_t cur_raw_genoarr_hets = Word01(raw_genoarr[0]);
6071 uint32_t loop_len = kBitsPerWord;
6072 const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
6073 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6074 return kPglRetMalformedInput;
6075 }
6076 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
6077 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
6078 uintptr_t fvals_bits = 0;
6079 uint32_t fvals_widx = 0;
6080 uint32_t rare01_lowbits = kBitsPerWord;
6081 for (uint32_t fset_widx = 0; ; ++fset_widx) {
6082 uintptr_t fset_bits;
6083 if (fset_widx >= fset_word_ct_m1) {
6084 if (fset_widx > fset_word_ct_m1) {
6085 break;
6086 }
6087 fset_bits = SubwordLoad(&(patch_01_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
6088 loop_len = ModNz(raw_01_ct, kBitsPerWord);
6089 } else {
6090 fset_bits = patch_01_fsetw[fset_widx];
6091 }
6092 if (allele_ct == 3) {
6093 for (uint32_t uii = 0; uii != loop_len; ++uii) {
6094 while (!cur_raw_genoarr_hets) {
6095 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
6096 }
6097 if (fset_bits & 1) {
6098 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
6099 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
6100 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
6101 SetBit(sample_idx, dst_01_set);
6102 *dst_01_vals_iter++ = 2;
6103 }
6104 }
6105 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
6106 fset_bits = fset_bits >> 1;
6107 }
6108 } else {
6109 for (uint32_t uii = 0; uii != loop_len; ++uii) {
6110 while (!cur_raw_genoarr_hets) {
6111 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
6112 }
6113 if (fset_bits & 1) {
6114 if (rare01_lowbits == kBitsPerWord) {
6115 if (fvals_widx == fvals_word_ct_m1) {
6116 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6117 } else {
6118 fvals_bits = patch_01_fvalsw[fvals_widx];
6119 }
6120 // unnecessary to apply bzhi here
6121 ++fvals_widx;
6122 rare01_lowbits = 0;
6123 }
6124 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
6125 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
6126 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
6127 SetBit(sample_idx, dst_01_set);
6128 *dst_01_vals_iter++ = 2 + ((fvals_bits >> rare01_lowbits) & allele_code_mask);
6129 }
6130 rare01_lowbits += allele_code_width;
6131 }
6132 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
6133 fset_bits = fset_bits >> 1;
6134 }
6135 }
6136 }
6137 *rare01_ctp = dst_01_vals_iter - dst_01_vals;
6138 return kPglRetSuccess;
6139 }
6140 // aux1a_mode == 1
6141 uint32_t rare01_ct;
6142 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare01_ct);
6143 if (unlikely(reterr)) {
6144 return reterr;
6145 }
6146 const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
6147 const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
6148 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6149 return kPglRetMalformedInput;
6150 }
6151 if (allele_ct == 3) {
6152 for (uint32_t rare01_idx = 0; rare01_idx != rare01_ct; ++rare01_idx) {
6153 const uint32_t sample_uidx = deltalist_workspace[rare01_idx];
6154 // could wrap this boilerplate
6155 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
6156 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
6157 const uintptr_t sample_include_word = sample_include[sample_widx];
6158 if (sample_include_word & lowbit) {
6159 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
6160 SetBit(sample_idx, dst_01_set);
6161 *dst_01_vals_iter++ = 2;
6162 }
6163 }
6164 *rare01_ctp = dst_01_vals_iter - dst_01_vals;
6165 return kPglRetSuccess;
6166 }
6167 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
6168 const uint32_t allele_code_logwidth = ctzu32(allele_code_width);
6169 uint32_t loop_len = kBitsPerWord >> allele_code_logwidth;
6170 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
6171 uintptr_t fvals_bits;
6172 if (fvals_widx >= fvals_word_ct_m1) {
6173 if (fvals_widx > fvals_word_ct_m1) {
6174 break;
6175 }
6176 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6177 loop_len = 1 + ((rare01_ct - 1) & (loop_len - 1));
6178 } else {
6179 fvals_bits = patch_01_fvalsw[fvals_widx];
6180 }
6181 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - allele_code_logwidth)]);
6182 for (uint32_t uii = 0; uii != loop_len; ++uii) {
6183 const uint32_t sample_uidx = cur_deltalist_base[uii];
6184 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
6185 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
6186 const uintptr_t sample_include_word = sample_include[sample_widx];
6187 if (sample_include_word & lowbit) {
6188 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
6189 SetBit(sample_idx, dst_01_set);
6190 *dst_01_vals_iter++ = 2 + ((fvals_bits >> (uii << allele_code_logwidth)) & allele_code_mask);
6191 }
6192 }
6193 }
6194 *rare01_ctp = dst_01_vals_iter - dst_01_vals;
6195 return kPglRetSuccess;
6196 }
6197
6198 static_assert(sizeof(AlleleCode) == 1, "GetAux1bCodes() must be updated.");
GetAux1bCodes(const unsigned char * fread_end,uint32_t rare10_ct,uint32_t allele_ct,const unsigned char ** fread_pp,AlleleCode * __restrict patch_10_vals)6199 PglErr GetAux1bCodes(const unsigned char* fread_end, uint32_t rare10_ct, uint32_t allele_ct, const unsigned char** fread_pp, AlleleCode* __restrict patch_10_vals) {
6200 const unsigned char* patch_10_fvals = *fread_pp;
6201 if (allele_ct == 3) {
6202 // 1 bit, distinguishes between 0x0201 and 0x0202
6203 const uint32_t patch_10_fvals_byte_ct = DivUp(rare10_ct, CHAR_BIT);
6204 if (PtrAddCk(fread_end, patch_10_fvals_byte_ct, fread_pp)) {
6205 return kPglRetMalformedInput;
6206 }
6207 Expand1bitTo16(patch_10_fvals, rare10_ct, 0x0201, R_CAST(uintptr_t*, patch_10_vals));
6208 return kPglRetSuccess;
6209 }
6210 const uint32_t rare10_ct_x2 = rare10_ct * 2;
6211 if (allele_ct < 6) {
6212 // 2+2 bits, add 1
6213 const uint32_t patch_10_fvals_byte_ct = DivUp(rare10_ct, 2);
6214 if (PtrAddCk(fread_end, patch_10_fvals_byte_ct, fread_pp)) {
6215 return kPglRetMalformedInput;
6216 }
6217 Expand2bitTo8(patch_10_fvals, rare10_ct_x2, 1, R_CAST(uintptr_t*, patch_10_vals));
6218 return kPglRetSuccess;
6219 }
6220 if (allele_ct < 18) {
6221 // 4+4 bits
6222 if (PtrAddCk(fread_end, rare10_ct, fread_pp)) {
6223 return kPglRetMalformedInput;
6224 }
6225 Expand4bitTo8(patch_10_fvals, rare10_ct_x2, 1, R_CAST(uintptr_t*, patch_10_vals));
6226 return kPglRetSuccess;
6227 }
6228 if (PtrAddCk(fread_end, rare10_ct_x2, fread_pp)) {
6229 return kPglRetMalformedInput;
6230 }
6231 // todo: verify the compiler recognizes this
6232 for (uint32_t uii = 0; uii < rare10_ct_x2; ++uii) {
6233 patch_10_vals[uii] = patch_10_fvals[uii] + 1;
6234 }
6235 return kPglRetSuccess;
6236 }
6237
6238 // Assumes aux1b_mode != 15.
ExportAux1b(const unsigned char * fread_end,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict patch_10_set,AlleleCode * __restrict patch_10_vals,uint32_t * __restrict rare10_ctp)6239 PglErr ExportAux1b(const unsigned char* fread_end, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict patch_10_set, AlleleCode* __restrict patch_10_vals, uint32_t* __restrict rare10_ctp) {
6240 uint32_t rare10_ct;
6241 if (!aux1b_mode) {
6242 const unsigned char* patch_10_fset = *fread_pp;
6243 const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
6244 if (PtrAddCk(fread_end, fset_byte_ct, fread_pp)) {
6245 return kPglRetMalformedInput;
6246 }
6247 rare10_ct = PopcountBytes(patch_10_fset, fset_byte_ct);
6248 ExpandBytearrFromGenoarr(patch_10_fset, raw_genoarr, kMaskAAAA, NypCtToWordCt(raw_sample_ct), raw_10_ct, 0, patch_10_set);
6249 } else {
6250 if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, fread_pp, patch_10_set, &rare10_ct))) {
6251 return kPglRetMalformedInput;
6252 }
6253 }
6254 *rare10_ctp = rare10_ct;
6255 return GetAux1bCodes(fread_end, rare10_ct, allele_ct, fread_pp, patch_10_vals);
6256 }
6257
ExportAux1bProperSubset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uintptr_t * __restrict dst_10_set,AlleleCode * __restrict dst_10_vals,uint32_t * __restrict rare10_ctp,uint32_t * __restrict deltalist_workspace)6258 PglErr ExportAux1bProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uintptr_t* __restrict dst_10_set, AlleleCode* __restrict dst_10_vals, uint32_t* __restrict rare10_ctp, uint32_t* __restrict deltalist_workspace) {
6259 uintptr_t detect_hom_mask_lo; // unused
6260 const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
6261 const uint32_t allele_code_width = 1U << allele_code_logwidth;
6262 const uintptr_t allele_code_mask = (1U << allele_code_width) - 1;
6263 const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
6264 const uint32_t code10_width = 1U << code10_logwidth;
6265 memset(dst_10_set, 0, BitCtToWordCt(sample_ct) * sizeof(intptr_t));
6266 AlleleCode* dst_10_vals_iter = dst_10_vals;
6267 if (!aux1b_mode) {
6268 #ifdef __arm__
6269 # error "Unaligned accesses in ExportAux1bProperSubset()."
6270 #endif
6271 const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
6272 const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
6273 const uint32_t rare10_ct = PopcountBytes(patch_10_fsetw, fset_byte_ct);
6274 if (PtrAddCk(fread_end, fset_byte_ct, fread_pp)) {
6275 return kPglRetMalformedInput;
6276 }
6277 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
6278 uintptr_t sample_hwidx = 0;
6279 uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
6280 uint32_t loop_len = kBitsPerWord;
6281 const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) * code10_width, 8);
6282 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6283 return kPglRetMalformedInput;
6284 }
6285 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
6286 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
6287 uintptr_t fvals_bits = 0;
6288 uint32_t fvals_widx = 0;
6289 uint32_t rare10_lowbits = kBitsPerWord;
6290 for (uint32_t fset_widx = 0; ; ++fset_widx) {
6291 uintptr_t fset_bits;
6292 if (fset_widx >= fset_word_ct_m1) {
6293 if (fset_widx > fset_word_ct_m1) {
6294 break;
6295 }
6296 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
6297 loop_len = ModNz(raw_10_ct, kBitsPerWord);
6298 } else {
6299 fset_bits = patch_10_fsetw[fset_widx];
6300 }
6301 if (allele_ct == 3) {
6302 for (uint32_t uii = 0; uii != loop_len; ++uii) {
6303 while (!cur_raw_genoarr_xys) {
6304 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
6305 }
6306 if (fset_bits & 1) {
6307 if (rare10_lowbits == kBitsPerWord) {
6308 if (fvals_widx == fvals_word_ct_m1) {
6309 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6310 } else {
6311 fvals_bits = patch_10_fvalsw[fvals_widx];
6312 }
6313 // unnecessary to apply bzhi here
6314 ++fvals_widx;
6315 rare10_lowbits = 0;
6316 }
6317 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
6318 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
6319 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
6320 SetBit(sample_idx, dst_10_set);
6321 *dst_10_vals_iter++ = 1 + ((fvals_bits >> rare10_lowbits) & 1);
6322 *dst_10_vals_iter++ = 2;
6323 }
6324 ++rare10_lowbits;
6325 }
6326 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
6327 fset_bits = fset_bits >> 1;
6328 }
6329 } else {
6330 for (uint32_t uii = 0; uii != loop_len; ++uii) {
6331 while (!cur_raw_genoarr_xys) {
6332 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
6333 }
6334 if (fset_bits & 1) {
6335 if (rare10_lowbits == kBitsPerWord) {
6336 if (fvals_widx == fvals_word_ct_m1) {
6337 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6338 } else {
6339 fvals_bits = patch_10_fvalsw[fvals_widx];
6340 }
6341 // unnecessary to apply bzhi here
6342 ++fvals_widx;
6343 rare10_lowbits = 0;
6344 }
6345 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
6346 if ((R_CAST(const Halfword*, sample_include)[sample_hwidx]) & (1U << sample_uidx_lowbits)) {
6347 const uint32_t sample_idx = RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, sample_hwidx * kBitsPerWordD2 + sample_uidx_lowbits);
6348 SetBit(sample_idx, dst_10_set);
6349 const uintptr_t cur_code_pair = fvals_bits >> rare10_lowbits;
6350 const uint32_t cur_code_hi = (cur_code_pair >> allele_code_width) & allele_code_mask;
6351 const uint32_t cur_code_lo = cur_code_pair & allele_code_mask;
6352 *dst_10_vals_iter++ = 1 + cur_code_lo;
6353 *dst_10_vals_iter++ = 1 + cur_code_hi;
6354 }
6355 rare10_lowbits += code10_width;
6356 }
6357 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
6358 fset_bits = fset_bits >> 1;
6359 }
6360 }
6361 }
6362 *rare10_ctp = S_CAST(uintptr_t, dst_10_vals_iter - dst_10_vals) / 2;
6363 return kPglRetSuccess;
6364 }
6365 // aux1b_mode == 1
6366 uint32_t rare10_ct;
6367 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
6368 if (unlikely(reterr)) {
6369 return reterr;
6370 }
6371 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
6372 const uintptr_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, 8);
6373 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6374 return kPglRetMalformedInput;
6375 }
6376 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
6377 uint32_t loop_len = kBitsPerWord >> code10_logwidth;
6378 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
6379 uintptr_t fvals_bits;
6380 if (fvals_widx >= fvals_word_ct_m1) {
6381 if (fvals_widx > fvals_word_ct_m1) {
6382 break;
6383 }
6384 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
6385 loop_len = 1 + ((rare10_ct - 1) & (loop_len - 1));
6386 } else {
6387 fvals_bits = patch_10_fvalsw[fvals_widx];
6388 }
6389 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
6390 if (allele_ct == 3) {
6391 for (uint32_t uii = 0; uii != loop_len; ++uii) {
6392 const uint32_t sample_uidx = cur_deltalist_base[uii];
6393 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
6394 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
6395 const uintptr_t sample_include_word = sample_include[sample_widx];
6396 if (sample_include_word & lowbit) {
6397 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
6398 SetBit(sample_idx, dst_10_set);
6399 *dst_10_vals_iter++ = 1 + ((fvals_bits >> uii) & 1);
6400 *dst_10_vals_iter++ = 2;
6401 }
6402 }
6403 } else {
6404 for (uint32_t uii = 0; uii != loop_len; ++uii) {
6405 const uint32_t sample_uidx = cur_deltalist_base[uii];
6406 const uint32_t sample_widx = sample_uidx / kBitsPerWord;
6407 const uintptr_t lowbit = k1LU << (sample_uidx % kBitsPerWord);
6408 const uintptr_t sample_include_word = sample_include[sample_widx];
6409 if (sample_include_word & lowbit) {
6410 const uint32_t sample_idx = sample_include_cumulative_popcounts[sample_widx] + PopcountWord(sample_include_word & (lowbit - 1));
6411 SetBit(sample_idx, dst_10_set);
6412 const uintptr_t cur_code_pair = fvals_bits >> (uii << code10_logwidth);
6413 const uint32_t cur_code_hi = (cur_code_pair >> allele_code_width) & allele_code_mask;
6414 const uint32_t cur_code_lo = cur_code_pair & allele_code_mask;
6415 *dst_10_vals_iter++ = 1 + cur_code_lo;
6416 *dst_10_vals_iter++ = 1 + cur_code_hi;
6417 }
6418 }
6419 }
6420 }
6421 *rare10_ctp = S_CAST(uintptr_t, dst_10_vals_iter - dst_10_vals) / 2;
6422 return kPglRetSuccess;
6423 }
6424
6425 // Assumes sample_ct > 0, multiallelic-hc track is present, and patch_01_ct and
6426 // patch_10_ct are zero-initialized.
GetMultiallelicCodes(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict all_hets,PgenVariant * pgvp)6427 PglErr GetMultiallelicCodes(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict all_hets, PgenVariant* pgvp) {
6428 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
6429 uint32_t subsetting_required = (sample_ct != raw_sample_ct);
6430 uintptr_t* raw_genovec = pgrp->workspace_vec;
6431 const unsigned char* fread_ptr;
6432 const unsigned char* fread_end;
6433 PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
6434 if (unlikely(reterr)) {
6435 return reterr;
6436 }
6437 CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, pgvp->genovec);
6438 ZeroTrailingNyps(raw_sample_ct, raw_genovec);
6439 const uint32_t aux1_first_byte = *fread_ptr++;
6440 const uint32_t aux1a_mode = aux1_first_byte & 15;
6441 const uint32_t aux1b_mode = aux1_first_byte >> 4;
6442 uint32_t raw_01_ct = 0;
6443 uint32_t raw_10_ct = 0;
6444 if ((!aux1a_mode) || (!aux1b_mode)) {
6445 GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6446 }
6447 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
6448 const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
6449 uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
6450 if (aux1a_mode != 15) {
6451 if (!subsetting_required) {
6452 reterr = ExportAux1a(fread_end, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr, pgvp->patch_01_set, pgvp->patch_01_vals, &(pgvp->patch_01_ct));
6453 } else {
6454 reterr = ExportAux1aProperSubset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1a_mode, raw_sample_ct, sample_ct, allele_ct, raw_01_ct, &fread_ptr, pgvp->patch_01_set, pgvp->patch_01_vals, &(pgvp->patch_01_ct), deltalist_workspace);
6455 }
6456 if (unlikely(reterr)) {
6457 return reterr;
6458 }
6459 }
6460 const unsigned char* aux1b_start = fread_ptr;
6461 if (aux1b_mode != 15) {
6462 if (!subsetting_required) {
6463 reterr = ExportAux1b(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, pgvp->patch_10_set, pgvp->patch_10_vals, &(pgvp->patch_10_ct));
6464 } else {
6465 reterr = ExportAux1bProperSubset(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1b_mode, raw_sample_ct, sample_ct, allele_ct, raw_10_ct, &fread_ptr, pgvp->patch_10_set, pgvp->patch_10_vals, &(pgvp->patch_10_ct), deltalist_workspace);
6466 }
6467 if (unlikely(reterr)) {
6468 return reterr;
6469 }
6470 }
6471 if (fread_pp) {
6472 *fread_pp = fread_ptr;
6473 *fread_endp = fread_end;
6474 if (all_hets) {
6475 PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
6476 if (aux1b_mode != 15) {
6477 // can merge this with ExportAux1b functions later
6478 uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
6479 uint32_t aux1b_het_present;
6480 reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &aux1b_start, aux1b_hets, &aux1b_het_present, deltalist_workspace);
6481 if (unlikely(reterr)) {
6482 return reterr;
6483 }
6484 if (aux1b_het_present) {
6485 BitvecOr(aux1b_hets, BitCtToWordCt(raw_sample_ct), all_hets);
6486 }
6487 }
6488 }
6489 }
6490 return kPglRetSuccess;
6491 }
6492
PgrGetM(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)6493 PglErr PgrGetM(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
6494 pgvp->patch_01_ct = 0;
6495 pgvp->patch_10_ct = 0;
6496 if (!sample_ct) {
6497 return kPglRetSuccess;
6498 }
6499 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
6500 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
6501 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6502 const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
6503 if (!multiallelic_hc_present) {
6504 return ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, pgvp->genovec);
6505 }
6506 return GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, nullptr, pgvp);
6507 }
6508
DetectGenoarrHetsHw(const uintptr_t * __restrict genoarr,uint32_t raw_sample_ctl2,Halfword * all_hets_hw)6509 void DetectGenoarrHetsHw(const uintptr_t*__restrict genoarr, uint32_t raw_sample_ctl2, Halfword* all_hets_hw) {
6510 // requires trailing bits of genoarr to be zeroed out. does not update last
6511 // all_hets[] halfword if raw_sample_ctl2 is odd.
6512 for (uint32_t widx = 0; widx != raw_sample_ctl2; ++widx) {
6513 const uintptr_t cur_word = genoarr[widx];
6514 uintptr_t ww = (~(cur_word >> 1)) & cur_word; // low 1, high 0
6515 all_hets_hw[widx] = PackWordToHalfwordMask5555(ww);
6516 }
6517 }
6518
PgrDetectGenoarrHetsMultiallelic(const uintptr_t * __restrict genoarr,const uintptr_t * __restrict patch_10_set,const AlleleCode * __restrict patch_10_vals,uint32_t raw_sample_ct,uintptr_t * __restrict all_hets)6519 void PgrDetectGenoarrHetsMultiallelic(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t raw_sample_ct, uintptr_t* __restrict all_hets) {
6520 const Halfword* patch_10_set_alias = R_CAST(const Halfword*, patch_10_set);
6521 const AlleleCode* patch_10_vals_iter = patch_10_vals;
6522 const uint32_t word_ct_m1 = (raw_sample_ct - 1) / kBitsPerWordD2;
6523 Halfword* all_hets_hw = R_CAST(Halfword*, all_hets);
6524 for (uint32_t widx = 0; ; ++widx) {
6525 uintptr_t cur_geno_word;
6526 if (widx >= word_ct_m1) {
6527 if (widx > word_ct_m1) {
6528 if (widx % 2) {
6529 all_hets_hw[widx] = 0;
6530 }
6531 return;
6532 }
6533 const uint32_t final_ct = ModNz(raw_sample_ct, kBitsPerWordD2);
6534 cur_geno_word = bzhi_max(genoarr[widx], 2 * final_ct);
6535 } else {
6536 cur_geno_word = genoarr[widx];
6537 }
6538 uint32_t patch_10_hw = patch_10_set_alias[widx];
6539 uint32_t cur_hets = Pack01ToHalfword(cur_geno_word);
6540 while (patch_10_hw) {
6541 const AlleleCode code1 = *patch_10_vals_iter++;
6542 const AlleleCode code2 = *patch_10_vals_iter++;
6543 const uint32_t lowbit = patch_10_hw & (-patch_10_hw);
6544 if (code1 != code2) {
6545 cur_hets |= lowbit;
6546 }
6547 patch_10_hw ^= lowbit;
6548 }
6549 all_hets_hw[widx] = cur_hets;
6550 }
6551 }
6552
SkipAux1b(const unsigned char * fread_end,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp)6553 PglErr SkipAux1b(const unsigned char* fread_end, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp) {
6554 if (aux1b_mode == 15) {
6555 return kPglRetSuccess;
6556 }
6557 uint32_t rare10_ct;
6558 if (!aux1b_mode) {
6559 const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
6560 rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
6561 *fread_pp += fset_byte_ct;
6562 } else {
6563 const unsigned char* group_info_iter;
6564 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare10_ct);
6565 if (unlikely(reterr)) {
6566 return reterr;
6567 }
6568 reterr = SkipDeltalistIds(fread_end, group_info_iter, rare10_ct, raw_sample_ct, 0, fread_pp);
6569 if (unlikely(reterr)) {
6570 return reterr;
6571 }
6572 }
6573 const uint32_t fvals_byte_ct = GetAux1bAlleleEntryByteCt(allele_ct, rare10_ct);
6574 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
6575 return kPglRetMalformedInput;
6576 }
6577 return kPglRetSuccess;
6578 }
6579
SkipAux1(const unsigned char * fread_end,const uintptr_t * __restrict raw_genovec,uint32_t raw_sample_ct,uint32_t allele_ct,const unsigned char ** fread_pp)6580 PglErr SkipAux1(const unsigned char* fread_end, const uintptr_t* __restrict raw_genovec, uint32_t raw_sample_ct, uint32_t allele_ct, const unsigned char** fread_pp) {
6581 const uint32_t aux1_first_byte = **fread_pp;
6582 (*fread_pp) += 1;
6583 const uint32_t aux1a_mode = aux1_first_byte & 15;
6584 const uint32_t aux1b_mode = aux1_first_byte >> 4;
6585 uint32_t raw_01_ct = 0;
6586 uint32_t raw_10_ct = 0;
6587 if ((!aux1a_mode) || (!aux1b_mode)) {
6588 GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6589 }
6590 PglErr reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, fread_pp);
6591 if (unlikely(reterr)) {
6592 return reterr;
6593 }
6594 return SkipAux1b(fread_end, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, fread_pp);
6595 }
6596
6597 // sample_include assumed to be nullptr if no subsetting required
6598 // subsetted_10het should only be provided when you explicitly want to exclude
6599 // those phase entries
6600 // set phasepresent == phaseinfo == nullptr if you want to skip the entire
6601 // track; ok for phasepresent_ct_ptr to be nullptr too in that case
6602 // (also see SkipAux2() and GetPhasepresentAndSkipPhaseinfo() below)
ParseAux2Subset(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict all_hets,const uintptr_t * __restrict subsetted_10het,uint32_t raw_sample_ct,uint32_t sample_ct,const unsigned char ** fread_pp,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr,uintptr_t * __restrict workspace_subset)6603 PglErr ParseAux2Subset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict all_hets, const uintptr_t* __restrict subsetted_10het, uint32_t raw_sample_ct, uint32_t sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr, uintptr_t* __restrict workspace_subset) {
6604 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
6605 const uint32_t het_ct = PopcountWords(all_hets, raw_sample_ctl);
6606 if (unlikely(!het_ct)) {
6607 // there shouldn't be a hphase track at all in this case, het_ct is not
6608 // computed off a subset
6609 return kPglRetMalformedInput;
6610 }
6611 const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
6612 const unsigned char* aux2_start = *fread_pp;
6613 if (!(aux2_start[0] & 1)) {
6614 // phase always present
6615 if (PtrAddCk(fread_end, 1 + (het_ct / CHAR_BIT), fread_pp)) {
6616 return kPglRetMalformedInput;
6617 }
6618 if (!phaseinfo) {
6619 // for internal callers which just want to skip aux2
6620 return kPglRetSuccess;
6621 }
6622 if (!sample_include) {
6623 memcpy(phasepresent, all_hets, raw_sample_ctl * kBytesPerWord);
6624 ExpandBytearr(aux2_start, all_hets, raw_sample_ctl, het_ct, 1, phaseinfo);
6625 if (!subsetted_10het) {
6626 *phasepresent_ct_ptr = het_ct;
6627 return kPglRetSuccess;
6628 }
6629 } else {
6630 CopyBitarrSubset(all_hets, sample_include, sample_ct, phasepresent);
6631 if (AllWordsAreZero(phasepresent, sample_ctl)) {
6632 *phasepresent_ct_ptr = 0;
6633 // bugfix (7 Dec 2017): clear sample_ctl words here, not raw_sample_ctl
6634 ZeroWArr(sample_ctl, phaseinfo);
6635 return kPglRetSuccess;
6636 }
6637 ExpandThenSubsetBytearr(aux2_start, all_hets, sample_include, het_ct, sample_ct, 1, phaseinfo);
6638 }
6639 // bugfix (25 Feb 2020): forgot to mask out subsetted_10het here
6640 } else {
6641 const uint32_t het_ctdl = het_ct / kBitsPerWord;
6642
6643 // explicit phasepresent
6644 const uintptr_t* aux2_first_part = R_CAST(const uintptr_t*, aux2_start);
6645 uintptr_t* aux2_first_part_copy = workspace_subset;
6646 aux2_first_part_copy[het_ctdl] = 0;
6647 memcpy(aux2_first_part_copy, aux2_first_part, 1 + (het_ct / CHAR_BIT));
6648 const uint32_t raw_phasepresent_ct = PopcountWords(aux2_first_part_copy, het_ctdl + 1) - 1;
6649 if (unlikely(!raw_phasepresent_ct)) {
6650 // there shouldn't be a hphase track at all in this case
6651 return kPglRetMalformedInput;
6652 }
6653 const unsigned char* aux2_second_part = &(aux2_start[1 + (het_ct / CHAR_BIT)]);
6654 *fread_pp = aux2_second_part;
6655 if (PtrAddCk(fread_end, DivUp(raw_phasepresent_ct, CHAR_BIT), fread_pp)) {
6656 return kPglRetMalformedInput;
6657 }
6658 if (!phaseinfo) {
6659 return kPglRetSuccess;
6660 }
6661 if (!sample_include) {
6662 ExpandBytearrNested(aux2_second_part, aux2_first_part_copy, all_hets, sample_ctl, raw_phasepresent_ct, 1, phasepresent, phaseinfo);
6663 if (!subsetted_10het) {
6664 *phasepresent_ct_ptr = raw_phasepresent_ct;
6665 return kPglRetSuccess;
6666 }
6667 } else {
6668 // could skip if intersection of phasepresent with sample_include is
6669 // empty, but this function call should be fast enough there anyway?
6670 ExpandThenSubsetBytearrNested(aux2_second_part, aux2_first_part_copy, all_hets, sample_include, sample_ct, raw_phasepresent_ct, 1, phasepresent, phaseinfo);
6671 }
6672 }
6673 if (subsetted_10het) {
6674 BitvecInvmask(subsetted_10het, sample_ctl, phasepresent);
6675 }
6676 *phasepresent_ct_ptr = PopcountWords(phasepresent, sample_ctl);
6677 return kPglRetSuccess;
6678 }
6679
SkipAux2(const unsigned char * fread_end,uint32_t het_ct,const unsigned char ** fread_pp,uint32_t * __restrict phasepresent_ctp)6680 PglErr SkipAux2(const unsigned char* fread_end, uint32_t het_ct, const unsigned char** fread_pp, uint32_t* __restrict phasepresent_ctp) {
6681 const unsigned char* aux2_start = *fread_pp;
6682 const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
6683 if (PtrAddCk(fread_end, aux2_first_part_byte_ct, fread_pp)) {
6684 return kPglRetMalformedInput;
6685 }
6686 if (!(aux2_start[0] & 1)) {
6687 if (phasepresent_ctp) {
6688 *phasepresent_ctp = het_ct;
6689 }
6690 return kPglRetSuccess;
6691 }
6692 const uint32_t phasepresent_ct = PopcountBytes(aux2_start, aux2_first_part_byte_ct) - 1;
6693 if (phasepresent_ctp) {
6694 *phasepresent_ctp = phasepresent_ct;
6695 }
6696 if (PtrAddCk(fread_end, DivUp(phasepresent_ct, CHAR_BIT), fread_pp)) {
6697 return kPglRetMalformedInput;
6698 }
6699 return kPglRetSuccess;
6700 }
6701
6702 // If fread_pp/fread_endp are non-null, this always moves fread_ptr to the end
6703 // of aux2. Set phasepresent/phaseinfo to nullptr when you don't actually care
6704 // about the contents of aux2.
6705 // In multiallelic case, this guarantees phasepresent bits are only set at
6706 // ref/altx hets, not at altx/alty hets. (We don't currently guarantee this
6707 // for phaseinfo, since popcounts on that array are meaningless.) Yes, this is
6708 // mildly annoying, but the code would be messier if the ordering of
6709 // multiallelic-hardcall and hardcall-phase info were swapped.
ReadGenovecHphaseSubsetUnsafe(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict genovec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * phasepresent_ct_ptr)6710 PglErr ReadGenovecHphaseSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* phasepresent_ct_ptr) {
6711 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6712 if ((!(vrtype & 0x18)) || ((!fread_pp) && (!VrtypeHphase(vrtype)))) {
6713 *phasepresent_ct_ptr = 0;
6714 return ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, fread_pp, fread_endp, genovec);
6715 }
6716 // Either hphase track is present; or if it's absent, multiallelic track is
6717 // present and we were asked to advance fread_ptr to the end of aux2.
6718 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
6719 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
6720 uintptr_t* raw_genovec = (subsetting_required || VrtypeMultiallelicHc(vrtype))? pgrp->workspace_vec : genovec;
6721 const unsigned char* fread_ptr;
6722 const unsigned char* fread_end;
6723 PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
6724 if (unlikely(reterr)) {
6725 return reterr;
6726 }
6727 ZeroTrailingNyps(raw_sample_ct, raw_genovec);
6728 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
6729 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
6730 if (raw_genovec != genovec) {
6731 CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
6732 if (!VrtypeHphase(vrtype)) {
6733 // only possible if multiallelic track present and fread_ptr must be
6734 // advanced to end of aux2
6735 *fread_pp = fread_ptr;
6736 *fread_endp = fread_end;
6737 return SkipAux1(fread_end, raw_genovec, raw_sample_ct, allele_ct, fread_pp);
6738 }
6739 }
6740 uintptr_t* all_hets = pgrp->workspace_all_hets;
6741 PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
6742 uintptr_t* subsetted_10het = nullptr;
6743 if (VrtypeMultiallelicHc(vrtype)) {
6744 const uint32_t aux1_first_byte = *fread_ptr++;
6745 const uint32_t aux1a_mode = aux1_first_byte & 15;
6746 const uint32_t aux1b_mode = aux1_first_byte >> 4;
6747 uint32_t raw_01_ct = 0;
6748 uint32_t raw_10_ct = 0;
6749 if ((!aux1a_mode) || (!aux1b_mode)) {
6750 GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6751 }
6752 reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr);
6753 if (unlikely(reterr)) {
6754 return reterr;
6755 }
6756 // 1. fill workspace_aux1x_present with aux1b
6757 // 2. clear bit for each hom-altx call in aux1b
6758 // 3. bitvec-or to set new workspace_all_hets bits
6759 // 4. if not subsetting, set subsetted_10het := workspace_all_hets
6760 // if subsetting, copy-subset to pgrp->workspace_vec and set to that
6761 // if AllWordsAreZero, keep as nullptr
6762 uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
6763 uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
6764 uint32_t aux1b_het_present;
6765 reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, aux1b_hets, &aux1b_het_present, deltalist_workspace);
6766 if (unlikely(reterr)) {
6767 return reterr;
6768 }
6769 if (aux1b_het_present) {
6770 BitvecOr(aux1b_hets, BitCtToWordCt(raw_sample_ct), all_hets);
6771 if (!subsetting_required) {
6772 subsetted_10het = aux1b_hets;
6773 } else {
6774 // Don't need raw_genovec any more.
6775 CopyBitarrSubset(aux1b_hets, sample_include, sample_ct, raw_genovec);
6776 subsetted_10het = raw_genovec;
6777 }
6778 }
6779 }
6780 reterr = ParseAux2Subset(fread_end, subsetting_required? sample_include : nullptr, all_hets, subsetted_10het, raw_sample_ct, sample_ct, &fread_ptr, phasepresent, phaseinfo, phasepresent_ct_ptr, pgrp->workspace_subset);
6781 if (fread_pp) {
6782 *fread_pp = fread_ptr;
6783 *fread_endp = fread_end;
6784 }
6785 return reterr;
6786 }
6787
PgrGetP(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict genovec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6788 PglErr PgrGetP(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6789 if (!sample_ct) {
6790 *phasepresent_ct_ptr = 0;
6791 return kPglRetSuccess;
6792 }
6793 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
6794 assert(vidx < pgrp->fi.raw_variant_ct);
6795 return ReadGenovecHphaseSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, vidx, pgrp, nullptr, nullptr, genovec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6796 }
6797
6798 // eventually want to return fread_ptr/fread_end, but not relevant until
6799 // multiallelic dosage working
Get1MP(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,uintptr_t * __restrict allele_countvec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6800 PglErr Get1MP(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6801 // sample_ct > 0; either allele_idx > 1 or ((allele_idx == 1) &&
6802 // multiallelic_hc_present)
6803 const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
6804 if (!VrtypeHphase(vrtype)) {
6805 *phasepresent_ct_ptr = 0;
6806 return IMPLPgrGet1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_countvec);
6807 }
6808 uintptr_t* all_hets = pgrp->workspace_all_hets;
6809 uintptr_t* subsetted_10het = nullptr;
6810 const unsigned char* fread_ptr;
6811 const unsigned char* fread_end;
6812 PglErr reterr = Get1Multiallelic(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, &fread_ptr, &fread_end, all_hets, allele_countvec, &subsetted_10het);
6813 if (unlikely(reterr)) {
6814 return reterr;
6815 }
6816 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
6817 reterr = ParseAux2Subset(fread_end, (sample_ct != raw_sample_ct)? sample_include : nullptr, all_hets, subsetted_10het, raw_sample_ct, sample_ct, &fread_ptr, phasepresent, phaseinfo, phasepresent_ct_ptr, pgrp->workspace_subset);
6818 // bugfix (7 Sep 2018): Need to postprocess phasepresent when collapsing
6819 // multiple alleles.
6820 if (reterr || (!(*phasepresent_ct_ptr))) {
6821 return reterr;
6822 }
6823
6824 // Might want to make this its own function.
6825 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
6826 Halfword* phasepresent_alias = R_CAST(Halfword*, phasepresent);
6827 for (uint32_t hwidx = 0; hwidx != sample_ctl2; ++hwidx) {
6828 phasepresent_alias[hwidx] &= Pack01ToHalfword(allele_countvec[hwidx]);
6829 }
6830 *phasepresent_ct_ptr = PopcountWords(phasepresent, BitCtToWordCt(sample_ct));
6831
6832 return kPglRetSuccess;
6833 }
6834
PgrGet1P(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_countvec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6835 PglErr PgrGet1P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6836 if (!sample_ct) {
6837 *phasepresent_ct_ptr = 0;
6838 return kPglRetSuccess;
6839 }
6840 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
6841 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
6842 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6843 const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
6844 if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
6845 PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, allele_countvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6846 if (allele_idx) {
6847 GenovecInvertUnsafe(sample_ct, allele_countvec);
6848 if (*phasepresent_ct_ptr) {
6849 BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
6850 }
6851 }
6852 return reterr;
6853 }
6854 return Get1MP(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_countvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6855 }
6856
IMPLPgrGetInv1P(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx,PgenReaderMain * pgrp,uintptr_t * __restrict allele_invcountvec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6857 PglErr IMPLPgrGetInv1P(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6858 if (!sample_ct) {
6859 *phasepresent_ct_ptr = 0;
6860 return kPglRetSuccess;
6861 }
6862 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6863 const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
6864 if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
6865 PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, allele_invcountvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6866 if (!allele_idx) {
6867 GenovecInvertUnsafe(sample_ct, allele_invcountvec);
6868 if (*phasepresent_ct_ptr) {
6869 BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
6870 }
6871 }
6872 return reterr;
6873 }
6874 PglErr reterr = Get1MP(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6875 if (unlikely(reterr)) {
6876 return reterr;
6877 }
6878 GenovecInvertUnsafe(sample_ct, allele_invcountvec);
6879 if (*phasepresent_ct_ptr) {
6880 BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
6881 }
6882 return kPglRetSuccess;
6883 }
6884
PgrGet2P(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,uint32_t allele_idx0,uint32_t allele_idx1,PgenReader * pgr_ptr,uintptr_t * __restrict genovec,uintptr_t * __restrict phasepresent,uintptr_t * __restrict phaseinfo,uint32_t * __restrict phasepresent_ct_ptr)6885 PglErr PgrGet2P(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict phasepresent, uintptr_t* __restrict phaseinfo, uint32_t* __restrict phasepresent_ct_ptr) {
6886 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
6887 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
6888 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
6889 if (!VrtypeHphase(vrtype)) {
6890 *phasepresent_ct_ptr = 0;
6891 return IMPLPgrGet2(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx0, allele_idx1, pgrp, genovec);
6892 }
6893 if (!sample_ct) {
6894 *phasepresent_ct_ptr = 0;
6895 return kPglRetSuccess;
6896 }
6897 if (allele_idx0 + allele_idx1 == 1) {
6898 PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec, phasepresent, phaseinfo, phasepresent_ct_ptr);
6899 if (allele_idx0) {
6900 GenovecInvertUnsafe(sample_ct, genovec);
6901 if (*phasepresent_ct_ptr) {
6902 BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
6903 }
6904 }
6905 return reterr;
6906 }
6907 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
6908 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
6909 uintptr_t* raw_genovec = pgrp->workspace_vec;
6910 const unsigned char* fread_ptr;
6911 const unsigned char* fread_end;
6912 PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
6913 if (unlikely(reterr)) {
6914 return reterr;
6915 }
6916 ZeroTrailingNyps(raw_sample_ct, raw_genovec);
6917
6918 uint32_t invert = 0;
6919 if (allele_idx0 > allele_idx1) {
6920 const uint32_t swap = allele_idx0;
6921 allele_idx0 = allele_idx1;
6922 allele_idx1 = swap;
6923 invert = 1;
6924 }
6925 if (allele_idx0 > 1) {
6926 SetAllBits(2 * sample_ct, genovec);
6927 } else {
6928 CopyNyparrNonemptySubset(raw_genovec, sample_include, raw_sample_ct, sample_ct, genovec);
6929 // allele_idx1 > 1 guaranteed
6930 if (!allele_idx0) {
6931 GenovecNonzeroToMissingUnsafe(sample_ct, genovec);
6932 } else {
6933 GenovecInvertThenNonzeroToMissingUnsafe(sample_ct, genovec);
6934 }
6935 }
6936 uintptr_t* all_hets = pgrp->workspace_all_hets;
6937 PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
6938 uintptr_t* subsetted_10het = nullptr;
6939 if (!subsetting_required) {
6940 sample_include = nullptr;
6941 }
6942
6943 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
6944 const uint32_t allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
6945 if (VrtypeMultiallelicHc(vrtype)) {
6946 // This combines ReadGenovecHphaseSubsetUnsafe() and Get2()'s logic.
6947 const uint32_t aux1_first_byte = *fread_ptr++;
6948 const uint32_t aux1a_mode = aux1_first_byte & 15;
6949 const uint32_t aux1b_mode = aux1_first_byte >> 4;
6950 uint32_t raw_01_ct = 0;
6951 uint32_t raw_10_ct = 0;
6952 if ((!aux1a_mode) || (!aux1b_mode)) {
6953 GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6954 }
6955 uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
6956 if (!allele_idx0) {
6957 // Two cases:
6958 // - If allele_idx == 1, convert all aux1a entries from 01 to 11.
6959 // - Otherwise, for each matching aux1a entry, convert from 11 to 01.
6960 reterr = GenoarrAux1aUpdate(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, allele_idx1, 2, raw_01_ct, &fread_ptr, genovec, deltalist_workspace);
6961 } else {
6962 reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr);
6963 }
6964 if (unlikely(reterr)) {
6965 return reterr;
6966 }
6967 const unsigned char* aux1b_start = fread_ptr;
6968 reterr = GenoarrAux1bUpdate2(fread_end, sample_include, sample_include_cumulative_popcounts, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, allele_idx0, allele_idx1, raw_10_ct, &fread_ptr, genovec, deltalist_workspace);
6969 if (unlikely(reterr)) {
6970 return reterr;
6971 }
6972 // Can have a modified version of GenoarrAux1bUpdate2() which only requires
6973 // one pass, but let's keep the logic simpler for now since I don't expect
6974 // this function to be used frequently.
6975 uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
6976 uint32_t aux1b_het_present;
6977 reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &aux1b_start, aux1b_hets, &aux1b_het_present, deltalist_workspace);
6978 if (unlikely(reterr)) {
6979 return reterr;
6980 }
6981 if (aux1b_het_present) {
6982 BitvecOr(aux1b_hets, BitCtToWordCt(raw_sample_ct), all_hets);
6983 if (!subsetting_required) {
6984 subsetted_10het = aux1b_hets;
6985 } else {
6986 // Don't need raw_genovec any more.
6987 CopyBitarrSubset(aux1b_hets, sample_include, sample_ct, raw_genovec);
6988 subsetted_10het = raw_genovec;
6989 }
6990 }
6991 }
6992 reterr = ParseAux2Subset(fread_end, sample_include, all_hets, subsetted_10het, raw_sample_ct, sample_ct, &fread_ptr, phasepresent, phaseinfo, phasepresent_ct_ptr, pgrp->workspace_subset);
6993 if (unlikely(reterr)) {
6994 return reterr;
6995 }
6996 if (VrtypeMultiallelicHc(vrtype) && (*phasepresent_ct_ptr)) {
6997 const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
6998 Halfword* phasepresent_alias = R_CAST(Halfword*, phasepresent);
6999 for (uint32_t hwidx = 0; hwidx != sample_ctl2; ++hwidx) {
7000 phasepresent_alias[hwidx] &= Pack01ToHalfword(genovec[hwidx]);
7001 }
7002 *phasepresent_ct_ptr = PopcountWords(phasepresent, BitCtToWordCt(sample_ct));
7003 }
7004 if (invert) {
7005 GenovecInvertUnsafe(sample_ct, genovec);
7006 if (*phasepresent_ct_ptr) {
7007 BitvecInvert(BitCtToWordCt(sample_ct), phaseinfo);
7008 }
7009 }
7010 return kPglRetSuccess;
7011 }
7012
PgrGetMP(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)7013 PglErr PgrGetMP(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
7014 pgvp->patch_01_ct = 0;
7015 pgvp->patch_10_ct = 0;
7016 if (!sample_ct) {
7017 pgvp->phasepresent_ct = 0;
7018 return kPglRetSuccess;
7019 }
7020 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
7021 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
7022 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
7023 const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
7024 if (!multiallelic_hc_present) {
7025 return ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, pgvp->genovec, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct));
7026 }
7027 const unsigned char* fread_ptr;
7028 const unsigned char* fread_end;
7029 uintptr_t* all_hets = VrtypeHphase(vrtype)? pgrp->workspace_all_hets : nullptr;
7030 PglErr reterr = GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, all_hets? (&fread_ptr) : nullptr, all_hets? (&fread_end) : nullptr, all_hets, pgvp);
7031 if (reterr || (!all_hets)) {
7032 return reterr;
7033 }
7034 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7035 return ParseAux2Subset(fread_end, (sample_ct != raw_sample_ct)? sample_include : nullptr, all_hets, nullptr, raw_sample_ct, sample_ct, &fread_ptr, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct), pgrp->workspace_subset);
7036 }
7037
7038 // ok for sample_include to be nullptr if not subsetting, though this is not
7039 // required
ParseDosage16(const unsigned char * fread_ptr,const unsigned char * fread_end,const uintptr_t * __restrict sample_include,uint32_t sample_ct,uint32_t vidx,uint32_t allele_ct,PgenReaderMain * pgrp,uint32_t * __restrict dosage_ct_ptr,uintptr_t * __restrict dphase_present,int16_t * dphase_delta,uint32_t * __restrict dphase_ct_ptr,uintptr_t * __restrict dosage_present,uint16_t * dosage_main)7040 PglErr ParseDosage16(const unsigned char* fread_ptr, const unsigned char* fread_end, const uintptr_t* __restrict sample_include, uint32_t sample_ct, uint32_t vidx, uint32_t allele_ct, PgenReaderMain* pgrp, uint32_t* __restrict dosage_ct_ptr, uintptr_t* __restrict dphase_present, int16_t* dphase_delta, uint32_t* __restrict dphase_ct_ptr, uintptr_t* __restrict dosage_present, uint16_t* dosage_main) {
7041 // Side effect: may use pgrp->workspace_dosage_present and
7042 // pgrp->workspace_dphase_present
7043 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7044 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7045 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
7046 uintptr_t* raw_dosage_present = subsetting_required? pgrp->workspace_dosage_present : dosage_present;
7047 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
7048 const uint32_t is_unconditional_dosage = ((vrtype & 0x60) == 0x40);
7049 uint32_t raw_dosage_ct;
7050 if ((vrtype & 0x60) == 0x20) {
7051 // case 1: dosage list
7052 if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, &fread_ptr, raw_dosage_present, &raw_dosage_ct))) {
7053 return kPglRetMalformedInput;
7054 }
7055 } else if (is_unconditional_dosage) {
7056 // case 2: unconditional dosage. handle separately from other two cases
7057 // since missing values may be present.
7058 SetAllBits(raw_sample_ct, raw_dosage_present);
7059 raw_dosage_ct = raw_sample_ct;
7060 } else {
7061 // case 3: dosage bitarray
7062 raw_dosage_present[raw_sample_ctl - 1] = 0;
7063 const uint32_t raw_sample_ctb = DivUp(raw_sample_ct, CHAR_BIT);
7064 memcpy(raw_dosage_present, fread_ptr, raw_sample_ctb);
7065 fread_ptr = &(fread_ptr[raw_sample_ctb]);
7066 raw_dosage_ct = PopcountWords(raw_dosage_present, raw_sample_ctl);
7067 }
7068 const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
7069 uint32_t dosage_ct;
7070 if (subsetting_required) {
7071 CopyBitarrSubset(raw_dosage_present, sample_include, sample_ct, dosage_present);
7072 dosage_ct = PopcountWords(dosage_present, sample_ctl);
7073 } else {
7074 dosage_ct = raw_dosage_ct;
7075 }
7076 if (dosage_ct_ptr) {
7077 *dosage_ct_ptr = dosage_ct;
7078 }
7079 if (!dosage_ct) {
7080 if (dphase_ct_ptr) {
7081 *dphase_ct_ptr = 0;
7082 }
7083 return kPglRetSuccess;
7084 }
7085 #ifdef __arm__
7086 # error "Unaligned accesses in ParseDosage16()."
7087 #endif
7088 const uint16_t* dosage_main_read_iter = R_CAST(const uint16_t*, fread_ptr);
7089 uint16_t* dosage_main_write_iter = dosage_main;
7090 uint32_t raw_dphase_ct = 0;
7091 uint32_t dphase_ct = 0;
7092 uintptr_t* raw_dphase_present = nullptr;
7093 if (dphase_present && (vrtype & 0x80)) {
7094 fread_ptr = &(fread_ptr[raw_dosage_ct * 2]);
7095 if (!is_unconditional_dosage) {
7096 const uintptr_t* file_dphase_present = R_CAST(const uintptr_t*, fread_ptr);
7097 fread_ptr = &(fread_ptr[DivUp(raw_dosage_ct, CHAR_BIT)]);
7098 raw_dphase_present = subsetting_required? pgrp->workspace_dphase_present : dphase_present;
7099 ExpandBytearr(file_dphase_present, raw_dosage_present, raw_sample_ctl, raw_dosage_ct, 0, raw_dphase_present);
7100 raw_dphase_ct = PopcountWords(raw_dphase_present, raw_sample_ctl);
7101 dphase_ct = raw_dphase_ct;
7102 if (subsetting_required) {
7103 CopyBitarrSubset(raw_dphase_present, sample_include, sample_ct, dphase_present);
7104 dphase_ct = PopcountWords(dphase_present, sample_ctl);
7105 }
7106 } else {
7107 // raw_dphase_present = raw_dosage_present;
7108 dphase_ct = dosage_ct;
7109 SetAllBits(sample_ct, dphase_present);
7110 }
7111 }
7112 if (!dphase_ct) {
7113 if (allele_ct == 2) {
7114 if (!is_unconditional_dosage) {
7115 if (dosage_ct == raw_dosage_ct) {
7116 memcpy(dosage_main_write_iter, dosage_main_read_iter, dosage_ct * sizeof(int16_t));
7117 } else {
7118 // bugfix (22 May 2017): dosage_entry_idx needs to iterate up to
7119 // raw_dosage_ct, not dosage_ct
7120 uintptr_t widx = ~k0LU;
7121 uint32_t dosage_entry_idx = 0;
7122 do {
7123 uintptr_t cur_bits;
7124 do {
7125 cur_bits = raw_dosage_present[++widx];
7126 } while (!cur_bits);
7127 const uintptr_t sample_include_word = sample_include[widx];
7128 do {
7129 const uintptr_t low_bit = cur_bits & (-cur_bits);
7130 if (sample_include_word & low_bit) {
7131 *dosage_main_write_iter++ = dosage_main_read_iter[dosage_entry_idx];
7132 }
7133 ++dosage_entry_idx;
7134 cur_bits ^= low_bit;
7135 } while (cur_bits);
7136 } while (dosage_entry_idx != raw_dosage_ct);
7137 }
7138 } else {
7139 if (!subsetting_required) {
7140 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
7141 const uint16_t cur_dosage = *dosage_main_read_iter++;
7142 if (cur_dosage != 65535) {
7143 *dosage_main_write_iter++ = cur_dosage;
7144 } else {
7145 ClearBit(sample_idx, dosage_present);
7146 }
7147 }
7148 } else {
7149 uintptr_t widx = ~k0LU;
7150 uint32_t sample_idx = 0;
7151 do {
7152 uintptr_t cur_bits;
7153 do {
7154 cur_bits = sample_include[++widx];
7155 } while (!cur_bits);
7156 const uintptr_t sample_uidx_base = widx * kBitsPerWord;
7157 const uint16_t* dosage_main_readp = &(dosage_main_read_iter[sample_uidx_base]);
7158 do {
7159 const uint32_t sample_uidx_lowbits = ctzw(cur_bits);
7160 const uint16_t cur_dosage = dosage_main_readp[sample_uidx_lowbits];
7161 if (cur_dosage != 65535) {
7162 *dosage_main_write_iter++ = cur_dosage;
7163 } else {
7164 ClearBit(sample_idx, dosage_present);
7165 }
7166 ++sample_idx;
7167 cur_bits &= cur_bits - 1;
7168 } while (cur_bits);
7169 } while (sample_idx != sample_ct);
7170 }
7171 if (dosage_ct_ptr) {
7172 *dosage_ct_ptr = dosage_main_write_iter - dosage_main;
7173 }
7174 }
7175 } else {
7176 // todo: multiallelic dosage
7177 // need to support downcode to ref/nonref as well as raw load
7178 // (dosage_ct_ptr should be nullptr iff we're doing a raw load)
7179 fputs("multiallelic variants not yet supported by ParseDosage16()\n", stderr);
7180 exit(S_CAST(int32_t, kPglRetNotYetSupported));
7181 return kPglRetSuccess;
7182 }
7183 if (dphase_ct_ptr) {
7184 *dphase_ct_ptr = 0;
7185 }
7186 } else {
7187 // phased dosage
7188 if (allele_ct == 2) {
7189 if (!is_unconditional_dosage) {
7190 if (dphase_ct == raw_dphase_ct) {
7191 memcpy(dosage_main_write_iter, dosage_main_read_iter, dosage_ct * sizeof(int16_t));
7192 memcpy(dphase_delta, fread_ptr, dphase_ct * sizeof(int16_t));
7193 if (dphase_ct_ptr) {
7194 *dphase_ct_ptr = dphase_ct;
7195 }
7196 } else {
7197 uintptr_t widx = ~k0LU;
7198 uint32_t dosage_entry_idx = 0;
7199 do {
7200 uintptr_t cur_bits;
7201 do {
7202 cur_bits = raw_dosage_present[++widx];
7203 } while (!cur_bits);
7204 const uintptr_t sample_include_word = sample_include[widx];
7205 do {
7206 const uintptr_t low_bit = cur_bits & (-cur_bits);
7207 if (sample_include_word & low_bit) {
7208 *dosage_main_write_iter++ = dosage_main_read_iter[dosage_entry_idx];
7209 }
7210 ++dosage_entry_idx;
7211 cur_bits ^= low_bit;
7212 } while (cur_bits);
7213 } while (dosage_entry_idx != raw_dosage_ct);
7214 widx = ~k0LU;
7215 uint32_t dphase_entry_idx = 0;
7216 const int16_t* dphase_delta_read_alias = R_CAST(const int16_t*, fread_ptr);
7217 int16_t* dphase_delta_write_iter = dphase_delta;
7218 do {
7219 uintptr_t cur_bits;
7220 do {
7221 cur_bits = raw_dphase_present[++widx];
7222 } while (!cur_bits);
7223 const uintptr_t sample_include_word = sample_include[widx];
7224 do {
7225 const uintptr_t low_bit = cur_bits & (-cur_bits);
7226 if (sample_include_word & low_bit) {
7227 *dphase_delta_write_iter++ = dphase_delta_read_alias[dphase_entry_idx];
7228 }
7229 ++dphase_entry_idx;
7230 cur_bits ^= low_bit;
7231 } while (cur_bits);
7232 } while (dphase_entry_idx != raw_dphase_ct);
7233 if (dphase_ct_ptr) {
7234 *dphase_ct_ptr = dphase_delta_write_iter - dphase_delta;
7235 }
7236 }
7237 } else {
7238 const int16_t* dphase_delta_read = R_CAST(const int16_t*, fread_ptr);
7239 int16_t* dphase_delta_write_iter = dphase_delta;
7240 if (!subsetting_required) {
7241 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
7242 const uint16_t cur_dosage = *dosage_main_read_iter++;
7243 if (cur_dosage != 65535) {
7244 *dosage_main_write_iter++ = cur_dosage;
7245 const int16_t dphase_delta_val = dphase_delta_read[sample_idx];
7246 if (dphase_delta_val) {
7247 *dphase_delta_write_iter++ = dphase_delta_val;
7248 } else {
7249 ClearBit(sample_idx, dphase_present);
7250 }
7251 } else {
7252 // assert(dphase_delta_read[sample_idx] == -32768);
7253 ClearBit(sample_idx, dosage_present);
7254 }
7255 }
7256 } else {
7257 uintptr_t sample_uidx_base = 0;
7258 uintptr_t sample_include_bits = sample_include[0];
7259 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
7260 const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
7261 const uint16_t cur_dosage = dosage_main_read_iter[sample_uidx];
7262 if (cur_dosage != 65535) {
7263 *dosage_main_write_iter++ = cur_dosage;
7264 const int16_t dphase_delta_val = dphase_delta_read[sample_uidx];
7265 if (dphase_delta_val) {
7266 *dphase_delta_write_iter++ = dphase_delta_val;
7267 } else {
7268 ClearBit(sample_idx, dphase_present);
7269 }
7270 } else {
7271 // assert(dphase_delta_read[sample_uidx] == -32768);
7272 ClearBit(sample_idx, dosage_present);
7273 }
7274 }
7275 }
7276 dosage_ct = dosage_main_write_iter - dosage_main;
7277 if (dosage_ct != sample_ct) {
7278 BitvecAnd(dosage_present, sample_ctl, dphase_present);
7279 }
7280 if (dosage_ct_ptr) {
7281 *dosage_ct_ptr = dosage_ct;
7282 }
7283 if (dphase_ct_ptr) {
7284 *dphase_ct_ptr = dphase_delta_write_iter - dphase_delta;
7285 }
7286 }
7287 } else {
7288 // multiallelic subcase
7289 fputs("multiallelic variants not yet supported by ParseDosage16()\n", stderr);
7290 exit(S_CAST(int32_t, kPglRetNotYetSupported));
7291 return kPglRetSuccess;
7292 }
7293 }
7294 return kPglRetSuccess;
7295 }
7296
IMPLPgrGetD(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,uintptr_t * __restrict genovec,uintptr_t * __restrict dosage_present,uint16_t * dosage_main,uint32_t * dosage_ct_ptr)7297 PglErr IMPLPgrGetD(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
7298 assert(vidx < pgrp->fi.raw_variant_ct);
7299 if (!sample_ct) {
7300 *dosage_ct_ptr = 0;
7301 return kPglRetSuccess;
7302 }
7303 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
7304 if ((!VrtypeDosage(vrtype)) || (!dosage_present)) {
7305 *dosage_ct_ptr = 0;
7306 return ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, nullptr, nullptr, genovec);
7307 }
7308 const unsigned char* fread_ptr = nullptr;
7309 const unsigned char* fread_end = nullptr;
7310 uint32_t phasepresent_ct;
7311 PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec, nullptr, nullptr, &phasepresent_ct);
7312 if (unlikely(reterr)) {
7313 return reterr;
7314 }
7315 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
7316 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
7317 return ParseDosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, allele_ct, pgrp, dosage_ct_ptr, nullptr, nullptr, nullptr, dosage_present, dosage_main);
7318 }
7319
PgrGet1D(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,AlleleCode allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_countvec,uintptr_t * __restrict dosage_present,uint16_t * dosage_main,uint32_t * dosage_ct_ptr)7320 PglErr PgrGet1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
7321 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
7322 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
7323 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
7324 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
7325 if ((allele_ct == 2) || (!allele_idx)) {
7326 uint32_t dosage_ct;
7327 PglErr reterr = IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, allele_countvec, dosage_present, dosage_main, &dosage_ct);
7328 if (!allele_idx) {
7329 GenovecInvertUnsafe(sample_ct, allele_countvec);
7330 if (dosage_ct) {
7331 BiallelicDosage16Invert(dosage_ct, dosage_main);
7332 }
7333 }
7334 *dosage_ct_ptr = dosage_ct;
7335 return reterr;
7336 }
7337 const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
7338 if (!VrtypeDosage(vrtype)) {
7339 *dosage_ct_ptr = 0;
7340 return IMPLPgrGet1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_countvec);
7341 }
7342 fputs("multiallelic variants not yet supported by PgrGet1D()\n", stderr);
7343 exit(S_CAST(int32_t, kPglRetNotYetSupported));
7344 return kPglRetSuccess;
7345 }
7346
PgrGetInv1D(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,AlleleCode allele_idx,PgenReader * pgr_ptr,uintptr_t * __restrict allele_invcountvec,uintptr_t * __restrict dosage_present,uint16_t * dosage_main,uint32_t * dosage_ct_ptr)7347 PglErr PgrGetInv1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
7348 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
7349 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
7350 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
7351 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
7352 if ((allele_ct == 2) || (!allele_idx)) {
7353 uint32_t dosage_ct;
7354 PglErr reterr = IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, allele_invcountvec, dosage_present, dosage_main, &dosage_ct);
7355 if (allele_idx) {
7356 GenovecInvertUnsafe(sample_ct, allele_invcountvec);
7357 if (dosage_ct) {
7358 BiallelicDosage16Invert(dosage_ct, dosage_main);
7359 }
7360 }
7361 *dosage_ct_ptr = dosage_ct;
7362 return reterr;
7363 }
7364 const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
7365 if (!VrtypeDosage(vrtype)) {
7366 *dosage_ct_ptr = 0;
7367 return IMPLPgrGetInv1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec);
7368 }
7369 fputs("multiallelic variants not yet supported by PgrGetInv1D()\n", stderr);
7370 exit(S_CAST(int32_t, kPglRetNotYetSupported));
7371 return kPglRetSuccess;
7372 }
7373
GetAux1bHetIncr(const unsigned char * fread_end,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uint32_t * __restrict raw_het_ctp)7374 PglErr GetAux1bHetIncr(const unsigned char* fread_end, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uint32_t* __restrict raw_het_ctp) {
7375 if (aux1b_mode == 15) {
7376 return kPglRetSuccess;
7377 }
7378 uint32_t rare10_ct;
7379 if (!aux1b_mode) {
7380 const uint32_t fset_byte_ct = DivUp(raw_10_ct, 8);
7381 rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
7382 *fread_pp += fset_byte_ct;
7383 } else {
7384 // aux1b_mode == 1
7385 const unsigned char* group_info_iter;
7386 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare10_ct);
7387 if (unlikely(reterr)) {
7388 return reterr;
7389 }
7390 reterr = SkipDeltalistIds(fread_end, group_info_iter, rare10_ct, raw_sample_ct, 0, fread_pp);
7391 if (unlikely(reterr)) {
7392 return reterr;
7393 }
7394 }
7395 uintptr_t detect_hom_mask_lo;
7396 const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
7397 const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
7398 #ifdef __arm__
7399 # error "Unaligned accesses in GetAux1bHetIncr()."
7400 #endif
7401 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
7402 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, CHAR_BIT);
7403 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
7404 return kPglRetMalformedInput;
7405 }
7406 if (allele_ct == 3) {
7407 const uint32_t hom22_ct = PopcountBytes(patch_10_fvalsw, fvals_byte_ct);
7408 *raw_het_ctp += rare10_ct - hom22_ct;
7409 return kPglRetSuccess;
7410 }
7411 // possible todo: vectorized het-counter, analogous to CountAux1bDense()
7412 const uint32_t code10_width = 1U << code10_logwidth;
7413 const uint32_t allele_code_width = 1U << allele_code_logwidth;
7414 const uintptr_t detect_all_mask_lo = detect_hom_mask_lo | (detect_hom_mask_lo << allele_code_width);
7415 const uintptr_t detect_all_mask_hi = detect_all_mask_lo << (allele_code_width - 1);
7416 const uintptr_t detect_hom_mask_hi = detect_hom_mask_lo << (code10_width - 1);
7417 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
7418 uint32_t het_incr = 0;
7419 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
7420 uintptr_t fvals_bits;
7421 if (fvals_widx >= fvals_word_ct_m1) {
7422 if (fvals_widx > fvals_word_ct_m1) {
7423 break;
7424 }
7425 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
7426 } else {
7427 fvals_bits = patch_10_fvalsw[fvals_widx];
7428 }
7429 // allele_ct > 3 guaranteed
7430 fvals_bits = fvals_bits ^ (fvals_bits << allele_code_width);
7431 fvals_bits = detect_hom_mask_hi & (fvals_bits | ((fvals_bits | detect_all_mask_hi) - detect_all_mask_lo));
7432 if (fvals_widx == fvals_word_ct_m1) {
7433 fvals_bits = bzhi_max(fvals_bits, ModNz(rare10_ct << code10_logwidth, kBitsPerWord));
7434 }
7435 het_incr += PopcountWord(fvals_bits);
7436 }
7437 *raw_het_ctp += het_incr;
7438 return kPglRetSuccess;
7439 }
7440
U16VecSum(const uint16_t * __restrict uint16_vec,uint32_t entry_ct)7441 uint64_t U16VecSum(const uint16_t* __restrict uint16_vec, uint32_t entry_ct) {
7442 #ifdef __LP64__
7443 // UniVecHsum32() could overflow once we exceed this
7444 const uint32_t max_loop_len = (131072 / kInt32PerVec) - 1;
7445
7446 const VecW m16 = VCONST_W(kMask0000FFFF);
7447 const VecW* uint16_vvec_iter = R_CAST(const VecW*, uint16_vec);
7448 uint64_t sum = 0;
7449 for (uint32_t full_vecs_remaining = entry_ct / (kBytesPerVec / sizeof(int16_t)); ; ) {
7450 UniVec acc_even;
7451 UniVec acc_odd;
7452 acc_even.vw = vecw_setzero();
7453 acc_odd.vw = vecw_setzero();
7454 const VecW* uint16_vvec_stop;
7455 if (full_vecs_remaining < max_loop_len) {
7456 if (!full_vecs_remaining) {
7457 const uint32_t trail_ct = entry_ct % (kBytesPerVec / sizeof(int16_t));
7458 uint16_vec = R_CAST(const uint16_t*, uint16_vvec_iter);
7459 for (uint32_t uii = 0; uii != trail_ct; ++uii) {
7460 sum += uint16_vec[uii];
7461 }
7462 return sum;
7463 }
7464 uint16_vvec_stop = &(uint16_vvec_iter[full_vecs_remaining]);
7465 full_vecs_remaining = 0;
7466 } else {
7467 uint16_vvec_stop = &(uint16_vvec_iter[max_loop_len]);
7468 full_vecs_remaining -= max_loop_len;
7469 }
7470 do {
7471 const VecW cur_vec = *uint16_vvec_iter++;
7472 acc_even.vw = acc_even.vw + (cur_vec & m16);
7473 acc_odd.vw = acc_odd.vw + (vecw_srli(cur_vec, 16) & m16);
7474 } while (uint16_vvec_iter < uint16_vvec_stop);
7475 sum += UniVecHsum32(acc_even);
7476 sum += UniVecHsum32(acc_odd);
7477 }
7478 #else
7479 uint64_t sum = 0;
7480 for (uint32_t uii = 0; uii != entry_ct; ++uii) {
7481 sum += uint16_vec[uii];
7482 }
7483 return sum;
7484 #endif
7485 }
7486
GetPhasepresentAndSkipPhaseinfo(const unsigned char * fread_end,const uintptr_t * __restrict all_hets,uint32_t raw_sample_ct,uint32_t het_ct,const unsigned char ** fread_pp,uintptr_t * __restrict phasepresent,uint32_t * __restrict phasepresent_ctp)7487 PglErr GetPhasepresentAndSkipPhaseinfo(const unsigned char* fread_end, const uintptr_t* __restrict all_hets, uint32_t raw_sample_ct, uint32_t het_ct, const unsigned char** fread_pp, uintptr_t* __restrict phasepresent, uint32_t* __restrict phasepresent_ctp) {
7488 const unsigned char* aux2_start = *fread_pp;
7489 const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
7490 if (PtrAddCk(fread_end, aux2_first_part_byte_ct, fread_pp)) {
7491 return kPglRetMalformedInput;
7492 }
7493 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7494 if (!(aux2_start[0] & 1)) {
7495 memcpy(phasepresent, all_hets, raw_sample_ctl * kBytesPerWord);
7496 *phasepresent_ctp = het_ct;
7497 return kPglRetSuccess;
7498 }
7499 const uint32_t phasepresent_ct = PopcountBytes(aux2_start, aux2_first_part_byte_ct) - 1;
7500 if (PtrAddCk(fread_end, DivUp(phasepresent_ct, CHAR_BIT), fread_pp)) {
7501 return kPglRetMalformedInput;
7502 }
7503 *phasepresent_ctp = phasepresent_ct;
7504 ExpandBytearr(aux2_start, all_hets, raw_sample_ctl, het_ct, 1, phasepresent);
7505 return kPglRetSuccess;
7506 }
7507
GetUnphasedBiallelicHetCt(const uintptr_t * __restrict sample_include,const uintptr_t * raw_genoarr,const unsigned char * fread_ptr,const unsigned char * fread_end,uint32_t subsetted_het_ct,PgenReaderMain * pgrp,uint32_t * unphased_het_ctp)7508 PglErr GetUnphasedBiallelicHetCt(const uintptr_t* __restrict sample_include, const uintptr_t* raw_genoarr, const unsigned char* fread_ptr, const unsigned char* fread_end, uint32_t subsetted_het_ct, PgenReaderMain* pgrp, uint32_t* unphased_het_ctp) {
7509 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7510 uint32_t raw_het_ct;
7511 if (!sample_include) {
7512 raw_het_ct = subsetted_het_ct;
7513 } else {
7514 raw_het_ct = CountNyp(raw_genoarr, kMask5555, raw_sample_ct);
7515 }
7516 const uint32_t aux2_first_part_byte_ct = 1 + (raw_het_ct / CHAR_BIT);
7517 if (PtrCheck(fread_end, fread_ptr, aux2_first_part_byte_ct)) {
7518 return kPglRetMalformedInput;
7519 }
7520 const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
7521 if (!explicit_phasepresent) {
7522 // initial value of 0 is correct
7523 return kPglRetSuccess;
7524 }
7525 if (raw_het_ct == subsetted_het_ct) {
7526 *unphased_het_ctp = raw_het_ct + 1 - PopcountBytes(fread_ptr, aux2_first_part_byte_ct);
7527 return kPglRetSuccess;
7528 }
7529 // A dedicated counting function would be faster, but this case
7530 // should rarely come up.
7531 uintptr_t* all_hets = pgrp->workspace_all_hets;
7532 PgrDetectGenoarrHets(raw_genoarr, raw_sample_ct, all_hets);
7533 uintptr_t* raw_phasepresent = pgrp->workspace_subset;
7534 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7535 // todo: compare against ExpandThenSubsetBytearr followed by simple popcount
7536 ExpandBytearr(fread_ptr, all_hets, raw_sample_ctl, raw_het_ct, 1, raw_phasepresent);
7537 *unphased_het_ctp = subsetted_het_ct - PopcountWordsIntersect(raw_phasepresent, sample_include, raw_sample_ctl);
7538 return kPglRetSuccess;
7539 }
7540
7541 PglErr GetPhasedBiallelicGenotypeSubsetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, uint32_t* unphased_het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts) {
7542 // Currently much less optimized than the other count functions. (This case
7543 // shouldn't come up much, the user has to be computing minimac-r2 on a file
7544 // with no dosages...)
7545 uintptr_t* raw_genovec = pgrp->workspace_vec;
7546 const unsigned char* fread_ptr;
7547 const unsigned char* fread_end;
7548 PglErr reterr = ReadRawGenovec(1, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
7549 if (unlikely(reterr)) {
7550 return reterr;
7551 }
7552 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7553 ZeroTrailingNyps(raw_sample_ct, raw_genovec);
7554 GenoarrCountSubsetFreqs(raw_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
7555 return GetUnphasedBiallelicHetCt(sample_include, raw_genovec, fread_ptr, fread_end, genocounts[1], pgrp, unphased_het_ctp);
7556 }
7557
7558 // Imputation r^2 computation:
7559 // * This function assumes the biallelic diploid case. Divide by two to get
7560 // the biallelic haploid value, for whatever that's worth.
7561 // * chrX requires sex information, so that's handled directly in
7562 // LoadAlleleAndGenoCountsThread()... er, actually, we just give up on that
7563 // for now.
7564 // * See PgrGetMDCounts() support functions below for multiallelic-diploid
7565 // notes.
7566 PglErr GetBasicGenotypeCountsAndDosage16s(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReaderMain* pgrp, double* imp_r2_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages) {
7567 // genocounts[0] := ref/ref, genocounts[1] := ref/altx,
7568 // genocounts[2] := altx/alty, genocounts[3] := missing
7569 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
7570 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
7571 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
7572 uint32_t unphased_het_ct = 0;
7573 // To avoid LD cache thrashing, we try to either always keep a subsetted
7574 // cache, or never do so. (Always, when only hardcalls are present;
7575 // otherwise never.)
7576 if ((!(pgrp->fi.gflags & kfPgenGlobalDosagePresent)) ||
7577 ((!(vrtype & 0x60)) && (!subsetting_required))) {
7578 {
7579 const uint32_t need_unphased_het_ct = is_minimac3_r2 && VrtypeHphase(vrtype);
7580 PglErr reterr;
7581 if (!(subsetting_required && need_unphased_het_ct)) {
7582 reterr = GetBasicGenotypeCounts(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, need_unphased_het_ct? (&unphased_het_ct) : nullptr, genocounts);
7583 } else {
7584 reterr = GetPhasedBiallelicGenotypeSubsetCounts(sample_include, sample_include_interleaved_vec, sample_ct, vidx, pgrp, &unphased_het_ct, genocounts);
7585 }
7586 if (unlikely(reterr)) {
7587 return reterr;
7588 }
7589 }
7590 GetBasicGenotypeCountsAndDosage16s_basic_finish:
7591 all_dosages[0] = (genocounts[0] * 2 + genocounts[1]) * 16384LLU;
7592 all_dosages[1] = (genocounts[2] * 2 + genocounts[1]) * 16384LLU;
7593 if (!imp_r2_ptr) {
7594 return kPglRetSuccess;
7595 }
7596 // yeah, it's sinful to implement imputation r2 here...
7597 const uint32_t nm_sample_ct = sample_ct - genocounts[3];
7598 const uint64_t alt1_dosage = genocounts[2] * 0x8000LLU + genocounts[1] * 0x4000LLU;
7599 uint64_t hap_alt1_ssq_x2 = genocounts[2] * 0x40000000LLU + genocounts[1] * 0x10000000LLU;
7600 if (is_minimac3_r2) {
7601 if (!VrtypeHphase(vrtype)) {
7602 unphased_het_ct = genocounts[1];
7603 }
7604 hap_alt1_ssq_x2 += (genocounts[1] - unphased_het_ct) * 0x10000000LLU;
7605 }
7606 *imp_r2_ptr = BiallelicDiploidMinimac3R2(alt1_dosage, hap_alt1_ssq_x2, nm_sample_ct);
7607 if (!is_minimac3_r2) {
7608 *imp_r2_ptr *= 2;
7609 }
7610 return kPglRetSuccess;
7611 }
7612 uintptr_t* raw_genovec = pgrp->workspace_vec;
7613 const unsigned char* fread_ptr;
7614 const unsigned char* fread_end;
7615 PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
7616 if (unlikely(reterr)) {
7617 return reterr;
7618 }
7619 ZeroTrailingNyps(raw_sample_ct, raw_genovec);
7620 if (!subsetting_required) {
7621 GenoarrCountFreqsUnsafe(raw_genovec, raw_sample_ct, genocounts);
7622 } else {
7623 GenoarrCountSubsetFreqs(raw_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
7624 }
7625 if (!(vrtype & 0x60)) {
7626 if (is_minimac3_r2 && VrtypeHphase(vrtype)) {
7627 assert(!VrtypeMultiallelicHc(vrtype));
7628 reterr = GetUnphasedBiallelicHetCt(subsetting_required? sample_include : nullptr, raw_genovec, fread_ptr, fread_end, genocounts[1], pgrp, &unphased_het_ct);
7629 if (unlikely(reterr)) {
7630 return reterr;
7631 }
7632 }
7633 goto GetBasicGenotypeCountsAndDosage16s_basic_finish;
7634 }
7635 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
7636 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
7637 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
7638 uintptr_t* raw_phasepresent = pgrp->workspace_subset;
7639 uint32_t raw_phasepresent_ct = 0;
7640 if (VrtypeHphase(vrtype)) {
7641 uint32_t raw_het_ct = genocounts[1]; // inaccurate if subsetting_required
7642 if (!is_minimac3_r2) {
7643 if (VrtypeMultiallelicHc(vrtype)) {
7644 const uint32_t aux1_first_byte = *fread_ptr++;
7645 const uint32_t aux1a_mode = aux1_first_byte & 15;
7646 const uint32_t aux1b_mode = aux1_first_byte >> 4;
7647 uint32_t raw_10_ct = 0;
7648 if ((!aux1a_mode) || (!aux1b_mode) || subsetting_required) {
7649 GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_het_ct, &raw_10_ct);
7650 }
7651 reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_het_ct, &fread_ptr);
7652 if (unlikely(reterr)) {
7653 return reterr;
7654 }
7655 reterr = GetAux1bHetIncr(fread_end, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, &raw_het_ct);
7656 if (unlikely(reterr)) {
7657 return reterr;
7658 }
7659 } else if (subsetting_required) {
7660 raw_het_ct = CountNyp(raw_genovec, kMask5555, raw_sample_ct);
7661 }
7662 reterr = SkipAux2(fread_end, raw_het_ct, &fread_ptr, nullptr);
7663 if (unlikely(reterr)) {
7664 return reterr;
7665 }
7666 } else {
7667 assert(!VrtypeMultiallelicHc(vrtype));
7668 uintptr_t* all_hets = pgrp->workspace_all_hets;
7669 PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
7670 if (subsetting_required) {
7671 raw_het_ct = PopcountWords(all_hets, raw_sample_ctl);
7672 }
7673 const uint32_t first_half_byte_ct = 1 + (raw_het_ct / CHAR_BIT);
7674 const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
7675 if (explicit_phasepresent) {
7676 ExpandBytearr(fread_ptr, all_hets, raw_sample_ctl, raw_het_ct, 1, raw_phasepresent);
7677 raw_phasepresent_ct = PopcountBytes(fread_ptr, first_half_byte_ct) - 1;
7678 const uint32_t second_half_byte_ct = DivUp(raw_phasepresent_ct, CHAR_BIT);
7679 fread_ptr = &(fread_ptr[first_half_byte_ct + second_half_byte_ct]);
7680 } else {
7681 raw_phasepresent_ct = raw_het_ct;
7682 memcpy(raw_phasepresent, all_hets, raw_sample_ctl * sizeof(intptr_t));
7683 fread_ptr = &(fread_ptr[first_half_byte_ct]);
7684 }
7685 }
7686 } else if (VrtypeMultiallelicHc(vrtype)) {
7687 reterr = SkipAux1(fread_end, raw_genovec, raw_sample_ct, allele_ct, &fread_ptr);
7688 if (unlikely(reterr)) {
7689 return reterr;
7690 }
7691 }
7692 if (allele_ct != 2) {
7693 // Maybe make this an invalid function call? If that happens, the
7694 // VrtypeMultiallelicHc() branch above can be removed.
7695 fputs("multiallelic dosages not yet supported by GetBasicGenotypeCountsAndDosage16s()\n", stderr);
7696 exit(S_CAST(int32_t, kPglRetNotYetSupported));
7697 return kPglRetSuccess;
7698 }
7699
7700 const uint32_t is_unconditional_dosage = ((vrtype & 0x60) == 0x40);
7701 uint64_t alt1_dosage = 0;
7702 uint32_t dosage_ct = 0;
7703 STD_ARRAY_DECL(uint32_t, 4, replaced_genocounts);
7704 if ((!is_minimac3_r2) || (!(vrtype & 0x90))) {
7705 uint64_t alt1_dosage_sq_sum = 0;
7706 if (is_unconditional_dosage) {
7707 // needs to be handled separately from the other cases due to possible
7708 // presence of missing values.
7709 // note that this code will also need to be adjusted when multiallelic
7710 // support is added.
7711 #ifdef __arm__
7712 # error "Unaligned accesses in GetBasicGenotypeCountsAndDosage16s()."
7713 #endif
7714 STD_ARRAY_FILL0(replaced_genocounts);
7715 const uint16_t* dosage_main = R_CAST(const uint16_t*, fread_ptr);
7716 if (PtrAddCk(fread_end, raw_sample_ct * sizeof(int16_t), &fread_ptr)) {
7717 return kPglRetMalformedInput;
7718 }
7719 if (subsetting_required) {
7720 uintptr_t sample_uidx_base = 0;
7721 uintptr_t sample_include_bits = sample_include[0];
7722 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
7723 const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
7724 const uintptr_t cur_dosage_val = dosage_main[sample_uidx];
7725 if (cur_dosage_val != 65535) {
7726 alt1_dosage += cur_dosage_val;
7727
7728 // todo: check if this is slow enough to justify removing it from
7729 // the main loop
7730 alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
7731 ++dosage_ct;
7732 }
7733 }
7734 } else {
7735 for (uint32_t sample_uidx = 0; sample_uidx != sample_ct; ++sample_uidx) {
7736 const uintptr_t cur_dosage_val = dosage_main[sample_uidx];
7737 if (cur_dosage_val != 65535) {
7738 alt1_dosage += cur_dosage_val;
7739 alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
7740 ++dosage_ct;
7741 }
7742 }
7743 }
7744 // update (20 Mar 2019): .pgen specification tightened to remove the need
7745 // to update replaced_genocounts in the main loops above.
7746 STD_ARRAY_COPY(genocounts, 4, replaced_genocounts);
7747 replaced_genocounts[3] = replaced_genocounts[3] + dosage_ct - sample_ct;
7748 } else {
7749 uintptr_t* raw_dosage_present = pgrp->workspace_dosage_present;
7750 uint32_t raw_dosage_ct;
7751 if (!(vrtype & 0x40)) {
7752 // dosage list
7753 if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, &fread_ptr, raw_dosage_present, &raw_dosage_ct))) {
7754 return kPglRetMalformedInput;
7755 }
7756 } else {
7757 // dosage bitarray
7758 raw_dosage_present[raw_sample_ctl - 1] = 0;
7759 const uint32_t raw_sample_ctb = DivUp(raw_sample_ct, CHAR_BIT);
7760 memcpy(raw_dosage_present, fread_ptr, raw_sample_ctb);
7761 fread_ptr = &(fread_ptr[raw_sample_ctb]);
7762 raw_dosage_ct = PopcountWords(raw_dosage_present, raw_sample_ctl);
7763 }
7764 const uint16_t* dosage_main_iter = R_CAST(const uint16_t*, fread_ptr);
7765 if (PtrAddCk(fread_end, raw_dosage_ct * sizeof(int16_t), &fread_ptr)) {
7766 return kPglRetMalformedInput;
7767 }
7768 if (subsetting_required) {
7769 uintptr_t sample_widx = 0;
7770 uintptr_t dosage_present_bits = raw_dosage_present[0];
7771 for (uint32_t dosage_idx = 0; dosage_idx != raw_dosage_ct; ++dosage_idx) {
7772 const uintptr_t lowbit = BitIter1y(raw_dosage_present, &sample_widx, &dosage_present_bits);
7773 if (sample_include[sample_widx] & lowbit) {
7774 const uintptr_t cur_dosage_val = dosage_main_iter[dosage_idx];
7775 alt1_dosage += cur_dosage_val;
7776 alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
7777 ++dosage_ct;
7778 }
7779 }
7780 GenoarrCountSubsetIntersectFreqs(raw_genovec, raw_dosage_present, sample_include, raw_sample_ct, replaced_genocounts);
7781 } else {
7782 if (!imp_r2_ptr) {
7783 for (uint32_t dosage_idx = 0; dosage_idx != raw_dosage_ct; ++dosage_idx) {
7784 alt1_dosage += dosage_main_iter[dosage_idx];
7785 }
7786 } else {
7787 for (uint32_t dosage_idx = 0; dosage_idx != raw_dosage_ct; ++dosage_idx) {
7788 const uintptr_t cur_dosage_val = dosage_main_iter[dosage_idx];
7789 alt1_dosage += cur_dosage_val;
7790 alt1_dosage_sq_sum += cur_dosage_val * cur_dosage_val;
7791 }
7792 }
7793 dosage_ct = raw_dosage_ct;
7794 GenoarrCountSubsetFreqs2(raw_genovec, raw_dosage_present, raw_sample_ct, raw_dosage_ct, replaced_genocounts);
7795 }
7796 }
7797 const uint32_t replaced_ct = replaced_genocounts[0] + replaced_genocounts[1] + replaced_genocounts[2];
7798 const uint32_t remaining_het_ct = genocounts[1] - replaced_genocounts[1];
7799 const uint32_t remaining_hom_alt_ct = genocounts[2] - replaced_genocounts[2];
7800 const uint32_t alt1_ct = 2 * remaining_hom_alt_ct + remaining_het_ct;
7801 alt1_dosage += alt1_ct * 16384LLU;
7802 all_dosages[1] = alt1_dosage;
7803 const uint32_t nondosage_nm_ct = sample_ct - genocounts[3] - replaced_ct;
7804 const uint32_t new_sample_nm_ct = dosage_ct + nondosage_nm_ct;
7805 all_dosages[0] = new_sample_nm_ct * 32768LLU - alt1_dosage;
7806 if (!imp_r2_ptr) {
7807 return kPglRetSuccess;
7808 }
7809 // possible todo: also move all-hardcall-phase-present, no-dosage
7810 // is_minimac3_r2 case under this branch, since we can just set imp_r2 to
7811 // NaN or 1.
7812 // 16384^2, 32768^2
7813 alt1_dosage_sq_sum += remaining_het_ct * 0x10000000LLU + remaining_hom_alt_ct * 0x40000000LLU;
7814 *imp_r2_ptr = BiallelicDiploidMinimac3R2(alt1_dosage, alt1_dosage_sq_sum, new_sample_nm_ct);
7815 if (!is_minimac3_r2) {
7816 *imp_r2_ptr *= 2;
7817 }
7818 return kPglRetSuccess;
7819 }
7820 // Need to deal with implicitly phased dosages. Best to have raw_genovec,
7821 // raw_phasepresent, dosage_present, and dosage_main all available, then loop
7822 // over everything at once.
7823 // (phaseinfo is irrelevant since only absolute value of (left - right)
7824 // matters.)
7825
7826 // We have the following 2x2x3 cases to deal with:
7827 // - Subsetted vs. un-subsetted. Un-subsetted comes up a lot, so we have an
7828 // optimized code path for it.
7829 // - Unconditional vs. conditional dosage. Unconditional should not come up
7830 // much, so we just mock up raw_dosage_present... er, actually, that
7831 // doesn't work because dosage_main would also need to be collapsed. Sigh.
7832 // Ok, it's still handled separately.
7833 // - Only hardcall-phase, vs. only dosage-phase, vs. both. At least we can
7834 // merge the "only dosage-phase" and "both" cases.
7835 // So we end up with 8 primary code paths.
7836 // This is kind of a nightmare; it would obviously be nicer to move this
7837 // out of pgenlib_internal, and that may eventually happen. But we don't
7838 // want users to be discouraged from running --minimac3-r2-filter when it's
7839 // appropriate just because it's a lot slower than other standard filters;
7840 // and this also serves as a testing ground for efficient phased-dosage
7841 // handling strategies.
7842 if (!VrtypeHphase(vrtype)) {
7843 ZeroWArr(raw_sample_ctl, raw_phasepresent);
7844 }
7845 uintptr_t* raw_dosage_present = nullptr;
7846 const uint16_t* dosage_main;
7847 uint32_t raw_dosage_ct = 0;
7848 if (is_unconditional_dosage) {
7849 dosage_main = R_CAST(const uint16_t*, fread_ptr);
7850 if (PtrAddCk(fread_end, raw_sample_ct * sizeof(int16_t), &fread_ptr)) {
7851 return kPglRetMalformedInput;
7852 }
7853 // raw_dosage_ct unused in this case.
7854 } else {
7855 // could move some duplicate code before the big branch
7856 raw_dosage_present = pgrp->workspace_dosage_present;
7857 if (!(vrtype & 0x40)) {
7858 // dosage list
7859 if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, &fread_ptr, raw_dosage_present, &raw_dosage_ct))) {
7860 return kPglRetMalformedInput;
7861 }
7862 } else {
7863 // dosage bitarray
7864 raw_dosage_present[raw_sample_ctl - 1] = 0;
7865 const uint32_t raw_sample_ctb = DivUp(raw_sample_ct, CHAR_BIT);
7866 memcpy(raw_dosage_present, fread_ptr, raw_sample_ctb);
7867 fread_ptr = &(fread_ptr[raw_sample_ctb]);
7868 raw_dosage_ct = PopcountWords(raw_dosage_present, raw_sample_ctl);
7869 }
7870 dosage_main = R_CAST(const uint16_t*, fread_ptr);
7871 if (PtrAddCk(fread_end, raw_dosage_ct * sizeof(int16_t), &fread_ptr)) {
7872 return kPglRetMalformedInput;
7873 }
7874 }
7875 const uint16_t* dosage_main_iter = dosage_main;
7876 uint64_t hap_ssq_x2 = 0;
7877 uint32_t phased_hc_het_ct = 0;
7878 if (!(vrtype & 0x80)) {
7879 if (is_unconditional_dosage) {
7880 if (!subsetting_required) {
7881 const uint32_t raw_sample_ctl_m1 = raw_sample_ctl - 1;
7882 uint32_t loop_len = kBitsPerWord;
7883 for (uint32_t widx = 0; ; ++widx) {
7884 if (widx >= raw_sample_ctl_m1) {
7885 if (widx > raw_sample_ctl_m1) {
7886 break;
7887 }
7888 loop_len = ModNz(raw_sample_ct, kBitsPerWord);
7889 }
7890 uintptr_t phasepresent_word = raw_phasepresent[widx];
7891 for (uint32_t uii = 0; uii != loop_len; ++uii) {
7892 const uintptr_t cur_dosage_val = *dosage_main_iter++;
7893 if (cur_dosage_val != 65535) {
7894 alt1_dosage += cur_dosage_val;
7895 hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7896 ++dosage_ct;
7897 if (phasepresent_word & 1) {
7898 // For each dosage, when phasepresent bit is set, implicit
7899 // dphase_delta value is 16384 - |16384 - x|.
7900 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
7901 hap_ssq_x2 += homdist * homdist;
7902 }
7903 }
7904 phasepresent_word = phasepresent_word >> 1;
7905 }
7906 }
7907 } else {
7908 for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
7909 uintptr_t sample_include_word = sample_include[widx];
7910 if (!sample_include_word) {
7911 continue;
7912 }
7913 const uintptr_t phasepresent_word = raw_phasepresent[widx];
7914 const uint16_t* cur_dosage_main = &(dosage_main[widx * kBitsPerWord]);
7915 do {
7916 const uint32_t sample_idx_lowbits = ctzw(sample_include_word);
7917 const uintptr_t cur_dosage_val = cur_dosage_main[sample_idx_lowbits];
7918 const uintptr_t lowbit = sample_include_word & (-sample_include_word);
7919 if (cur_dosage_val != 65535) {
7920 alt1_dosage += cur_dosage_val;
7921 hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7922 ++dosage_ct;
7923 if (lowbit & phasepresent_word) {
7924 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
7925 hap_ssq_x2 += homdist * homdist;
7926 }
7927 }
7928 sample_include_word ^= lowbit;
7929 } while (sample_include_word);
7930 }
7931 }
7932 STD_ARRAY_COPY(genocounts, 4, replaced_genocounts);
7933 replaced_genocounts[3] = replaced_genocounts[3] + dosage_ct - sample_ct;
7934 } else { // !is_unconditional_dosage
7935 if (!subsetting_required) {
7936 // phased_hc_het_ct := popcount(phasepresent & (~dosage_present))
7937 phased_hc_het_ct = raw_phasepresent_ct - PopcountWordsIntersect(raw_phasepresent, raw_dosage_present, raw_sample_ctl);
7938
7939 for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
7940 uintptr_t dosage_present_word = raw_dosage_present[widx];
7941 if (dosage_present_word) {
7942 const uintptr_t phasepresent_word = raw_phasepresent[widx];
7943 do {
7944 const uintptr_t cur_dosage_val = *dosage_main_iter++;
7945 alt1_dosage += cur_dosage_val;
7946 const uintptr_t lowbit = dosage_present_word & (-dosage_present_word);
7947 hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7948 if (lowbit & phasepresent_word) {
7949 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
7950 hap_ssq_x2 += homdist * homdist;
7951 }
7952 dosage_present_word ^= lowbit;
7953 } while (dosage_present_word);
7954 }
7955 }
7956 dosage_ct = raw_dosage_ct;
7957 GenoarrCountSubsetFreqs2(raw_genovec, raw_dosage_present, raw_sample_ct, raw_dosage_ct, replaced_genocounts);
7958 } else {
7959 for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
7960 const uintptr_t sample_include_word = sample_include[widx];
7961 uintptr_t dosage_present_word = raw_dosage_present[widx];
7962 if (!sample_include_word) {
7963 dosage_main_iter = &(dosage_main_iter[PopcountWord(dosage_present_word)]);
7964 continue;
7965 }
7966 const uintptr_t phasepresent_word = raw_phasepresent[widx];
7967 phased_hc_het_ct += PopcountWord(sample_include_word & phasepresent_word & (~dosage_present_word));
7968 while (dosage_present_word) {
7969 const uintptr_t lowbit = dosage_present_word & (-dosage_present_word);
7970 if (lowbit & sample_include_word) {
7971 const uintptr_t cur_dosage_val = *dosage_main_iter;
7972 alt1_dosage += cur_dosage_val;
7973 hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7974 ++dosage_ct;
7975 if (lowbit & phasepresent_word) {
7976 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
7977 hap_ssq_x2 += homdist * homdist;
7978 }
7979 }
7980 dosage_present_word ^= lowbit;
7981 ++dosage_main_iter;
7982 }
7983 }
7984 GenoarrCountSubsetIntersectFreqs(raw_genovec, raw_dosage_present, sample_include, raw_sample_ct, replaced_genocounts);
7985 }
7986 }
7987 } else {
7988 if (is_unconditional_dosage) {
7989 if (PtrCheck(fread_end, fread_ptr, raw_sample_ct * sizeof(int16_t))) {
7990 return kPglRetMalformedInput;
7991 }
7992 const int16_t* dphase_delta = R_CAST(const int16_t*, fread_ptr);
7993 if (!subsetting_required) {
7994 for (uint32_t sample_uidx = 0; sample_uidx != raw_sample_ct; ++sample_uidx) {
7995 const uintptr_t cur_dosage_val = dosage_main[sample_uidx];
7996 if (cur_dosage_val != 65535) {
7997 alt1_dosage += cur_dosage_val;
7998 hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
7999 ++dosage_ct;
8000 // .pgen specification now requires this value to never be missing.
8001 const intptr_t dphase_delta_val = dphase_delta[sample_uidx];
8002 hap_ssq_x2 += dphase_delta_val * dphase_delta_val;
8003 }
8004 }
8005 } else {
8006 for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
8007 uintptr_t sample_include_word = sample_include[widx];
8008 if (!sample_include_word) {
8009 continue;
8010 }
8011 const uint16_t* cur_dosage_main = &(dosage_main[widx * kBitsPerWord]);
8012 const int16_t* cur_dphase_delta = &(dphase_delta[widx * kBitsPerWord]);
8013 do {
8014 const uint32_t sample_idx_lowbits = ctzw(sample_include_word);
8015 const uintptr_t cur_dosage_val = cur_dosage_main[sample_idx_lowbits];
8016 if (cur_dosage_val != 65535) {
8017 alt1_dosage += cur_dosage_val;
8018 hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
8019 ++dosage_ct;
8020 const intptr_t dphase_delta_val = cur_dphase_delta[sample_idx_lowbits];
8021 hap_ssq_x2 += dphase_delta_val * dphase_delta_val;
8022 }
8023 sample_include_word &= sample_include_word - 1;
8024 } while (sample_include_word);
8025 }
8026 }
8027 STD_ARRAY_COPY(genocounts, 4, replaced_genocounts);
8028 replaced_genocounts[3] = replaced_genocounts[3] + dosage_ct - sample_ct;
8029 } else {
8030 const uintptr_t* file_dphase_present = R_CAST(const uintptr_t*, fread_ptr);
8031 const uint32_t raw_dosage_ctb = DivUp(raw_dosage_ct, CHAR_BIT);
8032 if (PtrAddCk(fread_end, raw_dosage_ctb, &fread_ptr)) {
8033 return kPglRetMalformedInput;
8034 }
8035 const uint32_t raw_dphase_ct = PopcountBytes(file_dphase_present, raw_dosage_ctb);
8036 if (PtrCheck(fread_end, fread_ptr, raw_dphase_ct * sizeof(int16_t))) {
8037 return kPglRetMalformedInput;
8038 }
8039 uintptr_t* raw_dphase_present = pgrp->workspace_dphase_present;
8040 ExpandBytearr(file_dphase_present, raw_dosage_present, raw_sample_ctl, raw_dosage_ct, 0, raw_dphase_present);
8041 const int16_t* dphase_delta_iter = R_CAST(const int16_t*, fread_ptr);
8042 if (!subsetting_required) {
8043 phased_hc_het_ct = raw_phasepresent_ct - PopcountWordsIntersect(raw_phasepresent, raw_dosage_present, raw_sample_ctl);
8044
8045 for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
8046 uintptr_t dosage_present_word = raw_dosage_present[widx];
8047 if (dosage_present_word) {
8048 const uintptr_t phasepresent_word = raw_phasepresent[widx];
8049 const uintptr_t dphase_present_word = raw_dphase_present[widx];
8050 do {
8051 const uintptr_t cur_dosage_val = *dosage_main_iter++;
8052 alt1_dosage += cur_dosage_val;
8053 const uintptr_t lowbit = dosage_present_word & (-dosage_present_word);
8054 hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
8055 if (lowbit & dphase_present_word) {
8056 const intptr_t dphase_delta_val = *dphase_delta_iter++;
8057 hap_ssq_x2 += dphase_delta_val * dphase_delta_val;
8058 } else if (lowbit & phasepresent_word) {
8059 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
8060 hap_ssq_x2 += homdist * homdist;
8061 }
8062 dosage_present_word ^= lowbit;
8063 } while (dosage_present_word);
8064 }
8065 }
8066 dosage_ct = raw_dosage_ct;
8067 GenoarrCountSubsetFreqs2(raw_genovec, raw_dosage_present, raw_sample_ct, raw_dosage_ct, replaced_genocounts);
8068 } else {
8069 for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
8070 const uintptr_t sample_include_word = sample_include[widx];
8071 const uintptr_t dphase_present_word = raw_dphase_present[widx];
8072 uintptr_t dosage_present_word = raw_dosage_present[widx];
8073 if (!sample_include_word) {
8074 dosage_main_iter = &(dosage_main_iter[PopcountWord(dosage_present_word)]);
8075 dphase_delta_iter = &(dphase_delta_iter[PopcountWord(dphase_present_word)]);
8076 continue;
8077 }
8078 const uintptr_t phasepresent_word = raw_phasepresent[widx];
8079 phased_hc_het_ct += PopcountWord(sample_include_word & phasepresent_word & (~dosage_present_word));
8080 while (dosage_present_word) {
8081 const uintptr_t lowbit = dosage_present_word & (-dosage_present_word);
8082 const uintptr_t dphase_here = lowbit & dphase_present_word;
8083 if (lowbit & sample_include_word) {
8084 const uintptr_t cur_dosage_val = *dosage_main_iter;
8085 alt1_dosage += cur_dosage_val;
8086 hap_ssq_x2 += cur_dosage_val * cur_dosage_val;
8087 ++dosage_ct;
8088 if (dphase_here) {
8089 const intptr_t dphase_delta_val = *dphase_delta_iter;
8090 hap_ssq_x2 += dphase_delta_val * dphase_delta_val;
8091 } else if (lowbit & phasepresent_word) {
8092 const uintptr_t homdist = 16384 - abs_i32(16384 - cur_dosage_val);
8093 hap_ssq_x2 += homdist * homdist;
8094 }
8095 }
8096 dphase_delta_iter += (dphase_here != 0);
8097 dosage_present_word ^= lowbit;
8098 ++dosage_main_iter;
8099 }
8100 }
8101 GenoarrCountSubsetIntersectFreqs(raw_genovec, raw_dosage_present, sample_include, raw_sample_ct, replaced_genocounts);
8102 }
8103 }
8104 }
8105 const uint32_t replaced_ct = replaced_genocounts[0] + replaced_genocounts[1] + replaced_genocounts[2];
8106 const uint32_t remaining_het_ct = genocounts[1] - replaced_genocounts[1];
8107 const uint32_t remaining_hom_alt_ct = genocounts[2] - replaced_genocounts[2];
8108 const uint32_t alt1_ct = 2 * remaining_hom_alt_ct + remaining_het_ct;
8109 alt1_dosage += alt1_ct * 16384LLU;
8110 all_dosages[1] = alt1_dosage;
8111 const uint32_t nondosage_nm_ct = sample_ct - genocounts[3] - replaced_ct;
8112 const uint32_t new_sample_nm_ct = dosage_ct + nondosage_nm_ct;
8113 all_dosages[0] = new_sample_nm_ct * 32768LLU - alt1_dosage;
8114 hap_ssq_x2 += (remaining_het_ct + phased_hc_het_ct) * 0x10000000LLU + remaining_hom_alt_ct * 0x40000000LLU;
8115 *imp_r2_ptr = BiallelicDiploidMinimac3R2(alt1_dosage, hap_ssq_x2, new_sample_nm_ct);
8116 return kPglRetSuccess;
8117 }
8118
8119 PglErr PgrGetDCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReader* pgr_ptr, double* imp_r2_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages) {
8120 if (!sample_ct) {
8121 STD_ARRAY_REF_FILL0(4, genocounts);
8122 all_dosages[0] = 0;
8123 all_dosages[1] = 0;
8124 if (imp_r2_ptr) {
8125 *imp_r2_ptr = 0.0 / 0.0;
8126 }
8127 return kPglRetSuccess;
8128 }
8129 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8130 assert(vidx < pgrp->fi.raw_variant_ct);
8131 return GetBasicGenotypeCountsAndDosage16s(sample_include, sample_include_interleaved_vec, GetSicp(pssi), sample_ct, vidx, is_minimac3_r2, pgrp, imp_r2_ptr, genocounts, all_dosages);
8132 }
8133
8134 // Does not zero-initialize results[].
CountAllBytes64(const void * bytearr,uintptr_t byte_ct,uint64_t * __restrict results)8135 void CountAllBytes64(const void* bytearr, uintptr_t byte_ct, uint64_t* __restrict results) {
8136 const unsigned char* bytearr_uc = S_CAST(const unsigned char*, bytearr);
8137 for (uintptr_t ulii = 0; ulii != byte_ct; ++ulii) {
8138 results[bytearr_uc[ulii]] += 1;
8139 }
8140 }
8141
8142 // Does not zero-initialize results[].
CountAllNybbles64(const void * nybblearr,uintptr_t nybble_ct,uint64_t * __restrict results)8143 void CountAllNybbles64(const void* nybblearr, uintptr_t nybble_ct, uint64_t* __restrict results) {
8144 // possible todo: for sufficiently large nybble_ct, use CountAllBytes and
8145 // then postprocess
8146 const uintptr_t fullbyte_ct = nybble_ct / 2;
8147 const unsigned char* nybblearr_uc = S_CAST(const unsigned char*, nybblearr);
8148 for (uintptr_t ulii = 0; ulii != fullbyte_ct; ++ulii) {
8149 const uint32_t uii = nybblearr_uc[ulii];
8150 results[uii & 15] += 1;
8151 results[uii >> 4] += 1;
8152 }
8153 if (nybble_ct % 2) {
8154 results[nybblearr_uc[fullbyte_ct] & 15] += 1;
8155 }
8156 }
8157
CountAllAux1aDense(const void * patch_01_fvals,uint32_t allele_ct,uint32_t rare01_ct,uint64_t * __restrict one_cts)8158 void CountAllAux1aDense(const void* patch_01_fvals, uint32_t allele_ct, uint32_t rare01_ct, uint64_t* __restrict one_cts) {
8159 one_cts[1] -= rare01_ct;
8160 if (allele_ct < 5) {
8161 if (allele_ct == 3) {
8162 // all entries are 0/1 -> 0/2
8163 one_cts[2] = rare01_ct;
8164 return;
8165 }
8166 const uint32_t allele_code_byte_ct = DivUp(rare01_ct, 8);
8167 const uint32_t alt3_ct = PopcountBytes(patch_01_fvals, allele_code_byte_ct);
8168 one_cts[2] = rare01_ct - alt3_ct;
8169 one_cts[3] = alt3_ct;
8170 return;
8171 }
8172 if (allele_ct < 19) {
8173 if (allele_ct < 7) {
8174 STD_ARRAY_DECL(uint32_t, 4, rare0het_counts);
8175 GenoarrCountFreqs(R_CAST(const uintptr_t*, patch_01_fvals), rare01_ct, rare0het_counts);
8176 for (uint32_t allele_idx_p2 = 2; allele_idx_p2 != allele_ct; ++allele_idx_p2) {
8177 one_cts[allele_idx_p2] = rare0het_counts[allele_idx_p2 - 2];
8178 }
8179 return;
8180 }
8181 CountAllNybbles64(patch_01_fvals, rare01_ct, &(one_cts[2]));
8182 return;
8183 }
8184 CountAllBytes64(patch_01_fvals, rare01_ct, &(one_cts[2]));
8185 }
8186
8187 // assumes one_cts[1] initialized to genocounts[1]
8188 // sample_include should be nullptr if we aren't subsetting
CountAllAux1a(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict raw_genoarr,uint32_t aux1a_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_01_ct,const unsigned char ** fread_pp,uint64_t * __restrict one_cts,uint32_t * __restrict deltalist_workspace)8189 PglErr CountAllAux1a(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raw_genoarr, uint32_t aux1a_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_01_ct, const unsigned char** fread_pp, uint64_t* __restrict one_cts, uint32_t* __restrict deltalist_workspace) {
8190 if (aux1a_mode == 15) {
8191 return kPglRetSuccess;
8192 }
8193 if (!sample_include) {
8194 uint32_t rare01_ct;
8195 if (!aux1a_mode) {
8196 const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
8197 rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
8198 *fread_pp += fset_byte_ct;
8199 } else {
8200 const unsigned char* group_info_iter;
8201 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare01_ct);
8202 if (unlikely(reterr)) {
8203 return reterr;
8204 }
8205 reterr = SkipDeltalistIds(fread_end, group_info_iter, rare01_ct, raw_sample_ct, 0, fread_pp);
8206 if (unlikely(reterr)) {
8207 return reterr;
8208 }
8209 }
8210 const unsigned char* patch_01_fvals = *fread_pp;
8211 const uint32_t fvals_byte_ct = GetAux1aAlleleEntryByteCt(allele_ct, rare01_ct);
8212 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8213 return kPglRetMalformedInput;
8214 }
8215 CountAllAux1aDense(patch_01_fvals, allele_ct, rare01_ct, one_cts);
8216 return kPglRetSuccess;
8217 }
8218 const uint32_t allele_code_width = GetAux1aWidth(allele_ct);
8219 const uintptr_t allele_code_mask = (1U << allele_code_width) - 1;
8220 uint64_t* one_cts_offset2 = &(one_cts[2]);
8221 if (!aux1a_mode) {
8222 const uint32_t fset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
8223 const uint32_t rare01_ct = PopcountBytes(*fread_pp, fset_byte_ct);
8224 #ifdef __arm__
8225 # error "Unaligned accesses in CountAllAux1a()."
8226 #endif
8227 const uintptr_t* patch_01_fsetw = R_CAST(const uintptr_t*, *fread_pp);
8228 *fread_pp += fset_byte_ct;
8229 const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
8230 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
8231 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8232 return kPglRetMalformedInput;
8233 }
8234 const Halfword* sample_include_hw = R_CAST(const Halfword*, sample_include);
8235 uintptr_t sample_hwidx = 0;
8236 uintptr_t cur_raw_genoarr_hets = Word01(raw_genoarr[0]);
8237 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
8238 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
8239 uintptr_t fvals_bits = 0;
8240 uint32_t fvals_widx = 0;
8241 uint32_t subsetted_rare01_ct = 0;
8242 uint32_t loop_len = kBitsPerWord;
8243 uint32_t rare01_lowbits = kBitsPerWord;
8244 for (uint32_t fset_widx = 0; ; ++fset_widx) {
8245 uintptr_t fset_bits;
8246 if (fset_widx >= fset_word_ct_m1) {
8247 if (fset_widx > fset_word_ct_m1) {
8248 break;
8249 }
8250 fset_bits = SubwordLoad(&(patch_01_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
8251 loop_len = ModNz(raw_01_ct, kBitsPerWord);
8252 } else {
8253 fset_bits = patch_01_fsetw[fset_widx];
8254 }
8255 if (allele_ct == 3) {
8256 for (uint32_t uii = 0; uii != loop_len; ++uii) {
8257 while (!cur_raw_genoarr_hets) {
8258 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
8259 }
8260 if (fset_bits & 1) {
8261 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
8262 subsetted_rare01_ct += (sample_include_hw[sample_hwidx] >> sample_uidx_lowbits) & 1;
8263 }
8264 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
8265 fset_bits = fset_bits >> 1;
8266 }
8267 } else {
8268 for (uint32_t uii = 0; uii != loop_len; ++uii) {
8269 while (!cur_raw_genoarr_hets) {
8270 cur_raw_genoarr_hets = Word01(raw_genoarr[++sample_hwidx]);
8271 }
8272 if (fset_bits & 1) {
8273 if (rare01_lowbits == kBitsPerWord) {
8274 if (fvals_widx == fvals_word_ct_m1) {
8275 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8276 } else {
8277 fvals_bits = patch_01_fvalsw[fvals_widx];
8278 }
8279 // unnecessary to apply bzhi here
8280 ++fvals_widx;
8281 rare01_lowbits = 0;
8282 }
8283 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_hets) / 2;
8284 if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
8285 ++subsetted_rare01_ct;
8286 one_cts_offset2[(fvals_bits >> rare01_lowbits) & allele_code_mask] += 1;
8287 }
8288 rare01_lowbits += allele_code_width;
8289 }
8290 cur_raw_genoarr_hets &= cur_raw_genoarr_hets - 1;
8291 fset_bits = fset_bits >> 1;
8292 }
8293 }
8294 }
8295 one_cts_offset2[-1] -= subsetted_rare01_ct;
8296 if (allele_ct == 3) {
8297 one_cts_offset2[0] = subsetted_rare01_ct;
8298 }
8299 return kPglRetSuccess;
8300 }
8301 // mode 1: difflist.
8302 if (allele_ct == 3) {
8303 // Use CountDeltalistIntersect shortcut here.
8304 uint32_t subsetted_02_ct;
8305 uint32_t rare01_ct;
8306 PglErr reterr = CountDeltalistIntersect(fread_end, sample_include, raw_sample_ct, fread_pp, &subsetted_02_ct, &rare01_ct);
8307 if (unlikely(reterr)) {
8308 return reterr;
8309 }
8310 one_cts_offset2[-1] -= subsetted_02_ct;
8311 one_cts_offset2[0] = subsetted_02_ct;
8312 return kPglRetSuccess;
8313 }
8314 // Save deltalist elements, iterate.
8315 uint32_t rare01_ct;
8316 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare01_ct);
8317 if (unlikely(reterr)) {
8318 return reterr;
8319 }
8320 const uintptr_t* patch_01_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
8321 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare01_ct) * allele_code_width, 8);
8322 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8323 return kPglRetMalformedInput;
8324 }
8325 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
8326 const uint32_t allele_code_logwidth = ctzu32(allele_code_width);
8327 uint32_t subsetted_rare01_ct = 0;
8328 uint32_t loop_len = kBitsPerWord >> allele_code_logwidth;
8329 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
8330 uintptr_t fvals_bits;
8331 if (fvals_widx >= fvals_word_ct_m1) {
8332 if (fvals_widx > fvals_word_ct_m1) {
8333 break;
8334 }
8335 fvals_bits = SubwordLoad(&(patch_01_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8336 loop_len = 1 + ((rare01_ct - 1) & (loop_len - 1));
8337 } else {
8338 fvals_bits = patch_01_fvalsw[fvals_widx];
8339 }
8340 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - allele_code_logwidth)]);
8341 for (uint32_t uii = 0; uii != loop_len; ++uii) {
8342 const uint32_t sample_uidx = cur_deltalist_base[uii];
8343 if (IsSet(sample_include, sample_uidx)) {
8344 ++subsetted_rare01_ct;
8345 one_cts_offset2[(fvals_bits >> (uii << allele_code_logwidth)) & allele_code_mask] += 1;
8346 }
8347 }
8348 }
8349 one_cts_offset2[-1] -= subsetted_rare01_ct;
8350 return kPglRetSuccess;
8351 }
8352
CountAllAux1bDense(const void * __restrict patch_10_fvals,uint32_t allele_ct,uint32_t rare10_ct,uint64_t * __restrict one_cts_offset1,uint64_t * __restrict two_cts_offset1)8353 void CountAllAux1bDense(const void* __restrict patch_10_fvals, uint32_t allele_ct, uint32_t rare10_ct, uint64_t* __restrict one_cts_offset1, uint64_t* __restrict two_cts_offset1) {
8354 // probable todo: faster path if two_cts_offset1 == nullptr
8355 const uint32_t allele_ct_m1 = allele_ct - 1;
8356 two_cts_offset1[0] -= rare10_ct;
8357 if (allele_ct_m1 < 5) {
8358 if (allele_ct_m1 == 2) {
8359 const uint32_t allele_code_byte_ct = DivUp(rare10_ct, 8);
8360 const uint32_t hom22_ct = PopcountBytes(patch_10_fvals, allele_code_byte_ct);
8361 const uint32_t het12_ct = rare10_ct - hom22_ct;
8362 one_cts_offset1[0] += het12_ct;
8363 one_cts_offset1[1] += het12_ct;
8364 two_cts_offset1[1] = hom22_ct;
8365 return;
8366 }
8367 STD_ARRAY_DECL(uint32_t, 4, alt_counts);
8368 GenoarrCountFreqs(R_CAST(const uintptr_t*, patch_10_fvals), rare10_ct * 2, alt_counts);
8369 one_cts_offset1[0] += alt_counts[0];
8370 for (uint32_t allele_idx_m1 = 1; allele_idx_m1 != allele_ct_m1; ++allele_idx_m1) {
8371 const uint32_t homxx_ct = CountNybble(patch_10_fvals, allele_idx_m1 * kMask5555, rare10_ct);
8372 one_cts_offset1[allele_idx_m1] += alt_counts[allele_idx_m1] - 2 * homxx_ct;
8373 two_cts_offset1[allele_idx_m1] = homxx_ct;
8374 }
8375 return;
8376 }
8377 const unsigned char* patch_10_fvals_uc = S_CAST(const unsigned char*, patch_10_fvals);
8378 if (allele_ct_m1 < 17) {
8379 // for larger rare10_ct, this should use a byte counter
8380 for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
8381 const uint32_t cur_byte = patch_10_fvals_uc[uii];
8382 const uint32_t cur_byte_hi = cur_byte >> 4;
8383 const uint32_t cur_byte_lo = cur_byte & 15;
8384 if (cur_byte_hi == cur_byte_lo) {
8385 two_cts_offset1[cur_byte_lo] += 1;
8386 } else {
8387 one_cts_offset1[cur_byte_lo] += 1;
8388 one_cts_offset1[cur_byte_hi] += 1;
8389 }
8390 }
8391 return;
8392 }
8393 for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
8394 const uint32_t cur_byte_lo = patch_10_fvals_uc[2 * uii];
8395 const uint32_t cur_byte_hi = patch_10_fvals_uc[2 * uii + 1];
8396 if (cur_byte_hi == cur_byte_lo) {
8397 two_cts_offset1[cur_byte_lo] += 1;
8398 } else {
8399 one_cts_offset1[cur_byte_lo] += 1;
8400 one_cts_offset1[cur_byte_hi] += 1;
8401 }
8402 }
8403 }
8404
CountAllAux1b(const unsigned char * fread_end,const uintptr_t * __restrict sample_include,const uintptr_t * __restrict raw_genoarr,uint32_t aux1b_mode,uint32_t raw_sample_ct,uint32_t allele_ct,uint32_t raw_10_ct,const unsigned char ** fread_pp,uint64_t * __restrict one_cts,uint64_t * __restrict two_cts,uint32_t * __restrict deltalist_workspace)8405 PglErr CountAllAux1b(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uintptr_t* __restrict raw_genoarr, uint32_t aux1b_mode, uint32_t raw_sample_ct, uint32_t allele_ct, uint32_t raw_10_ct, const unsigned char** fread_pp, uint64_t* __restrict one_cts, uint64_t* __restrict two_cts, uint32_t* __restrict deltalist_workspace) {
8406 if (aux1b_mode == 15) {
8407 return kPglRetSuccess;
8408 }
8409 uint64_t* one_cts_offset1 = &(one_cts[1]);
8410 uint64_t* two_cts_offset1 = &(two_cts[1]);
8411 if (!sample_include) {
8412 uint32_t rare10_ct;
8413 if (!aux1b_mode) {
8414 const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
8415 rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
8416 *fread_pp += fset_byte_ct;
8417 } else {
8418 const unsigned char* group_info_iter;
8419 PglErr reterr = ParseDifflistHeader(fread_end, raw_sample_ct, fread_pp, nullptr, &group_info_iter, &rare10_ct);
8420 if (unlikely(reterr)) {
8421 return reterr;
8422 }
8423 reterr = SkipDeltalistIds(fread_end, group_info_iter, rare10_ct, raw_sample_ct, 0, fread_pp);
8424 if (unlikely(reterr)) {
8425 return reterr;
8426 }
8427 }
8428 const unsigned char* patch_10_fvals = *fread_pp;
8429 const uint32_t fvals_byte_ct = GetAux1bAlleleEntryByteCt(allele_ct, rare10_ct);
8430 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8431 return kPglRetMalformedInput;
8432 }
8433 CountAllAux1bDense(patch_10_fvals, allele_ct, rare10_ct, one_cts_offset1, two_cts_offset1);
8434 return kPglRetSuccess;
8435 }
8436 uintptr_t detect_hom_mask_lo; // unused
8437 const uint32_t allele_code_logwidth = GetAux1bConsts(allele_ct, &detect_hom_mask_lo);
8438 const uint32_t code10_logwidth = allele_code_logwidth + (allele_code_logwidth != 0);
8439 const uint32_t allele_code_width = 1U << allele_code_logwidth;
8440 const uint32_t allele_code_mask = (1U << allele_code_width) - 1;
8441 const uint32_t allele_ct_m1 = allele_ct - 1;
8442 uint32_t rare10_lowbits = kBitsPerWord;
8443 // probable todo: faster paths when two_cts_offset1 == nullptr
8444 if (!aux1b_mode) {
8445 const uint32_t fset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
8446 const uint32_t rare10_ct = PopcountBytes(*fread_pp, fset_byte_ct);
8447 #ifdef __arm__
8448 # error "Unaligned accesses in CountAllAux1b()."
8449 #endif
8450 const uintptr_t* patch_10_fsetw = R_CAST(const uintptr_t*, *fread_pp);
8451 *fread_pp += fset_byte_ct;
8452 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
8453 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, 8);
8454 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8455 return kPglRetMalformedInput;
8456 }
8457 const Halfword* sample_include_hw = R_CAST(const Halfword*, sample_include);
8458 uintptr_t sample_hwidx = 0;
8459 uintptr_t cur_raw_genoarr_xys = Word10(raw_genoarr[0]);
8460 const uint32_t fset_word_ct_m1 = (fset_byte_ct - 1) / kBytesPerWord;
8461 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
8462 const uint32_t code10_width = 1U << code10_logwidth;
8463 uintptr_t fvals_bits = 0;
8464 uint32_t fvals_widx = 0;
8465 uint32_t subsetted_rare10_ct = 0;
8466 uint32_t loop_len = kBitsPerWord;
8467 for (uint32_t fset_widx = 0; ; ++fset_widx) {
8468 uintptr_t fset_bits;
8469 if (fset_widx >= fset_word_ct_m1) {
8470 if (fset_widx > fset_word_ct_m1) {
8471 break;
8472 }
8473 fset_bits = SubwordLoad(&(patch_10_fsetw[fset_word_ct_m1]), ModNz(fset_byte_ct, kBytesPerWord));
8474 loop_len = ModNz(raw_10_ct, kBitsPerWord);
8475 } else {
8476 fset_bits = patch_10_fsetw[fset_widx];
8477 }
8478 if (allele_ct_m1 == 2) {
8479 for (uint32_t uii = 0; uii != loop_len; ++uii) {
8480 while (!cur_raw_genoarr_xys) {
8481 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
8482 }
8483 if (fset_bits & 1) {
8484 if (rare10_lowbits == kBitsPerWord) {
8485 if (fvals_widx == fvals_word_ct_m1) {
8486 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8487 } else {
8488 fvals_bits = patch_10_fvalsw[fvals_widx];
8489 }
8490 // unnecessary to apply bzhi here
8491 ++fvals_widx;
8492 rare10_lowbits = 0;
8493 }
8494 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
8495 if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
8496 ++subsetted_rare10_ct;
8497 two_cts_offset1[1] += (fvals_bits >> rare10_lowbits) & 1;
8498 }
8499 ++rare10_lowbits;
8500 }
8501 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
8502 fset_bits = fset_bits >> 1;
8503 }
8504 } else {
8505 for (uint32_t uii = 0; uii != loop_len; ++uii) {
8506 while (!cur_raw_genoarr_xys) {
8507 cur_raw_genoarr_xys = Word10(raw_genoarr[++sample_hwidx]);
8508 }
8509 if (fset_bits & 1) {
8510 if (rare10_lowbits == kBitsPerWord) {
8511 if (fvals_widx == fvals_word_ct_m1) {
8512 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8513 } else {
8514 fvals_bits = patch_10_fvalsw[fvals_widx];
8515 }
8516 // unnecessary to apply bzhi here
8517 ++fvals_widx;
8518 rare10_lowbits = 0;
8519 }
8520 const uint32_t sample_uidx_lowbits = ctzw(cur_raw_genoarr_xys) / 2;
8521 if (sample_include_hw[sample_hwidx] & (1U << sample_uidx_lowbits)) {
8522 ++subsetted_rare10_ct;
8523 const uintptr_t cur_code_pair = fvals_bits >> rare10_lowbits;
8524 const uint32_t cur_code_hi = (cur_code_pair >> allele_code_width) & allele_code_mask;
8525 const uint32_t cur_code_lo = cur_code_pair & allele_code_mask;
8526 if (cur_code_hi == cur_code_lo) {
8527 two_cts_offset1[cur_code_lo] += 1;
8528 } else {
8529 one_cts_offset1[cur_code_lo] += 1;
8530 one_cts_offset1[cur_code_hi] += 1;
8531 }
8532 }
8533 rare10_lowbits += code10_width;
8534 }
8535 cur_raw_genoarr_xys &= cur_raw_genoarr_xys - 1;
8536 fset_bits = fset_bits >> 1;
8537 }
8538 }
8539 }
8540 two_cts_offset1[0] -= subsetted_rare10_ct;
8541 if (allele_ct == 3) {
8542 const uint32_t subsetted_het12_ct = subsetted_rare10_ct - two_cts_offset1[1];
8543 one_cts_offset1[0] += subsetted_het12_ct;
8544 one_cts_offset1[1] += subsetted_het12_ct;
8545 }
8546 return kPglRetSuccess;
8547 }
8548 // Save deltalist elements, iterate.
8549 uint32_t rare10_ct;
8550 PglErr reterr = ParseAndSaveDeltalist(fread_end, raw_sample_ct, fread_pp, deltalist_workspace, &rare10_ct);
8551 if (unlikely(reterr)) {
8552 return reterr;
8553 }
8554 const uintptr_t* patch_10_fvalsw = R_CAST(const uintptr_t*, *fread_pp);
8555 const uint32_t fvals_byte_ct = DivUpU64(S_CAST(uint64_t, rare10_ct) << code10_logwidth, 8);
8556 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
8557 return kPglRetMalformedInput;
8558 }
8559 const uint32_t fvals_word_ct_m1 = (fvals_byte_ct - 1) / kBytesPerWord;
8560 uint32_t subsetted_rare10_ct = 0;
8561 uint32_t loop_len = kBitsPerWord >> code10_logwidth;
8562 for (uint32_t fvals_widx = 0; ; ++fvals_widx) {
8563 uintptr_t fvals_bits;
8564 if (fvals_widx >= fvals_word_ct_m1) {
8565 if (fvals_widx > fvals_word_ct_m1) {
8566 break;
8567 }
8568 fvals_bits = SubwordLoad(&(patch_10_fvalsw[fvals_widx]), ModNz(fvals_byte_ct, kBytesPerWord));
8569 loop_len = 1 + ((rare10_ct - 1) & (loop_len - 1));
8570 } else {
8571 fvals_bits = patch_10_fvalsw[fvals_widx];
8572 }
8573 const uint32_t* cur_deltalist_base = &(deltalist_workspace[fvals_widx << (kBitsPerWordLog2 - code10_logwidth)]);
8574 if (allele_ct == 3) {
8575 for (uint32_t uii = 0; uii != loop_len; ++uii) {
8576 const uint32_t sample_uidx = cur_deltalist_base[uii];
8577 if (IsSet(sample_include, sample_uidx)) {
8578 ++subsetted_rare10_ct;
8579 two_cts_offset1[1] += (fvals_bits >> uii) & 1;
8580 }
8581 }
8582 } else {
8583 for (uint32_t uii = 0; uii != loop_len; ++uii) {
8584 const uint32_t sample_uidx = cur_deltalist_base[uii];
8585 if (IsSet(sample_include, sample_uidx)) {
8586 ++subsetted_rare10_ct;
8587 const uintptr_t cur_code_pair = fvals_bits >> (uii << code10_logwidth);
8588 const uint32_t cur_code_hi = (cur_code_pair >> allele_code_width) & allele_code_mask;
8589 const uint32_t cur_code_lo = cur_code_pair & allele_code_mask;
8590 if (cur_code_hi == cur_code_lo) {
8591 two_cts_offset1[cur_code_lo] += 1;
8592 } else {
8593 one_cts_offset1[cur_code_lo] += 1;
8594 one_cts_offset1[cur_code_hi] += 1;
8595 }
8596 }
8597 }
8598 }
8599 }
8600 two_cts_offset1[0] -= subsetted_rare10_ct;
8601 if (allele_ct == 3) {
8602 const uint32_t subsetted_het12_ct = subsetted_rare10_ct - two_cts_offset1[1];
8603 one_cts_offset1[0] += subsetted_het12_ct;
8604 one_cts_offset1[1] += subsetted_het12_ct;
8605 }
8606 return kPglRetSuccess;
8607 }
8608
8609 PglErr GetMultiallelicCountsAndDosage16s(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, uint32_t sample_ct, uint32_t vidx, uint32_t allele_ct, __maybe_unused uint32_t is_minimac3_r2, PgenReaderMain* pgrp, double* __restrict imp_r2_ptr, uint32_t* __restrict het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* all_dosages) {
8610 // only called on multiallelic variants
8611 // no dosages for now
8612 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8613 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
8614 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
8615 uintptr_t* raw_genovec = pgrp->workspace_vec;
8616 const unsigned char* fread_ptr;
8617 const unsigned char* fread_end;
8618 PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, raw_genovec);
8619 if (unlikely(reterr)) {
8620 return reterr;
8621 }
8622 ZeroTrailingNyps(raw_sample_ct, raw_genovec);
8623 if (!subsetting_required) {
8624 GenoarrCountFreqsUnsafe(raw_genovec, raw_sample_ct, genocounts);
8625 sample_include = nullptr;
8626 } else {
8627 GenoarrCountSubsetFreqs(raw_genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
8628 }
8629 uint64_t* one_cts = pgrp->workspace_imp_r2;
8630 uint64_t* two_cts = &(one_cts[allele_ct]);
8631 one_cts[0] = genocounts[1];
8632 one_cts[1] = genocounts[1];
8633 ZeroU64Arr(allele_ct - 2, &(one_cts[2]));
8634 two_cts[0] = genocounts[0];
8635 two_cts[1] = genocounts[2];
8636 ZeroU64Arr(allele_ct - 2, &(two_cts[2]));
8637 // Cases:
8638 // - No hardcall-phase present. Then we don't need to know raw_het_ct.
8639 // - No multiallelic dosages present, not computing minimac3-r2. Then we
8640 // still don't need to know raw_het_ct.
8641 // - Otherwise, we need to know raw_het_ct, either for the minimac3-r2
8642 // computation or to locate the beginning of aux3/aux4.
8643 // If we're computing minimac3-r2, AND
8644 // (i) we're subsetting, or
8645 // (ii) multiallelic dosages are present,
8646 // it's also necessary to compute all_hets, either to compute correct
8647 // subsetted minimac3-r2 or to know how many phased-hardcalls are
8648 // overridden by phased dosages.
8649 const uint32_t raw_het_ct_needed = VrtypeHphase(vrtype) && (is_minimac3_r2 || (vrtype & 0x60));
8650 uintptr_t* all_hets = nullptr;
8651 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
8652 uint32_t raw_het_ct = genocounts[1]; // inaccurate, corrected later if needed
8653 if (VrtypeMultiallelicHc(vrtype)) {
8654 const uint32_t aux1_first_byte = *fread_ptr++;
8655 const uint32_t aux1a_mode = aux1_first_byte & 15;
8656 const uint32_t aux1b_mode = aux1_first_byte >> 4;
8657 uint32_t raw_10_ct = 0;
8658 if ((!aux1a_mode) || (!aux1b_mode) || sample_include) {
8659 GenovecCount12Unsafe(raw_genovec, raw_sample_ct, &raw_het_ct, &raw_10_ct);
8660 }
8661 uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
8662 reterr = CountAllAux1a(fread_end, sample_include, raw_genovec, aux1a_mode, raw_sample_ct, allele_ct, raw_het_ct, &fread_ptr, one_cts, deltalist_workspace);
8663 if (unlikely(reterr)) {
8664 return reterr;
8665 }
8666 const unsigned char* aux1b_start = fread_ptr;
8667 reterr = CountAllAux1b(fread_end, sample_include, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, one_cts, two_cts, deltalist_workspace);
8668 if (unlikely(reterr)) {
8669 return reterr;
8670 }
8671 if (raw_het_ct_needed) {
8672 if (!sample_include) {
8673 raw_het_ct += genocounts[2];
8674 for (uint32_t aidx = 1; aidx != allele_ct; ++aidx) {
8675 raw_het_ct -= two_cts[aidx];
8676 }
8677 }
8678 if (sample_include || (is_minimac3_r2 && (vrtype & 0x60))) {
8679 all_hets = pgrp->workspace_all_hets;
8680 PgrDetectGenoarrHets(raw_genovec, raw_sample_ct, all_hets);
8681 if (aux1b_mode != 15) {
8682 uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
8683 uint32_t aux1b_het_present;
8684 reterr = GetAux1bHets(fread_end, raw_genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &aux1b_start, aux1b_hets, &aux1b_het_present, deltalist_workspace);
8685 if (unlikely(reterr)) {
8686 return reterr;
8687 }
8688 if (aux1b_het_present) {
8689 BitvecOr(aux1b_hets, raw_sample_ctl, all_hets);
8690 }
8691 }
8692 if (sample_include) {
8693 raw_het_ct = PopcountWords(all_hets, raw_sample_ctl);
8694 }
8695 }
8696 }
8697 }
8698 uintptr_t* raw_phasepresent = nullptr;
8699 uint32_t extra_phased_het_ct = 0;
8700 if (raw_het_ct_needed) {
8701 if (!all_hets) {
8702 reterr = SkipAux2(fread_end, raw_het_ct, &fread_ptr, is_minimac3_r2? (&extra_phased_het_ct) : nullptr);
8703 if (unlikely(reterr)) {
8704 return reterr;
8705 }
8706 } else {
8707 raw_phasepresent = pgrp->workspace_subset;
8708 reterr = GetPhasepresentAndSkipPhaseinfo(fread_end, all_hets, raw_sample_ct, raw_het_ct, &fread_ptr, raw_phasepresent, &extra_phased_het_ct);
8709 if (unlikely(reterr)) {
8710 return reterr;
8711 }
8712 if (sample_include) {
8713 extra_phased_het_ct = PopcountWordsIntersect(raw_phasepresent, sample_include, raw_sample_ctl);
8714 }
8715 }
8716 }
8717 if (!(vrtype & 0x60)) {
8718 uint32_t hom_hc_ct = 0;
8719 for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
8720 const uint64_t cur_hom_ct = two_cts[allele_idx];
8721 hom_hc_ct += cur_hom_ct;
8722 const uint64_t two_dosage = cur_hom_ct * 0x8000LLU;
8723 const uint64_t dosage_sum = one_cts[allele_idx] * 0x4000LLU + two_dosage;
8724 all_dosages[allele_idx] = dosage_sum;
8725 // Repurpose two_cts[] to store ssqs.
8726 two_cts[allele_idx] = (dosage_sum + two_dosage) * 0x4000LLU;
8727 }
8728 const uint32_t nm_sample_ct = sample_ct - genocounts[3];
8729 *het_ctp = nm_sample_ct - hom_hc_ct;
8730 if (!imp_r2_ptr) {
8731 return kPglRetSuccess;
8732 }
8733 *imp_r2_ptr = MultiallelicDiploidMinimac3R2(all_dosages, two_cts, nm_sample_ct, allele_ct, extra_phased_het_ct);
8734 if (!is_minimac3_r2) {
8735 *imp_r2_ptr *= 2;
8736 }
8737 return kPglRetSuccess;
8738 }
8739 fputs("dosages not yet supported by GetMultiallelicCountsAndDosage16s()\n", stderr);
8740 exit(S_CAST(int32_t, kPglRetNotYetSupported));
8741 return kPglRetNotYetSupported;
8742 }
8743
8744 PglErr PgrGetMDCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t is_minimac3_r2, PgenReader* pgr_ptr, double* __restrict imp_r2_ptr, uint32_t* __restrict het_ctp, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict all_dosages) {
8745 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8746 assert(vidx < pgrp->fi.raw_variant_ct);
8747 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8748 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8749 if (!sample_ct) {
8750 STD_ARRAY_REF_FILL0(4, genocounts);
8751 ZeroU64Arr(allele_ct, all_dosages);
8752 if (imp_r2_ptr) {
8753 *imp_r2_ptr = 0.0 / 0.0;
8754 }
8755 return kPglRetSuccess;
8756 }
8757 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
8758 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8759 if ((allele_ct == 2) || (!(vrtype & 0x68))) {
8760 PglErr reterr = GetBasicGenotypeCountsAndDosage16s(sample_include, sample_include_interleaved_vec, sample_include_cumulative_popcounts, sample_ct, vidx, is_minimac3_r2, pgrp, imp_r2_ptr, genocounts, all_dosages);
8761 *het_ctp = genocounts[1];
8762 ZeroU64Arr(allele_ct - 2, &(all_dosages[2]));
8763 return reterr;
8764 }
8765 return GetMultiallelicCountsAndDosage16s(sample_include, sample_include_interleaved_vec, sample_ct, vidx, allele_ct, is_minimac3_r2, pgrp, imp_r2_ptr, het_ctp, genocounts, all_dosages);
8766 }
8767
PgrGetMD(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)8768 PglErr PgrGetMD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
8769 pgvp->patch_01_ct = 0;
8770 pgvp->patch_10_ct = 0;
8771 pgvp->dosage_ct = 0;
8772 pgvp->multidosage_sample_ct = 0;
8773 if (!sample_ct) {
8774 return kPglRetSuccess;
8775 }
8776 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8777 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
8778 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8779 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8780 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8781 if ((allele_ct == 2) || (!(vrtype & 0x68))) {
8782 return IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, pgvp->genovec, pgvp->dosage_present, pgvp->dosage_main, &(pgvp->dosage_ct));
8783 }
8784 const unsigned char* fread_ptr;
8785 const unsigned char* fread_end;
8786 uintptr_t* all_hets = VrtypeHphase(vrtype)? pgrp->workspace_all_hets : nullptr;
8787 if (VrtypeMultiallelicHc(vrtype)) {
8788 PglErr reterr = GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, all_hets? (&fread_ptr) : nullptr, all_hets? (&fread_end) : nullptr, all_hets, pgvp);
8789 if (!(vrtype & 0x60)) {
8790 return reterr;
8791 }
8792 } else {
8793 // todo: ReadRawGenovec, etc.
8794 }
8795 fputs("true multiallelic dosages not yet supported by PgrGetMD()\n", stderr);
8796 exit(S_CAST(int32_t, kPglRetNotYetSupported));
8797 return kPglRetSuccess;
8798 }
8799
IMPLPgrGetDp(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,PgenVariant * pgvp)8800 PglErr IMPLPgrGetDp(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, PgenVariant* pgvp) {
8801 assert(vidx < pgrp->fi.raw_variant_ct);
8802 if (!sample_ct) {
8803 pgvp->phasepresent_ct = 0;
8804 pgvp->dosage_ct = 0;
8805 pgvp->dphase_ct = 0;
8806 return kPglRetSuccess;
8807 }
8808 const unsigned char* fread_ptr = nullptr;
8809 const unsigned char* fread_end = nullptr;
8810 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8811 const uint32_t dosage_is_present = VrtypeDosage(vrtype);
8812 PglErr reterr = ReadGenovecHphaseSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, dosage_is_present? (&fread_ptr) : nullptr, dosage_is_present? (&fread_end) : nullptr, pgvp->genovec, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct));
8813 if (reterr || (!dosage_is_present)) {
8814 pgvp->dosage_ct = 0;
8815 pgvp->dphase_ct = 0;
8816 return reterr;
8817 }
8818 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8819 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8820 return ParseDosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, allele_ct, pgrp, &(pgvp->dosage_ct), pgvp->dphase_present, pgvp->dphase_delta, &(pgvp->dphase_ct), pgvp->dosage_present, pgvp->dosage_main);
8821 }
8822
PgrGetInv1Dp(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,AlleleCode allele_idx,PgenReader * pgr_ptr,PgenVariant * pgvp)8823 PglErr PgrGetInv1Dp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
8824 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8825 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
8826 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8827 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8828 if ((allele_ct == 2) || (!allele_idx)) {
8829 PglErr reterr = IMPLPgrGetDp(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, pgvp);
8830 if (allele_idx) {
8831 GenovecInvertUnsafe(sample_ct, pgvp->genovec);
8832 if (pgvp->phasepresent_ct) {
8833 BitvecInvert(BitCtToWordCt(sample_ct), pgvp->phaseinfo);
8834 }
8835 if (pgvp->dosage_ct) {
8836 BiallelicDosage16Invert(pgvp->dosage_ct, pgvp->dosage_main);
8837 if (pgvp->dphase_ct) {
8838 BiallelicDphase16Invert(pgvp->dphase_ct, pgvp->dphase_delta);
8839 }
8840 }
8841 }
8842 return reterr;
8843 }
8844 const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
8845 if (!VrtypeDosage(vrtype)) {
8846 pgvp->dosage_ct = 0;
8847 pgvp->dphase_ct = 0;
8848 return IMPLPgrGetInv1P(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, pgvp->genovec, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct));
8849 }
8850 fputs("multiallelic dosage not yet supported by GetInv1Dp()\n", stderr);
8851 exit(S_CAST(int32_t, kPglRetNotYetSupported));
8852 return kPglRetSuccess;
8853 }
8854
PgrGetMDp(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,PgenVariant * pgvp)8855 PglErr PgrGetMDp(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, PgenVariant* pgvp) {
8856 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8857 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
8858 pgvp->patch_01_ct = 0;
8859 pgvp->patch_10_ct = 0;
8860 pgvp->phasepresent_ct = 0;
8861 pgvp->dosage_ct = 0;
8862 pgvp->multidosage_sample_ct = 0;
8863 pgvp->dphase_ct = 0;
8864 pgvp->multidphase_sample_ct = 0;
8865 if (!sample_ct) {
8866 return kPglRetSuccess;
8867 }
8868 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8869 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8870 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8871 if ((allele_ct == 2) || (!(vrtype & 0x68))) {
8872 return IMPLPgrGetDp(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, pgvp);
8873 }
8874 const unsigned char* fread_ptr;
8875 const unsigned char* fread_end;
8876 uintptr_t* all_hets = VrtypeHphase(vrtype)? pgrp->workspace_all_hets : nullptr;
8877 if (VrtypeMultiallelicHc(vrtype)) {
8878 PglErr reterr = GetMultiallelicCodes(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, all_hets? (&fread_ptr) : nullptr, all_hets? (&fread_end) : nullptr, all_hets, pgvp);
8879 if (reterr || (!all_hets)) {
8880 return reterr;
8881 }
8882 if (!(vrtype & 0x60)) {
8883 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
8884 return ParseAux2Subset(fread_end, (sample_ct != raw_sample_ct)? sample_include : nullptr, all_hets, nullptr, raw_sample_ct, sample_ct, &fread_ptr, pgvp->phasepresent, pgvp->phaseinfo, &(pgvp->phasepresent_ct), pgrp->workspace_subset);
8885 }
8886 } else {
8887 // todo: ReadRawGenovec, etc.
8888 }
8889 fputs("true multiallelic dosages not yet supported by PgrGetMDp()\n", stderr);
8890 fprintf(stderr, "%u\n", vidx);
8891 exit(S_CAST(int32_t, kPglRetNotYetSupported));
8892 return kPglRetSuccess;
8893
8894 }
8895
8896 static_assert(sizeof(AlleleCode) == 1, "CountAux1bHets() must be updated.");
CountAux1bHets(const AlleleCode * patch_10_vals,uintptr_t rare10_ct)8897 uintptr_t CountAux1bHets(const AlleleCode* patch_10_vals, uintptr_t rare10_ct) {
8898 // Similar to CountByte().
8899 uintptr_t byte_ct = rare10_ct * 2;
8900 #ifdef __LP64__
8901 if (byte_ct < kBytesPerVec) {
8902 #endif
8903 uintptr_t tot = 0;
8904 for (uintptr_t offset = 0; offset < byte_ct; offset += 2) {
8905 tot += (patch_10_vals[offset] != patch_10_vals[offset + 1]);
8906 }
8907 return tot;
8908 #ifdef __LP64__
8909 }
8910 const unsigned char* bytearr_uc_iter = R_CAST(const unsigned char*, patch_10_vals);
8911 const VecW m0 = vecw_setzero();
8912 const VecW m8 = VCONST_W(kMask00FF);
8913 VecW acc = vecw_setzero();
8914 while (byte_ct > 255 * kBytesPerVec) {
8915 VecUc inner_acc = vecuc_setzero();
8916 for (uint32_t uii = 0; uii != 255; ++uii) {
8917 const VecUc cur_vvec = vecuc_loadu(bytearr_uc_iter);
8918 bytearr_uc_iter = &(bytearr_uc_iter[kBytesPerVec]);
8919 const VecUc shifted_vvec = R_CAST(VecUc, vecw_srli(R_CAST(VecW, cur_vvec), 8));
8920 inner_acc = inner_acc - (cur_vvec == shifted_vvec);
8921 }
8922 const VecW partial_sums = R_CAST(VecW, inner_acc) & m8;
8923 acc = acc + vecw_sad(partial_sums, m0);
8924 byte_ct -= 255 * kBytesPerVec;
8925 }
8926 const unsigned char* bytearr_uc_final = &(bytearr_uc_iter[byte_ct - kBytesPerVec]);
8927 VecUc inner_acc = vecuc_setzero();
8928 while (bytearr_uc_iter < bytearr_uc_final) {
8929 const VecUc cur_vvec = vecuc_loadu(bytearr_uc_iter);
8930 bytearr_uc_iter = &(bytearr_uc_iter[kBytesPerVec]);
8931 const VecUc shifted_vvec = R_CAST(VecUc, vecw_srli(R_CAST(VecW, cur_vvec), 8));
8932 inner_acc = inner_acc - (cur_vvec == shifted_vvec);
8933 }
8934 VecUc cur_vvec = vecuc_loadu(bytearr_uc_final);
8935 const uintptr_t overlap_byte_ct = bytearr_uc_iter - bytearr_uc_final;
8936 const VecUc shifted_vvec = R_CAST(VecUc, vecw_srli(R_CAST(VecW, cur_vvec), 8));
8937 const VecUc mask_vvec = vecuc_loadu(&(kLeadMask[kBytesPerVec - overlap_byte_ct]));
8938 cur_vvec = (cur_vvec == shifted_vvec) & mask_vvec;
8939 inner_acc = inner_acc - cur_vvec;
8940 const VecW partial_sums = R_CAST(VecW, inner_acc) & m8;
8941 acc = acc + vecw_sad(partial_sums, m0);
8942 const uintptr_t tot = HsumW(acc);
8943 return rare10_ct - tot;
8944 #endif
8945 }
8946
PgrGetRaw(uint32_t vidx,PgenGlobalFlags read_gflags,PgenReader * pgr_ptr,uintptr_t ** loadbuf_iter_ptr,unsigned char * loaded_vrtype_ptr)8947 PglErr PgrGetRaw(uint32_t vidx, PgenGlobalFlags read_gflags, PgenReader* pgr_ptr, uintptr_t** loadbuf_iter_ptr, unsigned char* loaded_vrtype_ptr) {
8948 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
8949 // currently handles multiallelic hardcalls, hardcall phase, and biallelic
8950 // dosage (both unphased and phased)
8951 // todo: multiallelic dosage
8952 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
8953 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
8954 uintptr_t* genovec = (*loadbuf_iter_ptr);
8955 uintptr_t* loadbuf_iter = &(genovec[NypCtToAlignedWordCt(raw_sample_ct)]);
8956 const uint32_t multiallelic_hc_present = (vrtype / 8) & 1;
8957 const uint32_t save_multiallelic_hc = multiallelic_hc_present && (read_gflags & kfPgenGlobalMultiallelicHardcallFound);
8958 const uint32_t hphase_is_present = (vrtype / 0x10) & 1;
8959 const uint32_t save_hphase = hphase_is_present && (read_gflags & kfPgenGlobalHardcallPhasePresent);
8960 const uint32_t dosage_is_present = (vrtype & 0x60)? 1 : 0;
8961 const uint32_t save_dosage = dosage_is_present && (read_gflags & kfPgenGlobalDosagePresent);
8962
8963 const uint32_t save_dphase = (vrtype & 0x80) && (read_gflags & kfPgenGlobalDosagePhasePresent);
8964 assert(save_dosage || (!save_dphase));
8965
8966 if (loaded_vrtype_ptr) {
8967 *loaded_vrtype_ptr = save_multiallelic_hc * 8 + save_hphase * 0x10 + save_dosage * 0x60 + save_dphase * 0x80;
8968 }
8969 const unsigned char* fread_ptr;
8970 const unsigned char* fread_end;
8971 PglErr reterr = ReadRawGenovec(0, vidx, pgrp, &fread_ptr, &fread_end, genovec);
8972 if ((!(multiallelic_hc_present || save_hphase || save_dosage)) || reterr) {
8973 *loadbuf_iter_ptr = loadbuf_iter;
8974 return reterr;
8975 }
8976
8977 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
8978 ZeroTrailingNyps(raw_sample_ct, genovec);
8979 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
8980 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
8981 uint32_t het_ct = 0;
8982 if (multiallelic_hc_present) {
8983 if (!save_multiallelic_hc) {
8984 // todo: erase-alt2+ fast path
8985 // mostly mirror PgrGet2P(0, 1), but a bit of extra logic is needed to
8986 // throw out phased-10het entries
8987 return kPglRetNotYetSupported;
8988 }
8989 // assume we always save multiallelic info
8990 // raw format:
8991 // rare01_ct, padded out to a word
8992 // rare10_ct, padded out to a word
8993 // [round up to vector boundary, for patch_01_set]
8994 // aux1a, if not mode 15:
8995 // patch_01_set as bitarray, raw_sample_ctl words
8996 // patch_01_vals, round up to word boundary
8997 // [round up to vector boundary, for patch_10_set]
8998 // aux1b, if not mode 15:
8999 // patch_10_set as bitarray, raw_sample_ctl words
9000 // patch_10_vals, round up to word boundary
9001 // round up to vector boundary at end
9002 const uint32_t aux1_first_byte = *fread_ptr++;
9003 const uint32_t aux1a_mode = aux1_first_byte & 15;
9004 const uint32_t aux1b_mode = aux1_first_byte >> 4;
9005 uint32_t raw_10_ct = 0;
9006 if ((!aux1a_mode) || hphase_is_present) {
9007 if (!aux1b_mode) {
9008 GenovecCount12Unsafe(genovec, raw_sample_ct, &het_ct, &raw_10_ct);
9009 } else {
9010 het_ct = CountNyp(genovec, kMask5555, raw_sample_ct);
9011 }
9012 } else if (!aux1b_mode) {
9013 raw_10_ct = CountNyp(genovec, kMaskAAAA, raw_sample_ct);
9014 }
9015 uintptr_t* multihc_raw = loadbuf_iter;
9016 loadbuf_iter = &(loadbuf_iter[RoundUpPow2(2, kWordsPerVec)]);
9017 uint32_t rare01_ct = 0;
9018 if (aux1a_mode != 15) {
9019 uintptr_t* patch_01_set = loadbuf_iter;
9020 loadbuf_iter = &(loadbuf_iter[raw_sample_ctl]);
9021 // (could decide to vector-align patch_01_vals later)
9022 AlleleCode* patch_01_vals = R_CAST(AlleleCode*, loadbuf_iter);
9023 reterr = ExportAux1a(fread_end, genovec, aux1a_mode, raw_sample_ct, allele_ct, het_ct, &fread_ptr, patch_01_set, patch_01_vals, &rare01_ct);
9024 if (unlikely(reterr)) {
9025 return reterr;
9026 }
9027 loadbuf_iter = &(loadbuf_iter[DivUp(rare01_ct, kBytesPerWord / sizeof(AlleleCode))]);
9028 VecAlignUp64(&loadbuf_iter);
9029 }
9030 uint32_t rare10_ct = 0;
9031 if (aux1b_mode != 15) {
9032 uintptr_t* patch_10_set = loadbuf_iter;
9033 loadbuf_iter = &(loadbuf_iter[raw_sample_ctl]);
9034 AlleleCode* patch_10_vals = R_CAST(AlleleCode*, loadbuf_iter);
9035 reterr = ExportAux1b(fread_end, genovec, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, patch_10_set, patch_10_vals, &rare10_ct);
9036 if (unlikely(reterr)) {
9037 return reterr;
9038 }
9039 loadbuf_iter = &(loadbuf_iter[DivUp(rare10_ct, kBytesPerWord / (2 * sizeof(AlleleCode)))]);
9040 VecAlignUp64(&loadbuf_iter);
9041 if (hphase_is_present) {
9042 het_ct += CountAux1bHets(patch_10_vals, rare10_ct);
9043 }
9044 }
9045 multihc_raw[0] = rare01_ct;
9046 multihc_raw[1] = rare10_ct;
9047 } else if (hphase_is_present) {
9048 het_ct = CountNyp(genovec, kMask5555, raw_sample_ct);
9049 }
9050
9051 if (hphase_is_present) {
9052 if (unlikely(!het_ct)) {
9053 // there shouldn't be a hphase track at all in this case
9054 return kPglRetMalformedInput;
9055 }
9056 const uint32_t het_ctdl = het_ct / kBitsPerWord;
9057 uintptr_t* phaseraw = loadbuf_iter;
9058 const uint32_t first_half_byte_ct = 1 + (het_ct / CHAR_BIT);
9059 if (save_hphase) {
9060 // this needs to be synced with MakePgenThread()
9061 #ifdef __LP64__
9062 // save het_ct later so we can use PopcountWords() below
9063 phaseraw[0] = 0;
9064 #else
9065 phaseraw[0] = het_ct;
9066 phaseraw[1] = 0;
9067 #endif
9068 loadbuf_iter = &(loadbuf_iter[8 / kBytesPerWord]);
9069 loadbuf_iter[het_ctdl] = 0;
9070 memcpy(loadbuf_iter, fread_ptr, first_half_byte_ct);
9071 loadbuf_iter = &(loadbuf_iter[1 + het_ctdl]);
9072 }
9073 const uint32_t explicit_phasepresent = fread_ptr[0] & 1;
9074 const unsigned char* aux2_start = fread_ptr;
9075 fread_ptr = &(fread_ptr[first_half_byte_ct]);
9076 if (explicit_phasepresent) {
9077 uint32_t raw_phasepresent_ct;
9078 if (save_hphase) {
9079 #ifdef __LP64__
9080 raw_phasepresent_ct = PopcountWords(phaseraw, het_ctdl + 2);
9081 #else
9082 raw_phasepresent_ct = PopcountWords(&(phaseraw[2]), het_ctdl + 1);
9083 #endif
9084 } else {
9085 // bugfix (11 Apr 2018): not copied to phaseraw in this case
9086 raw_phasepresent_ct = PopcountBytes(aux2_start, first_half_byte_ct);
9087 }
9088 --raw_phasepresent_ct;
9089 if (unlikely(!raw_phasepresent_ct)) {
9090 // there shouldn't be a hphase track at all in this case, either
9091 return kPglRetMalformedInput;
9092 }
9093 const uint32_t second_half_byte_ct = DivUp(raw_phasepresent_ct, CHAR_BIT);
9094 if (save_hphase) {
9095 #ifdef __LP64__
9096 phaseraw[0] = het_ct | (S_CAST(uint64_t, raw_phasepresent_ct) << 32);
9097 #else
9098 phaseraw[1] = raw_phasepresent_ct;
9099 #endif
9100 memcpy(loadbuf_iter, fread_ptr, second_half_byte_ct);
9101 loadbuf_iter = &(loadbuf_iter[BitCtToWordCt(raw_phasepresent_ct)]);
9102 }
9103 fread_ptr = &(fread_ptr[second_half_byte_ct]);
9104 }
9105 #ifdef __LP64__
9106 if (save_hphase) {
9107 if (!explicit_phasepresent) {
9108 phaseraw[0] = het_ct;
9109 }
9110 VecAlignUp(&loadbuf_iter);
9111 }
9112 #endif
9113 }
9114 if (!save_dosage) {
9115 *loadbuf_iter_ptr = loadbuf_iter;
9116 return kPglRetSuccess;
9117 }
9118 uintptr_t* dosage_present = loadbuf_iter;
9119 const uint32_t raw_sample_ctaw = BitCtToAlignedWordCt(raw_sample_ct);
9120 loadbuf_iter = &(loadbuf_iter[raw_sample_ctaw]);
9121 uint16_t* dosage_main = R_CAST(uint16_t*, loadbuf_iter);
9122 // probable todo: pack this more tightly in the future
9123 const uintptr_t dosage_main_aligned_wordct = kWordsPerVec * DivUp(raw_sample_ct, (kBytesPerVec / sizeof(int16_t)));
9124 loadbuf_iter = &(loadbuf_iter[dosage_main_aligned_wordct]);
9125 uintptr_t* dphase_present = nullptr;
9126 int16_t* dphase_delta = nullptr;
9127 if (save_dphase) {
9128 dphase_present = loadbuf_iter;
9129 loadbuf_iter = &(loadbuf_iter[raw_sample_ctaw]);
9130 dphase_delta = R_CAST(int16_t*, loadbuf_iter);
9131 loadbuf_iter = &(loadbuf_iter[dosage_main_aligned_wordct]);
9132 }
9133 *loadbuf_iter_ptr = loadbuf_iter;
9134 return ParseDosage16(fread_ptr, fread_end, nullptr, raw_sample_ct, vidx, allele_ct, pgrp, nullptr, dphase_present, dphase_delta, nullptr, dosage_present, dosage_main);
9135 }
9136
9137
9138 // Currently assumes no phase or multiallelic hardcalls.
9139 // tried to have more custom code, turned out to not be worth it
ReadMissingness(const uintptr_t * __restrict sample_include,const uint32_t * __restrict sample_include_cumulative_popcounts,uint32_t sample_ct,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,const unsigned char ** fread_endp,uintptr_t * __restrict missingness,uintptr_t * __restrict hets,uintptr_t * __restrict genovec_buf)9140 PglErr ReadMissingness(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict missingness, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf) {
9141 const unsigned char* fread_ptr;
9142 const unsigned char* fread_end;
9143 PglErr reterr = ReadGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec_buf);
9144 ZeroTrailingNyps(sample_ct, genovec_buf);
9145 GenoarrToMissingnessUnsafe(genovec_buf, sample_ct, missingness);
9146 if (hets) {
9147 PgrDetectGenoarrHetsUnsafe(genovec_buf, NypCtToWordCt(sample_ct), hets);
9148 }
9149 if (fread_pp) {
9150 *fread_pp = fread_ptr;
9151 *fread_endp = fread_end;
9152 }
9153 return reterr;
9154 }
9155
PgrGetMissingness(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict missingness,uintptr_t * __restrict genovec_buf)9156 PglErr PgrGetMissingness(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict missingness, uintptr_t* __restrict genovec_buf) {
9157 if (!sample_ct) {
9158 return kPglRetSuccess;
9159 }
9160 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
9161 // may as well add a hets parameter?
9162 assert(vidx < pgrp->fi.raw_variant_ct);
9163 return ReadMissingness(sample_include, GetSicp(pssi), sample_ct, vidx, pgrp, nullptr, nullptr, missingness, nullptr, genovec_buf);
9164 }
9165
PgrGetMissingnessD(const uintptr_t * __restrict sample_include,PgrSampleSubsetIndex pssi,uint32_t sample_ct,uint32_t vidx,PgenReader * pgr_ptr,uintptr_t * __restrict missingness_hc,uintptr_t * __restrict missingness_dosage,uintptr_t * __restrict hets,uintptr_t * __restrict genovec_buf)9166 PglErr PgrGetMissingnessD(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict missingness_hc, uintptr_t* __restrict missingness_dosage, uintptr_t* __restrict hets, uintptr_t* __restrict genovec_buf) {
9167 if (!sample_ct) {
9168 return kPglRetSuccess;
9169 }
9170 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
9171 // sample_include can't be null
9172 // either missingness_hc or missingness_dosage must be non-null
9173 assert(vidx < pgrp->fi.raw_variant_ct);
9174 const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
9175 const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
9176 const uint32_t dosage_is_relevant = missingness_dosage && VrtypeDosage(vrtype);
9177 const uint32_t need_to_skip_aux1or2 = dosage_is_relevant && (vrtype & 0x18);
9178 const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
9179 const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
9180 const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
9181 const unsigned char* fread_ptr = nullptr;
9182 const unsigned char* fread_end = nullptr;
9183 uintptr_t* missingness_base = missingness_hc? missingness_hc : missingness_dosage;
9184 if (!need_to_skip_aux1or2) {
9185 PglErr reterr = ReadMissingness(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, dosage_is_relevant? (&fread_ptr) : nullptr, dosage_is_relevant? (&fread_end) : nullptr, missingness_base, hets, genovec_buf);
9186 if (missingness_dosage && missingness_hc) {
9187 memcpy(missingness_dosage, missingness_hc, BitCtToWordCt(sample_ct) * sizeof(intptr_t));
9188 }
9189 if (reterr || (!dosage_is_relevant)) {
9190 return reterr;
9191 }
9192 } else {
9193 PglErr reterr = ReadRawGenovec(subsetting_required, vidx, pgrp, &fread_ptr, &fread_end, genovec_buf);
9194 if (unlikely(reterr)) {
9195 return reterr;
9196 }
9197 ZeroTrailingNyps(raw_sample_ct, genovec_buf);
9198 uintptr_t* subsetted_genovec = pgrp->workspace_vec;
9199 CopyNyparrNonemptySubset(genovec_buf, sample_include, raw_sample_ct, sample_ct, subsetted_genovec);
9200 GenoarrToMissingnessUnsafe(subsetted_genovec, sample_ct, missingness_base);
9201 if (missingness_hc) {
9202 memcpy(missingness_dosage, missingness_hc, BitCtToWordCt(sample_ct) * sizeof(intptr_t));
9203 }
9204
9205 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
9206 const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
9207 if (VrtypeHphase(vrtype) || hets) {
9208 uintptr_t* all_hets = pgrp->workspace_all_hets;
9209 PgrDetectGenoarrHets(genovec_buf, raw_sample_ct, all_hets);
9210 if (VrtypeMultiallelicHc(vrtype)) {
9211 // see analogous branch in ReadGenovecHphaseSubsetUnsafe()
9212 // probable todo: make this a separate function
9213 const uint32_t aux1_first_byte = *fread_ptr++;
9214 const uint32_t aux1a_mode = aux1_first_byte & 15;
9215 const uint32_t aux1b_mode = aux1_first_byte >> 4;
9216 uint32_t raw_01_ct = 0;
9217 uint32_t raw_10_ct = 0;
9218 if ((!aux1a_mode) || (!aux1b_mode)) {
9219 GenovecCount12Unsafe(genovec_buf, raw_sample_ct, &raw_01_ct, &raw_10_ct);
9220 }
9221 reterr = SkipAux1a(fread_end, aux1a_mode, raw_sample_ct, allele_ct, raw_01_ct, &fread_ptr);
9222 if (unlikely(reterr)) {
9223 return reterr;
9224 }
9225 uintptr_t* aux1b_hets = pgrp->workspace_aux1x_present;
9226 uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
9227 uint32_t aux1b_het_present;
9228 reterr = GetAux1bHets(fread_end, genovec_buf, aux1b_mode, raw_sample_ct, allele_ct, raw_10_ct, &fread_ptr, aux1b_hets, &aux1b_het_present, deltalist_workspace);
9229 if (unlikely(reterr)) {
9230 return reterr;
9231 }
9232 if (aux1b_het_present) {
9233 BitvecOr(aux1b_hets, raw_sample_ctl, all_hets);
9234 }
9235 }
9236 if (hets) {
9237 CopyBitarrSubset(all_hets, sample_include, sample_ct, hets);
9238 }
9239 if (VrtypeHphase(vrtype)) {
9240 reterr = SkipAux2(fread_end, PopcountWords(all_hets, raw_sample_ctl), &fread_ptr, nullptr);
9241 if (unlikely(reterr)) {
9242 return reterr;
9243 }
9244 }
9245 } else {
9246 SkipAux1(fread_end, genovec_buf, raw_sample_ct, allele_ct, &fread_ptr);
9247 }
9248 }
9249 // now perform bitwise andnot with dosage_present
9250 if ((vrtype & 0x60) == 0x40) {
9251 // unconditional dosage. spot-check the appropriate entries for equality
9252 // to 65535.
9253 #ifdef __arm__
9254 # error "Unaligned accesses in PgrGetMissingnessPD()."
9255 #endif
9256 const uint16_t* dosage_main = R_CAST(const uint16_t*, fread_ptr);
9257 // bugfix (18 Feb 2019): sample_include is permitted to be nullptr here
9258 if (!subsetting_required) {
9259 // probable todo: faster iteration over set bits
9260 for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
9261 uintptr_t missing_dosage_bits = missingness_dosage[widx];
9262 if (missing_dosage_bits) {
9263 const uint16_t* cur_dosage_main = &(dosage_main[widx * kBitsPerWord]);
9264 do {
9265 uint32_t sample_idx_lowbits = ctzw(missing_dosage_bits);
9266 if (cur_dosage_main[sample_idx_lowbits] != 65535) {
9267 missingness_dosage[widx] ^= missing_dosage_bits & (-missing_dosage_bits);
9268 }
9269 missing_dosage_bits &= missing_dosage_bits - 1;
9270 } while (missing_dosage_bits);
9271 }
9272 }
9273 } else {
9274 uintptr_t sample_uidx_base = 0;
9275 uintptr_t sample_include_bits = sample_include[0];
9276 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
9277 const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &sample_include_bits);
9278 if (!IsSet(missingness_dosage, sample_idx)) {
9279 continue;
9280 }
9281 if (dosage_main[sample_uidx] != 65535) {
9282 ClearBit(sample_idx, missingness_dosage);
9283 }
9284 }
9285 }
9286 return kPglRetSuccess;
9287 }
9288 uintptr_t* dosage_present = pgrp->workspace_dosage_present;
9289 if ((vrtype & 0x60) == 0x20) {
9290 // dosage list
9291 uint32_t dummy;
9292 if (unlikely(ParseAndSaveDeltalistAsBitarr(fread_end, raw_sample_ct, &fread_ptr, dosage_present, &dummy))) {
9293 return kPglRetMalformedInput;
9294 }
9295 } else {
9296 // dosage bitarray
9297 dosage_present[raw_sample_ctl - 1] = 0;
9298 const uint32_t raw_sample_ctb = DivUp(raw_sample_ct, CHAR_BIT);
9299 memcpy(dosage_present, fread_ptr, raw_sample_ctb);
9300 }
9301 if (subsetting_required) {
9302 CopyBitarrSubset(dosage_present, sample_include, sample_ct, pgrp->workspace_vec);
9303 dosage_present = pgrp->workspace_vec;
9304 }
9305 BitvecInvmask(dosage_present, BitCtToWordCt(sample_ct), missingness_dosage);
9306 return kPglRetSuccess;
9307 }
9308
ValidateVint31(const unsigned char * buf_end,const unsigned char ** bufpp,uint32_t * val_ptr)9309 static inline BoolErr ValidateVint31(const unsigned char* buf_end, const unsigned char** bufpp, uint32_t* val_ptr) {
9310 if (unlikely(buf_end <= (*bufpp))) {
9311 return 1;
9312 }
9313 uint32_t vint32 = *((*bufpp)++);
9314 if (vint32 <= 127) {
9315 *val_ptr = vint32;
9316 return 0;
9317 }
9318 vint32 &= 127;
9319 for (uint32_t shift = 7; shift != 28; shift += 7) {
9320 if (unlikely(buf_end == (*bufpp))) {
9321 return 1;
9322 }
9323 uint32_t uii = *((*bufpp)++);
9324 vint32 |= (uii & 127) << shift;
9325 if (uii <= 127) {
9326 *val_ptr = vint32;
9327 return 0;
9328 }
9329 }
9330 if (unlikely(buf_end == (*bufpp))) {
9331 return 1;
9332 }
9333 uint32_t uii = *((*bufpp)++);
9334 if (unlikely(uii > 7)) {
9335 return 1;
9336 }
9337 vint32 |= uii << 28;
9338 *val_ptr = vint32;
9339 return 0;
9340 }
9341
ValidateDifflistHeader(const unsigned char * fread_end,uint32_t sample_ct,const unsigned char ** fread_pp,uintptr_t * raregeno_buf,const unsigned char ** difflist_group_info_ptr,uint32_t * difflist_len_ptr)9342 BoolErr ValidateDifflistHeader(const unsigned char* fread_end, uint32_t sample_ct, const unsigned char** fread_pp, uintptr_t* raregeno_buf, const unsigned char** difflist_group_info_ptr, uint32_t* difflist_len_ptr) {
9343 // can be used for deltalists: pass raregeno_buf == nullptr.
9344 if (unlikely(ValidateVint31(fread_end, fread_pp, difflist_len_ptr))) {
9345 // todo: ensure fread_pp points to a problematic byte whenever a validate_
9346 // function returns an error, so the error message can provide an accurate
9347 // byte offset.
9348 return 1;
9349 }
9350 const uint32_t difflist_len = *difflist_len_ptr;
9351 *difflist_group_info_ptr = *fread_pp;
9352 if (!difflist_len) {
9353 return 0;
9354 }
9355 if (unlikely(difflist_len > sample_ct / kPglMaxDifflistLenDivisor)) {
9356 return 1;
9357 }
9358 const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize);
9359 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(sample_ct);
9360 const uint32_t difflist_index_byte_ct = group_ct * (sample_id_byte_ct + 1) - 1;
9361 if (PtrAddCk(fread_end, difflist_index_byte_ct, fread_pp)) {
9362 return 1;
9363 }
9364 if (!raregeno_buf) {
9365 return 0;
9366 }
9367 const uint32_t raregeno_byte_ct = NypCtToByteCt(difflist_len);
9368 const unsigned char* raregeno_start = *fread_pp;
9369 if (PtrAddCk(fread_end, raregeno_byte_ct, fread_pp)) {
9370 return 1;
9371 }
9372 memcpy(raregeno_buf, raregeno_start, raregeno_byte_ct);
9373 const uint32_t difflist_len_mod4 = difflist_len % 4;
9374 if (difflist_len_mod4) {
9375 const uint32_t last_raregeno_byte = (*fread_pp)[-1];
9376 if (unlikely(last_raregeno_byte >> (2 * difflist_len_mod4))) {
9377 return 1;
9378 }
9379 }
9380 return 0;
9381 }
9382
ValidateAndApplyDifflist(const unsigned char * fread_end,uint32_t common2_code,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)9383 BoolErr ValidateAndApplyDifflist(const unsigned char* fread_end, uint32_t common2_code, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
9384 // Side effects: uses pgr.workspace_raregeno_tmp_loadbuf.
9385 // Similar to ParseAndApplyDifflist(), but with exhaustive input
9386 // validation.
9387 const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9388 uintptr_t* cur_raregeno_iter = pgrp->workspace_raregeno_tmp_loadbuf;
9389 const unsigned char* group_info_iter;
9390 uint32_t difflist_len;
9391 if (unlikely(ValidateDifflistHeader(fread_end, sample_ct, fread_pp, cur_raregeno_iter, &group_info_iter, &difflist_len))) {
9392 return 1;
9393 }
9394 if (!difflist_len) {
9395 return 0;
9396 }
9397 const uint32_t subgroup_idx_last = (difflist_len - 1) / kBitsPerWordD2;
9398 if (common2_code) {
9399 // 1-bit format + list of exceptions. In this case,
9400 // (i) the length of the exception list must be < (sample_ct / 16)
9401 // (ii) every raregeno entry must either be one of the two rare genotype
9402 // values, or involve a rare alt allele.
9403 if (unlikely(difflist_len >= (sample_ct / (2 * kPglMaxDifflistLenDivisor)))) {
9404 return 1;
9405 }
9406 const uintptr_t common_code_delta = common2_code & 3;
9407 const uintptr_t inv_common_word1 = (3 - common2_code / 4) * kMask5555;
9408 const uintptr_t inv_common_word2 = inv_common_word1 - (common_code_delta * kMask5555);
9409 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
9410 uintptr_t cur_raregeno_word = cur_raregeno_iter[subgroup_idx];
9411 const uintptr_t match1 = Word11(cur_raregeno_word ^ inv_common_word1);
9412 const uintptr_t match2 = Word11(cur_raregeno_word ^ inv_common_word2);
9413 if (subgroup_idx == subgroup_idx_last) {
9414 // ignore trailing bits
9415 const uint32_t lshift = ((-difflist_len) % kBitsPerWordD2) * 2;
9416 if (unlikely((match1 << lshift) || (match2 << lshift))) {
9417 return 1;
9418 }
9419 break;
9420 }
9421 if (unlikely(match1 || match2)) {
9422 // todo: if (multiallelic_hc_present && (!inv_common_word2)), record
9423 // might be fine; but we need to verify these are actually rare alt
9424 // alleles.
9425 // (er, above comment is obsolete)
9426 return 1;
9427 }
9428 }
9429 }
9430 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(sample_ct);
9431 const unsigned char* group_byte_cts_iter = &(group_info_iter[DivUp(difflist_len, kPglDifflistGroupSize) * sample_id_byte_ct]);
9432 const unsigned char* prev_group_start = *fread_pp;
9433
9434 uintptr_t sample_idx = 0;
9435 for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
9436 uint32_t remaining_deltas_in_subgroup = kBitsPerWordD2 - 1;
9437 if (subgroup_idx >= subgroup_idx_last) {
9438 if (subgroup_idx > subgroup_idx_last) {
9439 return 0;
9440 }
9441 remaining_deltas_in_subgroup &= difflist_len - 1;
9442 }
9443 if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
9444 uintptr_t new_sample_idx_start = SubU32Load(group_info_iter, sample_id_byte_ct);
9445 if (subgroup_idx) {
9446 if (unlikely(sample_idx >= new_sample_idx_start)) {
9447 return 1;
9448 }
9449 const uint32_t group_byte_ct = S_CAST(uint32_t, *group_byte_cts_iter++) + 63;
9450 if (unlikely(S_CAST(uintptr_t, (*fread_pp) - prev_group_start) != group_byte_ct)) {
9451 return 1;
9452 }
9453 prev_group_start = *fread_pp;
9454 }
9455 sample_idx = new_sample_idx_start;
9456 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
9457 } else {
9458 uint32_t sample_idx_incr;
9459 if (unlikely(ValidateVint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr))) {
9460 return 1;
9461 }
9462 sample_idx += sample_idx_incr;
9463 }
9464 uintptr_t cur_raregeno_word = *cur_raregeno_iter++;
9465 for (; ; --remaining_deltas_in_subgroup) {
9466 if (unlikely(sample_idx >= sample_ct)) {
9467 return 1;
9468 }
9469 const uintptr_t cur_geno = cur_raregeno_word & 3;
9470 AssignNyparrEntry(sample_idx, cur_geno, genoarr);
9471 if (!remaining_deltas_in_subgroup) {
9472 break;
9473 }
9474 uint32_t sample_idx_incr;
9475 if (unlikely(ValidateVint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr))) {
9476 return 1;
9477 }
9478 sample_idx += sample_idx_incr;
9479 cur_raregeno_word >>= 2;
9480 }
9481 }
9482 }
9483
ValidateOnebit(const unsigned char * fread_end,const unsigned char ** fread_pp,PgenReaderMain * pgrp,uintptr_t * __restrict genoarr)9484 BoolErr ValidateOnebit(const unsigned char* fread_end, const unsigned char** fread_pp, PgenReaderMain* pgrp, uintptr_t* __restrict genoarr) {
9485 // ParseOnebitUnsafe() with exhaustive input validation.
9486 const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9487 const uint32_t common2_and_bitarray_byte_ct = (sample_ct + 15) / CHAR_BIT;
9488 const unsigned char* onebit_main_iter = *fread_pp;
9489 if (PtrAddCk(fread_end, common2_and_bitarray_byte_ct, fread_pp)) {
9490 return 1;
9491 }
9492 const uintptr_t common2_code = *onebit_main_iter++;
9493 const uintptr_t common_code_delta = common2_code & 3;
9494 uintptr_t word_base = common2_code / 4;
9495 if (unlikely((!common_code_delta) || (word_base + common_code_delta > 3))) {
9496 return 1;
9497 }
9498 word_base *= kMask5555;
9499 const uint32_t genoarr_widx_trail = (sample_ct + 7) / kBitsPerWordD2;
9500 const uint32_t genoarr_widx_end = NypCtToWordCt(sample_ct);
9501 #ifdef __arm__
9502 # error "Unaligned accesses in ValidateOnebit()."
9503 #endif
9504 const Halfword* onebit_main = R_CAST(const Halfword*, onebit_main_iter);
9505 for (uint32_t genoarr_widx = 0; ; ++genoarr_widx) {
9506 uintptr_t ww;
9507 if (genoarr_widx >= genoarr_widx_trail) {
9508 if (genoarr_widx == genoarr_widx_end) {
9509 break;
9510 }
9511 const uint32_t nontrail_byte_ct = ((sample_ct - 1) % kBitsPerWordD2) / CHAR_BIT;
9512 ww = ProperSubwordLoad(&(onebit_main[genoarr_widx_trail]), 1 + nontrail_byte_ct);
9513 const uint32_t sample_ct_mod8 = sample_ct % 8;
9514 if (sample_ct_mod8) {
9515 if (unlikely(ww >> (nontrail_byte_ct * 8 + sample_ct_mod8))) {
9516 return 1;
9517 }
9518 }
9519 } else {
9520 ww = onebit_main[genoarr_widx];
9521 }
9522 ww = UnpackHalfwordToWord(ww);
9523 genoarr[genoarr_widx] = word_base + ww * common_code_delta;
9524 }
9525 return ValidateAndApplyDifflist(fread_end, common2_code, fread_pp, pgrp, genoarr);
9526 }
9527
9528 // assumes that we aren't dealing with the trivial fixed-width case.
9529 // saves main genotype array to genovec. does not zero out trailing bits.
ValidateGeno(const unsigned char * fread_end,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,uintptr_t * genovec,char * errstr_buf)9530 BoolErr ValidateGeno(const unsigned char* fread_end, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, uintptr_t* genovec, char* errstr_buf) {
9531 const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
9532 const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9533 if (VrtypeLdCompressed(vrtype)) {
9534 CopyNyparr(pgrp->ldbase_genovec, sample_ct, genovec);
9535 if (unlikely(ValidateAndApplyDifflist(fread_end, 0, fread_pp, pgrp, genovec))) {
9536 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid LD difflist for (0-based) variant #%u.\n", vidx);
9537 return 1;
9538 }
9539 if (vrtype & 1) {
9540 GenovecInvertUnsafe(sample_ct, genovec);
9541 }
9542 return 0;
9543 }
9544 const uint32_t is_ldbase = VrtypeLdCompressed(pgrp->fi.vrtypes[vidx + 1]);
9545 if (!(vrtype & 4)) {
9546 if (vrtype & 1) {
9547 if (unlikely(ValidateOnebit(fread_end, fread_pp, pgrp, genovec))) {
9548 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid 1-bit genotype record for (0-based) variant #%u.\n", vidx);
9549 return 1;
9550 }
9551 } else {
9552 const uint32_t genovec_byte_ct = DivUp(sample_ct, 4);
9553 const unsigned char* src_genodata = *fread_pp;
9554 if (PtrAddCk(fread_end, genovec_byte_ct, fread_pp)) {
9555 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid 2-bit genotype record for (0-based) variant #%u\n", vidx);
9556 return 1;
9557 }
9558 memcpy(genovec, src_genodata, genovec_byte_ct);
9559 const uint32_t sample_ct_mod4 = sample_ct % 4;
9560 if (sample_ct_mod4) {
9561 const uint32_t last_geno_byte = (*fread_pp)[-1];
9562 if (unlikely(last_geno_byte >> (2 * sample_ct_mod4))) {
9563 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Last genotype byte for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9564 return 1;
9565 }
9566 }
9567 }
9568 } else {
9569 const uint32_t vrtype_low2 = vrtype & 3;
9570 if (vrtype_low2 != 1) {
9571 const uint32_t vec_ct = NypCtToVecCt(sample_ct);
9572 vecset(genovec, vrtype_low2 * kMask5555, vec_ct);
9573 if (unlikely(ValidateAndApplyDifflist(fread_end, 0, fread_pp, pgrp, genovec))) {
9574 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid genotype difflist for (0-based) variant #%u.\n", vidx);
9575 return 1;
9576 }
9577 } else {
9578 if (is_ldbase) {
9579 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid LD back-reference from variant #%u to all-hom-ref variant #%u.\n", vidx + 1, vidx);
9580 return 1;
9581 }
9582 ZeroWArr(NypCtToWordCt(sample_ct), genovec);
9583 }
9584 }
9585 if (is_ldbase) {
9586 CopyNyparr(genovec, sample_ct, pgrp->ldbase_genovec);
9587 }
9588 return 0;
9589 }
9590
ValidateAndCountDeltalist(const unsigned char * fread_end,uint32_t sample_ct,const unsigned char ** fread_pp,uint32_t * __restrict deltalist,uint32_t * deltalist_len_ptr)9591 BoolErr ValidateAndCountDeltalist(const unsigned char* fread_end, uint32_t sample_ct, const unsigned char** fread_pp, uint32_t* __restrict deltalist, uint32_t* deltalist_len_ptr) {
9592 // pass deltalist == nullptr when actual bit positions aren't needed
9593 const unsigned char* group_info_iter;
9594 if (unlikely(ValidateDifflistHeader(fread_end, sample_ct, fread_pp, nullptr, &group_info_iter, deltalist_len_ptr))) {
9595 return 1;
9596 }
9597 const uint32_t deltalist_len = *deltalist_len_ptr;
9598 if (!deltalist_len) {
9599 return 0;
9600 }
9601 const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(sample_ct);
9602 const uint32_t group_idx_last = (deltalist_len - 1) / kPglDifflistGroupSize;
9603 const unsigned char* group_byte_cts_iter = &(group_info_iter[DivUp(deltalist_len, kPglDifflistGroupSize) * sample_id_byte_ct]);
9604 const unsigned char* prev_group_start = *fread_pp;
9605 uint32_t* deltalist_iter = deltalist;
9606 uint32_t group_len_m1 = kPglDifflistGroupSize - 1;
9607 uintptr_t sample_idx = 0;
9608 for (uint32_t group_idx = 0; ; ++group_idx) {
9609 if (group_idx >= group_idx_last) {
9610 if (group_idx > group_idx_last) {
9611 return 0;
9612 }
9613 group_len_m1 &= deltalist_len - 1;
9614 }
9615 uintptr_t new_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
9616 if (group_idx) {
9617 if (unlikely(sample_idx >= new_sample_idx)) {
9618 return 1;
9619 }
9620 const uint32_t group_byte_ct = S_CAST(uint32_t, *group_byte_cts_iter++) + 63;
9621 if (unlikely(S_CAST(uintptr_t, (*fread_pp) - prev_group_start) != group_byte_ct)) {
9622 return 1;
9623 }
9624 prev_group_start = *fread_pp;
9625 }
9626 sample_idx = new_sample_idx;
9627 group_info_iter = &(group_info_iter[sample_id_byte_ct]);
9628 for (uint32_t deltalist_idx_lowbits = 0; ; ++deltalist_idx_lowbits) {
9629 if (unlikely(sample_idx >= sample_ct)) {
9630 return 1;
9631 }
9632 if (deltalist_iter) {
9633 *deltalist_iter++ = sample_idx;
9634 }
9635 if (deltalist_idx_lowbits == group_len_m1) {
9636 break;
9637 }
9638 uint32_t sample_idx_incr;
9639 if (unlikely(ValidateVint31(fread_end, fread_pp, &sample_idx_incr) || (!sample_idx_incr))) {
9640 return 1;
9641 }
9642 sample_idx += sample_idx_incr;
9643 }
9644 }
9645 }
9646
ValidateMultiallelicHc(const unsigned char * fread_end,const uintptr_t * __restrict raw_genovec,uint32_t vidx,uint32_t allele_ct,PgenReaderMain * pgrp,const unsigned char ** fread_pp,uint32_t * __restrict het_ctp,char * __restrict errstr_buf)9647 BoolErr ValidateMultiallelicHc(const unsigned char* fread_end, const uintptr_t* __restrict raw_genovec, uint32_t vidx, uint32_t allele_ct, PgenReaderMain* pgrp, const unsigned char** fread_pp, uint32_t* __restrict het_ctp, char* __restrict errstr_buf) {
9648 if (unlikely(allele_ct <= 2)) {
9649 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic hardcall track present for (0-based) variant #%u, but it apparently has only %u allele%s.\n", vidx, allele_ct, (allele_ct == 1)? "" : "s");
9650 return 1;
9651 }
9652 const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9653 const uint32_t aux1_first_byte = **fread_pp;
9654 *fread_pp += 1;
9655 if (unlikely(
9656 aux1_first_byte &&
9657 (aux1_first_byte != 1) &&
9658 (aux1_first_byte != 15) &&
9659 (aux1_first_byte != 16) &&
9660 (aux1_first_byte != 17) &&
9661 (aux1_first_byte != 31) &&
9662 (aux1_first_byte != 240) &&
9663 (aux1_first_byte != 241))) {
9664 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic hardcall track mode byte (%u; must be in {0, 1, 15, 16, 17, 31, 240, 241}) in (0-based) variant #%u.\n", aux1_first_byte, vidx);
9665 return 1;
9666 }
9667 const uint32_t aux1a_mode = aux1_first_byte & 15;
9668 const uint32_t aux1b_mode = aux1_first_byte >> 4;
9669 uint32_t raw_01_ct;
9670 uint32_t raw_10_ct;
9671 GenovecCount12Unsafe(raw_genovec, sample_ct, &raw_01_ct, &raw_10_ct);
9672 uint32_t* deltalist_workspace = pgrp->workspace_difflist_sample_ids;
9673 if (aux1a_mode != 15) {
9674 if (unlikely(!raw_01_ct)) {
9675 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic het-ref hardcall track present for (0-based) variant #%u, but no het-ref calls exist.\n", vidx);
9676 return 1;
9677 }
9678 uint32_t rare01_ct;
9679 if (!aux1a_mode) {
9680 const uint32_t subset_byte_ct = DivUp(raw_01_ct, CHAR_BIT);
9681 if (PtrCheck(fread_end, *fread_pp, subset_byte_ct)) {
9682 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall bitarray-subset for (0-based) variant #%u.\n", vidx);
9683 return 1;
9684 }
9685 rare01_ct = PopcountBytes(*fread_pp, subset_byte_ct);
9686 if (unlikely(!rare01_ct)) {
9687 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Empty multiallelic het-ref hardcall bitarray-subset for (0-based) variant #%u.\n", vidx);
9688 return 1;
9689 }
9690 *fread_pp += subset_byte_ct;
9691 const uint32_t raw_01_ct_mod8 = raw_01_ct % 8;
9692 if (raw_01_ct_mod8) {
9693 if (unlikely((*fread_pp)[-1] >> raw_01_ct_mod8)) {
9694 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic het-ref hardcall bitarray-subset for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9695 return 1;
9696 }
9697 }
9698 } else {
9699 if (unlikely(ValidateAndCountDeltalist(fread_end, sample_ct, fread_pp, deltalist_workspace, &rare01_ct) || (!rare01_ct))) {
9700 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall deltalist-subset for (0-based) variant #%u.\n", vidx);
9701 return 1;
9702 }
9703 for (uint32_t uii = 0; uii != rare01_ct; ++uii) {
9704 if (unlikely(GetNyparrEntry(raw_genovec, deltalist_workspace[uii]) != 1)) {
9705 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall deltalist-subset for (0-based) variant #%u (an index doesn't correspond to a het-ref call).\n", vidx);
9706 return 1;
9707 }
9708 }
9709 }
9710 if (allele_ct < 5) {
9711 // Nothing to do for allele_ct == 3.
9712 if (allele_ct == 4) {
9713 // 1-bit entries. Contents must be in range, so just validate trailing
9714 // bits.
9715 const uint32_t fvals_byte_ct = DivUp(rare01_ct, 8);
9716 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9717 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9718 return 1;
9719 }
9720 const uint32_t rare01_ct_mod8 = rare01_ct % 8;
9721 if (rare01_ct_mod8) {
9722 if (unlikely((*fread_pp)[-1] >> rare01_ct_mod8)) {
9723 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9724 return 1;
9725 }
9726 }
9727 }
9728 } else {
9729 const unsigned char* fvals = *fread_pp;
9730 if (allele_ct < 19) {
9731 if (allele_ct < 7) {
9732 // 2-bit entries.
9733 const uint32_t fvals_byte_ct = DivUp(rare01_ct, 4);
9734 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9735 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9736 return 1;
9737 }
9738 if (allele_ct == 5) {
9739 // Contents may be out-of-range.
9740 const uint32_t fullword_ct = fvals_byte_ct / kBytesPerWord;
9741 uint32_t widx = 0;
9742 if (fullword_ct) {
9743 const uintptr_t* fvals_alias = R_CAST(const uintptr_t*, fvals);
9744 for (; widx != fullword_ct; ++widx) {
9745 const uintptr_t cur_word = fvals_alias[widx];
9746 if (unlikely(cur_word & (cur_word >> 1) & kMask5555)) {
9747 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (out-of-range allele code).\n", vidx);
9748 return 1;
9749 }
9750 }
9751 }
9752 for (uint32_t uii = widx * kBytesPerWord; uii != fvals_byte_ct; ++uii) {
9753 const uint32_t cur_byte = fvals[uii];
9754 if (unlikely(cur_byte & (cur_byte >> 1) & 0x55)) {
9755 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (out-of-range allele code).\n", vidx);
9756 return 1;
9757 }
9758 }
9759 }
9760 // Validate trailing bits.
9761 const uint32_t rare01_ct_mod4 = rare01_ct % 4;
9762 if (rare01_ct_mod4) {
9763 if (unlikely((*fread_pp)[-1] >> (2 * rare01_ct_mod4))) {
9764 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9765 return 1;
9766 }
9767 }
9768 } else {
9769 // 4-bit entries.
9770 const uint32_t fvals_byte_ct = DivUp(rare01_ct, 2);
9771 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9772 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9773 return 1;
9774 }
9775 if (allele_ct != 18) {
9776 // Contents may be out-of-range.
9777 // (Can optimize this loop later.)
9778 const uint32_t max_code = allele_ct - 3;
9779 for (uint32_t uii = 0; uii != fvals_byte_ct; ++uii) {
9780 const uint32_t cur_byte = fvals[uii];
9781 if (unlikely(((cur_byte & 15) > max_code) || ((cur_byte >> 4) > max_code))) {
9782 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (out-of-range allele code).\n", vidx);
9783 return 1;
9784 }
9785 }
9786 }
9787 // Validate trailing bits.
9788 const uint32_t rare01_ct_mod2 = rare01_ct % 2;
9789 if (rare01_ct_mod2) {
9790 if (unlikely((*fread_pp)[-1] >> 4)) {
9791 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9792 return 1;
9793 }
9794 }
9795 }
9796 } else {
9797 // 8-bit entries.
9798 if (PtrAddCk(fread_end, rare01_ct, fread_pp)) {
9799 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9800 return 1;
9801 }
9802 // Can optimize this loop later.
9803 const uint32_t max_code = allele_ct - 3;
9804 for (uint32_t uii = 0; uii != rare01_ct; ++uii) {
9805 const uint32_t cur_byte = fvals[uii];
9806 if (unlikely(cur_byte > max_code)) {
9807 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic het-ref hardcall track for (0-based) variant #%u (out-of-range allele code).\n", vidx);
9808 return 1;
9809 }
9810 }
9811 }
9812 }
9813 }
9814 if (aux1b_mode != 15) {
9815 if (unlikely(!raw_10_ct)) {
9816 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic altxy hardcall track present for (0-based) variant #%u, but no altxy calls exist.\n", vidx);
9817 return 1;
9818 }
9819 uint32_t rare10_ct;
9820 if (!aux1b_mode) {
9821 const uint32_t subset_byte_ct = DivUp(raw_10_ct, CHAR_BIT);
9822 if (PtrCheck(fread_end, *fread_pp, subset_byte_ct)) {
9823 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall bitarray-subset for (0-based) variant #%u.\n", vidx);
9824 return 1;
9825 }
9826 rare10_ct = PopcountBytes(*fread_pp, subset_byte_ct);
9827 if (unlikely(!rare10_ct)) {
9828 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Empty multiallelic altxy hardcall bitarray-subset for (0-based) variant #%u.\n", vidx);
9829 return 1;
9830 }
9831 *fread_pp += subset_byte_ct;
9832 const uint32_t raw_10_ct_mod8 = raw_10_ct % 8;
9833 if (raw_10_ct_mod8) {
9834 if (unlikely((*fread_pp)[-1] >> raw_10_ct_mod8)) {
9835 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Multiallelic altxy hardcall bitarray-subset for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9836 return 1;
9837 }
9838 }
9839 } else {
9840 if (unlikely(ValidateAndCountDeltalist(fread_end, sample_ct, fread_pp, deltalist_workspace, &rare10_ct) || (!rare10_ct))) {
9841 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall deltalist-subset for (0-based) variant #%u.\n", vidx);
9842 return 1;
9843 }
9844 for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
9845 if (unlikely(GetNyparrEntry(raw_genovec, deltalist_workspace[uii]) != 2)) {
9846 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall deltalist-subset for (0-based) variant #%u (an index doesn't correspond to an altxy call).\n", vidx);
9847 return 1;
9848 }
9849 }
9850 }
9851 const unsigned char* fvals = *fread_pp;
9852 uint32_t het_incr;
9853 if (allele_ct < 6) {
9854 if (allele_ct == 3) {
9855 // 1-bit entries.
9856 const uint32_t fvals_byte_ct = DivUp(rare10_ct, 8);
9857 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9858 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9859 return 1;
9860 }
9861 const uint32_t rare10_ct_mod8 = rare10_ct % 8;
9862 if (rare10_ct_mod8) {
9863 if (unlikely((*fread_pp)[-1] >> rare10_ct_mod8)) {
9864 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9865 return 1;
9866 }
9867 }
9868 het_incr = rare10_ct - PopcountBytes(fvals, fvals_byte_ct);
9869 } else {
9870 // 2+2 bit entries.
9871 const uint32_t fvals_byte_ct = DivUp(rare10_ct, 2);
9872 if (PtrAddCk(fread_end, fvals_byte_ct, fread_pp)) {
9873 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9874 return 1;
9875 }
9876 // Can optimize this later.
9877 uint64_t nybble_cts[16];
9878 ZeroU64Arr(16, nybble_cts);
9879 CountAllNybbles64(fvals, rare10_ct, nybble_cts);
9880 // 1/1 is invalid here
9881 if (nybble_cts[0]) {
9882 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (out-of-range allele code pair).\n", vidx);
9883 return 1;
9884 }
9885 const uint32_t max_code = allele_ct - 2;
9886 for (uint32_t hi_code = 0; hi_code != 4; ++hi_code) {
9887 uint32_t lo_code = hi_code + 1;
9888 if (hi_code > max_code) {
9889 lo_code = 0;
9890 }
9891 const uint64_t* nybble_cts_offset = &(nybble_cts[hi_code * 4]);
9892 for (; lo_code != 4; ++lo_code) {
9893 if (nybble_cts_offset[lo_code]) {
9894 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (out-of-range allele code pair).\n", vidx);
9895 return 1;
9896 }
9897 }
9898 }
9899 const uintptr_t rarehom_ct = nybble_cts[5] + nybble_cts[10] + nybble_cts[15];
9900 het_incr = rare10_ct - rarehom_ct;
9901 const uint32_t rare10_ct_mod2 = rare10_ct % 2;
9902 if (rare10_ct_mod2) {
9903 if (unlikely((*fread_pp)[-1] >> 4)) {
9904 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (nonzero trailing bits).\n", vidx);
9905 return 1;
9906 }
9907 }
9908 }
9909 } else {
9910 if (allele_ct < 18) {
9911 // 4+4 bit entries.
9912 if (PtrAddCk(fread_end, rare10_ct, fread_pp)) {
9913 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9914 return 1;
9915 }
9916 const uint32_t max_code = allele_ct - 2;
9917 het_incr = 0;
9918 for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
9919 const uint32_t cur_byte = fvals[uii];
9920 const uint32_t lo_code = cur_byte & 15;
9921 const uint32_t hi_code = cur_byte >> 4;
9922 if (unlikely((!hi_code) || (hi_code > max_code) || (lo_code > hi_code))) {
9923 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (out-of-range or misordered allele code pair).\n", vidx);
9924 return 1;
9925 }
9926 het_incr += (lo_code != hi_code);
9927 }
9928 } else {
9929 // 8+8 bit entries
9930 if (PtrAddCk(fread_end, 2 * rare10_ct, fread_pp)) {
9931 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (shorter than expected).\n", vidx);
9932 return 1;
9933 }
9934 const uint32_t max_code = allele_ct - 2;
9935 het_incr = 0;
9936 for (uint32_t uii = 0; uii != rare10_ct; ++uii) {
9937 const AlleleCode lo_code = fvals[2 * uii];
9938 const AlleleCode hi_code = fvals[2 * uii + 1];
9939 if (unlikely((!hi_code) || (hi_code > max_code) || (lo_code > hi_code))) {
9940 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid multiallelic altxy hardcall track for (0-based) variant #%u (out-of-range or misordered allele code pair).\n", vidx);
9941 return 1;
9942 }
9943 het_incr += (lo_code != hi_code);
9944 }
9945 }
9946 }
9947 *het_ctp += het_incr;
9948 }
9949 return 0;
9950 }
9951
ValidateHphase(const unsigned char * fread_end,uint32_t vidx,uint32_t het_ct,const unsigned char ** fread_pp,char * errstr_buf)9952 BoolErr ValidateHphase(const unsigned char* fread_end, uint32_t vidx, uint32_t het_ct, const unsigned char** fread_pp, char* errstr_buf) {
9953 if (unlikely(!het_ct)) {
9954 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Hardcall phase track present for (0-based) variant #%u, but there were no heterozygous calls.\n", vidx);
9955 return 1;
9956 }
9957 const uint32_t aux2_first_part_byte_ct = 1 + (het_ct / CHAR_BIT);
9958 const unsigned char* aux2_first_part = *fread_pp;
9959 if (PtrAddCk(fread_end, aux2_first_part_byte_ct, fread_pp)) {
9960 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid hardcall phase track present for (0-based) variant #%u.\n", vidx);
9961 return 1;
9962 }
9963 const uint32_t het_ct_p1_mod8 = (het_ct + 1) % CHAR_BIT;
9964 if (het_ct_p1_mod8) {
9965 // verify trailing bits are zero
9966 if (unlikely((*fread_pp)[-1] >> het_ct_p1_mod8)) {
9967 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Hardcall phase track for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9968 return 1;
9969 }
9970 }
9971 if (!((*aux2_first_part) & 1)) {
9972 // phase always present, "first part" is only part
9973 return 0;
9974 }
9975 const uint32_t phasepresent_ct = PopcountBytes(aux2_first_part, aux2_first_part_byte_ct) - 1;
9976 if (unlikely(!phasepresent_ct)) {
9977 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Hardcall phase track for (0-based) variant #%u does not have any actual phase information.\n", vidx);
9978 return 1;
9979 }
9980 const uint32_t phaseinfo_byte_ct = DivUp(phasepresent_ct, CHAR_BIT);
9981 if (PtrAddCk(fread_end, phaseinfo_byte_ct, fread_pp)) {
9982 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid hardcall phase track present for (0-based) variant #%u.\n", vidx);
9983 return 1;
9984 }
9985 const uint32_t phasepresent_ct_mod8 = phasepresent_ct % 8;
9986 if (phasepresent_ct_mod8) {
9987 if (unlikely((*fread_pp)[-1] >> phasepresent_ct_mod8)) {
9988 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Hardcall phase track for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
9989 return 1;
9990 }
9991 }
9992 return 0;
9993 }
9994
ValidateDosage16(const unsigned char * fread_end,uint32_t vidx,PgenReaderMain * pgrp,const unsigned char ** fread_pp,char * errstr_buf)9995 PglErr ValidateDosage16(const unsigned char* fread_end, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, char* errstr_buf) {
9996 // similar to ParseDosage16(). doesn't support multiallelic data yet.
9997 const uint32_t vrtype = pgrp->fi.vrtypes[vidx];
9998 const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
9999 if ((vrtype & 0x60) == 0x40) {
10000 // unconditional dosage. handle separately from other two cases since
10001 // 65535 is valid.
10002 #ifdef __arm__
10003 # error "Unaligned accesses in ValidateDosage16()."
10004 #endif
10005 const uint16_t* dosage_main = R_CAST(const uint16_t*, *fread_pp);
10006 if (PtrAddCk(fread_end, sample_ct * sizeof(int16_t), fread_pp)) {
10007 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid unconditional dosage track for (0-based) variant #%u.\n", vidx);
10008 return kPglRetMalformedInput;
10009 }
10010 // todo: verify genotype and dosage are consistent
10011 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
10012 uint16_t cur_dosage_val_p1 = dosage_main[sample_idx];
10013 cur_dosage_val_p1 += 1; // intentional overflow on 65535
10014 if (unlikely(cur_dosage_val_p1 > 32769)) {
10015 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid unconditional dosage track for (0-based) variant #%u (dosage is greater than 2).\n", vidx);
10016 return kPglRetMalformedInput;
10017 }
10018 }
10019 if (vrtype & 0x80) {
10020 const int16_t* dphase_delta = R_CAST(const int16_t*, *fread_pp);
10021 if (PtrAddCk(fread_end, sample_ct * sizeof(int16_t), fread_pp)) {
10022 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid unconditional phased-dosages for (0-based) variant #%u.\n", vidx);
10023 return kPglRetMalformedInput;
10024 }
10025 for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
10026 const uint16_t dosage_val = dosage_main[sample_idx];
10027 const int16_t dphase_delta_val = dphase_delta[sample_idx];
10028 const uint16_t dpiece0_x2 = dosage_val + dphase_delta_val;
10029 const uint16_t dpiece1_x2 = dosage_val - dphase_delta_val;
10030 // Update (11 May 2018): parity condition removed.
10031 if ((dpiece0_x2 > 32768) || (dpiece1_x2 > 32768)) {
10032 if (unlikely((dphase_delta_val != -32768) || (dosage_val != 65535))) {
10033 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid unconditional phased-dosages for (0-based) variant #%u.\n", vidx);
10034 return kPglRetMalformedInput;
10035 }
10036 }
10037 }
10038 }
10039 return kPglRetSuccess;
10040 }
10041 uint32_t dosage_ct;
10042 if ((vrtype & 0x60) == 0x20) {
10043 // dosage list
10044 if (unlikely(ValidateAndCountDeltalist(fread_end, sample_ct, fread_pp, nullptr, &dosage_ct))) {
10045 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid dosage list for (0-based) variant #%u.\n", vidx);
10046 return kPglRetMalformedInput;
10047 }
10048 } else {
10049 const uint32_t sample_ctb = DivUp(sample_ct, CHAR_BIT);
10050 if (PtrCheck(fread_end, *fread_pp, sample_ctb)) {
10051 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid dosage subset for (0-based) variant #%u.\n", vidx);
10052 return kPglRetMalformedInput;
10053 }
10054 dosage_ct = PopcountBytes(*fread_pp, sample_ctb);
10055 *fread_pp += sample_ctb;
10056 const uint32_t sample_ct_mod8 = sample_ct % 8;
10057 if (sample_ct_mod8) {
10058 if (unlikely((*fread_pp)[-1] >> sample_ct_mod8)) {
10059 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Dosage subset bitarray for (0-based) variant #%u has nonzero trailing bits.\n", vidx);
10060 return kPglRetMalformedInput;
10061 }
10062 }
10063 }
10064 const uint16_t* dosage_main = R_CAST(const uint16_t*, *fread_pp);
10065 if (PtrAddCk(fread_end, dosage_ct * sizeof(int16_t), fread_pp)) {
10066 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid dosage track for (0-based) variant #%u.\n", vidx);
10067 return kPglRetMalformedInput;
10068 }
10069 for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
10070 if (unlikely(dosage_main[dosage_idx] > 32768)) {
10071 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid dosage track for (0-based) variant #%u (dosage is greater than 2).\n", vidx);
10072 return kPglRetMalformedInput;
10073 }
10074 }
10075 if (vrtype & 0x80) {
10076 const uintptr_t* file_dphase_present = R_CAST(const uintptr_t*, *fread_pp);
10077 const uint32_t dphase_present_byte_ct = DivUp(dosage_ct, CHAR_BIT);
10078 if (PtrAddCk(fread_end, dphase_present_byte_ct, fread_pp)) {
10079 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10080 return kPglRetMalformedInput;
10081 }
10082 const uint32_t trailing_bit_ct = dosage_ct % CHAR_BIT;
10083 if (unlikely(trailing_bit_ct && ((*fread_pp)[-1] & (255 << trailing_bit_ct)))) {
10084 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10085 return kPglRetMalformedInput;
10086 }
10087 const uint16_t* dosage_main_read_iter = dosage_main;
10088 const int16_t* dphase_delta_read_iter = R_CAST(const int16_t*, *fread_pp);
10089 const uint32_t dphase_widx_last = (dphase_present_byte_ct - 1) / kBytesPerWord;
10090 uint32_t loop_end = kBitsPerWord;
10091 for (uint32_t dphase_widx = 0; ; ++dphase_widx) {
10092 uintptr_t ww;
10093 if (dphase_widx >= dphase_widx_last) {
10094 if (dphase_widx > dphase_widx_last) {
10095 break;
10096 }
10097 loop_end = 1 + ((dosage_ct - 1) % kBitsPerWord);
10098 const uint32_t final_byte_ct = DivUp(loop_end, CHAR_BIT);
10099 ww = SubwordLoad(&(file_dphase_present[dphase_widx]), final_byte_ct);
10100 } else {
10101 ww = file_dphase_present[dphase_widx];
10102 }
10103 for (uint32_t dphase_lowbits = 0; dphase_lowbits != loop_end; ++dphase_lowbits, ++dosage_main_read_iter) {
10104 if (!((ww >> dphase_lowbits) & 1)) {
10105 continue;
10106 }
10107 const uint16_t dosage_val = *dosage_main_read_iter;
10108 const int16_t dphase_delta_val = *dphase_delta_read_iter++;
10109 const uint16_t dpiece0_x2 = dosage_val + dphase_delta_val;
10110 const uint16_t dpiece1_x2 = dosage_val - dphase_delta_val;
10111 if (unlikely((dpiece0_x2 > 32768) || (dpiece1_x2 > 32768))) {
10112 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10113 return kPglRetMalformedInput;
10114 }
10115 }
10116 }
10117 if (unlikely(dphase_delta_read_iter == R_CAST(const int16_t*, *fread_pp))) {
10118 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10119 return kPglRetMalformedInput;
10120 }
10121 *fread_pp = R_CAST(const unsigned char*, dphase_delta_read_iter);
10122 if (PtrCheck(fread_end, *fread_pp, 0)) {
10123 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid phased-dosage track for (0-based) variant #%u.\n", vidx);
10124 return kPglRetMalformedInput;
10125 }
10126 }
10127 return kPglRetSuccess;
10128 }
10129
10130 static_assert(kPglVblockSize == 65536, "PgrValidate() needs to have an error message updated.");
PgrValidate(PgenReader * pgr_ptr,uintptr_t * genovec_buf,char * errstr_buf)10131 PglErr PgrValidate(PgenReader* pgr_ptr, uintptr_t* genovec_buf, char* errstr_buf) {
10132 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
10133 // Performs all validation which isn't done by pgfi_init_phase{1,2}() and
10134 // PgrInit().
10135 const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
10136 const uint32_t variant_ct = pgrp->fi.raw_variant_ct;
10137 const uint32_t sample_ct = pgrp->fi.raw_sample_ct;
10138 const uint32_t const_vrtype = pgrp->fi.const_vrtype;
10139 if (const_vrtype != UINT32_MAX) {
10140 if (unlikely(allele_idx_offsets && (allele_idx_offsets[variant_ct] != 2 * variant_ct))) {
10141 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pvar file contains multiallelic variant(s), but .%s file does not.\n", (const_vrtype == kPglVrtypePlink1)? "bed" : "pgen");
10142 return kPglRetInconsistentInput;
10143 }
10144 // const uintptr_t const_vrec_width = pgrp->fi.const_vrec_width;
10145 if ((!const_vrtype) || (const_vrtype == kPglVrtypePlink1)) {
10146 // only thing that can go wrong is nonzero trailing bits
10147 const uint32_t dbl_sample_ct_mod4 = 2 * (sample_ct % 4);
10148 if (!dbl_sample_ct_mod4) {
10149 return kPglRetSuccess;
10150 }
10151 for (uint32_t vidx = 0; vidx != variant_ct; ++vidx) {
10152 const unsigned char* fread_ptr;
10153 const unsigned char* fread_end = nullptr;
10154 if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
10155 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10156 return kPglRetReadFail;
10157 }
10158 const uint32_t last_byte_in_record = fread_end[-1];
10159 if (unlikely(last_byte_in_record >> dbl_sample_ct_mod4)) {
10160 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Last byte of (0-based) variant #%u has nonzero trailing bits.\n", vidx);
10161 return kPglRetMalformedInput;
10162 }
10163 }
10164 return kPglRetSuccess;
10165 }
10166 // todo: 16-bit dosage entries can't be in [32769,65534]
10167 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Validation of fixed-width dosage formats is not implemented yet.\n");
10168 return kPglRetNotYetSupported;
10169 }
10170 const unsigned char* vrtypes = pgrp->fi.vrtypes;
10171 for (uint32_t vidx = 0; vidx < variant_ct; vidx += kPglVblockSize) {
10172 if (unlikely(VrtypeLdCompressed(vrtypes[vidx]))) {
10173 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: (0-based) variant #%u is LD-compressed; this is prohibited when the variant index is a multiple of 65536.\n", vidx);
10174 return kPglRetMalformedInput;
10175 }
10176 }
10177 // file size may not be validated yet.
10178 uint64_t fsize;
10179 FILE* ff = pgrp->ff;
10180 #ifndef NO_MMAP
10181 if (ff == nullptr) {
10182 // mmap case
10183 fsize = pgrp->fi.file_size;
10184 } else {
10185 #endif
10186 if (unlikely(fseeko(ff, 0, SEEK_END))) {
10187 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10188 return kPglRetReadFail;
10189 }
10190 fsize = ftello(ff);
10191 pgrp->fp_vidx = 1; // force fseek when loading first variant
10192 #ifndef NO_MMAP
10193 }
10194 #endif
10195 // todo: modify this check when phase sets are implemented
10196 const uint64_t expected_fsize = pgrp->fi.var_fpos[variant_ct];
10197 if (unlikely(expected_fsize != fsize)) {
10198 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen header indicates that file size should be %" PRIu64 " bytes, but actual file size is %" PRIu64 " bytes.\n", expected_fsize, fsize);
10199 return kPglRetMalformedInput;
10200 }
10201 const uint32_t vblock_ct = DivUp(variant_ct, kPglVblockSize);
10202 uint32_t header_ctrl = 0;
10203 #ifndef NO_MMAP
10204 if (ff == nullptr) {
10205 # ifdef __arm__
10206 # error "Unaligned accesses in PgrValidate()."
10207 # endif
10208 memcpy(&header_ctrl, &(pgrp->fi.block_base[11]), 1);
10209 // validate the random-access index.
10210 const uint64_t* fpos_index = R_CAST(const uint64_t*, &(pgrp->fi.block_base[12]));
10211 for (uint32_t vblock_idx = 0; vblock_idx != vblock_ct; ++vblock_idx) {
10212 if (unlikely(fpos_index[vblock_idx] != pgrp->fi.var_fpos[vblock_idx * kPglVblockSize])) {
10213 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen header vblock-start index is inconsistent with variant record length index.\n");
10214 return kPglRetMalformedInput;
10215 }
10216 }
10217 } else {
10218 #endif
10219 if (unlikely(fseeko(ff, 11, SEEK_SET))) {
10220 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10221 return kPglRetReadFail;
10222 }
10223 header_ctrl = getc_unlocked(ff);
10224 if (unlikely(header_ctrl > 255)) {
10225 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10226 return kPglRetReadFail;
10227 }
10228 for (uint32_t vblock_idx = 0; vblock_idx != vblock_ct; ++vblock_idx) {
10229 uint64_t vblock_start_fpos;
10230 if (unlikely(!fread_unlocked(&vblock_start_fpos, sizeof(int64_t), 1, ff))) {
10231 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10232 return kPglRetReadFail;
10233 }
10234 if (unlikely(vblock_start_fpos != pgrp->fi.var_fpos[vblock_idx * kPglVblockSize])) {
10235 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen header vblock-start index is inconsistent with variant record length index.\n");
10236 return kPglRetMalformedInput;
10237 }
10238 }
10239 #ifndef NO_MMAP
10240 }
10241 #endif
10242 const uint32_t vrtype_and_fpos_storage = header_ctrl & 15;
10243 const uint32_t alt_allele_ct_byte_ct = (header_ctrl >> 4) & 3;
10244 const uint32_t nonref_flags_stored = ((header_ctrl >> 6) == 3);
10245
10246 // does not include vrtypes yet
10247 uint64_t vblock_index_byte_ct = kPglVblockSize * (1 + (vrtype_and_fpos_storage & 3) + alt_allele_ct_byte_ct);
10248 if (nonref_flags_stored) {
10249 vblock_index_byte_ct += kPglVblockSize / CHAR_BIT;
10250 }
10251 uint64_t last_vrtype_byte_offset = 0;
10252 uint32_t trailing_shift = 4;
10253 if (vrtype_and_fpos_storage & 8) {
10254 vblock_index_byte_ct += kPglVblockSize >> (10 - vrtype_and_fpos_storage);
10255 if (vrtype_and_fpos_storage == 8) {
10256 const uint32_t variant_ct_mod4 = variant_ct % 4;
10257 if (variant_ct_mod4) {
10258 last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t)) + ((variant_ct % kPglVblockSize) / 4);
10259 trailing_shift = variant_ct_mod4 * 2;
10260 }
10261 } else {
10262 assert(vrtype_and_fpos_storage == 9);
10263 if (variant_ct % 2) {
10264 last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t)) + ((variant_ct % kPglVblockSize) / 2);
10265 }
10266 }
10267 } else if (!(vrtype_and_fpos_storage & 4)) {
10268 vblock_index_byte_ct += kPglVblockSize / 2;
10269 if (variant_ct % 2) {
10270 // bugfix (22 Nov 2017): forgot to add offset in last block
10271 last_vrtype_byte_offset = 20 + (vblock_ct - 1) * (vblock_index_byte_ct + sizeof(int64_t)) + ((variant_ct % kPglVblockSize) / 2);
10272 }
10273 /*
10274 } else {
10275 vblock_index_byte_ct += kPglVblockSize;
10276 */
10277 }
10278 if (last_vrtype_byte_offset) {
10279 uint32_t last_vrtype_byte = 0;
10280 #ifndef NO_MMAP
10281 if (ff == nullptr) {
10282 memcpy(&last_vrtype_byte, &(pgrp->fi.block_base[last_vrtype_byte_offset]), 1);
10283 } else {
10284 #endif
10285 if (unlikely(fseeko(ff, last_vrtype_byte_offset, SEEK_SET))) {
10286 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10287 return kPglRetReadFail;
10288 }
10289 last_vrtype_byte = getc_unlocked(ff);
10290 if (unlikely(last_vrtype_byte > 255)) {
10291 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10292 return kPglRetReadFail;
10293 }
10294 #ifndef NO_MMAP
10295 }
10296 #endif
10297 if (unlikely(last_vrtype_byte >> trailing_shift)) {
10298 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Nonzero trailing bits in last vrtype index byte.\n");
10299 return kPglRetMalformedInput;
10300 }
10301 }
10302 const uintptr_t* nonref_flags = pgrp->fi.nonref_flags;
10303 if (nonref_flags) {
10304 const uint32_t variant_ct_modl = variant_ct % kBitsPerWord;
10305 if (variant_ct % CHAR_BIT) {
10306 if (unlikely(nonref_flags[variant_ct / kBitsPerWord] >> variant_ct_modl)) {
10307 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Nonzero trailing bits in last nonref_flags byte.\n");
10308 return kPglRetMalformedInput;
10309 }
10310 }
10311 }
10312
10313 // could move most of this into plink2_common and make it multithreaded, if
10314 // speed is ever an issue.
10315 uint32_t allele_ct = 2;
10316 for (uint32_t vidx = 0; vidx != variant_ct; ++vidx) {
10317 const unsigned char* fread_ptr;
10318 const unsigned char* fread_end;
10319 if (unlikely(InitReadPtrs(vidx, pgrp, &fread_ptr, &fread_end))) {
10320 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: .pgen read failure: %s.\n", strerror(errno));
10321 return kPglRetReadFail;
10322 }
10323 const unsigned char* fread_ptr_start = fread_ptr;
10324 if (unlikely(ValidateGeno(fread_end, vidx, pgrp, &fread_ptr, genovec_buf, errstr_buf))) {
10325 return kPglRetMalformedInput;
10326 }
10327 ZeroTrailingNyps(sample_ct, genovec_buf);
10328 const uint32_t vrtype = vrtypes[vidx];
10329 uint32_t het_ct = CountNyp(genovec_buf, kMask5555, sample_ct);
10330 if (allele_idx_offsets) {
10331 allele_ct = allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx];
10332 }
10333 if (VrtypeMultiallelicHc(vrtype)) {
10334 if (unlikely(ValidateMultiallelicHc(fread_end, genovec_buf, vidx, allele_ct, pgrp, &fread_ptr, &het_ct, errstr_buf))) {
10335 return kPglRetMalformedInput;
10336 }
10337 }
10338 // don't need genovec_buf to store main genotypes past this point.
10339 if (VrtypeHphase(vrtype)) {
10340 if (unlikely(ValidateHphase(fread_end, vidx, het_ct, &fread_ptr, errstr_buf))) {
10341 return kPglRetMalformedInput;
10342 }
10343 }
10344 if (vrtype & 0xe0) {
10345 if (unlikely((vrtype & 0xe0) == 0x80)) {
10346 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Invalid record type for (0-based) variant #%u (phased dosage bit set, but main dosage bits unset).\n", vidx);
10347 return kPglRetMalformedInput;
10348 }
10349 PglErr reterr = ValidateDosage16(fread_end, vidx, pgrp, &fread_ptr, errstr_buf);
10350 if (unlikely(reterr)) {
10351 return reterr;
10352 }
10353 }
10354 if (unlikely(fread_ptr != fread_end)) {
10355 // possible todo: tolerate this at the end of a vblock.
10356 snprintf(errstr_buf, kPglErrstrBufBlen, "Error: Extra byte(s) in (0-based) variant record #%u. (record type = %u; expected length = %" PRIuPTR ", actual = %" PRIuPTR ")\n", vidx, vrtype, S_CAST(uintptr_t, fread_ptr - fread_ptr_start), S_CAST(uintptr_t, fread_end - fread_ptr_start));
10357 return kPglRetMalformedInput;
10358 }
10359 }
10360 return kPglRetSuccess;
10361 }
10362
10363
CleanupPgfi(PgenFileInfo * pgfip,PglErr * reterrp)10364 BoolErr CleanupPgfi(PgenFileInfo* pgfip, PglErr* reterrp) {
10365 // memory is the responsibility of the caller
10366 if (pgfip->shared_ff) {
10367 if (unlikely(fclose_null(&pgfip->shared_ff))) {
10368 if (*reterrp == kPglRetSuccess) {
10369 *reterrp = kPglRetReadFail;
10370 return 1;
10371 }
10372 }
10373 #ifndef NO_MMAP
10374 } else if (pgfip->block_base != nullptr) {
10375 munmap(K_CAST(unsigned char*, pgfip->block_base), pgfip->file_size);
10376 #endif
10377 }
10378 return 0;
10379 }
10380
CleanupPgr(PgenReader * pgr_ptr,PglErr * reterrp)10381 BoolErr CleanupPgr(PgenReader* pgr_ptr, PglErr* reterrp) {
10382 PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
10383 // assume file is open if pgr.ff is not null
10384 // memory is the responsibility of the caller for now
10385 if (!pgrp->ff) {
10386 return 0;
10387 }
10388 if (fclose_null(&(pgrp->ff))) {
10389 if (*reterrp == kPglRetSuccess) {
10390 *reterrp = kPglRetReadFail;
10391 return 1;
10392 }
10393 }
10394 return 0;
10395 }
10396
10397 #ifdef __cplusplus
10398 } // namespace plink2
10399 #endif
10400