1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink2_common.h"
19 
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23 
InitPedigreeIdInfo(MiscFlags misc_flags,PedigreeIdInfo * piip)24 void InitPedigreeIdInfo(MiscFlags misc_flags, PedigreeIdInfo* piip) {
25   piip->sii.sample_ids = nullptr;
26   piip->sii.sids = nullptr;
27   piip->sii.max_sample_id_blen = 4;
28   piip->sii.max_sid_blen = 0;
29   piip->sii.flags = kfSampleId0;
30   if (misc_flags & kfMiscNoIdHeader) {
31     piip->sii.flags |= kfSampleIdNoIdHeader;
32     if (misc_flags & kfMiscNoIdHeaderIidOnly) {
33       piip->sii.flags |= kfSampleIdNoIdHeaderIidOnly;
34     }
35   }
36   if (misc_flags & kfMiscStrictSid0) {
37     piip->sii.flags |= kfSampleIdStrictSid0;
38   }
39   piip->parental_id_info.paternal_ids = nullptr;
40   piip->parental_id_info.maternal_ids = nullptr;
41   piip->parental_id_info.max_paternal_id_blen = 2;
42   piip->parental_id_info.max_maternal_id_blen = 2;
43 }
44 
BigstackAllocPgv(uint32_t sample_ct,uint32_t multiallelic_needed,PgenGlobalFlags gflags,PgenVariant * pgvp)45 BoolErr BigstackAllocPgv(uint32_t sample_ct, uint32_t multiallelic_needed, PgenGlobalFlags gflags, PgenVariant* pgvp) {
46   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
47   if (unlikely(
48           bigstack_alloc_w(sample_ctl2, &(pgvp->genovec)))) {
49     return 1;
50   }
51   const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
52   if (multiallelic_needed) {
53     if (unlikely(
54             bigstack_alloc_w(sample_ctl, &(pgvp->patch_01_set)) ||
55             bigstack_alloc_ac(sample_ct, &(pgvp->patch_01_vals)) ||
56             bigstack_alloc_w(sample_ctl, &(pgvp->patch_10_set)) ||
57             bigstack_alloc_ac(sample_ct * 2, &(pgvp->patch_10_vals)))) {
58       return 1;
59     }
60   } else {
61     // defensive
62     pgvp->patch_01_set = nullptr;
63     pgvp->patch_01_vals = nullptr;
64     pgvp->patch_10_set = nullptr;
65     pgvp->patch_10_vals = nullptr;
66   }
67   if (gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent)) {
68     if (unlikely(
69             bigstack_alloc_w(sample_ctl, &(pgvp->phasepresent)) ||
70             bigstack_alloc_w(sample_ctl, &(pgvp->phaseinfo)))) {
71       return 1;
72     }
73   } else {
74     pgvp->phasepresent = nullptr;
75     pgvp->phaseinfo = nullptr;
76   }
77   if (gflags & kfPgenGlobalDosagePresent) {
78     if (unlikely(
79             bigstack_alloc_w(sample_ctl, &(pgvp->dosage_present)) ||
80             bigstack_alloc_dosage(sample_ct, &(pgvp->dosage_main)))) {
81       return 1;
82     }
83     if (multiallelic_needed) {
84       // todo
85     }
86     if (gflags & kfPgenGlobalDosagePhasePresent) {
87       if (unlikely(
88               bigstack_alloc_w(sample_ctl, &(pgvp->dphase_present)) ||
89               bigstack_alloc_dphase(sample_ct, &(pgvp->dphase_delta)))) {
90         return 1;
91       }
92       if (multiallelic_needed) {
93         // todo
94       }
95     }
96   } else {
97     pgvp->dosage_present = nullptr;
98     pgvp->dosage_main = nullptr;
99     pgvp->dphase_present = nullptr;
100     pgvp->dphase_delta = nullptr;
101     // todo: multiallelic-dosage buffers
102   }
103   return 0;
104 }
105 
106 static_assert(kDosageMid == 16384, "PrintDosageDecimal() needs to be updated.");
PrintDosageDecimal(uint32_t remainder,char * start)107 char* PrintDosageDecimal(uint32_t remainder, char* start) {
108   // Instead of constant 5-digit precision, we print fewer digits whenever that
109   // doesn't interfere with proper round-tripping.  I.e. we search for the
110   // shortest string in
111   //   ((n - 0.5)/16384, (n + 0.5)/16384).
112   // E.g. 3277/16384 is 0.20001 when printed with 5-digit precision, but we'd
113   // print that as 0.2 since that's still in (3276.5/16384, 3277.5/16384).
114   *start++ = '.';
115   // (remainder * 2) is in 32768ths
116   // 32768 * 625 = 20480k, smallest common denominator with 10^4
117 
118   const uint32_t range_top_20480k = (remainder * 2 + 1) * 625;
119   // this is technically checking a half-open rather than a fully-open
120   // interval, but that's fine since we never hit the boundary points
121   if ((range_top_20480k % 2048) < 1250) {
122     // when this is true, the four-decimal-place approximation is in the range
123     // which round-trips back to our original number.
124     const uint32_t four_decimal_places = range_top_20480k / 2048;
125     return u32toa_trunc4(four_decimal_places, start);
126   }
127 
128   // we wish to print (100000 * remainder + 8192) / 16384, left-0-padded.  and
129   // may as well banker's round too.
130   //
131   // banker's rounding yields a different result than regular rounding for n/64
132   // when n is congruent to 1 mod 4:
133   //   1/64 = .015625 -> print 0.01562
134   //   3/64 = .046875 -> print 0.04688
135   //   5/64 = .078125 -> print 0.07812
136   const uint32_t five_decimal_places = ((3125 * remainder + 256) / 512) - ((remainder % 1024) == 256);
137   const uint32_t first_decimal_place = five_decimal_places / 10000;
138   *start++ = '0' + first_decimal_place;
139   const uint32_t last_four_digits = five_decimal_places - first_decimal_place * 10000;
140   if (last_four_digits) {
141     return u32toa_trunc4(last_four_digits, start);
142   }
143   return start;
144 }
145 
146 static_assert(kDosageMax == 32768, "ddosagetoa() needs to be updated.");
ddosagetoa(uint64_t dosage,char * start)147 char* ddosagetoa(uint64_t dosage, char* start) {
148   // 3 digit precision seems like the best compromise between accuracy and
149   // avoidance of rounding ugliness
150   // (Rounding ugliness is not actually hidden for e.g. 1000 Genomes phase 1,
151   // since there are lots of 0.05 and 0.1 dosages which all get rounded in the
152   // same direction; oh well.)
153 
154   // +16 since we need to round .99951 up to 1
155   const uint64_t dosage_p16 = dosage + 16;
156   start = u32toa(dosage_p16 / kDosageMax, start);
157   const uint32_t remainder_p16 = S_CAST(uint32_t, dosage_p16) % kDosageMax;
158   if (remainder_p16 < 33) {
159     return start;
160   }
161   // (1000 * remainder + 16384) / 32768
162   //   1/16 = .0625 -> print 0.062
163   //   3/16 = .1875 -> print 0.188
164   //   5/16 = .3125 -> print 0.312
165   // const uint32_t three_decimal_places = ((125 * remainder + 2048) / 4096) - ((remainder % 8192) == 2048);
166   const uint32_t three_decimal_places = ((125 * remainder_p16 + 48) / 4096) - ((remainder_p16 % 8192) == 4048);
167   // three_decimal_places guaranteed to be nonzero here
168   *start++ = '.';
169   const uint32_t first_decimal_place = three_decimal_places / 100;
170   *start++ = '0' + first_decimal_place;
171   const uint32_t last_two_digits = three_decimal_places - first_decimal_place * 100;
172   if (last_two_digits) {
173     memcpy_k(start, &(kDigitPair[last_two_digits]), 2);
174     return &(start[1 + (start[1] != '0')]);
175   }
176   return start;
177 }
178 
179 static_assert(kDosageMax == 32768, "PrintDdosageDecimal() needs to be updated.");
PrintDdosageDecimal(uint32_t remainder,char * start)180 char* PrintDdosageDecimal(uint32_t remainder, char* start) {
181   // Instead of constant 5-digit precision, we print fewer digits whenever that
182   // doesn't interfere with proper round-tripping.  I.e. we search for the
183   // shortest string in
184   //   ((n - 0.5)/32768, (n + 0.5)/32768).
185   // E.g. 6554/32768 is 0.20001 when printed with 5-digit precision, but we'd
186   // print that as 0.2 since that's still in (6553.5/32768, 6554.5/32768).
187   // (This is more precise than necessary for most chromosomes, but it's
188   // necessary for chrX.)
189   *start++ = '.';
190   // (remainder * 2) is in 65536ths
191   // 65536 * 625 = 40960k, smallest common denominator with 10^4
192 
193   const uint32_t range_top_40960k = (remainder * 2 + 1) * 625;
194   // this is technically checking a half-open rather than a fully-open
195   // interval, but that's fine since we never hit the boundary points
196   if ((range_top_40960k % 4096) < 1250) {
197     // when this is true, the four-decimal-place approximation is in the range
198     // which round-trips back to our original number.
199     const uint32_t four_decimal_places = range_top_40960k / 4096;
200     return u32toa_trunc4(four_decimal_places, start);
201   }
202 
203   // we wish to print (100000 * remainder + 16384) / 32768, left-0-padded.  and
204   // may as well banker's round too.
205   //
206   // banker's rounding yields a different result than regular rounding for n/64
207   // when n is congruent to 1 mod 4:
208   //   1/64 = .015625 -> print 0.01562
209   //   3/64 = .046875 -> print 0.04688
210   //   5/64 = .078125 -> print 0.07812
211   const uint32_t five_decimal_places = ((3125 * remainder + 512) / 1024) - ((remainder % 2048) == 512);
212   const uint32_t first_decimal_place = five_decimal_places / 10000;
213   *start++ = '0' + first_decimal_place;
214   const uint32_t last_four_digits = five_decimal_places - first_decimal_place * 10000;
215   if (last_four_digits) {
216     return u32toa_trunc4(last_four_digits, start);
217   }
218   return start;
219 }
220 
221 static const uint16_t kHcToDosage[1024] = QUAD_TABLE256(0, kDosageMid, kDosageMax, kDosageMissing);
222 
PopulateDenseDosage(const uintptr_t * genoarr,const uintptr_t * dosage_present,const Dosage * dosage_main,uint32_t sample_ct,uint32_t dosage_ct,Dosage * dense_dosage)223 void PopulateDenseDosage(const uintptr_t* genoarr, const uintptr_t* dosage_present, const Dosage* dosage_main, uint32_t sample_ct, uint32_t dosage_ct, Dosage* dense_dosage) {
224   GenoarrLookup256x2bx4(genoarr, kHcToDosage, sample_ct, dense_dosage);
225   // fill trailing bits with missing values so vector operations work
226   const uint32_t trailing_entry_ct = (-sample_ct) % kDosagePerVec;
227   if (trailing_entry_ct) {
228     Dosage* dense_dosage_end = &(dense_dosage[sample_ct]);
229     for (uint32_t uii = 0; uii != trailing_entry_ct; ++uii) {
230       dense_dosage_end[uii] = kDosageMissing;
231     }
232   }
233   if (dosage_ct) {
234     uintptr_t sample_idx_base = 0;
235     uintptr_t cur_bits = dosage_present[0];
236     for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
237       const uintptr_t sample_idx = BitIter1(dosage_present, &sample_idx_base, &cur_bits);
238       dense_dosage[sample_idx] = dosage_main[dosage_idx];
239     }
240   }
241 }
242 
PopulateRescaledDosage(const uintptr_t * genoarr,const uintptr_t * dosage_present,const Dosage * dosage_main,double slope,double intercept,double missing_val,uint32_t sample_ct,uint32_t dosage_ct,double * expanded_dosages)243 void PopulateRescaledDosage(const uintptr_t* genoarr, const uintptr_t* dosage_present, const Dosage* dosage_main, double slope, double intercept, double missing_val, uint32_t sample_ct, uint32_t dosage_ct, double* expanded_dosages) {
244   double lookup_vals[32] ALIGNV16;
245   lookup_vals[0] = intercept;
246   lookup_vals[2] = intercept + slope;
247   lookup_vals[4] = intercept + 2 * slope;
248   lookup_vals[6] = missing_val;
249   InitLookup16x8bx2(lookup_vals);
250   GenoarrLookup16x8bx2(genoarr, lookup_vals, sample_ct, expanded_dosages);
251   if (!dosage_ct) {
252     return;
253   }
254   slope *= kRecipDosageMid;
255   uintptr_t sample_uidx_base = 0;
256   uintptr_t cur_bits = dosage_present[0];
257   for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
258     const uintptr_t sample_uidx = BitIter1(dosage_present, &sample_uidx_base, &cur_bits);
259     expanded_dosages[sample_uidx] = dosage_main[dosage_idx] * slope + intercept;
260   }
261 }
262 
263 static_assert(sizeof(AlleleCode) == 1, "AtLeastOneMultiallelicHet() needs to be updated.");
AtLeastOneMultiallelicHet(const PgenVariant * pgvp,uint32_t sample_ct)264 uint32_t AtLeastOneMultiallelicHet(const PgenVariant* pgvp, uint32_t sample_ct) {
265   if (pgvp->patch_01_ct) {
266     return 1;
267   }
268   {
269     const uintptr_t* genoarr = pgvp->genovec;
270     const uint32_t fullvec_ct = sample_ct / kBitsPerWordD2;
271     for (uint32_t uii = 0; uii != fullvec_ct; ++uii) {
272       const uintptr_t geno_word = genoarr[uii];
273       if (Word01(geno_word)) {
274         return 1;
275       }
276     }
277     const uint32_t remainder = sample_ct % kBitsPerWordD2;
278     if (remainder) {
279       if (Word01(bzhi(genoarr[fullvec_ct], 2 * remainder))) {
280         return 1;
281       }
282     }
283   }
284   const uint32_t patch_10_ct = pgvp->patch_10_ct;
285   if (patch_10_ct) {
286     const AlleleCode* patch_10_vals = pgvp->patch_10_vals;
287 #ifdef __LP64__
288     const uint32_t fullvec_ct = patch_10_ct / (kBytesPerVec / 2);
289     const VecU16* patch_10_valias = R_CAST(const VecU16*, patch_10_vals);
290     const VecU16 m8 = vecu16_set1(0xff);
291     for (uint32_t vidx = 0; vidx != fullvec_ct; ++vidx) {
292       const VecU16 vv_orig = vecu16_loadu(&(patch_10_valias[vidx]));
293       const VecU16 vv_hi = vecu16_srli(vv_orig, 8);
294       const VecU16 vv_lo = vv_orig & m8;
295       const VecU16 vv_eq = (vv_hi == vv_lo);
296       if (vecu16_movemask(vv_eq) != kVec8thUintMax) {
297         return 1;
298       }
299     }
300     uint32_t uii = fullvec_ct * (kBytesPerVec / 2);
301 #else
302     uint32_t uii = 0;
303 #endif
304     for (; uii != patch_10_ct; ++uii) {
305       if (patch_10_vals[2 * uii] != patch_10_vals[2 * uii + 1]) {
306         return 1;
307       }
308     }
309   }
310   return 0;
311 }
312 
SetHetMissing(uintptr_t word_ct,uintptr_t * genovec)313 void SetHetMissing(uintptr_t word_ct, uintptr_t* genovec) {
314   // 01 -> 11, nothing else changes
315 #ifdef __LP64__
316   const VecW m1 = VCONST_W(kMask5555);
317   VecW* geno_vvec_iter = R_CAST(VecW*, genovec);
318   const uintptr_t full_vec_ct = word_ct / kWordsPerVec;
319   if (full_vec_ct & 1) {
320     const VecW cur_geno_vword = *geno_vvec_iter;
321     const VecW cur_geno_vword_low_lshifted = vecw_slli(cur_geno_vword & m1, 1);
322     *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
323   }
324   if (full_vec_ct & 2) {
325     VecW cur_geno_vword = *geno_vvec_iter;
326     VecW cur_geno_vword_low_lshifted = vecw_slli(cur_geno_vword & m1, 1);
327     *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
328     cur_geno_vword = *geno_vvec_iter;
329     cur_geno_vword_low_lshifted = vecw_slli(cur_geno_vword & m1, 1);
330     *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
331   }
332   for (uintptr_t ulii = 3; ulii < full_vec_ct; ulii += 4) {
333     VecW cur_geno_vword = *geno_vvec_iter;
334     VecW cur_geno_vword_low_lshifted = vecw_slli(cur_geno_vword & m1, 1);
335     *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
336     cur_geno_vword = *geno_vvec_iter;
337     cur_geno_vword_low_lshifted = vecw_slli(cur_geno_vword & m1, 1);
338     *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
339     cur_geno_vword = *geno_vvec_iter;
340     cur_geno_vword_low_lshifted = vecw_slli(cur_geno_vword & m1, 1);
341     *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
342     cur_geno_vword = *geno_vvec_iter;
343     cur_geno_vword_low_lshifted = vecw_slli(cur_geno_vword & m1, 1);
344     *geno_vvec_iter++ = cur_geno_vword | cur_geno_vword_low_lshifted;
345   }
346 #  ifdef USE_AVX2
347   if (word_ct & 2) {
348     const uintptr_t base_idx = full_vec_ct * kWordsPerVec;
349     uintptr_t geno_word = genovec[base_idx];
350     genovec[base_idx] = geno_word | ((geno_word & kMask5555) << 1);
351     geno_word = genovec[base_idx + 1];
352     genovec[base_idx + 1] = geno_word | ((geno_word & kMask5555) << 1);
353   }
354 #  endif
355   if (word_ct & 1) {
356     const uintptr_t geno_word = genovec[word_ct - 1];
357     genovec[word_ct - 1] = geno_word | ((geno_word & kMask5555) << 1);
358   }
359 #else
360   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
361     const uintptr_t geno_word = genovec[widx];
362     genovec[widx] = geno_word | ((geno_word & kMask5555) << 1);
363   }
364 #endif
365 }
366 
SetHetMissingCleardosage(uintptr_t word_ct,uintptr_t * __restrict genovec,uint32_t * write_dosage_ct_ptr,uintptr_t * __restrict dosagepresent,Dosage * dosage_main)367 void SetHetMissingCleardosage(uintptr_t word_ct, uintptr_t* __restrict genovec, uint32_t* write_dosage_ct_ptr, uintptr_t* __restrict dosagepresent, Dosage* dosage_main) {
368   const uint32_t orig_write_dosage_ct = *write_dosage_ct_ptr;
369   if (orig_write_dosage_ct) {
370     uintptr_t sample_uidx_base = 0;
371     uintptr_t cur_bits = dosagepresent[0];
372     for (uint32_t dosage_read_idx = 0; dosage_read_idx != orig_write_dosage_ct; ++dosage_read_idx) {
373       const uintptr_t sample_uidx = BitIter1(dosagepresent, &sample_uidx_base, &cur_bits);
374       if (GetNyparrEntry(genovec, sample_uidx) == 1) {
375         ClearBit(sample_uidx, dosagepresent);
376         uint32_t dosage_write_idx = dosage_read_idx++;
377         for (; dosage_read_idx == orig_write_dosage_ct; ++dosage_read_idx) {
378           const uintptr_t sample_uidx2 = BitIter1(dosagepresent, &sample_uidx_base, &cur_bits);
379           if (GetNyparrEntry(genovec, sample_uidx2) == 1) {
380             ClearBit(sample_uidx2, dosagepresent);
381           } else {
382             dosage_main[dosage_write_idx++] = dosage_main[dosage_read_idx];
383           }
384         }
385         *write_dosage_ct_ptr = dosage_write_idx;
386         break;
387       }
388     }
389   }
390   SetHetMissing(word_ct, genovec);
391 }
392 
SetHetMissingKeepdosage(uintptr_t word_ct,uintptr_t * __restrict genovec,uint32_t * write_dosage_ct_ptr,uintptr_t * __restrict dosagepresent,Dosage * dosage_main)393 void SetHetMissingKeepdosage(uintptr_t word_ct, uintptr_t* __restrict genovec, uint32_t* write_dosage_ct_ptr, uintptr_t* __restrict dosagepresent, Dosage* dosage_main) {
394   // count # of 1.00000 dosages we need to insert, and then rewrite dosage_main
395   // from back to front so we don't need temporary buffers.
396 
397   const uint32_t orig_dosage_ct = *write_dosage_ct_ptr;
398   // can't assume dosagepresent is initialized in this case
399   if (!orig_dosage_ct) {
400     ZeroWArr(DivUp(word_ct, 2), dosagepresent);
401   }
402   Halfword* dosagepresent_alias = R_CAST(Halfword*, dosagepresent);
403   uint32_t new_dosage_ct = 0;
404   // todo: try vectorizing this loop
405   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
406     const uintptr_t geno_hets = Word01(genovec[widx]);
407     const uintptr_t dosagepresent_word = UnpackHalfwordToWord(dosagepresent_alias[widx]);
408     new_dosage_ct += Popcount01Word(geno_hets & (~dosagepresent_word));
409   }
410   if (!new_dosage_ct) {
411     SetHetMissing(word_ct, genovec);
412     return;
413   }
414   uint32_t dosage_write_idx = orig_dosage_ct + new_dosage_ct;
415   uint32_t dosage_read_idx = orig_dosage_ct;
416   uint32_t widx = word_ct;
417   *write_dosage_ct_ptr = dosage_write_idx;
418   do {
419     --widx;
420     const uintptr_t geno_word = genovec[widx];
421     const uintptr_t dosagepresent_word = UnpackHalfwordToWord(dosagepresent_alias[widx]);
422     const uintptr_t geno_hets = Word01(geno_word);
423     uintptr_t new_dosagepresent_word = dosagepresent_word | geno_hets;
424     if (new_dosagepresent_word) {
425       dosagepresent_alias[widx] = PackWordToHalfword(new_dosagepresent_word);
426       do {
427         const uint32_t top_set_bit = bsrw(new_dosagepresent_word);
428         const uintptr_t cur_bit_word = k1LU << top_set_bit;
429         Dosage cur_dosage = kDosageMid;
430         if (cur_bit_word & dosagepresent_word) {
431           cur_dosage = dosage_main[--dosage_read_idx];
432         }
433         dosage_main[--dosage_write_idx] = cur_dosage;
434         new_dosagepresent_word -= cur_bit_word;
435       } while (new_dosagepresent_word);
436       genovec[widx] = geno_word | (geno_hets << 1);
437     }
438   } while (widx);
439 }
440 
441 // todo: try vectorizing this
GenoarrToNonmissing(const uintptr_t * genoarr,uint32_t sample_ct,uintptr_t * nonmissing_bitarr)442 void GenoarrToNonmissing(const uintptr_t* genoarr, uint32_t sample_ct, uintptr_t* nonmissing_bitarr) {
443   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
444   const uintptr_t* genoarr_iter = genoarr;
445   Halfword* nonmissing_bitarr_iter = R_CAST(Halfword*, nonmissing_bitarr);
446   for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
447     uintptr_t ww = ~(*genoarr_iter++);
448     ww = ww | (ww >> 1);
449     *nonmissing_bitarr_iter++ = PackWordToHalfwordMask5555(ww);
450   }
451   // zero trailing bits up to word boundary, in a way that doesn't create
452   // aliasing issues
453   // (if zeroing is needed up to vector boundary, that's the caller's
454   // responsibility)
455   const uint32_t trail_ct = sample_ct % kBitsPerWordD2;
456   if (trail_ct) {
457     nonmissing_bitarr_iter[-1] &= (1U << trail_ct) - 1;
458   }
459   if (sample_ctl2 % 2) {
460     *nonmissing_bitarr_iter = 0;
461   }
462 }
463 
464 // These functions may move to pgenlib_misc later.
CountMissingVec6(const VecW * __restrict geno_vvec,uint32_t vec_ct)465 uint32_t CountMissingVec6(const VecW* __restrict geno_vvec, uint32_t vec_ct) {
466   assert(!(vec_ct % 6));
467   const VecW m1 = VCONST_W(kMask5555);
468   const VecW m2 = VCONST_W(kMask3333);
469   const VecW m4 = VCONST_W(kMask0F0F);
470   const VecW* geno_vvec_iter = geno_vvec;
471   VecW acc_bothset = vecw_setzero();
472   uintptr_t cur_incr = 60;
473   for (; ; vec_ct -= cur_incr) {
474     if (vec_ct < 60) {
475       if (!vec_ct) {
476         return HsumW(acc_bothset);
477       }
478       cur_incr = vec_ct;
479     }
480     VecW inner_acc_bothset = vecw_setzero();
481     const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
482     do {
483       VecW cur_geno_vword = vecw_loadu(geno_vvec_iter);
484       ++geno_vvec_iter;
485       VecW bothset1 = m1 & cur_geno_vword & vecw_srli(cur_geno_vword, 1);
486 
487       cur_geno_vword = vecw_loadu(geno_vvec_iter);
488       ++geno_vvec_iter;
489       bothset1 = bothset1 + (m1 & cur_geno_vword & vecw_srli(cur_geno_vword, 1));
490 
491       cur_geno_vword = vecw_loadu(geno_vvec_iter);
492       ++geno_vvec_iter;
493       bothset1 = bothset1 + (m1 & cur_geno_vword & vecw_srli(cur_geno_vword, 1));
494       bothset1 = (bothset1 & m2) + (vecw_srli(bothset1, 2) & m2);
495 
496       cur_geno_vword = vecw_loadu(geno_vvec_iter);
497       ++geno_vvec_iter;
498       VecW bothset2 = m1 & cur_geno_vword & vecw_srli(cur_geno_vword, 1);
499 
500       cur_geno_vword = vecw_loadu(geno_vvec_iter);
501       ++geno_vvec_iter;
502       bothset2 = bothset2 + (m1 & cur_geno_vword & vecw_srli(cur_geno_vword, 1));
503 
504       cur_geno_vword = vecw_loadu(geno_vvec_iter);
505       ++geno_vvec_iter;
506       bothset2 = bothset2 + (m1 & cur_geno_vword & vecw_srli(cur_geno_vword, 1));
507 
508       bothset1 = bothset1 + (bothset2 & m2) + (vecw_srli(bothset2, 2) & m2);
509       // these now contain 4-bit values from 0-12
510 
511       inner_acc_bothset = inner_acc_bothset + (bothset1 & m4) + (vecw_srli(bothset1, 4) & m4);
512     } while (geno_vvec_iter < geno_vvec_stop);
513     const VecW m0 = vecw_setzero();
514     acc_bothset = acc_bothset + vecw_bytesum(inner_acc_bothset, m0);
515   }
516 }
517 
GenoarrCountMissingUnsafe(const uintptr_t * genoarr,uint32_t sample_ct)518 uint32_t GenoarrCountMissingUnsafe(const uintptr_t* genoarr, uint32_t sample_ct) {
519   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
520   uint32_t word_idx = sample_ctl2 - (sample_ctl2 % (6 * kWordsPerVec));
521   uint32_t missing_ct = CountMissingVec6(R_CAST(const VecW*, genoarr), word_idx / kWordsPerVec);
522   for (; word_idx != sample_ctl2; ++word_idx) {
523     uintptr_t ww = genoarr[word_idx];
524     ww = ww & (ww >> 1);
525     missing_ct += Popcount01Word(ww);
526   }
527   return missing_ct;
528 }
529 
530 // geno_vvec allowed to be unaligned.
CountMissingMaskedVec6(const VecW * __restrict geno_vvec,const VecW * __restrict interleaved_mask_vvec,uint32_t vec_ct)531 uint32_t CountMissingMaskedVec6(const VecW* __restrict geno_vvec, const VecW* __restrict interleaved_mask_vvec, uint32_t vec_ct) {
532   assert(!(vec_ct % 6));
533   const VecW m1 = VCONST_W(kMask5555);
534   const VecW m2 = VCONST_W(kMask3333);
535   const VecW m4 = VCONST_W(kMask0F0F);
536   const VecW* geno_vvec_iter = geno_vvec;
537   const VecW* interleaved_mask_vvec_iter = interleaved_mask_vvec;
538   VecW acc_bothset = vecw_setzero();
539   uintptr_t cur_incr = 60;
540   for (; ; vec_ct -= cur_incr) {
541     if (vec_ct < 60) {
542       if (!vec_ct) {
543         return HsumW(acc_bothset);
544       }
545       cur_incr = vec_ct;
546     }
547     VecW inner_acc_bothset = vecw_setzero();
548     const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
549     do {
550       VecW interleaved_mask_vword = *interleaved_mask_vvec_iter++;
551       VecW cur_geno_vword = vecw_loadu(geno_vvec_iter);
552       ++geno_vvec_iter;
553       VecW cur_mask = interleaved_mask_vword & m1;
554       VecW bothset1 = cur_mask & cur_geno_vword & vecw_srli(cur_geno_vword, 1);
555 
556       cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
557       cur_geno_vword = vecw_loadu(geno_vvec_iter);
558       ++geno_vvec_iter;
559       bothset1 = bothset1 + (cur_mask & cur_geno_vword & vecw_srli(cur_geno_vword, 1));
560 
561       interleaved_mask_vword = *interleaved_mask_vvec_iter++;
562       cur_mask = interleaved_mask_vword & m1;
563       cur_geno_vword = vecw_loadu(geno_vvec_iter);
564       ++geno_vvec_iter;
565       bothset1 = bothset1 + (cur_mask & cur_geno_vword & vecw_srli(cur_geno_vword, 1));
566       bothset1 = (bothset1 & m2) + (vecw_srli(bothset1, 2) & m2);
567 
568       cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
569       cur_geno_vword = vecw_loadu(geno_vvec_iter);
570       ++geno_vvec_iter;
571       VecW bothset2 = cur_mask & cur_geno_vword & vecw_srli(cur_geno_vword, 1);
572 
573       interleaved_mask_vword = *interleaved_mask_vvec_iter++;
574       cur_mask = interleaved_mask_vword & m1;
575       cur_geno_vword = vecw_loadu(geno_vvec_iter);
576       ++geno_vvec_iter;
577       bothset2 = bothset2 + (cur_mask & cur_geno_vword & vecw_srli(cur_geno_vword, 1));
578 
579       cur_mask = vecw_srli(interleaved_mask_vword, 1) & m1;
580       cur_geno_vword = vecw_loadu(geno_vvec_iter);
581       ++geno_vvec_iter;
582       bothset2 = bothset2 + (cur_mask & cur_geno_vword & vecw_srli(cur_geno_vword, 1));
583 
584       bothset1 = bothset1 + (bothset2 & m2) + (vecw_srli(bothset2, 2) & m2);
585       // these now contain 4-bit values from 0-12
586 
587       inner_acc_bothset = inner_acc_bothset + (bothset1 & m4) + (vecw_srli(bothset1, 4) & m4);
588     } while (geno_vvec_iter < geno_vvec_stop);
589     const VecW m0 = vecw_setzero();
590     acc_bothset = acc_bothset + vecw_bytesum(inner_acc_bothset, m0);
591   }
592 }
593 
GenoarrCountMissingSubset(const uintptr_t * genoarr,const uintptr_t * interleaved_vec,uint32_t sample_ct)594 uint32_t GenoarrCountMissingSubset(const uintptr_t* genoarr, const uintptr_t* interleaved_vec, uint32_t sample_ct) {
595   // See GenoarrCountSubsetFreqs().
596   const uint32_t sample_ctv2 = NypCtToVecCt(sample_ct);
597   uint32_t missing_ct;
598 #ifdef __LP64__
599   uint32_t vec_idx = sample_ctv2 - (sample_ctv2 % 6);
600   missing_ct = CountMissingMaskedVec6(R_CAST(const VecW*, genoarr), R_CAST(const VecW*, interleaved_vec), vec_idx);
601   const uintptr_t* genoarr_iter = &(genoarr[kWordsPerVec * vec_idx]);
602   const uintptr_t* interleaved_mask_iter = &(interleaved_vec[(kWordsPerVec / 2) * vec_idx]);
603 #  ifdef USE_AVX2
604   uintptr_t mask_base1 = 0;
605   uintptr_t mask_base2 = 0;
606   uintptr_t mask_base3 = 0;
607   uintptr_t mask_base4 = 0;
608   for (; vec_idx != sample_ctv2; ++vec_idx) {
609     uintptr_t mask_word1;
610     uintptr_t mask_word2;
611     uintptr_t mask_word3;
612     uintptr_t mask_word4;
613     if (!(vec_idx % 2)) {
614       mask_base1 = *interleaved_mask_iter++;
615       mask_base2 = *interleaved_mask_iter++;
616       mask_base3 = *interleaved_mask_iter++;
617       mask_base4 = *interleaved_mask_iter++;
618       mask_word1 = mask_base1 & kMask5555;
619       mask_word2 = mask_base2 & kMask5555;
620       mask_word3 = mask_base3 & kMask5555;
621       mask_word4 = mask_base4 & kMask5555;
622     } else {
623       mask_word1 = (mask_base1 >> 1) & kMask5555;
624       mask_word2 = (mask_base2 >> 1) & kMask5555;
625       mask_word3 = (mask_base3 >> 1) & kMask5555;
626       mask_word4 = (mask_base4 >> 1) & kMask5555;
627     }
628     for (uint32_t vechalf_idx = 0; ; ++vechalf_idx) {
629       const uintptr_t cur_geno_word1 = *genoarr_iter++;
630       const uintptr_t cur_geno_word2 = *genoarr_iter++;
631       const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
632       const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
633       missing_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
634       if (vechalf_idx) {
635         break;
636       }
637       mask_word1 = mask_word3;
638       mask_word2 = mask_word4;
639     }
640   }
641 #  else  // not USE_AVX2
642   uintptr_t mask_base1 = 0;
643   uintptr_t mask_base2 = 0;
644   for (; vec_idx != sample_ctv2; ++vec_idx) {
645     uintptr_t mask_word1;
646     uintptr_t mask_word2;
647     if (!(vec_idx % 2)) {
648       mask_base1 = *interleaved_mask_iter++;
649       mask_base2 = *interleaved_mask_iter++;
650       mask_word1 = mask_base1 & kMask5555;
651       mask_word2 = mask_base2 & kMask5555;
652     } else {
653       mask_word1 = (mask_base1 >> 1) & kMask5555;
654       mask_word2 = (mask_base2 >> 1) & kMask5555;
655     }
656     const uintptr_t cur_geno_word1 = *genoarr_iter++;
657     const uintptr_t cur_geno_word2 = *genoarr_iter++;
658     const uintptr_t cur_geno_word1_high_masked = mask_word1 & (cur_geno_word1 >> 1);
659     const uintptr_t cur_geno_word2_high_masked = mask_word2 & (cur_geno_word2 >> 1);
660 #    ifdef USE_SSE42
661     missing_ct += PopcountWord(((cur_geno_word1 & cur_geno_word1_high_masked) << 1) | (cur_geno_word2 & cur_geno_word2_high_masked));
662 #    else
663     missing_ct += NypsumWord((cur_geno_word1 & cur_geno_word1_high_masked) + (cur_geno_word2 & cur_geno_word2_high_masked));
664 #    endif
665   }
666 #  endif  // not USE_AVX2
667 #else  // not __LP64__
668   uint32_t word_idx = sample_ctv2 - (sample_ctv2 % 6);
669   missing_ct = CountMissingMaskedVec6(R_CAST(const VecW*, genoarr), R_CAST(const VecW*, interleaved_vec), word_idx);
670   const uintptr_t* interleaved_mask_iter = &(interleaved_vec[word_idx / 2]);
671   uintptr_t mask_base = 0;
672   for (; word_idx != sample_ctv2; ++word_idx) {
673     uintptr_t mask_word;
674     if (!(word_idx % 2)) {
675       mask_base = *interleaved_mask_iter++;
676       mask_word = mask_base & kMask5555;
677     } else {
678       mask_word = (mask_base >> 1) & kMask5555;
679     }
680     const uintptr_t cur_geno_word = genoarr[word_idx];
681     const uintptr_t cur_geno_word_high_masked = mask_word & (cur_geno_word >> 1);
682     missing_ct += Popcount01Word(cur_geno_word & cur_geno_word_high_masked);
683   }
684 #endif
685   return missing_ct;
686 }
687 
688 // counts post-dosage missing, for which we don't have an interleaved mask
GenoarrCountMissingInvsubsetUnsafe(const uintptr_t * genoarr,const uintptr_t * exclude_mask,uint32_t sample_ct)689 uint32_t GenoarrCountMissingInvsubsetUnsafe(const uintptr_t* genoarr, const uintptr_t* exclude_mask, uint32_t sample_ct) {
690   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
691   const uintptr_t* genoarr_iter = genoarr;
692   const Halfword* exclude_alias_iter = R_CAST(const Halfword*, exclude_mask);
693   uint32_t missing_ct = 0;
694   for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
695     uintptr_t ww = *genoarr_iter++;
696     ww = ww & (ww >> 1);
697     const uint32_t include_mask = ~(*exclude_alias_iter++);
698     missing_ct += Popcount01Word(ww & UnpackHalfwordToWord(include_mask));
699   }
700   return missing_ct;
701 }
702 
703 
CollapsedSampleFmtidInit(const uintptr_t * sample_include,const SampleIdInfo * siip,uint32_t sample_ct,uint32_t include_fid,uint32_t include_sid,uintptr_t max_sample_fmtid_blen,char * collapsed_sample_fmtids_iter)704 void CollapsedSampleFmtidInit(const uintptr_t* sample_include, const SampleIdInfo* siip, uint32_t sample_ct, uint32_t include_fid, uint32_t include_sid, uintptr_t max_sample_fmtid_blen, char* collapsed_sample_fmtids_iter) {
705   const char* sample_ids = siip->sample_ids;
706   const char* sids = siip->sids;
707   const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
708   uintptr_t max_sid_blen = 0;
709   if (include_sid) {
710     max_sid_blen = sids? siip->max_sid_blen : 2;
711   }
712   uintptr_t sample_uidx_base = 0;
713   uintptr_t cur_bits = sample_include[0];
714   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
715     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
716     const char* cur_sample_id = &(sample_ids[sample_uidx * max_sample_id_blen]);
717     if (!include_fid) {
718       cur_sample_id = AdvPastDelim(cur_sample_id, '\t');
719     }
720     char* write_iter = strcpya(collapsed_sample_fmtids_iter, cur_sample_id);
721     if (include_sid) {
722       *write_iter++ = '\t';
723       if (sids) {
724         strcpy(write_iter, &(sids[sample_uidx * max_sid_blen]));
725       } else {
726         strcpy_k(write_iter, "0");
727       }
728     } else {
729       *write_iter = '\0';
730     }
731     collapsed_sample_fmtids_iter = &(collapsed_sample_fmtids_iter[max_sample_fmtid_blen]);
732   }
733 }
734 
CollapsedSampleFmtidInitAlloc(const uintptr_t * sample_include,const SampleIdInfo * siip,uint32_t sample_ct,uint32_t include_fid,uint32_t include_sid,char ** collapsed_sample_fmtids_ptr,uintptr_t * max_sample_fmtid_blen_ptr)735 BoolErr CollapsedSampleFmtidInitAlloc(const uintptr_t* sample_include, const SampleIdInfo* siip, uint32_t sample_ct, uint32_t include_fid, uint32_t include_sid, char** collapsed_sample_fmtids_ptr, uintptr_t* max_sample_fmtid_blen_ptr) {
736   const uintptr_t max_sample_fmtid_blen = GetMaxSampleFmtidBlen(siip, include_fid, include_sid);
737   *max_sample_fmtid_blen_ptr = max_sample_fmtid_blen;
738   // might add sample_fmtid_map later
739   if (unlikely(bigstack_alloc_c(max_sample_fmtid_blen * sample_ct, collapsed_sample_fmtids_ptr))) {
740     return 1;
741   }
742   CollapsedSampleFmtidInit(sample_include, siip, sample_ct, include_fid, include_sid, max_sample_fmtid_blen, *collapsed_sample_fmtids_ptr);
743   return 0;
744 }
745 
OnlyOneFid(const uintptr_t * sample_include,const SampleIdInfo * siip,uint32_t sample_ct)746 uint32_t OnlyOneFid(const uintptr_t* sample_include, const SampleIdInfo* siip, uint32_t sample_ct) {
747   if (!(siip->flags & kfSampleIdFidPresent)) {
748     return 1;
749   }
750   const char* sample_ids = siip->sample_ids;
751   const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
752   uintptr_t sample_uidx_base = 0;
753   uintptr_t cur_bits = sample_include[0];
754   const uintptr_t first_sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
755   const char* fid_start = &(sample_ids[first_sample_uidx * max_sample_id_blen]);
756   const uintptr_t fid_blen = AdvPastDelim(fid_start, '\t') - fid_start;
757   for (uint32_t sample_idx = 1; sample_idx != sample_ct; ++sample_idx) {
758     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
759     const char* cur_fid_start = &(sample_ids[sample_uidx * max_sample_id_blen]);
760     if (!memequal(fid_start, cur_fid_start, fid_blen)) {
761       return 0;
762     }
763   }
764   return 1;
765 }
766 
GetMajIdxMulti(const double * cur_allele_freqs,uint32_t cur_allele_ct)767 uint32_t GetMajIdxMulti(const double* cur_allele_freqs, uint32_t cur_allele_ct) {
768   assert(cur_allele_ct > 2);
769   double max_freq = cur_allele_freqs[1];
770   if (max_freq >= 0.5) {
771     return 1;
772   }
773   double tot_nonlast_freq = cur_allele_freqs[0];
774   uint32_t maj_allele_idx = 1;
775   if (tot_nonlast_freq >= max_freq) {
776     maj_allele_idx = 0;
777     max_freq = tot_nonlast_freq;
778   }
779   tot_nonlast_freq += max_freq;
780   const uint32_t cur_allele_ct_m1 = cur_allele_ct - 1;
781   for (uint32_t allele_idx = 2; allele_idx != cur_allele_ct_m1; ++allele_idx) {
782     const double cur_freq = cur_allele_freqs[allele_idx];
783     if (cur_freq > max_freq) {
784       maj_allele_idx = allele_idx;
785       max_freq = cur_freq;
786     }
787     tot_nonlast_freq += cur_freq;
788   }
789   if (max_freq + tot_nonlast_freq < 1.0 - kSmallEpsilon) {
790     return cur_allele_ct_m1;
791   }
792   return maj_allele_idx;
793 }
794 
795 // forced SID '0' if sids == nullptr
796 // ok for sample_augid_map_ptr == nullptr
AugidInitAlloc(const uintptr_t * sample_include,const SampleIdInfo * siip,uint32_t sample_ct,uint32_t ** sample_augid_map_ptr,char ** sample_augids_ptr,uintptr_t * max_sample_augid_blen_ptr)797 PglErr AugidInitAlloc(const uintptr_t* sample_include, const SampleIdInfo* siip, uint32_t sample_ct, uint32_t** sample_augid_map_ptr, char** sample_augids_ptr, uintptr_t* max_sample_augid_blen_ptr) {
798   const char* sample_ids = siip->sample_ids;
799   const char* sids = siip->sids;
800   const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
801   uintptr_t max_sid_blen = siip->max_sid_blen;
802   if (!sids) {
803     max_sid_blen = 2;
804   }
805   const uintptr_t max_sample_augid_blen = max_sample_id_blen + max_sid_blen;
806   *max_sample_augid_blen_ptr = max_sample_augid_blen;
807   uint32_t* sample_augid_map = nullptr;
808   if (sample_augid_map_ptr) {
809     if (unlikely(bigstack_alloc_u32(sample_ct, sample_augid_map_ptr))) {
810       return kPglRetNomem;
811     }
812     sample_augid_map = *sample_augid_map_ptr;
813   }
814   if (unlikely(bigstack_alloc_c(max_sample_augid_blen * sample_ct, sample_augids_ptr))) {
815     return kPglRetNomem;
816   }
817   char* sample_augids_iter = *sample_augids_ptr;
818   uintptr_t sample_uidx_base = 0;
819   uintptr_t cur_bits = sample_include[0];
820   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
821     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
822     char* write_iter = strcpyax(sample_augids_iter, &(sample_ids[sample_uidx * max_sample_id_blen]), '\t');
823     if (sids) {
824       strcpy(write_iter, &(sids[sample_uidx * max_sid_blen]));
825     } else {
826       strcpy_k(write_iter, "0");
827     }
828     sample_augids_iter = &(sample_augids_iter[max_sample_augid_blen]);
829     if (sample_augid_map) {
830       sample_augid_map[sample_idx] = sample_uidx;
831     }
832   }
833   return kPglRetSuccess;
834 }
835 
SortedXidboxInitAlloc(const uintptr_t * sample_include,const SampleIdInfo * siip,uint32_t sample_ct,uint32_t allow_dups,XidMode xid_mode,uint32_t use_nsort,char ** sorted_xidbox_ptr,uint32_t ** xid_map_ptr,uintptr_t * max_xid_blen_ptr)836 PglErr SortedXidboxInitAlloc(const uintptr_t* sample_include, const SampleIdInfo* siip, uint32_t sample_ct, uint32_t allow_dups, XidMode xid_mode, uint32_t use_nsort, char** sorted_xidbox_ptr, uint32_t** xid_map_ptr, uintptr_t* max_xid_blen_ptr) {
837   if (!(xid_mode & kfXidModeFlagSid)) {
838     // two fields
839     *max_xid_blen_ptr = siip->max_sample_id_blen;
840     return CopySortStrboxSubset(sample_include, siip->sample_ids, sample_ct, siip->max_sample_id_blen, allow_dups, 0, use_nsort, sorted_xidbox_ptr, xid_map_ptr);
841   }
842   // three fields
843   if (unlikely(AugidInitAlloc(sample_include, siip, sample_ct, xid_map_ptr, sorted_xidbox_ptr, max_xid_blen_ptr))) {
844     return kPglRetNomem;
845   }
846   if (unlikely(SortStrboxIndexed(sample_ct, *max_xid_blen_ptr, use_nsort, *sorted_xidbox_ptr, *xid_map_ptr))) {
847     return kPglRetNomem;
848   }
849   if (!allow_dups) {
850     char* dup_id = ScanForDuplicateIds(*sorted_xidbox_ptr, sample_ct, *max_xid_blen_ptr);
851     if (unlikely(dup_id)) {
852       char* tab_iter = AdvToDelim(dup_id, '\t');
853       *tab_iter = ' ';
854       tab_iter = AdvToDelim(&(tab_iter[1]), '\t');
855       *tab_iter = ' ';
856       logerrprintfww("Error: Duplicate ID '%s'.\n", dup_id);
857       return kPglRetMalformedInput;
858     }
859   }
860   return kPglRetSuccess;
861 }
862 
XidRead(uintptr_t max_xid_blen,uint32_t comma_delim,XidMode xid_mode,const char ** read_pp,char * __restrict idbuf)863 uint32_t XidRead(uintptr_t max_xid_blen, uint32_t comma_delim, XidMode xid_mode, const char** read_pp, char* __restrict idbuf) {
864   // Saves normalized tokens pointed to by *read_pp to idbuf, and advances
865   // *read_pp.
866   //
867   // Input *read_pp must point to beginning of sample ID; this is a change from
868   // plink 1.9.
869   //
870   // *read_pp is now set to point to the end of the last parsed token instead
871   // of the beginning of the next; this is another change from plink 1.9.
872   //
873   // Returns 0 on missing token *or* guaranteed sample ID mismatch (longer than
874   // max_xid_blen); cases can be distinguished by checking whether *read_pp ==
875   // nullptr.  Otherwise, returns slen of the parsed ID saved to idbuf.  (idbuf
876   // is not null-terminated.)
877   //
878   // Alpha 2 behavior change: If there's no FID column, it's now set to 0
879   // instead of the IID value.
880   const char* first_token_start = *read_pp;
881   uintptr_t slen_fid = 0;  // now zero if no FID column
882   uintptr_t blen_sid = 0;
883   const char* sid_ptr = nullptr;
884   const char* token_iter;
885   const char* iid_ptr;
886   uintptr_t slen_iid;
887   if (comma_delim) {
888     token_iter = first_token_start;
889     unsigned char ucc = *token_iter;
890     while (ucc != ',') {
891       if (ucc < 32) {
892         if (unlikely(!(xid_mode & kfXidModeFlagOneTokenOk))) {
893           *read_pp = nullptr;
894           return 0;
895         }
896         goto XidRead_comma_single_token;
897       }
898       ucc = *(++token_iter);
899     }
900     if (xid_mode & kfXidModeFlagNeverFid) {
901     XidRead_comma_single_token:
902       iid_ptr = first_token_start;
903       slen_iid = token_iter - first_token_start;
904     } else {
905       slen_fid = token_iter - first_token_start;
906       do {
907         ucc = *(++token_iter);
908       } while ((ucc == ' ') || (ucc == '\t'));
909       iid_ptr = token_iter;
910       while ((ucc >= 32) && (ucc != ',')) {
911         ucc = *(++token_iter);
912       }
913       slen_iid = token_iter - iid_ptr;
914     }
915     // token_iter now points to comma/eoln at end of IID
916     if (xid_mode & kfXidModeFlagSid) {
917       if (*token_iter != ',') {
918         return 0;
919       }
920       do {
921         ucc = *(++token_iter);
922       } while ((ucc == ' ') || (ucc == '\t'));
923       sid_ptr = token_iter;
924       while ((ucc >= 32) && (ucc != ',')) {
925         ucc = *(++token_iter);
926       }
927       blen_sid = 1 + S_CAST(uintptr_t, token_iter - sid_ptr);
928       if (token_iter == sid_ptr) {
929         // special case: treat missing SID as '0'
930         blen_sid = 2;
931         sid_ptr = &(g_one_char_strs[96]);
932       }
933     }
934   } else {
935     assert(!IsEolnKns(*first_token_start));
936     token_iter = CurTokenEnd(first_token_start);
937     if (xid_mode & kfXidModeFlagNeverFid) {
938     XidRead_space_single_token:
939       iid_ptr = first_token_start;
940       slen_iid = token_iter - first_token_start;
941     } else {
942       slen_fid = token_iter - first_token_start;
943       token_iter = FirstNonTspace(token_iter);
944       if (IsEolnKns(*token_iter)) {
945         if (unlikely(!(xid_mode & kfXidModeFlagOneTokenOk))) {
946           *read_pp = nullptr;
947           return 0;
948         }
949         // need to backtrack
950         token_iter = &(first_token_start[slen_fid]);
951         // bugfix (26 Feb 2018): forgot to zero this
952         slen_fid = 0;
953         goto XidRead_space_single_token;
954       }
955       iid_ptr = token_iter;
956       token_iter = CurTokenEnd(token_iter);
957       slen_iid = token_iter - iid_ptr;
958     }
959     // token_iter now points to space/eoln at end of IID
960     if (xid_mode & kfXidModeFlagSid) {
961       token_iter = FirstNonTspace(token_iter);
962       if (unlikely(IsEolnKns(*token_iter))) {
963         *read_pp = nullptr;
964         return 0;
965       }
966       sid_ptr = token_iter;
967       token_iter = CurTokenEnd(token_iter);
968       blen_sid = 1 + S_CAST(uintptr_t, token_iter - sid_ptr);
969     }
970   }
971   *read_pp = token_iter;
972   const uintptr_t slen_final = slen_fid + (slen_fid == 0) + slen_iid + blen_sid + 1;
973   if (slen_final >= max_xid_blen) {
974     // avoid buffer overflow
975     return 0;
976   }
977   char* idbuf_iter = idbuf;
978   if (slen_fid) {
979     idbuf_iter = memcpya(idbuf, first_token_start, slen_fid);
980   } else {
981     *idbuf_iter++ = '0';
982   }
983   *idbuf_iter++ = '\t';
984   idbuf_iter = memcpya(idbuf_iter, iid_ptr, slen_iid);
985   if (blen_sid) {
986     *idbuf_iter++ = '\t';
987     memcpy(idbuf_iter, sid_ptr, blen_sid - 1);
988   }
989   return slen_final;
990 }
991 
SortedXidboxReadMultifind(const char * __restrict sorted_xidbox,uintptr_t max_xid_blen,uintptr_t xid_ct,uint32_t comma_delim,XidMode xid_mode,const char ** read_pp,uint32_t * __restrict xid_idx_start_ptr,uint32_t * __restrict xid_idx_end_ptr,char * __restrict idbuf)992 BoolErr SortedXidboxReadMultifind(const char* __restrict sorted_xidbox, uintptr_t max_xid_blen, uintptr_t xid_ct, uint32_t comma_delim, XidMode xid_mode, const char** read_pp, uint32_t* __restrict xid_idx_start_ptr, uint32_t* __restrict xid_idx_end_ptr, char* __restrict idbuf) {
993   // sorted_xidbox = packed, sorted list of ID strings to search over.
994   const uint32_t slen_final = XidRead(max_xid_blen, comma_delim, xid_mode, read_pp, idbuf);
995   if (!slen_final) {
996     return 1;
997   }
998   const uint32_t lb_idx = bsearch_str_lb(idbuf, sorted_xidbox, slen_final, max_xid_blen, xid_ct);
999   idbuf[slen_final] = ' ';
1000   const uint32_t ub_idx = ExpsearchStrLb(idbuf, sorted_xidbox, slen_final + 1, max_xid_blen, xid_ct, lb_idx);
1001   if (lb_idx == ub_idx) {
1002     return 1;
1003   }
1004   *xid_idx_start_ptr = lb_idx;
1005   *xid_idx_end_ptr = ub_idx;
1006   return 0;
1007 }
1008 
LoadXidHeader(const char * flag_name,XidHeaderFlags xid_header_flags,uintptr_t * line_idx_ptr,TextStream * txsp,XidMode * xid_mode_ptr,char ** line_startp,char ** line_iterp)1009 PglErr LoadXidHeader(const char* flag_name, XidHeaderFlags xid_header_flags, uintptr_t* line_idx_ptr, TextStream* txsp, XidMode* xid_mode_ptr, char** line_startp, char** line_iterp) {
1010   // possible todo: support comma delimiter
1011   uintptr_t line_idx = *line_idx_ptr;
1012   char* line_iter;
1013   uint32_t is_header_line;
1014   do {
1015     ++line_idx;
1016     line_iter = TextGet(txsp);
1017     // eof may be ok
1018     if (!line_iter) {
1019       return TextStreamRawErrcode(txsp);
1020     }
1021     is_header_line = (line_iter[0] == '#');
1022   } while (is_header_line && (!tokequal_k(&(line_iter[1]), "FID")) && (!tokequal_k(&(line_iter[1]), "IID")));
1023   if (line_startp) {
1024     *line_startp = line_iter;
1025   }
1026   XidMode xid_mode = kfXidMode0;
1027   if (is_header_line) {
1028     // The following header leading columns are supported:
1029     // #FID IID
1030     // #FID IID SID (SID ignored if kfXidHeaderIgnoreSid set)
1031     // #IID
1032     // #IID SID
1033     char* linebuf_iter = &(line_iter[4]);
1034     if (line_iter[1] == 'I') {
1035       xid_mode = kfXidModeFlagNeverFid;
1036     } else {
1037       linebuf_iter = FirstNonTspace(linebuf_iter);
1038       if (unlikely(!tokequal_k(linebuf_iter, "IID"))) {
1039         logerrprintf("Error: No IID column on line %" PRIuPTR " of --%s file.\n", line_idx, flag_name);
1040         return kPglRetMalformedInput;
1041       }
1042       linebuf_iter = &(linebuf_iter[3]);
1043     }
1044     linebuf_iter = FirstNonTspace(linebuf_iter);
1045     if (tokequal_k(linebuf_iter, "SID")) {
1046       if (!(xid_header_flags & kfXidHeaderIgnoreSid)) {
1047         xid_mode |= kfXidModeFlagSid;
1048       }
1049       linebuf_iter = FirstNonTspace(&(linebuf_iter[3]));
1050     } else if (xid_mode == kfXidModeFlagNeverFid) {
1051       xid_mode = kfXidModeIid;
1052     }
1053     line_iter = linebuf_iter;
1054   } else {
1055     xid_mode = (xid_header_flags & kfXidHeaderFixedWidth)? kfXidModeFidIid : kfXidModeFidIidOrIid;
1056   }
1057   if (line_iterp) {
1058     *line_iterp = line_iter;
1059   }
1060   *line_idx_ptr = line_idx;
1061   *xid_mode_ptr = xid_mode;
1062   return kPglRetSuccess;
1063 }
1064 
OpenAndLoadXidHeader(const char * fname,const char * flag_name,XidHeaderFlags xid_header_flags,uint32_t max_line_blen,TextStream * txsp,XidMode * xid_mode_ptr,uintptr_t * line_idx_ptr,char ** line_startp,char ** line_iterp)1065 PglErr OpenAndLoadXidHeader(const char* fname, const char* flag_name, XidHeaderFlags xid_header_flags, uint32_t max_line_blen, TextStream* txsp, XidMode* xid_mode_ptr, uintptr_t* line_idx_ptr, char** line_startp, char** line_iterp) {
1066   PglErr reterr = InitTextStream(fname, max_line_blen, 1, txsp);
1067   // remove unlikely() if any caller treats eof as ok
1068   if (unlikely(reterr)) {
1069     return reterr;
1070   }
1071   *line_idx_ptr = 0;
1072   return LoadXidHeader(flag_name, xid_header_flags, line_idx_ptr, txsp, xid_mode_ptr, line_startp, line_iterp);
1073 }
1074 
LoadXidHeaderPair(const char * flag_name,uint32_t sid_over_fid,uintptr_t * line_idx_ptr,TextStream * txsp,XidMode * xid_mode_ptr,char ** line_startp,char ** line_iterp)1075 PglErr LoadXidHeaderPair(const char* flag_name, uint32_t sid_over_fid, uintptr_t* line_idx_ptr, TextStream* txsp, XidMode* xid_mode_ptr, char** line_startp, char** line_iterp) {
1076   uintptr_t line_idx = *line_idx_ptr;
1077   char* line_iter;
1078   uint32_t is_header_line;
1079   do {
1080     ++line_idx;
1081     line_iter = TextGet(txsp);
1082     // eof may be ok
1083     if (!line_iter) {
1084       return TextStreamRawErrcode(txsp);
1085     }
1086     is_header_line = (line_iter[0] == '#');
1087   } while (is_header_line && (!tokequal_k(&(line_iter[1]), "FID1")) && (!tokequal_k(&(line_iter[1]), "ID1")) && (!tokequal_k(&(line_iter[1]), "IID1")));
1088   if (line_startp) {
1089     *line_startp = line_iter;
1090   }
1091   XidMode xid_mode = kfXidMode0;
1092   if (is_header_line) {
1093     // The following header leading columns are supported:
1094     // #FID1 {I}ID1 FID2 {I}ID2
1095     // #FID1 {I}ID1 SID1 FID2 {I}ID2 SID2
1096     // #{I}ID1 {I}ID2
1097     // #{I}ID1 SID1 {I}ID2 SID2
1098     char* linebuf_iter = &(line_iter[4]);
1099     if (line_iter[1] == 'I') {
1100       xid_mode = kfXidModeFlagNeverFid;
1101     } else {
1102       linebuf_iter = FirstNonTspace(linebuf_iter);
1103       if (tokequal_k(linebuf_iter, "ID1")) {
1104         linebuf_iter = &(linebuf_iter[3]);
1105       } else if (likely(tokequal_k(linebuf_iter, "IID1"))) {
1106         linebuf_iter = &(linebuf_iter[4]);
1107       } else {
1108         logerrprintf("Error: No [I]ID1 column on line %" PRIuPTR " of --%s file.\n", line_idx, flag_name);
1109         return kPglRetMalformedInput;
1110       }
1111     }
1112     linebuf_iter = FirstNonTspace(linebuf_iter);
1113     if (tokequal_k(linebuf_iter, "SID1")) {
1114       xid_mode |= kfXidModeFlagSid;
1115       linebuf_iter = FirstNonTspace(&(linebuf_iter[4]));
1116     }
1117     // Now verify the expected ID2 token sequence follows.
1118     if (!(xid_mode & kfXidModeFlagNeverFid)) {
1119       if (unlikely(!tokequal_k(linebuf_iter, "FID2"))) {
1120         logerrprintf("Error: No FID2 column on line %" PRIuPTR " of --%s file.\n", line_idx, flag_name);
1121         return kPglRetMalformedInput;
1122       }
1123       linebuf_iter = FirstNonTspace(&(linebuf_iter[4]));
1124     }
1125     if (tokequal_k(linebuf_iter, "ID2")) {
1126       linebuf_iter = &(linebuf_iter[3]);
1127     } else if (likely(tokequal_k(linebuf_iter, "IID2"))) {
1128       linebuf_iter = &(linebuf_iter[4]);
1129     } else {
1130       logerrprintf("Error: No [I]ID2 column on line %" PRIuPTR " of --%s file.\n", line_idx, flag_name);
1131       return kPglRetMalformedInput;
1132     }
1133     if (xid_mode & kfXidModeFlagSid) {
1134       linebuf_iter = FirstNonTspace(linebuf_iter);
1135       if (unlikely(!tokequal_k(linebuf_iter, "SID2"))) {
1136         logerrprintf("Error: No SID2 column on line %" PRIuPTR " of --%s file.\n", line_idx, flag_name);
1137         return kPglRetMalformedInput;
1138       }
1139       linebuf_iter = &(linebuf_iter[4]);
1140     }
1141     line_iter = linebuf_iter;
1142   } else {
1143     // Assume the file contains nothing but ID pairs.
1144     const uint32_t token_ct = CountTokens(line_iter);
1145     if (token_ct == 2) {
1146       xid_mode = kfXidModeIid;
1147     } else if (token_ct == 4) {
1148       xid_mode = sid_over_fid? kfXidModeIidSid : kfXidModeFidIid;
1149     } else if (likely(token_ct == 6)) {
1150       xid_mode = kfXidModeFidIidSid;
1151     } else {
1152       logerrprintf("Error: Unexpected token count on line %" PRIuPTR " of --%s file.\n", line_idx, flag_name);
1153       return kPglRetMalformedInput;
1154     }
1155   }
1156   if (line_iter) {
1157     *line_iterp = line_iter;
1158   }
1159   *line_idx_ptr = line_idx;
1160   *xid_mode_ptr = xid_mode;
1161   return kPglRetSuccess;
1162 }
1163 
1164 
1165 const char g_xymt_log_names[kChrOffsetCt][5] = {"chrX", "chrY", "XY", "chrM", "PAR1", "PAR2"};
1166 
1167 static_assert(!(kChrRawEnd % kBytesPerVec), "kChrRawEnd must be a multiple of kBytesPerVec.");
InitChrInfo(ChrInfo * cip)1168 PglErr InitChrInfo(ChrInfo* cip) {
1169   // "constructor".  initializes with maximum capacity.  doesn't use bigstack.
1170   // chr_mask, haploid_mask: bits
1171   // chr_file_order, chr_idx_to_foidx: uint32s
1172   // chr_fo_vidx_start: uint32s, with an extra trailing element
1173   // nonstd_names: intptr_ts
1174   // nonstd_id_htable: kChrHtableSize uint32s
1175 
1176   // this assumes kChrRawEnd is divisible by kBytesPerVec
1177   const uintptr_t vecs_required = 2 * BitCtToVecCt(kChrRawEnd) + 3 * (kChrRawEnd / kInt32PerVec) + 1 + (kChrRawEnd / kWordsPerVec) + Int32CtToVecCt(kChrHtableSize);
1178 
1179   // needed for proper cleanup
1180   cip->name_ct = 0;
1181   cip->incl_excl_name_stack = nullptr;
1182   if (unlikely(vecaligned_malloc(vecs_required * kBytesPerVec, &(cip->chr_mask)))) {
1183     return kPglRetNomem;
1184   }
1185   uintptr_t* alloc_iter = &(cip->chr_mask[BitCtToVecCt(kChrRawEnd) * kWordsPerVec]);
1186   cip->haploid_mask = alloc_iter;
1187   alloc_iter = &(alloc_iter[BitCtToVecCt(kChrRawEnd) * kWordsPerVec]);
1188   cip->chr_file_order = R_CAST(uint32_t*, alloc_iter);
1189   alloc_iter = &(alloc_iter[(kChrRawEnd / kInt32PerVec) * kWordsPerVec]);
1190   cip->chr_fo_vidx_start = R_CAST(uint32_t*, alloc_iter);
1191   alloc_iter = &(alloc_iter[((kChrRawEnd / kInt32PerVec) + 1) * kWordsPerVec]);
1192   cip->chr_idx_to_foidx = R_CAST(uint32_t*, alloc_iter);
1193   alloc_iter = &(alloc_iter[(kChrRawEnd / kInt32PerVec) * kWordsPerVec]);
1194   cip->nonstd_names = R_CAST(const char**, alloc_iter);
1195   alloc_iter = &(alloc_iter[kChrRawEnd]);
1196   cip->nonstd_id_htable = R_CAST(uint32_t*, alloc_iter);
1197   // alloc_iter = &(alloc_iter[((kChrHtableSize + (kInt32PerVec - 1)) / kInt32PerVec) * kWordsPerVec]);
1198   // SetAllU32Arr(kChrHtableSize, cip->nonstd_id_htable);
1199 
1200   ZeroWArr(kChrMaskWords, cip->chr_mask);
1201   ZeroWArr(kChrExcludeWords, cip->chr_exclude);
1202 
1203   // This is a change from plink 1.x.  MT > M since the former matches Ensembl,
1204   // while the latter doesn't match any major resource.  no "chr" to reduce
1205   // file sizes and reduce the impact of this change.
1206   cip->output_encoding = kfChrOutputMT;
1207 
1208   cip->zero_extra_chrs = 0;
1209   cip->is_include_stack = 0;
1210   cip->chrset_source = kChrsetSourceDefault;
1211   cip->autosome_ct = 22;
1212   for (uint32_t xymt_idx = 0; xymt_idx != kChrOffsetCt; ++xymt_idx) {
1213     cip->xymt_codes[xymt_idx] = 23 + xymt_idx;
1214   }
1215   // Now includes MT again, as of alpha 2.
1216   cip->haploid_mask[0] = 0x5800000;
1217   ZeroWArr(kChrMaskWords - 1, &(cip->haploid_mask[1]));
1218   return kPglRetSuccess;
1219 }
1220 
1221 // explicit plink 1.07 species (now initialized by command line parser):
1222 // human: 22, X, Y, XY, MT, PAR1, PAR2 (PAR1/PAR2 added, XY deprecated in plink
1223 //   2.0)
1224 // cow: 29, X, Y, MT
1225 // dog: 38, X, Y, XY, MT
1226 // horse: 31, X, Y
1227 // mouse: 19, X, Y
1228 // rice: 12
1229 // sheep: 26, X, Y
1230 
1231 // must be safe to call this twice.
FinalizeChrset(MiscFlags misc_flags,ChrInfo * cip)1232 void FinalizeChrset(MiscFlags misc_flags, ChrInfo* cip) {
1233   uint32_t autosome_ct = cip->autosome_ct;
1234   uint32_t max_code = autosome_ct;
1235   for (uint32_t xymt_idx_p1 = kChrOffsetCt; xymt_idx_p1; --xymt_idx_p1) {
1236     if (!IsI32Neg(cip->xymt_codes[xymt_idx_p1 - 1])) {
1237       max_code = autosome_ct + xymt_idx_p1;
1238       break;
1239     }
1240   }
1241 
1242   // could initialize haploid_mask bits (after the first) here, instead of
1243   // earlier...
1244 
1245   cip->max_numeric_code = MINV(max_code, autosome_ct + 4);
1246   cip->max_code = max_code;
1247   uintptr_t* chr_mask = cip->chr_mask;
1248   uintptr_t last_chr_mask_word = chr_mask[kChrMaskWords - 1];
1249   STD_ARRAY_REF(uint32_t, kChrOffsetCt) xymt_codes = cip->xymt_codes;
1250   if (last_chr_mask_word) {
1251     // avoids repeating some work if this is called twice
1252     chr_mask[kChrMaskWords - 1] = 0;
1253 
1254     uint32_t xymt_include = last_chr_mask_word >> (kBitsPerWord - kChrOffsetCt);
1255     do {
1256       const uint32_t xymt_idx = ctzu32(xymt_include);
1257       const uint32_t cur_chr_code = xymt_codes[xymt_idx];
1258       if (!IsI32Neg(cur_chr_code)) {
1259         SetBit(cur_chr_code, chr_mask);
1260       }
1261       xymt_include &= xymt_include - 1;
1262     } while (xymt_include);
1263   } else if (AllWordsAreZero(chr_mask, kChrExcludeWords) && (!cip->is_include_stack)) {
1264     // init_default_chr_mask()
1265     SetAllBits(cip->autosome_ct + 1, chr_mask);
1266     for (uint32_t xymt_idx = 0; xymt_idx != kChrOffsetCt; ++xymt_idx) {
1267       const uint32_t cur_chr_code = cip->xymt_codes[xymt_idx];
1268       if (!IsI32Neg(cur_chr_code)) {
1269         SetBit(cur_chr_code, chr_mask);
1270       }
1271     }
1272   } else if (misc_flags & (kfMiscAutosomePar | kfMiscAutosomeOnly)) {
1273     FillBitsNz(1, cip->autosome_ct + 1, chr_mask);
1274     ClearBitsNz(cip->autosome_ct + 1, kChrExcludeWords * kBitsPerWord, chr_mask);
1275     if (misc_flags & kfMiscAutosomePar) {
1276       uint32_t par_chr_code = cip->xymt_codes[kChrOffsetXY];
1277       if (!IsI32Neg(par_chr_code)) {
1278         SetBit(par_chr_code, chr_mask);
1279       }
1280       par_chr_code = cip->xymt_codes[kChrOffsetPAR1];
1281       if (!IsI32Neg(par_chr_code)) {
1282         SetBit(par_chr_code, chr_mask);
1283       }
1284       par_chr_code = cip->xymt_codes[kChrOffsetPAR2];
1285       if (!IsI32Neg(par_chr_code)) {
1286         SetBit(par_chr_code, chr_mask);
1287       }
1288     }
1289   }
1290 
1291   uintptr_t* chr_exclude = cip->chr_exclude;
1292   uintptr_t last_chr_exclude_word = chr_exclude[kChrExcludeWords - 1];
1293   uint32_t xymt_exclude = last_chr_exclude_word >> (kBitsPerWord - kChrOffsetCt);
1294   last_chr_exclude_word = bzhi(last_chr_exclude_word, kBitsPerWord - kChrOffsetCt);
1295   for (uint32_t widx = 0; widx != kChrExcludeWords - 1; ++widx) {
1296     chr_mask[widx] &= ~chr_exclude[widx];
1297   }
1298   chr_mask[kChrExcludeWords - 1] &= ~last_chr_exclude_word;
1299   if (xymt_exclude) {
1300     do {
1301       const uint32_t xymt_idx = ctzu32(xymt_exclude);
1302       const uint32_t cur_chr_code = xymt_codes[xymt_idx];
1303       if (!IsI32Neg(cur_chr_code)) {
1304         ClearBit(cur_chr_code, chr_mask);
1305       }
1306       xymt_exclude &= xymt_exclude - 1;
1307     } while (xymt_exclude);
1308   }
1309   SetAllU32Arr(max_code + 1, cip->chr_idx_to_foidx);
1310 }
1311 
ForgetExtraChrNames(uint32_t reinitialize,ChrInfo * cip)1312 void ForgetExtraChrNames(uint32_t reinitialize, ChrInfo* cip) {
1313   const uint32_t name_ct = cip->name_ct;
1314   if (name_ct) {
1315     const char** nonstd_names = cip->nonstd_names;
1316     const uint32_t chr_idx_end = cip->max_code + 1 + name_ct;
1317     for (uint32_t chr_idx = cip->max_code + 1; chr_idx != chr_idx_end; ++chr_idx) {
1318       free_const(nonstd_names[chr_idx]);
1319       nonstd_names[chr_idx] = nullptr;
1320     }
1321     if (reinitialize) {
1322       // SetAllU32Arr(kChrHtableSize, cip->nonstd_id_htable);
1323       cip->name_ct = 0;
1324     }
1325   }
1326 }
1327 
1328 // not currently called.  might want to do so in the future.
FinalizeChrInfo(ChrInfo * cip)1329 PglErr FinalizeChrInfo(ChrInfo* cip) {
1330   const uint32_t chr_ct = cip->chr_ct;
1331   const uint32_t name_ct = cip->name_ct;
1332   const uint32_t chr_code_end = cip->max_code + 1 + name_ct;
1333   const uint32_t chr_code_bitvec_ct = BitCtToVecCt(chr_code_end);
1334   const uint32_t chr_ct_int32vec_ct = Int32CtToVecCt(chr_ct);
1335   const uint32_t chr_ct_p1_int32vec_ct = 1 + (chr_ct / kInt32PerVec);
1336   const uint32_t chr_code_end_int32vec_ct = Int32CtToVecCt(chr_code_end);
1337   const uint32_t chr_code_end_wordvec_ct = WordCtToVecCt(chr_code_end);
1338   uint32_t final_vecs_required = 2 * chr_code_bitvec_ct + chr_ct_int32vec_ct + chr_ct_p1_int32vec_ct + chr_code_end_int32vec_ct;
1339   if (name_ct) {
1340     final_vecs_required += chr_code_end_wordvec_ct + Int32CtToVecCt(kChrHtableSize);
1341   }
1342   uintptr_t* new_alloc;
1343   if (unlikely(vecaligned_malloc(final_vecs_required * kBytesPerVec, &new_alloc))) {
1344     return kPglRetNomem;
1345   }
1346   uintptr_t* old_alloc = cip->chr_mask;
1347   uintptr_t* new_alloc_iter = new_alloc;
1348 
1349   memcpy(new_alloc_iter, cip->chr_mask, chr_code_bitvec_ct * kBytesPerVec);
1350   cip->chr_mask = new_alloc_iter;
1351   new_alloc_iter = &(new_alloc_iter[chr_code_bitvec_ct * kWordsPerVec]);
1352 
1353   memcpy(new_alloc_iter, cip->haploid_mask, chr_code_bitvec_ct * kBytesPerVec);
1354   cip->haploid_mask = new_alloc_iter;
1355   new_alloc_iter = &(new_alloc_iter[chr_code_bitvec_ct * kWordsPerVec]);
1356 
1357   memcpy(new_alloc_iter, cip->chr_file_order, chr_ct_int32vec_ct * kBytesPerVec);
1358   cip->chr_file_order = R_CAST(uint32_t*, new_alloc_iter);
1359   new_alloc_iter = &(new_alloc_iter[chr_ct_int32vec_ct * kWordsPerVec]);
1360 
1361   memcpy(new_alloc_iter, cip->chr_fo_vidx_start, chr_ct_p1_int32vec_ct * kBytesPerVec);
1362   cip->chr_fo_vidx_start = R_CAST(uint32_t*, new_alloc_iter);
1363   new_alloc_iter = &(new_alloc_iter[chr_ct_p1_int32vec_ct * kWordsPerVec]);
1364 
1365   memcpy(new_alloc_iter, cip->chr_idx_to_foidx, chr_code_end_int32vec_ct * kBytesPerVec);
1366   cip->chr_idx_to_foidx = R_CAST(uint32_t*, new_alloc_iter);
1367 
1368   if (!name_ct) {
1369     cip->nonstd_names = nullptr;
1370     cip->nonstd_id_htable = nullptr;
1371   } else {
1372     new_alloc_iter = &(new_alloc_iter[chr_code_end_int32vec_ct * kWordsPerVec]);
1373 
1374     memcpy(new_alloc_iter, cip->nonstd_names, chr_code_end_wordvec_ct * kBytesPerVec);
1375     cip->nonstd_names = R_CAST(const char**, new_alloc_iter);
1376     new_alloc_iter = &(new_alloc_iter[chr_code_end_wordvec_ct * kWordsPerVec]);
1377 
1378     memcpy(new_alloc_iter, cip->nonstd_id_htable, kChrHtableSize * sizeof(int32_t));
1379     cip->nonstd_id_htable = R_CAST(uint32_t*, new_alloc_iter);
1380   }
1381   vecaligned_free(old_alloc);
1382   return kPglRetSuccess;
1383 }
1384 
CleanupChrInfo(ChrInfo * cip)1385 void CleanupChrInfo(ChrInfo* cip) {
1386   if (cip->chr_mask) {
1387     ForgetExtraChrNames(0, cip);
1388     vecaligned_free(cip->chr_mask);
1389     cip->chr_mask = nullptr;
1390   }
1391   LlStr* llstr_ptr = cip->incl_excl_name_stack;
1392   while (llstr_ptr) {
1393     LlStr* next_ptr = llstr_ptr->next;
1394     free(llstr_ptr);
1395     llstr_ptr = next_ptr;
1396   }
1397   cip->incl_excl_name_stack = nullptr;
1398 }
1399 
ChrNameStd(const ChrInfo * cip,uint32_t chr_idx,char * buf)1400 char* ChrNameStd(const ChrInfo* cip, uint32_t chr_idx, char* buf) {
1401   const uint32_t output_encoding = cip->output_encoding;
1402   if (chr_idx > cip->max_numeric_code) {
1403     // This is usually encoding-independent; no real numeric representation of
1404     // PAR1/PAR2 is defined.  However, since there'd otherwise be no way to
1405     // include the pseudoautosomal regions in e.g. ADMIXTURE, we render them
1406     // them as 25 (in the human case) when "--output-chr 26" is specified.
1407     if (output_encoding) {
1408       const uint32_t parx = 'P' + ('A' << 8) + ('R' << 16) + ('0' << 24) + ((chr_idx - cip->max_numeric_code) << 24);
1409       return memcpya(buf, &parx, 4);
1410     }
1411     return u32toa(cip->autosome_ct + (kChrOffsetXY + 1), buf);
1412   }
1413   if (output_encoding & (kfChrOutputPrefix | kfChrOutput0M)) {
1414     if (output_encoding == kfChrOutput0M) {
1415       // force two chars
1416       if (chr_idx <= cip->autosome_ct) {
1417         buf = memcpya_k(buf, &(kDigitPair[chr_idx]), 2);
1418       } else if (chr_idx == cip->xymt_codes[kChrOffsetY]) {
1419         buf = strcpya_k(buf, "XY");
1420       } else {
1421         *buf++ = '0';
1422         if (chr_idx == cip->xymt_codes[kChrOffsetX]) {
1423           *buf++ = 'X';
1424         } else {
1425           // assumes only X/Y/XY/MT defined
1426           *buf++ = (chr_idx == cip->xymt_codes[kChrOffsetY])? 'Y' : 'M';
1427         }
1428       }
1429       return buf;
1430     }
1431     buf = strcpya_k(buf, "chr");
1432   }
1433   if ((!(output_encoding & (kfChrOutputM | kfChrOutputMT))) || (chr_idx <= cip->autosome_ct)) {
1434     return u32toa(chr_idx, buf);
1435   }
1436   if (chr_idx == cip->xymt_codes[kChrOffsetX]) {
1437     *buf++ = 'X';
1438   } else if (chr_idx == cip->xymt_codes[kChrOffsetY]) {
1439     *buf++ = 'Y';
1440   } else if (chr_idx == cip->xymt_codes[kChrOffsetXY]) {
1441     buf = strcpya_k(buf, "XY");
1442   } else {
1443     *buf++ = 'M';
1444     if (output_encoding & kfChrOutputMT) {
1445       *buf++ = 'T';
1446     }
1447   }
1448   return buf;
1449 }
1450 
chrtoa(const ChrInfo * cip,uint32_t chr_idx,char * buf)1451 char* chrtoa(const ChrInfo* cip, uint32_t chr_idx, char* buf) {
1452   // assumes chr_idx is valid
1453   if (!chr_idx) {
1454     // Note that this never has a 'chr' prefix.  Which is probably fine, it
1455     // isn't a real chromosome.
1456     *buf++ = '0';
1457     return buf;
1458   }
1459   if (chr_idx <= cip->max_code) {
1460     return ChrNameStd(cip, chr_idx, buf);
1461   }
1462   if (cip->zero_extra_chrs) {
1463     *buf++ = '0';
1464     return buf;
1465   }
1466   return strcpya(buf, cip->nonstd_names[chr_idx]);
1467 }
1468 
GetMaxChrSlen(const ChrInfo * cip)1469 uint32_t GetMaxChrSlen(const ChrInfo* cip) {
1470   // does not include trailing null
1471   // can be overestimate
1472   // if more functions start calling this, it should just be built into
1473   // load_bim() instead
1474   if (cip->zero_extra_chrs) {
1475     return 3 + kMaxChrTextnum;
1476   }
1477   const uint32_t chr_ct = cip->chr_ct;
1478   const uint32_t max_code = cip->max_code;
1479   uint32_t max_chr_slen = 3 + kMaxChrTextnum;
1480   for (uint32_t chr_fo_idx = 0; chr_fo_idx != chr_ct; ++chr_fo_idx) {
1481     const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
1482     if (!IsSet(cip->chr_mask, chr_idx)) {
1483       continue;
1484     }
1485     if (chr_idx > max_code) {
1486       const uint32_t name_slen = strlen(cip->nonstd_names[chr_idx]);
1487       if (name_slen > max_chr_slen) {
1488         max_chr_slen = name_slen;
1489       }
1490     }
1491   }
1492   return max_chr_slen;
1493 }
1494 
IsHaploidChrPresent(const ChrInfo * cip)1495 uint32_t IsHaploidChrPresent(const ChrInfo* cip) {
1496   const uintptr_t* chr_mask = cip->chr_mask;
1497   const uintptr_t* haploid_mask = cip->haploid_mask;
1498   // since we don't load haploid vs. diploid info from ##contig header lines,
1499   // this is sufficient
1500   for (uint32_t widx = 0; widx != kChrExcludeWords; ++widx) {
1501     if (chr_mask[widx] & haploid_mask[widx]) {
1502       return 1;
1503     }
1504   }
1505   return 0;
1506 }
1507 
IsAutosomalDiploidChrPresent(const ChrInfo * cip)1508 uint32_t IsAutosomalDiploidChrPresent(const ChrInfo* cip) {
1509   const uintptr_t* haploid_mask = cip->haploid_mask;
1510   if (haploid_mask[0] & 1) {
1511     return 0;
1512   }
1513   const uint32_t max_code_p1 = cip->max_code + 1;
1514   const uint32_t name_ct = cip->name_ct;
1515   const uint32_t chr_code_end = max_code_p1 + name_ct;
1516   const uint32_t word_ct = BitCtToWordCt(chr_code_end);
1517   const uintptr_t* chr_mask = cip->chr_mask;
1518   for (uint32_t widx = 0; widx != word_ct; ++widx) {
1519     if (chr_mask[widx] & (~haploid_mask[widx])) {
1520       return 1;
1521     }
1522   }
1523   return 0;
1524 }
1525 
SingleCapLetterChrCode(uint32_t cap_letter)1526 static inline uint32_t SingleCapLetterChrCode(uint32_t cap_letter) {
1527   if (cap_letter == 'X') {
1528     return kChrRawX;
1529   }
1530   if (cap_letter == 'Y') {
1531     return kChrRawY;
1532   }
1533   if (cap_letter == 'M') {
1534     return kChrRawMT;
1535   }
1536   return UINT32_MAX;
1537 }
1538 
1539 static_assert(kMaxChrTextnumSlen == 2, "GetChrCodeRaw() must be updated.");
GetChrCodeRaw(const char * str_iter)1540 uint32_t GetChrCodeRaw(const char* str_iter) {
1541   // any character <= ' ' is considered a terminator
1542   // note that char arithmetic tends to be compiled to uint32 operations, so we
1543   // mostly work with ints here
1544   uint32_t first_char_code = ctou32(str_iter[0]);
1545   uint32_t first_char_toi;
1546   if (first_char_code < 58) {
1547   GetChrCodeRaw_digits:
1548     first_char_toi = first_char_code - '0';
1549     if (first_char_toi < 10) {
1550       const uint32_t second_char_code = ctou32(str_iter[1]);
1551       if (second_char_code <= ' ') {
1552         return first_char_toi;
1553       }
1554       if (ctou32(str_iter[2]) <= ' ') {
1555         const uint32_t second_char_toi = second_char_code - '0';
1556         if (second_char_toi < 10) {
1557           return first_char_toi * 10 + second_char_toi;
1558         }
1559         if (!first_char_toi) {
1560           // accept '0X', '0Y', '0M' emitted by Oxford software
1561           return SingleCapLetterChrCode(second_char_code & 0xdf);
1562         }
1563       }
1564     }
1565     return UINT32_MAX;
1566   }
1567   first_char_code &= 0xdf;
1568   uint32_t second_char_code = ctou32(str_iter[1]);
1569   if (first_char_code == 'P') {
1570     // chrPAR1 *not* supported; has to be PAR1 by itself.
1571     // can't do uint16_t compare of multiple characters, since we could be
1572     // dealing with a length-1 null-terminated string; that IS faster when it's
1573     // safe, though
1574     if (((second_char_code & 0xdf) == 'A') && ((ctou32(str_iter[2]) & 0xdf) == 'R')) {
1575       const uint32_t par_idx_m1 = ctou32(str_iter[3]) - '1';
1576       if ((par_idx_m1 < 2) && (ctou32(str_iter[4]) <= ' ')) {
1577         return kChrRawPAR1 + par_idx_m1;
1578       }
1579     }
1580     return UINT32_MAX;
1581   }
1582   if (first_char_code == 'C') {
1583     if (((second_char_code & 0xdf) != 'H') || ((ctou32(str_iter[2]) & 0xdf) != 'R')) {
1584       return UINT32_MAX;
1585     }
1586     str_iter = &(str_iter[3]);
1587     first_char_code = ctou32(str_iter[0]);
1588     if (first_char_code < 58) {
1589       goto GetChrCodeRaw_digits;
1590     }
1591     first_char_code &= 0xdf;
1592     second_char_code = ctou32(str_iter[1]);
1593   }
1594   if (second_char_code <= ' ') {
1595     return SingleCapLetterChrCode(first_char_code);
1596   }
1597   if (ctou32(str_iter[2]) <= ' ') {
1598     second_char_code &= 0xdf;
1599     if ((first_char_code == 'X') && (second_char_code == 'Y')) {
1600       return kChrRawXY;
1601     } else if ((first_char_code == 'M') && (second_char_code == 'T')) {
1602       return kChrRawMT;
1603     }
1604   }
1605   return UINT32_MAX;
1606 }
1607 
GetChrCode(const char * chr_name,const ChrInfo * cip,uint32_t name_slen)1608 uint32_t GetChrCode(const char* chr_name, const ChrInfo* cip, uint32_t name_slen) {
1609   // requires chr_name to be null-terminated
1610   // in practice, name_slen will usually already be known, may as well avoid
1611   // redundant strlen() calls even though this uglifies the interface
1612   // does not perform exhaustive error-checking
1613   // UINT32_MAX = --allow-extra-chr ok, UINT32_MAXM1 = total fail
1614   uint32_t chr_code_raw = GetChrCodeRaw(chr_name);
1615   if (chr_code_raw <= cip->max_code) {
1616     return chr_code_raw;
1617   }
1618   if (chr_code_raw != UINT32_MAX) {
1619     if (chr_code_raw >= kMaxContigs) {
1620       return cip->xymt_codes[chr_code_raw - kMaxContigs];
1621     }
1622     return UINT32_MAXM1;
1623   }
1624   if (!cip->name_ct) {
1625     return UINT32_MAX;
1626   }
1627   // note that IdHtableFind returns UINT32_MAX if name not found
1628   // can't overread, nonstd_names not in main workspace
1629   return IdHtableFind(chr_name, cip->nonstd_names, cip->nonstd_id_htable, name_slen, kChrHtableSize);
1630 }
1631 
GetChrCodeCounted(const ChrInfo * cip,uint32_t name_slen,char * chr_name)1632 uint32_t GetChrCodeCounted(const ChrInfo* cip, uint32_t name_slen, char* chr_name) {
1633   // When the chromosome name isn't null-terminated.
1634   // Yeah, probably want to revise this so that chr_name doesn't need to be
1635   // mutable here.  However, that currently requires new substitutes for both
1636   // GetChrCodeRaw AND IdHtableFind (IdHtableFindNnt doesn't work due to
1637   // overread); when either of those are needed for some other reason, that's a
1638   // good time to revise this function.
1639   char* s_end = &(chr_name[name_slen]);
1640   const char tmpc = *s_end;
1641   *s_end = '\0';
1642   const uint32_t chr_code = GetChrCode(chr_name, cip, name_slen);
1643   *s_end = tmpc;
1644   return chr_code;
1645 }
1646 
ChrError(const char * chr_name,const char * file_descrip,const ChrInfo * cip,uintptr_t line_idx,uint32_t error_code)1647 void ChrError(const char* chr_name, const char* file_descrip, const ChrInfo* cip, uintptr_t line_idx, uint32_t error_code) {
1648   // assumes chr_name is null-terminated
1649   const uint32_t raw_code = GetChrCodeRaw(chr_name);
1650   logputs("\n");
1651   if (line_idx) {
1652     logerrprintfww("Error: Invalid chromosome code '%s' on line %" PRIuPTR " of %s.\n", chr_name, line_idx, file_descrip);
1653   } else {
1654     logerrprintfww("Error: Invalid chromosome code '%s' in %s.\n", chr_name, file_descrip);
1655   }
1656   if ((S_CAST(int32_t, raw_code) > S_CAST(int32_t, cip->max_code)) && ((raw_code <= kMaxChrTextnum + kChrOffsetCt) || (raw_code >= kMaxContigs))) {
1657     // numeric code or X/Y/MT/PAR
1658     if (cip->chrset_source == kChrsetSourceDefault) {
1659       logerrputs("(This is disallowed for humans.  Check if the problem is with your data, or if\nyou forgot to define a different chromosome set with e.g. --chr-set.).\n");
1660     } else if (cip->chrset_source == kChrsetSourceCmdline) {
1661       logerrputs("(This is disallowed by your command-line flags.)\n");
1662     } else {
1663       // kChrsetSourceFile
1664       logerrputs("(This is disallowed by the file's own ##chrSet header line.)\n");
1665     }
1666     // maybe want to print message(s) depending on whether chromosome set was
1667     // defined on the command line or by the input file?
1668   } else if (error_code == UINT32_MAX) {
1669     logerrputs("(Use --allow-extra-chr to force it to be accepted.)\n");
1670   }
1671 }
1672 
TryToAddChrName(const char * chr_name,const char * file_descrip,uintptr_t line_idx,uint32_t name_slen,uint32_t allow_extra_chrs,uint32_t * chr_idx_ptr,ChrInfo * cip)1673 PglErr TryToAddChrName(const char* chr_name, const char* file_descrip, uintptr_t line_idx, uint32_t name_slen, uint32_t allow_extra_chrs, uint32_t* chr_idx_ptr, ChrInfo* cip) {
1674   // assumes chr_name is either nonstandard (i.e. not "2", "chr2", "chrX",
1675   // etc.), or a rejected xymt.
1676   // requires chr_name to be null-terminated
1677   // assumes chr_idx currently has the return value of GetChrCode()
1678   if (unlikely((!allow_extra_chrs) || ((*chr_idx_ptr) == UINT32_MAXM1))) {
1679     ChrError(chr_name, file_descrip, cip, line_idx, *chr_idx_ptr);
1680     return kPglRetMalformedInput;
1681   }
1682 
1683   // quasi-bugfix: remove redundant hash table check
1684 
1685   if (unlikely(chr_name[0] == '#')) {
1686     // redundant with some of the comment-skipping loaders, but this isn't
1687     // performance-critical
1688     logputs("\n");
1689     logerrputs("Error: Chromosome/contig names may not begin with '#'.\n");
1690     return kPglRetMalformedInput;
1691   }
1692   if (unlikely(name_slen > kMaxIdSlen)) {
1693     logputs("\n");
1694     if (line_idx) {
1695       logerrprintfww("Error: Line %" PRIuPTR " of %s has an excessively long chromosome/contig name. (The " PROG_NAME_STR " limit is " MAX_ID_SLEN_STR " characters.)\n", line_idx, file_descrip);
1696     } else {
1697       logerrprintfww("Error: Excessively long chromosome/contig name in %s. (The " PROG_NAME_STR " limit is " MAX_ID_SLEN_STR " characters.)\n", file_descrip);
1698     }
1699     return kPglRetMalformedInput;
1700   }
1701   const uint32_t max_code_p1 = cip->max_code + 1;
1702   const uint32_t name_ct = cip->name_ct;
1703   const uint32_t chr_code_end = max_code_p1 + name_ct;
1704   if (unlikely(chr_code_end == kMaxContigs)) {
1705     logputs("\n");
1706     logerrputs("Error: Too many distinct nonstandard chromosome/contig names.\n");
1707     return kPglRetMalformedInput;
1708   }
1709   if (!name_ct) {
1710     // lazy initialization
1711     SetAllU32Arr(kChrHtableSize, cip->nonstd_id_htable);
1712   }
1713   char* new_nonstd_name;
1714   if (unlikely(pgl_malloc(name_slen + 1, &new_nonstd_name))) {
1715     return kPglRetNomem;
1716   }
1717   LlStr* name_stack_ptr = cip->incl_excl_name_stack;
1718   uint32_t in_name_stack = 0;
1719   while (name_stack_ptr) {
1720     // there shouldn't be many of these, so sorting is unimportant
1721     if (!strcmp(chr_name, name_stack_ptr->str)) {
1722       in_name_stack = 1;
1723       break;
1724     }
1725     name_stack_ptr = name_stack_ptr->next;
1726   }
1727   if ((in_name_stack && cip->is_include_stack) || ((!in_name_stack) && (!cip->is_include_stack))) {
1728     SetBit(chr_code_end, cip->chr_mask);
1729     if (cip->haploid_mask[0] & 1) {
1730       SetBit(chr_code_end, cip->haploid_mask);
1731     }
1732   }
1733   memcpy(new_nonstd_name, chr_name, name_slen + 1);
1734   cip->nonstd_names[chr_code_end] = new_nonstd_name;
1735   *chr_idx_ptr = chr_code_end;
1736   cip->name_ct = name_ct + 1;
1737   uint32_t* id_htable = cip->nonstd_id_htable;
1738   for (uint32_t hashval = Hashceil(chr_name, name_slen, kChrHtableSize); ; ) {
1739     if (id_htable[hashval] == UINT32_MAX) {
1740       id_htable[hashval] = chr_code_end;
1741       return kPglRetSuccess;
1742     }
1743     if (++hashval == kChrHtableSize) {
1744       hashval = 0;
1745     }
1746   }
1747 }
1748 
1749 
1750 /*
1751 uintptr_t count_11_vecs(const VecW* geno_vvec, uintptr_t vec_ct) {
1752   // Counts number of aligned 11s in vptr[0..(vec_ct-1)].  Assumes vec_ct is a
1753   // multiple of 6 (0 ok).
1754   assert(!(vec_ct % 6));
1755   const VecW m1 = VCONST_W(kMask5555);
1756   const VecW m2 = VCONST_W(kMask3333);
1757   const VecW m4 = VCONST_W(kMask0F0F);
1758   const VecW m8 = VCONST_W(kMask00FF);
1759   const VecW* geno_vvec_iter = geno_vvec;
1760   const VecW* geno_vvec_end = &(geno_vvec[vec_ct]);
1761   uintptr_t tot = 0;
1762 
1763   while (1) {
1764     const VecW* geno_vvec_stop = &(geno_vvec_iter[60]);
1765 
1766     UniVec acc;
1767     acc.vw = vecw_setzero();
1768 
1769     if (geno_vvec_stop > geno_vvec_end) {
1770       if (geno_vvec_iter == geno_vvec_end) {
1771         return tot;
1772       }
1773       geno_vvec_stop = geno_vvec_end;
1774     }
1775     do {
1776       VecW cur_geno_vword = *geno_vvec_iter++;
1777       VecW count1 = cur_geno_vword & m1;
1778       count1 = count1 & vecw_srli(cur_geno_vword, 1);
1779 
1780       cur_geno_vword = *geno_vvec_iter++;
1781       VecW cur_11 = cur_geno_vword & m1;
1782       cur_11 = cur_11 & vecw_srli(cur_geno_vword, 1);
1783       count1 = count1 + cur_11;
1784 
1785       cur_geno_vword = *geno_vvec_iter++;
1786       cur_11 = cur_geno_vword & m1;
1787       cur_11 = cur_11 & vecw_srli(cur_geno_vword, 1);
1788       count1 = count1 + cur_11;
1789       count1 = (count1 & m2) + (vecw_srli(count1, 2) & m2);
1790 
1791       cur_geno_vword = *geno_vvec_iter++;
1792       VecW count2 = cur_geno_vword & m1;
1793       count2 = count2 & vecw_srli(cur_geno_vword, 1);
1794 
1795       cur_geno_vword = *geno_vvec_iter++;
1796       VecW cur_11 = cur_geno_vword & m1;
1797       cur_11 = cur_11 & vecw_srli(cur_geno_vword, 1);
1798       count2 = count2 + cur_11;
1799 
1800       cur_geno_vword = *geno_vvec_iter++;
1801       cur_11 = cur_geno_vword & m1;
1802       cur_11 = cur_11 & vecw_srli(cur_geno_vword, 1);
1803       count2 = count2 + cur_11;
1804       count1 = count1 + (count2 & m2) + (vecw_srli(count2, 2) & m2);
1805 
1806       acc.vw = acc.vw + (count1 & m4) + (vecw_srli(count1, 4) & m4);
1807     } while (geno_vvec_iter < geno_vvec_stop);
1808     acc.vw = (acc.vw & m8) + (vecw_srli(acc.vw, 8) & m8);
1809     tot += UniVecHsum16(acc);
1810   }
1811 }
1812 
1813 uintptr_t count_11_longs(const uintptr_t* genovec, uintptr_t word_ct) {
1814   uintptr_t tot = 0;
1815   if (word_ct >= (6 * kWordsPerVec)) {
1816     assert(VecIsAligned(genovec));
1817     const uintptr_t remainder = word_ct % (6 * kWordsPerVec);
1818     const uintptr_t main_block_word_ct = word_ct - remainder;
1819     tot = count_11_vecs((const VecW*)genovec, main_block_word_ct / kWordsPerVec);
1820     word_ct = remainder;
1821     genovec = &(genovec[main_block_word_ct]);
1822   }
1823   for (uintptr_t trailing_word_idx = 0; trailing_word_idx != word_ct; ++trailing_word_idx) {
1824     const uintptr_t cur_geno_word = genovec[trailing_word_idx];
1825     tot += Popcount01Word(Word11(cur_geno_word));
1826   }
1827 }
1828 */
1829 
AllGenoEqual(const uintptr_t * genoarr,uint32_t sample_ct)1830 uint32_t AllGenoEqual(const uintptr_t* genoarr, uint32_t sample_ct) {
1831   const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
1832   const uintptr_t match_word = (genoarr[0] & 3) * kMask5555;
1833   for (uint32_t widx = 0; widx != word_ct_m1; ++widx) {
1834     if (genoarr[widx] != match_word) {
1835       return 0;
1836     }
1837   }
1838   const uint32_t remainder = ModNz(sample_ct, kBitsPerWordD2);
1839   return !bzhi_max(genoarr[word_ct_m1] ^ match_word, 2 * remainder);
1840 }
1841 
InterleavedMaskZero(const uintptr_t * __restrict interleaved_mask,uintptr_t vec_ct,uintptr_t * __restrict genovec)1842 void InterleavedMaskZero(const uintptr_t* __restrict interleaved_mask, uintptr_t vec_ct, uintptr_t* __restrict genovec) {
1843   const uintptr_t twovec_ct = vec_ct / 2;
1844 #ifdef __LP64__
1845   const VecW m1 = VCONST_W(kMask5555);
1846   const VecW* interleaved_mask_iter = R_CAST(const VecW*, interleaved_mask);
1847   VecW* genovvec_iter = R_CAST(VecW*, genovec);
1848   for (uintptr_t twovec_idx = 0; twovec_idx != twovec_ct; ++twovec_idx) {
1849     const VecW mask_vvec = *interleaved_mask_iter++;
1850     VecW mask_first = mask_vvec & m1;
1851     mask_first = mask_first | vecw_slli(mask_first, 1);
1852     VecW mask_second = vecw_and_notfirst(m1, mask_vvec);
1853     mask_second = mask_second | vecw_srli(mask_second, 1);
1854     *genovvec_iter = (*genovvec_iter) & mask_first;
1855     ++genovvec_iter;
1856     *genovvec_iter = (*genovvec_iter) & mask_second;
1857     ++genovvec_iter;
1858   }
1859   if (vec_ct & 1) {
1860     VecW mask_first = *interleaved_mask_iter;
1861     mask_first = mask_first | vecw_slli(mask_first, 1);
1862     *genovvec_iter = (*genovvec_iter) & mask_first;
1863   }
1864 #else
1865   const uintptr_t* interleaved_mask_iter = interleaved_mask;
1866   uintptr_t* genovec_iter = genovec;
1867   for (uintptr_t twovec_idx = 0; twovec_idx != twovec_ct; ++twovec_idx) {
1868     const uintptr_t mask_word = *interleaved_mask_iter++;
1869     *genovec_iter &= (mask_word & kMask5555) * 3;
1870     ++genovec_iter;
1871     *genovec_iter &= ((mask_word >> 1) & kMask5555) * 3;
1872     ++genovec_iter;
1873   }
1874   if (vec_ct & 1) {
1875     const uintptr_t mask_word = *interleaved_mask_iter;
1876     *genovec_iter &= mask_word * 3;
1877   }
1878 #endif
1879 }
1880 
InterleavedMaskMissing(const uintptr_t * __restrict interleaved_mask,uintptr_t vec_ct,uintptr_t * __restrict genovec)1881 void InterleavedMaskMissing(const uintptr_t* __restrict interleaved_mask, uintptr_t vec_ct, uintptr_t* __restrict genovec) {
1882   const uintptr_t twovec_ct = vec_ct / 2;
1883 #ifdef __LP64__
1884   const VecW m1 = VCONST_W(kMask5555);
1885   const VecW inv_m1 = VCONST_W(kMaskAAAA);
1886   const VecW* interleaved_mask_iter = R_CAST(const VecW*, interleaved_mask);
1887   VecW* genovvec_iter = R_CAST(VecW*, genovec);
1888   for (uintptr_t twovec_idx = 0; twovec_idx != twovec_ct; ++twovec_idx) {
1889     const VecW mask_vvec = *interleaved_mask_iter++;
1890     VecW set_first = vecw_and_notfirst(mask_vvec, m1);
1891     set_first = set_first | vecw_slli(set_first, 1);
1892     VecW set_second = vecw_and_notfirst(mask_vvec, inv_m1);
1893     set_second = set_second | vecw_srli(set_second, 1);
1894     *genovvec_iter = (*genovvec_iter) | set_first;
1895     ++genovvec_iter;
1896     *genovvec_iter = (*genovvec_iter) | set_second;
1897     ++genovvec_iter;
1898   }
1899   if (vec_ct & 1) {
1900     VecW set_first = vecw_and_notfirst(*interleaved_mask_iter, m1);
1901     set_first = set_first | vecw_slli(set_first, 1);
1902     *genovvec_iter = (*genovvec_iter) | set_first;
1903   }
1904 #else
1905   const uintptr_t* interleaved_mask_iter = interleaved_mask;
1906   uintptr_t* genovec_iter = genovec;
1907   for (uintptr_t twovec_idx = 0; twovec_idx != twovec_ct; ++twovec_idx) {
1908     const uintptr_t set_invword = *interleaved_mask_iter++;
1909     *genovec_iter |= ((~set_invword) & kMask5555) * 3;
1910     ++genovec_iter;
1911     *genovec_iter |= (((~set_invword) >> 1) & kMask5555) * 3;
1912     ++genovec_iter;
1913   }
1914   if (vec_ct & 1) {
1915     const uintptr_t set_invword = *interleaved_mask_iter;
1916     *genovec_iter |= ((~set_invword) & kMask5555) * 3;
1917   }
1918 #endif
1919 }
1920 
InterleavedSetMissing(const uintptr_t * __restrict interleaved_set,uintptr_t vec_ct,uintptr_t * __restrict genovec)1921 void InterleavedSetMissing(const uintptr_t* __restrict interleaved_set, uintptr_t vec_ct, uintptr_t* __restrict genovec) {
1922   const uintptr_t twovec_ct = vec_ct / 2;
1923 #ifdef __LP64__
1924   const VecW m1 = VCONST_W(kMask5555);
1925   const VecW* interleaved_set_iter = R_CAST(const VecW*, interleaved_set);
1926   VecW* genovvec_iter = R_CAST(VecW*, genovec);
1927   for (uintptr_t twovec_idx = 0; twovec_idx != twovec_ct; ++twovec_idx) {
1928     const VecW set_vvec = *interleaved_set_iter++;
1929     VecW set_first = set_vvec & m1;
1930     set_first = set_first | vecw_slli(set_first, 1);
1931     VecW set_second = vecw_and_notfirst(m1, set_vvec);
1932     set_second = set_second | vecw_srli(set_second, 1);
1933     *genovvec_iter = (*genovvec_iter) | set_first;
1934     ++genovvec_iter;
1935     *genovvec_iter = (*genovvec_iter) | set_second;
1936     ++genovvec_iter;
1937   }
1938   if (vec_ct & 1) {
1939     VecW set_first = *interleaved_set_iter;
1940     set_first = set_first | vecw_slli(set_first, 1);
1941     *genovvec_iter = (*genovvec_iter) | set_first;
1942   }
1943 #else
1944   const uintptr_t* interleaved_set_iter = interleaved_set;
1945   uintptr_t* genovec_iter = genovec;
1946   for (uintptr_t twovec_idx = 0; twovec_idx != twovec_ct; ++twovec_idx) {
1947     const uintptr_t set_word = *interleaved_set_iter++;
1948     *genovec_iter |= (set_word & kMask5555) * 3;
1949     ++genovec_iter;
1950     *genovec_iter |= ((set_word >> 1) & kMask5555) * 3;
1951     ++genovec_iter;
1952   }
1953   if (vec_ct & 1) {
1954     const uintptr_t set_word = *interleaved_set_iter;
1955     *genovec_iter |= set_word * 3;
1956   }
1957 #endif
1958 }
1959 
InterleavedSetMissingCleardosage(const uintptr_t * __restrict orig_set,const uintptr_t * __restrict interleaved_set,uintptr_t vec_ct,uintptr_t * __restrict genovec,uint32_t * __restrict write_dosage_ct_ptr,uintptr_t * __restrict dosagepresent,Dosage * dosage_main)1960 void InterleavedSetMissingCleardosage(const uintptr_t* __restrict orig_set, const uintptr_t* __restrict interleaved_set, uintptr_t vec_ct, uintptr_t* __restrict genovec, uint32_t* __restrict write_dosage_ct_ptr, uintptr_t* __restrict dosagepresent, Dosage* dosage_main) {
1961   const uint32_t orig_write_dosage_ct = *write_dosage_ct_ptr;
1962   if (orig_write_dosage_ct) {
1963     uintptr_t sample_widx = 0;
1964     uintptr_t cur_bits = dosagepresent[0];
1965     for (uint32_t dosage_read_idx = 0; dosage_read_idx != orig_write_dosage_ct; ++dosage_read_idx) {
1966       const uintptr_t lowbit = BitIter1y(dosagepresent, &sample_widx, &cur_bits);
1967       if (orig_set[sample_widx] & lowbit) {
1968         dosagepresent[sample_widx] ^= lowbit;
1969         uint32_t dosage_write_idx = dosage_read_idx++;
1970         for (; dosage_read_idx == orig_write_dosage_ct; ++dosage_read_idx) {
1971           const uintptr_t lowbit2 = BitIter1y(dosagepresent, &sample_widx, &cur_bits);
1972           if (orig_set[sample_widx] & lowbit2) {
1973             dosagepresent[sample_widx] ^= lowbit2;
1974           } else {
1975             dosage_main[dosage_write_idx++] = dosage_main[dosage_read_idx];
1976           }
1977         }
1978         *write_dosage_ct_ptr = dosage_write_idx;
1979         break;
1980       }
1981     }
1982   }
1983   InterleavedSetMissing(interleaved_set, vec_ct, genovec);
1984 }
1985 
SetMaleHetMissing(const uintptr_t * __restrict sex_male_interleaved,uint32_t vec_ct,uintptr_t * __restrict genovec)1986 void SetMaleHetMissing(const uintptr_t* __restrict sex_male_interleaved, uint32_t vec_ct, uintptr_t* __restrict genovec) {
1987   const uint32_t twovec_ct = vec_ct / 2;
1988 #ifdef __LP64__
1989   const VecW m1 = VCONST_W(kMask5555);
1990   const VecW* sex_male_interleaved_iter = R_CAST(const VecW*, sex_male_interleaved);
1991   VecW* genovvec_iter = R_CAST(VecW*, genovec);
1992   for (uint32_t twovec_idx = 0; twovec_idx != twovec_ct; ++twovec_idx) {
1993     const VecW sex_male_vvec = *sex_male_interleaved_iter++;
1994     // we wish to bitwise-or with (sex_male_nypvec_01 & genovec) << 1
1995     const VecW sex_male_first = sex_male_vvec & m1;
1996     const VecW sex_male_second_shifted = vecw_and_notfirst(m1, sex_male_vvec);
1997     VecW cur_geno_vword = *genovvec_iter;
1998 
1999     const VecW missing_male_vword = sex_male_first & cur_geno_vword;
2000 
2001     *genovvec_iter++ = cur_geno_vword | vecw_slli(missing_male_vword, 1);
2002     cur_geno_vword = *genovvec_iter;
2003     *genovvec_iter++ = cur_geno_vword | (sex_male_second_shifted & vecw_slli(cur_geno_vword, 1));
2004   }
2005   if (vec_ct & 1) {
2006     const VecW sex_male_first = (*sex_male_interleaved_iter) & m1;
2007     const VecW cur_geno_vword = *genovvec_iter;
2008     const VecW missing_male_vword = sex_male_first & cur_geno_vword;
2009     *genovvec_iter = cur_geno_vword | vecw_slli(missing_male_vword, 1);
2010   }
2011 #else
2012   const uintptr_t* sex_male_interleaved_iter = sex_male_interleaved;
2013   uintptr_t* genovec_iter = genovec;
2014   for (uint32_t twovec_idx = 0; twovec_idx != twovec_ct; ++twovec_idx) {
2015     const uintptr_t sex_male_word = *sex_male_interleaved_iter++;
2016     uintptr_t cur_geno_word = *genovec_iter;
2017     *genovec_iter++ = cur_geno_word | ((sex_male_word & kMask5555 & cur_geno_word) << 1);
2018     cur_geno_word = *genovec_iter;
2019     *genovec_iter++ = cur_geno_word | (sex_male_word & kMaskAAAA & (cur_geno_word << 1));
2020   }
2021   if (vec_ct & 1) {
2022     const uintptr_t sex_male_word = *sex_male_interleaved_iter;
2023     uintptr_t cur_geno_word = *genovec_iter;
2024     *genovec_iter = cur_geno_word | ((sex_male_word & kMask5555 & cur_geno_word) << 1);
2025   }
2026 #endif
2027 }
2028 
EraseMaleHetDosages(const uintptr_t * __restrict sex_male,const uintptr_t * __restrict genoarr,uint32_t * __restrict write_dosage_ct_ptr,uintptr_t * __restrict dosagepresent,Dosage * dosage_main)2029 void EraseMaleHetDosages(const uintptr_t* __restrict sex_male, const uintptr_t* __restrict genoarr, uint32_t* __restrict write_dosage_ct_ptr, uintptr_t* __restrict dosagepresent, Dosage* dosage_main) {
2030   const uint32_t orig_write_dosage_ct = *write_dosage_ct_ptr;
2031   if (!orig_write_dosage_ct) {
2032     return;
2033   }
2034   uintptr_t sample_uidx_base = 0;
2035   uintptr_t cur_bits = dosagepresent[0];
2036   for (uint32_t dosage_read_idx = 0; dosage_read_idx != orig_write_dosage_ct; ++dosage_read_idx) {
2037     const uintptr_t sample_uidx = BitIter1(dosagepresent, &sample_uidx_base, &cur_bits);
2038     if (IsSet(sex_male, sample_uidx) && (GetNyparrEntry(genoarr, sample_uidx) == 1)) {
2039       ClearBit(sample_uidx, dosagepresent);
2040       uint32_t dosage_write_idx = dosage_read_idx++;
2041       for (; dosage_read_idx != orig_write_dosage_ct; ++dosage_read_idx) {
2042         const uintptr_t sample_uidx2 = BitIter1(dosagepresent, &sample_uidx_base, &cur_bits);
2043         if (IsSet(sex_male, sample_uidx2) && (GetNyparrEntry(genoarr, sample_uidx2) == 1)) {
2044           ClearBit(sample_uidx2, dosagepresent);
2045         } else {
2046           dosage_main[dosage_write_idx++] = dosage_main[dosage_read_idx];
2047         }
2048       }
2049       *write_dosage_ct_ptr = dosage_write_idx;
2050       return;
2051     }
2052   }
2053 }
2054 
EraseMaleDphases(const uintptr_t * __restrict sex_male,uint32_t * __restrict write_dphase_ct_ptr,uintptr_t * __restrict dphasepresent,SDosage * dphase_delta)2055 void EraseMaleDphases(const uintptr_t* __restrict sex_male, uint32_t* __restrict write_dphase_ct_ptr, uintptr_t* __restrict dphasepresent, SDosage* dphase_delta) {
2056   const uint32_t orig_write_dphase_ct = *write_dphase_ct_ptr;
2057   if (!orig_write_dphase_ct) {
2058     return;
2059   }
2060   uintptr_t sample_widx = 0;
2061   uintptr_t cur_bits = dphasepresent[0];
2062   for (uint32_t dphase_read_idx = 0; dphase_read_idx != orig_write_dphase_ct; ++dphase_read_idx) {
2063     const uintptr_t lowbit = BitIter1y(dphasepresent, &sample_widx, &cur_bits);
2064     if (sex_male[sample_widx] & lowbit) {
2065       dphasepresent[sample_widx] ^= lowbit;
2066       uint32_t dphase_write_idx = dphase_read_idx++;
2067       for (; dphase_read_idx == orig_write_dphase_ct; ++dphase_read_idx) {
2068         const uintptr_t lowbit2 = BitIter1y(dphasepresent, &sample_widx, &cur_bits);
2069         if (sex_male[sample_widx] & lowbit2) {
2070           dphasepresent[sample_widx] ^= lowbit2;
2071         } else {
2072           dphase_delta[dphase_write_idx++] = dphase_delta[dphase_read_idx];
2073         }
2074       }
2075       *write_dphase_ct_ptr = dphase_write_idx;
2076       return;
2077     }
2078   }
2079 }
2080 
SetMaleHetMissingCleardosage(const uintptr_t * __restrict sex_male,const uintptr_t * __restrict sex_male_interleaved,uint32_t vec_ct,uintptr_t * __restrict genovec,uint32_t * __restrict write_dosage_ct_ptr,uintptr_t * __restrict dosagepresent,Dosage * dosage_main)2081 void SetMaleHetMissingCleardosage(const uintptr_t* __restrict sex_male, const uintptr_t* __restrict sex_male_interleaved, uint32_t vec_ct, uintptr_t* __restrict genovec, uint32_t* __restrict write_dosage_ct_ptr, uintptr_t* __restrict dosagepresent, Dosage* dosage_main) {
2082   if (*write_dosage_ct_ptr) {
2083     EraseMaleHetDosages(sex_male, genovec, write_dosage_ct_ptr, dosagepresent, dosage_main);
2084   }
2085   SetMaleHetMissing(sex_male_interleaved, vec_ct, genovec);
2086 }
2087 
SetMaleHetMissingKeepdosage(const uintptr_t * __restrict sex_male,const uintptr_t * __restrict sex_male_interleaved,uint32_t word_ct,uintptr_t * __restrict genovec,uint32_t * __restrict write_dosage_ct_ptr,uintptr_t * __restrict dosagepresent,Dosage * dosage_main)2088 void SetMaleHetMissingKeepdosage(const uintptr_t* __restrict sex_male, const uintptr_t* __restrict sex_male_interleaved, uint32_t word_ct, uintptr_t* __restrict genovec, uint32_t* __restrict write_dosage_ct_ptr, uintptr_t* __restrict dosagepresent, Dosage* dosage_main) {
2089   // count # of 1.00000 dosages we need to insert, and then rewrite dosage_main
2090   // from back to front so we don't need temporary buffers.
2091   const uint32_t orig_dosage_ct = *write_dosage_ct_ptr;
2092   if (!orig_dosage_ct) {
2093     // can't assume dosagepresent is initialized in this case
2094     ZeroWArr(DivUp(word_ct, 2), dosagepresent);
2095   }
2096   const Halfword* sex_male_alias = R_CAST(const Halfword*, sex_male);
2097   Halfword* dosagepresent_alias = R_CAST(Halfword*, dosagepresent);
2098   uint32_t new_dosage_ct = 0;
2099   // todo: try vectorizing this loop
2100   for (uintptr_t widx = 0; widx != word_ct; ++widx) {
2101     const uintptr_t geno_hets = Word01(genovec[widx]);
2102     const uintptr_t male_nodosage_word = UnpackHalfwordToWord(sex_male_alias[widx] & (~dosagepresent_alias[widx]));
2103     new_dosage_ct += Popcount01Word(geno_hets & male_nodosage_word);
2104   }
2105   if (!new_dosage_ct) {
2106     SetMaleHetMissing(sex_male_interleaved, WordCtToVecCt(word_ct), genovec);
2107     return;
2108   }
2109   uint32_t dosage_write_idx = orig_dosage_ct + new_dosage_ct;
2110   uint32_t dosage_read_idx = orig_dosage_ct;
2111   uint32_t widx = word_ct;
2112   *write_dosage_ct_ptr = dosage_write_idx;
2113   do {
2114     --widx;
2115     const uintptr_t geno_word = genovec[widx];
2116     const uintptr_t male_geno_hets = geno_word & (~(geno_word >> 1)) & UnpackHalfwordToWord(sex_male_alias[widx]);
2117     const uintptr_t dosagepresent_word = UnpackHalfwordToWord(dosagepresent_alias[widx]);
2118     uintptr_t new_dosagepresent_word = dosagepresent_word | male_geno_hets;
2119     if (new_dosagepresent_word) {
2120       dosagepresent_alias[widx] = PackWordToHalfword(new_dosagepresent_word);
2121       do {
2122         const uint32_t top_set_bit = bsrw(new_dosagepresent_word);
2123         const uintptr_t cur_bit_word = k1LU << top_set_bit;
2124         Dosage cur_dosage = kDosageMid;
2125         if (cur_bit_word & dosagepresent_word) {
2126           cur_dosage = dosage_main[--dosage_read_idx];
2127         }
2128         dosage_main[--dosage_write_idx] = cur_dosage;
2129         new_dosagepresent_word -= cur_bit_word;
2130       } while (new_dosagepresent_word);
2131       genovec[widx] = geno_word | (male_geno_hets << 1);
2132     }
2133   } while (widx);
2134 }
2135 
2136 // Clears each bit in bitarr which doesn't correspond to a genovec het.
2137 // Assumes that either trailing bits of bitarr are already zero, or trailing
2138 // bits of genovec are zero.
2139 //
2140 // Similar to PgrDetectGenoarrHetsUnsafe().
2141 //
2142 // todo: try vectorizing this
MaskGenoarrHetsUnsafe(const uintptr_t * __restrict genoarr,uint32_t raw_sample_ctl2,uintptr_t * __restrict bitarr)2143 void MaskGenoarrHetsUnsafe(const uintptr_t* __restrict genoarr, uint32_t raw_sample_ctl2, uintptr_t* __restrict bitarr) {
2144   Halfword* bitarr_alias = R_CAST(Halfword*, bitarr);
2145   for (uint32_t widx = 0; widx != raw_sample_ctl2; ++widx) {
2146     const uintptr_t cur_word = genoarr[widx];
2147     uintptr_t ww = (~(cur_word >> 1)) & cur_word;  // low 1, high 0
2148     bitarr_alias[widx] &= PackWordToHalfwordMask5555(ww);
2149   }
2150 }
2151 
MaskGenoarrHetsMultiallelicUnsafe(const uintptr_t * __restrict genoarr,const uintptr_t * __restrict patch_10_set,const AlleleCode * __restrict patch_10_vals,uint32_t raw_sample_ctl2,uintptr_t * __restrict bitarr)2152 void MaskGenoarrHetsMultiallelicUnsafe(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t raw_sample_ctl2, uintptr_t* __restrict bitarr) {
2153   // Related to PgrDetectGenoarrHetsMultiallelic().
2154   const Halfword* patch_10_set_alias = R_CAST(const Halfword*, patch_10_set);
2155   const AlleleCode* patch_10_vals_iter = patch_10_vals;
2156   Halfword* bitarr_alias = R_CAST(Halfword*, bitarr);
2157   for (uint32_t widx = 0; widx != raw_sample_ctl2; ++widx) {
2158     const uintptr_t cur_word = genoarr[widx];
2159     uint32_t patch_10_hw = patch_10_set_alias[widx];
2160     uint32_t cur_hets = Pack01ToHalfword(cur_word);
2161     while (patch_10_hw) {
2162       const AlleleCode code1 = *patch_10_vals_iter++;
2163       const AlleleCode code2 = *patch_10_vals_iter++;
2164       if (code1 != code2) {
2165         cur_hets |= ctzu32(patch_10_hw);
2166       }
2167       patch_10_hw &= patch_10_hw - 1;
2168     }
2169     bitarr_alias[widx] &= cur_hets;
2170   }
2171 }
2172 
2173 /*
2174 uint32_t chr_window_max(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bp, uint32_t chr_fo_idx, uint32_t ct_max, uint32_t bp_max, uint32_t cur_window_max) {
2175   if (cur_window_max >= ct_max) {
2176     return ct_max;
2177   }
2178   const uint32_t chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
2179   uint32_t variant_uidx = AdvBoundedTo1Bit(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], chr_end);
2180   const uint32_t variant_ct = PopcountBitRange(variant_include, variant_uidx, chr_end);
2181   if (variant_ct <= cur_window_max) {
2182     return cur_window_max;
2183   }
2184   uint32_t window_idx_first = 0;
2185   uint32_t window_uidx_first = variant_uidx;
2186   uint32_t window_bp_first = variant_bp[variant_uidx];
2187   for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_uidx, ++variant_idx) {
2188     MovU32To1Bit(variant_include, &variant_uidx);
2189     uint32_t variant_bp_thresh = variant_bp[variant_uidx];
2190     if (variant_bp_thresh < bp_max) {
2191       variant_bp_thresh = 0;
2192     } else {
2193       variant_bp_thresh -= bp_max;
2194     }
2195     if (variant_bp_thresh > window_bp_first) {
2196       do {
2197         ++window_uidx_first;
2198         MovU32To1Bit(variant_include, &window_uidx_first);
2199         window_bp_first = variant_bp[window_uidx_first];
2200         ++window_idx_first;
2201       } while (variant_bp_thresh > window_bp_first);
2202     } else if (variant_idx - window_idx_first == cur_window_max) {
2203       if (++cur_window_max == ct_max) {
2204         return cur_window_max;
2205       }
2206     }
2207   }
2208   return cur_window_max;
2209 }
2210 */
2211 
NotOnlyXymt(const uintptr_t * variant_include,const ChrInfo * cip,uint32_t raw_variant_ct,uint32_t xymt_offset)2212 uint32_t NotOnlyXymt(const uintptr_t* variant_include, const ChrInfo* cip, uint32_t raw_variant_ct, uint32_t xymt_offset) {
2213   const uint32_t xymt_code = cip->xymt_codes[xymt_offset];
2214   assert(!IsI32Neg(xymt_code));
2215   const uint32_t cur_chr_fo_idx = cip->chr_idx_to_foidx[xymt_code];
2216   const uint32_t chr_start = cip->chr_fo_vidx_start[cur_chr_fo_idx];
2217   if (chr_start) {
2218     const uint32_t first_uidx = AdvTo1Bit(variant_include, 0);
2219     if (first_uidx < chr_start) {
2220       return 1;
2221     }
2222   }
2223   const uint32_t chr_end = cip->chr_fo_vidx_start[cur_chr_fo_idx + 1];
2224   return (chr_end < raw_variant_ct) && (!AllBitsAreZero(variant_include, chr_end, raw_variant_ct));
2225 }
2226 
CountNonAutosomalVariants(const uintptr_t * variant_include,const ChrInfo * cip,uint32_t count_x,uint32_t count_mt)2227 uint32_t CountNonAutosomalVariants(const uintptr_t* variant_include, const ChrInfo* cip, uint32_t count_x, uint32_t count_mt) {
2228   // for backward compatibility, unplaced markers are considered to be
2229   // autosomal here
2230   uint32_t ct = 0;
2231   if (count_x) {
2232     uint32_t x_code;
2233     if (XymtExists(cip, kChrOffsetX, &x_code)) {
2234       ct += CountChrVariantsUnsafe(variant_include, cip, x_code);
2235     }
2236   }
2237   uint32_t y_code;
2238   if (XymtExists(cip, kChrOffsetY, &y_code)) {
2239     ct += CountChrVariantsUnsafe(variant_include, cip, y_code);
2240   }
2241   if (count_mt) {
2242     uint32_t mt_code;
2243     if (XymtExists(cip, kChrOffsetMT, &mt_code)) {
2244       ct += CountChrVariantsUnsafe(variant_include, cip, mt_code);
2245     }
2246   }
2247   return ct;
2248 }
2249 
ExcludeNonAutosomalVariants(const ChrInfo * cip,uintptr_t * variant_include)2250 void ExcludeNonAutosomalVariants(const ChrInfo* cip, uintptr_t* variant_include) {
2251   uint32_t x_code;
2252   if (XymtExists(cip, kChrOffsetX, &x_code)) {
2253     const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[x_code];
2254     ClearBitsNz(cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1], variant_include);
2255   }
2256   uint32_t y_code;
2257   if (XymtExists(cip, kChrOffsetY, &y_code)) {
2258     const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[y_code];
2259     ClearBitsNz(cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1], variant_include);
2260   }
2261   uint32_t mt_code;
2262   if (XymtExists(cip, kChrOffsetMT, &mt_code)) {
2263     const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[mt_code];
2264     ClearBitsNz(cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1], variant_include);
2265   }
2266 }
2267 
ConditionalAllocateNonAutosomalVariants(const ChrInfo * cip,const char * calc_descrip,uint32_t raw_variant_ct,const uintptr_t ** variant_include_ptr,uint32_t * variant_ct_ptr)2268 PglErr ConditionalAllocateNonAutosomalVariants(const ChrInfo* cip, const char* calc_descrip, uint32_t raw_variant_ct, const uintptr_t** variant_include_ptr, uint32_t* variant_ct_ptr) {
2269   const uint32_t non_autosomal_variant_ct = CountNonAutosomalVariants(*variant_include_ptr, cip, 1, 1);
2270   if (!non_autosomal_variant_ct) {
2271     return kPglRetSuccess;
2272   }
2273   logprintf("Excluding %u variant%s on non-autosomes from %s.\n", non_autosomal_variant_ct, (non_autosomal_variant_ct == 1)? "" : "s", calc_descrip);
2274   *variant_ct_ptr -= non_autosomal_variant_ct;
2275   if (unlikely(!(*variant_ct_ptr))) {
2276     // this may not always be an error condition, probably add a flag later to
2277     // control printing of this error message, etc.
2278     logerrprintf("Error: No variants remaining for %s.\n", calc_descrip);
2279     return kPglRetDegenerateData;
2280   }
2281   const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
2282   uintptr_t* working_variant_include;
2283   if (unlikely(bigstack_alloc_w(raw_variant_ctl, &working_variant_include))) {
2284     return kPglRetNomem;
2285   }
2286   memcpy(working_variant_include, *variant_include_ptr, raw_variant_ctl * sizeof(intptr_t));
2287   ExcludeNonAutosomalVariants(cip, working_variant_include);
2288   *variant_include_ptr = working_variant_include;
2289   return kPglRetSuccess;
2290 }
2291 
FillSubsetChrFoVidxStart(const uintptr_t * variant_include,const ChrInfo * cip,uint32_t * subset_chr_fo_vidx_start)2292 void FillSubsetChrFoVidxStart(const uintptr_t* variant_include, const ChrInfo* cip, uint32_t* subset_chr_fo_vidx_start) {
2293   const uint32_t chr_ct = cip->chr_ct;
2294   subset_chr_fo_vidx_start[0] = 0;
2295   uint32_t variant_uidx = 0;
2296   uint32_t variant_idx = 0;
2297   for (uint32_t chr_fo_idx = 1; chr_fo_idx <= chr_ct; ++chr_fo_idx) {
2298     const uint32_t chr_end_variant_uidx = cip->chr_fo_vidx_start[chr_fo_idx];
2299     variant_idx += PopcountBitRange(variant_include, variant_uidx, chr_end_variant_uidx);
2300     subset_chr_fo_vidx_start[chr_fo_idx] = variant_idx;
2301     variant_uidx = chr_end_variant_uidx;
2302   }
2303 }
2304 
2305 /*
2306 BoolErr allele_set(const char* newval, uint32_t allele_slen, char** allele_ptr) {
2307   char* newptr;
2308   if (allele_slen == 1) {
2309     // const_cast
2310     newptr = (char*)((uintptr_t)(&(g_one_char_strs[((unsigned char)(*newval)) * 2])));
2311   } else {
2312     char* new_alloc;
2313     if (pgl_malloc(allele_slen + 1, &new_alloc)) {
2314       return 1;
2315     }
2316     memcpyx(new_alloc, newval, allele_slen, '\0');
2317     newptr = new_alloc;
2318   }
2319   *allele_ptr = newptr;
2320   return 0;
2321 }
2322 
2323 BoolErr allele_reset(const char* newval, uint32_t allele_slen, char** allele_ptr) {
2324   char* newptr;
2325   if (allele_slen == 1) {
2326     // const_cast
2327     newptr = (char*)((uintptr_t)(&(g_one_char_strs[((unsigned char)(*newval)) * 2])));
2328   } else {
2329     char* new_alloc;
2330     if (pgl_malloc(allele_slen + 1, &new_alloc)) {
2331       return 1;
2332     }
2333     memcpyx(new_alloc, newval, allele_slen, '\0');
2334     newptr = new_alloc;
2335   }
2336   const uintptr_t bigstack_end_addr = (uintptr_t)g_bigstack_end;
2337   const uintptr_t maxdiff = ((uintptr_t)(&(g_one_char_strs[512]))) - bigstack_end_addr;
2338   // take advantage of unsigned wraparound
2339   if ((((uintptr_t)(*allele_ptr)) - bigstack_end_addr) >= maxdiff) {
2340     free(*allele_ptr);
2341   }
2342   *allele_ptr = newptr;
2343   return 0;
2344 }
2345 
2346 void cleanup_allele_storage(uint32_t max_allele_slen, uintptr_t allele_storage_entry_ct, const char** allele_storage) {
2347   // Now doesn't improperly free bigstack allocations (as long as they aren't
2348   // past g_bigstack_end), and doesn't need to be called at all most of the
2349   // time.
2350 
2351   // An alternative representation: have a separate bitarray which indicates
2352   // whether the allele_storage[] element should be interpreted as a heap
2353   // pointer or an in-place zero-terminated string (i.e. string length can be
2354   // up to 7 on 64-bit systems).  I expect that to be more efficient for new
2355   // datasets, but let's get the simple (and 1.9-codebase-compatible)
2356   // implementation working first, and then benchmark the fancier code later.
2357   if (allele_storage && (max_allele_slen > 1)) {
2358     const uintptr_t bigstack_end_addr = (uintptr_t)g_bigstack_end;
2359     const uintptr_t maxdiff = ((uintptr_t)(&(g_one_char_strs[512]))) - bigstack_end_addr;
2360     for (uintptr_t idx = 0; idx != allele_storage_entry_ct; ++idx) {
2361       const char* cur_entry = allele_storage[idx];
2362       assert(cur_entry);
2363       // take advantage of unsigned wraparound
2364       if ((((uintptr_t)cur_entry) - bigstack_end_addr) >= maxdiff) {
2365         free_const(cur_entry);
2366       }
2367     }
2368   }
2369 }
2370 */
2371 
2372 char g_missing_catname[kMaxMissingPhenostrBlen];
2373 char g_output_missing_pheno[kMaxMissingPhenostrBlen];
2374 char g_legacy_output_missing_pheno[kMaxMissingPhenostrBlen];
2375 
InitPheno()2376 void InitPheno() {
2377   snprintf(g_missing_catname, kMaxMissingPhenostrBlen, "NONE");
2378   snprintf(g_output_missing_pheno, kMaxMissingPhenostrBlen, "NA");
2379   snprintf(g_legacy_output_missing_pheno, kMaxMissingPhenostrBlen, "-9");
2380 }
2381 
IsCategoricalPhenostr(const char * phenostr_iter)2382 uint32_t IsCategoricalPhenostr(const char* phenostr_iter) {
2383   uint32_t first_char_code = ctou32(*phenostr_iter++);
2384   // allow leading +/-
2385   if ((first_char_code == 43) || (first_char_code == 45)) {
2386     first_char_code = ctou32(*phenostr_iter++);
2387   }
2388   if (((first_char_code - 48) < 10) || (first_char_code == 44) || (first_char_code < 32)) {
2389     // the last two conditions are for detecting CSV empty strings
2390     return 0;
2391   }
2392   if (first_char_code == 46) {
2393     // decimal point.  classify based on whether next character is a digit.
2394     const uint32_t second_char_code = ctou32(phenostr_iter[0]);
2395     return ((second_char_code - 48) >= 10);
2396   }
2397   // allow any capitalization of "NA"/"nan", but not "inf"
2398   if ((first_char_code & 0xdf) != 78) {
2399     return 1;
2400   }
2401   const uint32_t second_char_code = ctou32(phenostr_iter[0]);
2402   if ((second_char_code & 0xdf) != 65) {
2403     return 1;
2404   }
2405   const uint32_t third_char_code = ctou32(phenostr_iter[1]);
2406   if ((third_char_code & 0xdf) == 78) {
2407     return (ctou32(phenostr_iter[2]) > ' ');
2408   }
2409   return (third_char_code > 32);
2410 }
2411 
IsCategoricalPhenostrNocsv(const char * phenostr_iter)2412 uint32_t IsCategoricalPhenostrNocsv(const char* phenostr_iter) {
2413   uint32_t first_char_code = ctou32(*phenostr_iter++);
2414   // allow leading +/-
2415   if ((first_char_code == 43) || (first_char_code == 45)) {
2416     first_char_code = ctou32(*phenostr_iter++);
2417   }
2418   if ((first_char_code - 48) < 10) {
2419     return 0;
2420   }
2421   if (first_char_code == 46) {
2422     // decimal point.  classify based on whether next character is a digit.
2423     const uint32_t second_char_code = ctou32(phenostr_iter[0]);
2424     return ((second_char_code - 48) >= 10);
2425   }
2426   // allow any capitalization of "NA"/"nan", but not "inf"
2427   if ((first_char_code & 0xdf) != 78) {
2428     return 1;
2429   }
2430   const uint32_t second_char_code = ctou32(phenostr_iter[0]);
2431   if ((second_char_code & 0xdf) != 65) {
2432     return 1;
2433   }
2434   const uint32_t third_char_code = ctou32(phenostr_iter[1]);
2435   if ((third_char_code & 0xdf) == 78) {
2436     return (ctou32(phenostr_iter[2]) > ' ');
2437   }
2438   return (third_char_code > 32);
2439 }
2440 
FirstCcOrQtPhenoIdx(const PhenoCol * pheno_cols,uint32_t pheno_ct)2441 uint32_t FirstCcOrQtPhenoIdx(const PhenoCol* pheno_cols, uint32_t pheno_ct) {
2442   for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
2443     if (pheno_cols[pheno_idx].type_code < kPhenoDtypeCat) {
2444       return pheno_idx;
2445     }
2446   }
2447   return UINT32_MAX;
2448 }
2449 
IsConstCovar(const PhenoCol * covar_col,const uintptr_t * sample_include,uint32_t sample_ct)2450 uint32_t IsConstCovar(const PhenoCol* covar_col, const uintptr_t* sample_include, uint32_t sample_ct) {
2451   if (sample_ct < 2) {
2452     return 1;
2453   }
2454   uintptr_t sample_uidx_base = 0;
2455   uintptr_t cur_bits = sample_include[0];
2456   const uintptr_t initial_sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
2457   if (covar_col->type_code == kPhenoDtypeQt) {
2458     const double* covar_vals = covar_col->data.qt;
2459     const double first_covar_val = covar_vals[initial_sample_uidx];
2460     for (uint32_t sample_idx = 1; sample_idx != sample_ct; ++sample_idx) {
2461       const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
2462       if (covar_vals[sample_uidx] != first_covar_val) {
2463         return 0;
2464       }
2465     }
2466     return 1;
2467   }
2468   assert(covar_col->type_code == kPhenoDtypeCat);
2469   const uint32_t* covar_cats = covar_col->data.cat;
2470   const uint32_t first_covar_cat = covar_cats[initial_sample_uidx];
2471   for (uint32_t sample_idx = 1; sample_idx != sample_ct; ++sample_idx) {
2472     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
2473     if (covar_cats[sample_uidx] != first_covar_cat) {
2474       return 0;
2475     }
2476   }
2477   return 1;
2478 }
2479 
IdentifyRemainingCats(const uintptr_t * sample_include,const PhenoCol * covar_col,uint32_t sample_ct,uintptr_t * observed_cat_bitarr)2480 uint32_t IdentifyRemainingCats(const uintptr_t* sample_include, const PhenoCol* covar_col, uint32_t sample_ct, uintptr_t* observed_cat_bitarr) {
2481   // assumes covar_col->type_code == kPhenoTypeCat
2482   const uint32_t nonnull_cat_ct = covar_col->nonnull_category_ct;
2483   const uint32_t* covar_cats = covar_col->data.cat;
2484   const uint32_t word_ct = 1 + (nonnull_cat_ct / kBitsPerWord);
2485   ZeroWArr(word_ct, observed_cat_bitarr);
2486   uintptr_t sample_uidx_base = 0;
2487   uintptr_t cur_bits = sample_include[0];
2488   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
2489     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
2490     SetBit(covar_cats[sample_uidx], observed_cat_bitarr);
2491   }
2492   return PopcountWords(observed_cat_bitarr, word_ct);
2493 }
2494 
2495 // returns index of most common category
IdentifyRemainingCatsAndMostCommon(const uintptr_t * sample_include,const PhenoCol * covar_col,uint32_t sample_ct,uintptr_t * observed_cat_bitarr,uint32_t * cat_obs_buf)2496 uint32_t IdentifyRemainingCatsAndMostCommon(const uintptr_t* sample_include, const PhenoCol* covar_col, uint32_t sample_ct, uintptr_t* observed_cat_bitarr, uint32_t* cat_obs_buf) {
2497   // assumes covar_col->type_code == kPhenoTypeCat
2498   const uint32_t nonnull_cat_ct = covar_col->nonnull_category_ct;
2499   const uint32_t* covar_cats = covar_col->data.cat;
2500   const uint32_t word_ct = 1 + (nonnull_cat_ct / kBitsPerWord);
2501   ZeroWArr(word_ct, observed_cat_bitarr);
2502   ZeroU32Arr(nonnull_cat_ct + 1, cat_obs_buf);
2503   uintptr_t sample_uidx_base = 0;
2504   uintptr_t cur_bits = sample_include[0];
2505   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
2506     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
2507     cat_obs_buf[covar_cats[sample_uidx]] += 1;
2508     SetBit(covar_cats[sample_uidx], observed_cat_bitarr);
2509   }
2510   if (cat_obs_buf[0]) {
2511     // don't actually need this for now since we're only calling this with the
2512     // missing category excluded, but let's be consistent.
2513     SetBit(0, observed_cat_bitarr);
2514   }
2515   uint32_t best_cat_idx = 0;
2516   uint32_t best_cat_obs_ct = 0;
2517   for (uint32_t cat_idx = 1; cat_idx <= nonnull_cat_ct; ++cat_idx) {
2518     const uint32_t cat_obs_ct = cat_obs_buf[cat_idx];
2519     if (cat_obs_ct) {
2520       SetBit(cat_idx, observed_cat_bitarr);
2521       if (cat_obs_ct > best_cat_obs_ct) {
2522         best_cat_idx = cat_idx;
2523         best_cat_obs_ct = cat_obs_ct;
2524       }
2525     }
2526   }
2527   return best_cat_idx;
2528 }
2529 
GetCatSamples(const uintptr_t * sample_include_base,const PhenoCol * cat_pheno_col,uint32_t raw_sample_ctl,uint32_t sample_ct,uint32_t cat_uidx,uintptr_t * cur_cat_samples)2530 uint32_t GetCatSamples(const uintptr_t* sample_include_base, const PhenoCol* cat_pheno_col, uint32_t raw_sample_ctl, uint32_t sample_ct, uint32_t cat_uidx, uintptr_t* cur_cat_samples) {
2531   ZeroWArr(raw_sample_ctl, cur_cat_samples);
2532   const uint32_t* cat_vals = cat_pheno_col->data.cat;
2533   uintptr_t sample_uidx_base = 0;
2534   uintptr_t cur_bits = sample_include_base[0];
2535   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
2536     const uintptr_t sample_uidx = BitIter1(sample_include_base, &sample_uidx_base, &cur_bits);
2537     if (cat_vals[sample_uidx] == cat_uidx) {
2538       SetBit(sample_uidx, cur_cat_samples);
2539     }
2540   }
2541   return PopcountWords(cur_cat_samples, raw_sample_ctl);
2542 }
2543 
CleanupPhenoCols(uint32_t pheno_ct,PhenoCol * pheno_cols)2544 void CleanupPhenoCols(uint32_t pheno_ct, PhenoCol* pheno_cols) {
2545   if (pheno_cols) {
2546     for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
2547       vecaligned_free_cond(pheno_cols[pheno_idx].nonmiss);
2548     }
2549     free(pheno_cols);
2550   }
2551 }
2552 
ParseChrRanges(const char * const * argvk,const char * flagname_p,const char * errstr_append,uint32_t param_ct,uint32_t allow_extra_chrs,uint32_t xymt_subtract,char range_delim,ChrInfo * cip,uintptr_t * chr_mask)2553 PglErr ParseChrRanges(const char* const* argvk, const char* flagname_p, const char* errstr_append, uint32_t param_ct, uint32_t allow_extra_chrs, uint32_t xymt_subtract, char range_delim, ChrInfo* cip, uintptr_t* chr_mask) {
2554   PglErr reterr = kPglRetSuccess;
2555   {
2556     const char* cur_arg_ptr = argvk[1];
2557     char* token_buf = g_textbuf;
2558     const char* range_end = nullptr;
2559     uint32_t cur_param_idx = 1;
2560     uint32_t rs_len = 0;
2561     uint32_t re_len = 0;
2562     while (1) {
2563       const char* range_start;
2564       if (unlikely(ParseNextRange(argvk, param_ct, range_delim, &cur_param_idx, &cur_arg_ptr, &range_start, &rs_len, &range_end, &re_len))) {
2565         snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s parameter '%s'.\n", flagname_p, argvk[cur_param_idx]);
2566         goto ParseChrRanges_ret_INVALID_CMDLINE_WWA;
2567       }
2568       if (!range_start) {
2569         break;
2570       }
2571       // Could avoid this copy, but only at the cost of mutating argv.  Which
2572       // isn't actually problematic--it's not like we have multiple threads
2573       // parsing the command line--but it's not like this is a performance
2574       // bottleneck either.
2575       // ParseNextRange() prevents buffer overflow.
2576       memcpyx(token_buf, range_start, rs_len, '\0');
2577       uint32_t chr_code_start = GetChrCodeRaw(token_buf);
2578       if (IsI32Neg(chr_code_start)) {
2579         if (unlikely(!allow_extra_chrs)) {
2580           snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s chromosome code '%s'.\n", flagname_p, token_buf);
2581           goto ParseChrRanges_ret_INVALID_CMDLINE_WWA;
2582         }
2583         if (unlikely(range_end)) {
2584           goto ParseChrRanges_ret_INVALID_CMDLINE_NONSTD;
2585         }
2586         if (unlikely(PushLlStr(token_buf, &(cip->incl_excl_name_stack)))) {
2587           goto ParseChrRanges_ret_NOMEM;
2588         }
2589       } else {
2590         if (chr_code_start >= kMaxContigs) {
2591           chr_code_start -= xymt_subtract;
2592         }
2593         if (range_end) {
2594           memcpyx(token_buf, range_end, re_len, '\0');
2595           uint32_t chr_code_end = GetChrCodeRaw(token_buf);
2596           if (unlikely(IsI32Neg(chr_code_end))) {
2597             if (!allow_extra_chrs) {
2598               snprintf(g_logbuf, kLogbufSize, "Error: Invalid --%s chromosome code '%s'.\n", flagname_p, range_end);
2599               goto ParseChrRanges_ret_INVALID_CMDLINE_WWA;
2600             }
2601             goto ParseChrRanges_ret_INVALID_CMDLINE_NONSTD;
2602           }
2603           if (unlikely(chr_code_end >= kMaxContigs)) {
2604             // prohibit stuff like "--chr par1-par2", "--chr x-y", "--chr x-26"
2605             snprintf(g_logbuf, kLogbufSize, "Error: --%s chromosome code '%s' cannot be the end of a range.\n", flagname_p, range_end);
2606             goto ParseChrRanges_ret_INVALID_CMDLINE_WWA;
2607           }
2608           if (unlikely(chr_code_end <= chr_code_start)) {
2609             snprintf(g_logbuf, kLogbufSize, "Error: --%s chromosome code '%s' is not greater than '%s'.\n", flagname_p, range_end, range_start);
2610             goto ParseChrRanges_ret_INVALID_CMDLINE_WWA;
2611           }
2612           FillBitsNz(chr_code_start, chr_code_end + 1, chr_mask);
2613         } else {
2614           SetBit(chr_code_start, chr_mask);
2615         }
2616       }
2617     }
2618     // no compelling reason to prohibit "--not-chr ,"
2619   }
2620   while (0) {
2621   ParseChrRanges_ret_NOMEM:
2622     reterr = kPglRetNomem;
2623     break;
2624   ParseChrRanges_ret_INVALID_CMDLINE_NONSTD:
2625     logerrputs("Error: Chromosome ranges cannot include nonstandard names.\n");
2626     reterr = kPglRetInvalidCmdline;
2627     break;
2628   ParseChrRanges_ret_INVALID_CMDLINE_WWA:
2629     WordWrapB(0);
2630     logerrputsb();
2631     logerrputs(errstr_append);
2632     reterr = kPglRetInvalidCmdline;
2633     break;
2634   }
2635   return reterr;
2636 }
2637 
2638 /*
2639 uint32_t MultiallelicVariantPresent(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t variant_ct) {
2640   if (!allele_idx_offsets) {
2641     return 0;
2642   }
2643   uintptr_t variant_uidx_base = 0;
2644   uintptr_t cur_bits = variant_include[0];
2645   for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
2646     const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
2647     if (allele_idx_offsets[variant_uidx + 1] != allele_idx_offsets[variant_uidx] + 2) {
2648       return 1;
2649     }
2650   }
2651   return 0;
2652 }
2653 */
2654 
CountBiallelicVariants(const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,uint32_t variant_ct)2655 uint32_t CountBiallelicVariants(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t variant_ct) {
2656   if (!allele_idx_offsets) {
2657     return variant_ct;
2658   }
2659   uint32_t biallelic_variant_ct = 0;
2660   uintptr_t variant_uidx_base = 0;
2661   uintptr_t cur_bits = variant_include[0];
2662   for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
2663     const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
2664     biallelic_variant_ct += (allele_idx_offsets[variant_uidx + 1] == allele_idx_offsets[variant_uidx] + 2);
2665   }
2666   return biallelic_variant_ct;
2667 }
2668 
CountAlleles(const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,uint32_t raw_variant_ct,uint32_t variant_ct)2669 uintptr_t CountAlleles(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, uint32_t raw_variant_ct, uint32_t variant_ct) {
2670   if (!allele_idx_offsets) {
2671     return variant_ct * 2;
2672   }
2673   if (raw_variant_ct == variant_ct) {
2674     return allele_idx_offsets[raw_variant_ct];
2675   }
2676   uintptr_t allele_ct = 0;
2677   uint32_t variant_uidx = 0;
2678   while (variant_ct) {
2679     const uint32_t variant_uidx_start = AdvTo1Bit(variant_include, variant_uidx);
2680     variant_uidx = AdvBoundedTo0Bit(variant_include, variant_uidx_start + 1, raw_variant_ct);
2681     allele_ct += allele_idx_offsets[variant_uidx] - allele_idx_offsets[variant_uidx_start];
2682     variant_ct -= variant_uidx - variant_uidx_start;
2683   }
2684   return allele_ct;
2685 }
2686 
2687 /*
2688 uintptr_t GetMaxAlleleBlockSize(const uintptr_t* __restrict variant_include, const uintptr_t* __restrict allele_idx_offsets, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t read_block_size) {
2689   uintptr_t max_allele_block_size = 2 * read_block_size;
2690   if (!allele_idx_offsets) {
2691     return max_allele_block_size;
2692   }
2693   uint32_t variant_idx = 0;
2694   uint32_t variant_uidx_end = 0;
2695   while (1) {
2696     uint32_t variant_uidx = variant_uidx_end;
2697     variant_uidx_end += read_block_size;
2698     if (variant_uidx_end > raw_variant_ct) {
2699       variant_uidx_end = raw_variant_ct;
2700       read_block_size = raw_variant_ct - variant_uidx;
2701     }
2702     uint32_t cur_variant_ct = PopcountBitRange(variant_include, variant_uidx, variant_uidx_end);
2703     if (cur_variant_ct) {
2704       const uint32_t omitted_variant_ct = read_block_size - cur_variant_ct;
2705       const uintptr_t block_size_ubound = allele_idx_offsets[variant_uidx_end] - allele_idx_offsets[variant_uidx] - (2 * omitted_variant_ct);
2706       variant_idx += cur_variant_ct;
2707       if (block_size_ubound > max_allele_block_size) {
2708         // subtract at beginning of each variant_include bitblock, add at end.
2709         uintptr_t cur_allele_block_size = 0;
2710         do {
2711           const uint32_t cur_bitblock_start = AdvTo1Bit(variant_include, variant_uidx);
2712           // deliberate underflow
2713           cur_allele_block_size -= allele_idx_offsets[cur_bitblock_start];
2714           if (variant_uidx_end - cur_bitblock_start == cur_variant_ct) {
2715             cur_allele_block_size += allele_idx_offsets[variant_uidx_end];
2716             break;
2717           }
2718           variant_uidx = AdvTo0Bit(variant_include, cur_bitblock_start);
2719           cur_allele_block_size += allele_idx_offsets[variant_uidx];
2720           cur_variant_ct -= variant_uidx - cur_bitblock_start;
2721         } while (cur_variant_ct);
2722         if (cur_allele_block_size > max_allele_block_size) {
2723           max_allele_block_size = cur_allele_block_size;
2724         }
2725       }
2726       if (variant_idx == variant_ct) {
2727         return max_allele_block_size;
2728       }
2729     }
2730   }
2731 }
2732 */
2733 
GetMaxAltAlleleBlockSize(const uintptr_t * __restrict variant_include,const uintptr_t * __restrict allele_idx_offsets,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t read_block_size)2734 uintptr_t GetMaxAltAlleleBlockSize(const uintptr_t* __restrict variant_include, const uintptr_t* __restrict allele_idx_offsets, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t read_block_size) {
2735   uintptr_t max_alt_allele_block_size = read_block_size;
2736   if (!allele_idx_offsets) {
2737     return max_alt_allele_block_size;
2738   }
2739   uint32_t variant_idx = 0;
2740   for (uint32_t variant_uidx_end = 0; ; ) {
2741     uint32_t variant_uidx = variant_uidx_end;
2742     variant_uidx_end += read_block_size;
2743     if (variant_uidx_end > raw_variant_ct) {
2744       variant_uidx_end = raw_variant_ct;
2745       read_block_size = raw_variant_ct - variant_uidx;
2746     }
2747     uint32_t cur_variant_ct = PopcountBitRange(variant_include, variant_uidx, variant_uidx_end);
2748     if (cur_variant_ct) {
2749       const uint32_t omitted_variant_ct = read_block_size - cur_variant_ct;
2750       const uintptr_t cur_ubound = allele_idx_offsets[variant_uidx_end] - allele_idx_offsets[variant_uidx] - omitted_variant_ct - read_block_size;
2751       variant_idx += cur_variant_ct;
2752       if (cur_ubound > max_alt_allele_block_size) {
2753         // subtract at beginning of each variant_include bitblock, add at end.
2754         // deliberate underflow
2755         uintptr_t cur_alt_allele_block_size = -S_CAST(uintptr_t, cur_variant_ct);
2756         do {
2757           const uint32_t cur_bitblock_start = AdvTo1Bit(variant_include, variant_uidx);
2758           // deliberate underflow
2759           cur_alt_allele_block_size -= allele_idx_offsets[cur_bitblock_start];
2760           if (variant_uidx_end - cur_bitblock_start == cur_variant_ct) {
2761             cur_alt_allele_block_size += allele_idx_offsets[variant_uidx_end];
2762             break;
2763           }
2764           variant_uidx = AdvTo0Bit(variant_include, cur_bitblock_start);
2765           cur_alt_allele_block_size += allele_idx_offsets[variant_uidx];
2766           cur_variant_ct -= variant_uidx - cur_bitblock_start;
2767         } while (cur_variant_ct);
2768         if (cur_alt_allele_block_size > max_alt_allele_block_size) {
2769           max_alt_allele_block_size = cur_alt_allele_block_size;
2770         }
2771       }
2772       if (variant_idx == variant_ct) {
2773         return max_alt_allele_block_size;
2774       }
2775     }
2776   }
2777 }
2778 
GetMhcWordCt(uintptr_t sample_ct)2779 uintptr_t GetMhcWordCt(uintptr_t sample_ct) {
2780   const uintptr_t sample_ctl = BitCtToWordCt(sample_ct);
2781   const uintptr_t patch_01_max_word_ct = sample_ctl + DivUp(sizeof(AlleleCode) * sample_ct, kBytesPerWord);
2782   const uintptr_t patch_10_max_word_ct = sample_ctl + DivUp(2 * sizeof(AlleleCode) * sample_ct, kBytesPerWord);
2783   return RoundUpPow2(patch_01_max_word_ct, kWordsPerVec) + RoundUpPow2(patch_10_max_word_ct, kWordsPerVec);
2784 }
2785 
2786 PglErr PgenMtLoadInit(const uintptr_t* variant_include, uint32_t sample_ct, uint32_t variant_ct, uintptr_t bytes_avail, uintptr_t pgr_alloc_cacheline_ct, uintptr_t thread_xalloc_cacheline_ct, uintptr_t per_variant_xalloc_byte_ct, uintptr_t per_alt_allele_xalloc_byte_ct, PgenFileInfo* pgfip, uint32_t* calc_thread_ct_ptr, uintptr_t*** genovecs_ptr, uintptr_t*** mhc_ptr, uintptr_t*** phasepresent_ptr, uintptr_t*** phaseinfo_ptr, uintptr_t*** dosage_present_ptr, Dosage*** dosage_mains_ptr, uintptr_t*** dphase_present_ptr, SDosage*** dphase_delta_ptr, uint32_t* read_block_size_ptr, uintptr_t* max_alt_allele_block_size_ptr, STD_ARRAY_REF(unsigned char*, 2) main_loadbufs, PgenReader*** pgr_pps, uint32_t** read_variant_uidx_starts_ptr) {
2787   uintptr_t cachelines_avail = bytes_avail / kCacheline;
2788   uint32_t read_block_size = kPglVblockSize;
2789   uint64_t multiread_cacheline_ct;
2790   for (; ; read_block_size /= 2) {
2791     multiread_cacheline_ct = PgfiMultireadGetCachelineReq(variant_include, pgfip, variant_ct, read_block_size);
2792     // limit each raw load buffer to 1/4 of remaining workspace
2793     // if there's an additional per-variant allocation, put it in the same bin
2794     // as the load buffers
2795     if ((multiread_cacheline_ct + (S_CAST(uint64_t, per_variant_xalloc_byte_ct) * read_block_size) / kCacheline) * 4 <= cachelines_avail) {
2796       break;
2797     }
2798     // lots of callers require read_block_size to be either raw_variant_ct or a
2799     // multiple of kBitsPerVec
2800 #ifdef __LP64__
2801     if (read_block_size <= kBitsPerVec) {
2802       return kPglRetNomem;
2803     }
2804 #else
2805     if (read_block_size <= kCacheline) {
2806       return kPglRetNomem;
2807     }
2808 #endif
2809   }
2810 #ifndef __LP64__
2811   if (multiread_cacheline_ct > (kMaxBytesPerIO / kCacheline)) {
2812     return kPglRetNomem;
2813   }
2814 #endif
2815   main_loadbufs[0] = S_CAST(unsigned char*, bigstack_alloc_raw(multiread_cacheline_ct * kCacheline));
2816   main_loadbufs[1] = S_CAST(unsigned char*, bigstack_alloc_raw(multiread_cacheline_ct * kCacheline));
2817   pgfip->block_base = main_loadbufs[0];
2818   *read_block_size_ptr = read_block_size;
2819   cachelines_avail -= 2 * (multiread_cacheline_ct + (S_CAST(uint64_t, per_variant_xalloc_byte_ct) * read_block_size) / kCacheline);
2820   if (per_alt_allele_xalloc_byte_ct) {
2821     const uintptr_t max_alt_allele_block_size = GetMaxAltAlleleBlockSize(variant_include, pgfip->allele_idx_offsets, pgfip->raw_variant_ct, variant_ct, read_block_size);
2822 
2823     cachelines_avail -= DivUpU64(S_CAST(uint64_t, per_alt_allele_xalloc_byte_ct) * read_block_size, kCacheline);
2824     // we assume it's unnecessary to return this if
2825     // per_alt_allele_xalloc_byte_ct is zero
2826     *max_alt_allele_block_size_ptr = max_alt_allele_block_size;
2827   }
2828   // reduce calc_thread_ct if necessary
2829   uint32_t calc_thread_ct = *calc_thread_ct_ptr;
2830   if (calc_thread_ct > read_block_size) {
2831     calc_thread_ct = read_block_size;
2832     *calc_thread_ct_ptr = calc_thread_ct;
2833   }
2834 
2835   // pgr_pps, read_variant_uidx_starts_ptr, (*pgr_pps)[tidx], pgr_alloc;
2836   //   deliberately a slight overestimate
2837   const uintptr_t pgr_struct_alloc = RoundUpPow2(sizeof(PgenReader), kCacheline);
2838   uintptr_t thread_alloc_cacheline_ct = 1 + 1 + (pgr_struct_alloc / kCacheline) + pgr_alloc_cacheline_ct + thread_xalloc_cacheline_ct;
2839 
2840   const uint32_t sample_ctcl2 = NypCtToCachelineCt(sample_ct);
2841   const uint32_t sample_ctcl = BitCtToCachelineCt(sample_ct);
2842 
2843   // todo: multiallelic dosage
2844   const uintptr_t dosage_main_cl = DivUp(sample_ct, (kCacheline / sizeof(Dosage)));
2845   uintptr_t mhc_cl = 0;
2846   if (genovecs_ptr) {
2847     thread_alloc_cacheline_ct += 1 + sample_ctcl2;
2848     if (mhc_ptr) {
2849       mhc_cl = DivUp(GetMhcWordCt(sample_ct), kWordsPerCacheline);
2850       thread_alloc_cacheline_ct += 1 + mhc_cl;
2851     }
2852     if (phasepresent_ptr) {
2853       assert(phaseinfo_ptr);
2854       thread_alloc_cacheline_ct += 2 + sample_ctcl;
2855     }
2856     if (dosage_present_ptr) {
2857       assert(dosage_mains_ptr);
2858       thread_alloc_cacheline_ct += 2 + sample_ctcl + dosage_main_cl;
2859       if (dphase_present_ptr) {
2860         assert(dphase_delta_ptr);
2861         thread_alloc_cacheline_ct += 2 + sample_ctcl + dosage_main_cl;
2862       }
2863     }
2864   }
2865   if (thread_alloc_cacheline_ct * calc_thread_ct > cachelines_avail) {
2866     if (thread_alloc_cacheline_ct > cachelines_avail) {
2867       return kPglRetNomem;
2868     }
2869     calc_thread_ct = cachelines_avail / thread_alloc_cacheline_ct;
2870     *calc_thread_ct_ptr = calc_thread_ct;
2871   }
2872 
2873   const uint32_t array_of_ptrs_alloc = RoundUpPow2(calc_thread_ct * sizeof(intptr_t), kCacheline);
2874   *pgr_pps = S_CAST(PgenReader**, bigstack_alloc_raw(array_of_ptrs_alloc));
2875   *read_variant_uidx_starts_ptr = S_CAST(uint32_t*, bigstack_alloc_raw_rd(calc_thread_ct * sizeof(int32_t)));
2876   for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
2877     (*pgr_pps)[tidx] = S_CAST(PgenReader*, bigstack_alloc_raw(pgr_struct_alloc));
2878     // PreinitPgr(g_pgr_ptrs[tidx]);
2879     unsigned char* pgr_alloc = S_CAST(unsigned char*, bigstack_alloc_raw(pgr_alloc_cacheline_ct * kCacheline));
2880 
2881     // shouldn't be possible for this to fail
2882     PgrInit(nullptr, 0, pgfip, (*pgr_pps)[tidx], pgr_alloc);
2883   }
2884   if (genovecs_ptr) {
2885     *genovecs_ptr = S_CAST(uintptr_t**, bigstack_alloc_raw(array_of_ptrs_alloc));
2886     if (mhc_ptr) {
2887       *mhc_ptr = S_CAST(uintptr_t**, bigstack_alloc_raw(array_of_ptrs_alloc));
2888     }
2889     if (phasepresent_ptr) {
2890       *phasepresent_ptr = S_CAST(uintptr_t**, bigstack_alloc_raw(array_of_ptrs_alloc));
2891       *phaseinfo_ptr = S_CAST(uintptr_t**, bigstack_alloc_raw(array_of_ptrs_alloc));
2892     }
2893     if (dosage_present_ptr) {
2894       *dosage_present_ptr = S_CAST(uintptr_t**, bigstack_alloc_raw(array_of_ptrs_alloc));
2895       *dosage_mains_ptr = S_CAST(Dosage**, bigstack_alloc_raw(array_of_ptrs_alloc));
2896       if (dphase_present_ptr) {
2897         *dphase_present_ptr = S_CAST(uintptr_t**, bigstack_alloc_raw(array_of_ptrs_alloc));
2898         *dphase_delta_ptr = S_CAST(SDosage**, bigstack_alloc_raw(array_of_ptrs_alloc));
2899       }
2900     }
2901     const uintptr_t genovec_alloc = sample_ctcl2 * kCacheline;
2902     const uintptr_t bitarray_alloc = sample_ctcl * kCacheline;
2903     const uintptr_t dosage_main_alloc = dosage_main_cl * kCacheline;
2904     for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
2905       (*genovecs_ptr)[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(genovec_alloc));
2906       if (mhc_ptr) {
2907         (*mhc_ptr)[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(mhc_cl * kCacheline));
2908       }
2909       if (phasepresent_ptr) {
2910         (*phasepresent_ptr)[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitarray_alloc));
2911         (*phaseinfo_ptr)[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitarray_alloc));
2912       }
2913       if (dosage_present_ptr) {
2914         (*dosage_present_ptr)[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitarray_alloc));
2915         (*dosage_mains_ptr)[tidx] = S_CAST(Dosage*, bigstack_alloc_raw(dosage_main_alloc));
2916         if (dphase_present_ptr) {
2917           (*dphase_present_ptr)[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitarray_alloc));
2918           (*dphase_delta_ptr)[tidx] = S_CAST(SDosage*, bigstack_alloc_raw(dosage_main_alloc));
2919         }
2920       }
2921     }
2922   }
2923   return kPglRetSuccess;
2924 }
2925 
MultireadNonempty(const uintptr_t * variant_include,const ThreadGroup * tgp,uint32_t raw_variant_ct,uint32_t read_block_size,PgenFileInfo * pgfip,uint32_t * read_block_idxp,PglErr * reterrp)2926 uint32_t MultireadNonempty(const uintptr_t* variant_include, const ThreadGroup* tgp, uint32_t raw_variant_ct, uint32_t read_block_size, PgenFileInfo* pgfip, uint32_t* read_block_idxp, PglErr* reterrp) {
2927   if (IsLastBlock(tgp)) {
2928     return 0;
2929   }
2930   // This condition ensures the PopcountWords() calls are valid.  If it's
2931   // ever inconvenient, PopcountBitRange() can be used instead.
2932   assert((!(read_block_size % kBitsPerVec)) || (raw_variant_ct <= read_block_size));
2933 
2934   const uint32_t read_block_sizel = read_block_size / kBitsPerWord;
2935   uint32_t read_block_idx = *read_block_idxp;
2936   uint32_t offset = read_block_idx * read_block_size;
2937   uint32_t cur_read_block_size = read_block_size;
2938   uint32_t cur_block_write_ct;
2939   for (; ; ++read_block_idx, offset += read_block_size) {
2940     if (offset + read_block_size >= raw_variant_ct) {
2941       cur_read_block_size = raw_variant_ct - offset;
2942       cur_block_write_ct = PopcountWords(&(variant_include[read_block_idx * read_block_sizel]), BitCtToWordCt(cur_read_block_size));
2943       assert(cur_block_write_ct);  // otherwise, IsLastBlock should be set
2944       break;
2945     }
2946     cur_block_write_ct = PopcountWords(&(variant_include[read_block_idx * read_block_sizel]), read_block_sizel);
2947     if (cur_block_write_ct) {
2948       break;
2949     }
2950   }
2951   *read_block_idxp = read_block_idx;
2952   *reterrp = PgfiMultiread(variant_include, offset, offset + cur_read_block_size, cur_block_write_ct, pgfip);
2953   return cur_block_write_ct;
2954 }
2955 
ExpandMhc(uint32_t sample_ct,uintptr_t * mhc,uintptr_t ** patch_01_set_ptr,AlleleCode ** patch_01_vals_ptr,uintptr_t ** patch_10_set_ptr,AlleleCode ** patch_10_vals_ptr)2956 void ExpandMhc(uint32_t sample_ct, uintptr_t* mhc, uintptr_t** patch_01_set_ptr, AlleleCode** patch_01_vals_ptr, uintptr_t** patch_10_set_ptr, AlleleCode** patch_10_vals_ptr) {
2957   const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
2958   *patch_01_set_ptr = mhc;
2959   AlleleCode* patch_01_vals = R_CAST(AlleleCode*, &(mhc[sample_ctl]));
2960   *patch_01_vals_ptr = patch_01_vals;
2961   AlleleCode* patch_01_vals_end = &(patch_01_vals[sample_ct]);
2962   VecAlignUp(&patch_01_vals_end);
2963   uintptr_t* patch_10_set = R_CAST(uintptr_t*, patch_01_vals_end);
2964   *patch_10_set_ptr = patch_10_set;
2965   *patch_10_vals_ptr = R_CAST(AlleleCode*, &(patch_10_set[sample_ctl]));
2966 }
2967 
WriteSampleIdsOverride(const uintptr_t * sample_include,const SampleIdInfo * siip,const char * outname,uint32_t sample_ct,SampleIdFlags override_flags)2968 PglErr WriteSampleIdsOverride(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* outname, uint32_t sample_ct, SampleIdFlags override_flags) {
2969   FILE* outfile = nullptr;
2970   PglErr reterr = kPglRetSuccess;
2971   {
2972     if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
2973       goto WriteSampleIdsOverride_ret_OPEN_FAIL;
2974     }
2975     const char* sample_ids = siip->sample_ids;
2976     const char* sids = siip->sids;
2977     const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
2978     const uintptr_t max_sid_blen = siip->max_sid_blen;
2979     char* write_iter = g_textbuf;
2980     char* textbuf_flush = &(write_iter[kMaxMediumLine]);
2981     if (!(override_flags & kfSampleIdNoIdHeader)) {
2982       *write_iter++ = '#';
2983       if (override_flags & kfSampleIdFidPresent) {
2984         write_iter = strcpya_k(write_iter, "FID\t");
2985       } else {
2986         // every single row starts with "0\t", so this causes only IIDs to be
2987         // reported
2988         sample_ids = &(sample_ids[2]);
2989       }
2990       write_iter = strcpya_k(write_iter, "IID");
2991       if (sids) {
2992         write_iter = strcpya_k(write_iter, "\tSID");
2993       }
2994       AppendBinaryEoln(&write_iter);
2995     } else {
2996       if (override_flags & kfSampleIdNoIdHeaderIidOnly) {
2997         // We've previously verified that all FIDs are in fact '0'.
2998         sample_ids = &(sample_ids[2]);
2999       }
3000       sids = nullptr;
3001     }
3002     uintptr_t sample_uidx_base = 0;
3003     uintptr_t cur_bits = sample_include[0];
3004     for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
3005       const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
3006       write_iter = strcpya(write_iter, &(sample_ids[sample_uidx * max_sample_id_blen]));
3007       if (sids) {
3008         *write_iter++ = '\t';
3009         write_iter = strcpya(write_iter, &(sids[sample_uidx * max_sid_blen]));
3010       }
3011       AppendBinaryEoln(&write_iter);
3012       if (unlikely(fwrite_ck(textbuf_flush, outfile, &write_iter))) {
3013         goto WriteSampleIdsOverride_ret_WRITE_FAIL;
3014       }
3015     }
3016     if (unlikely(fclose_flush_null(textbuf_flush, write_iter, &outfile))) {
3017       goto WriteSampleIdsOverride_ret_WRITE_FAIL;
3018     }
3019   }
3020   while (0) {
3021   WriteSampleIdsOverride_ret_OPEN_FAIL:
3022     reterr = kPglRetOpenFail;
3023     break;
3024   WriteSampleIdsOverride_ret_WRITE_FAIL:
3025     reterr = kPglRetWriteFail;
3026     break;
3027   }
3028   fclose_cond(outfile);
3029   return reterr;
3030 }
3031 
RealpathIdentical(const char * outname,const char * read_realpath,char * write_realpath_buf)3032 uint32_t RealpathIdentical(const char* outname, const char* read_realpath, char* write_realpath_buf) {
3033 #ifdef _WIN32
3034   const uint32_t fname_slen = GetFullPathName(outname, kPglFnamesize, write_realpath_buf, nullptr);
3035   return (fname_slen && (fname_slen <= kPglFnamesize) && memequal(read_realpath, write_realpath_buf, fname_slen + 1));
3036 #else
3037   return (realpath(outname, write_realpath_buf) && strequal_overread(read_realpath, write_realpath_buf));
3038 #endif
3039 }
3040 
3041 // assumes rawval is in [1, 32767]
3042 static_assert(kDosageMax == 32768, "PrintHaploidNonintDosage() needs to be updated.");
PrintHaploidNonintDosage(uint32_t rawval,char * start)3043 char* PrintHaploidNonintDosage(uint32_t rawval, char* start) {
3044   // Instead of constant 5-digit precision, we print fewer digits whenever that
3045   // doesn't interfere with proper round-tripping.  I.e. we search for the
3046   // shortest string in
3047   //   ((n - 0.5)/32768, (n + 0.5)/32768).
3048   assert(rawval - 1 < 32767);
3049   start = strcpya_k(start, "0.");
3050 
3051   // (rawval * 2) is in 65536ths
3052   // 65536 * 625 = 40960k
3053   const uint32_t range_top_40960k = rawval * 1250 + 625;
3054   // ok to check half-open interval since we never hit boundary
3055   if ((range_top_40960k % 4096) < 1250) {
3056     // when this is true, the four-decimal-place approximation is in the range
3057     // which round-trips back to our original number.
3058     const uint32_t four_decimal_places = range_top_40960k / 4096;
3059     return u32toa_trunc4(four_decimal_places, start);
3060   }
3061 
3062   // we wish to print (100000 * remainder + 16384) / 32768, left-0-padded.  and
3063   // may as well banker's round too.
3064   //
3065   // banker's rounding yields a different result than regular rounding for n/64
3066   // when n is congruent to 1 mod 4.  32768/64 = 512.
3067   const uint32_t five_decimal_places = ((3125 * rawval + 512) / 1024) - ((rawval % 2048) == 512);
3068   const uint32_t first_decimal_place = five_decimal_places / 10000;
3069   *start++ = '0' + first_decimal_place;
3070   const uint32_t last_four_digits = five_decimal_places - first_decimal_place * 10000;
3071   if (last_four_digits) {
3072     return u32toa_trunc4(last_four_digits, start);
3073   }
3074   return start;
3075 }
3076 
PrintMultiallelicHcAsDs(uint32_t hc1,uint32_t hc2,uint32_t allele_ct,char * start)3077 char* PrintMultiallelicHcAsDs(uint32_t hc1, uint32_t hc2, uint32_t allele_ct, char* start) {
3078   if (hc1 == kMissingAlleleCode) {
3079     *start++ = '.';
3080     return start;
3081   }
3082   for (uint32_t uii = 1; uii < hc1; ++uii) {
3083     start = strcpya_k(start, "0,");
3084   }
3085   if (hc1 == hc2) {
3086     if (hc1) {
3087       start = strcpya_k(start, "2,");
3088     }
3089     for (uint32_t uii = hc1 + 1; uii != allele_ct; ++uii) {
3090       start = strcpya_k(start, "0,");
3091     }
3092     return &(start[-1]);
3093   }
3094   if (hc1) {
3095     start = strcpya_k(start, "1,");
3096   }
3097   for (uint32_t uii = hc1 + 1; uii != hc2; ++uii) {
3098     start = strcpya_k(start, "0,");
3099   }
3100   start = strcpya_k(start, "1,");
3101   for (uint32_t uii = hc2 + 1; uii != allele_ct; ++uii) {
3102     start = strcpya_k(start, "0,");
3103   }
3104   return &(start[-1]);
3105 }
3106 
PrintMultiallelicHcAsHaploidDs(uint32_t hc1,uint32_t hc2,uint32_t allele_ct,char * start)3107 char* PrintMultiallelicHcAsHaploidDs(uint32_t hc1, uint32_t hc2, uint32_t allele_ct, char* start) {
3108   if (hc1 == kMissingAlleleCode) {
3109     *start++ = '.';
3110     return start;
3111   }
3112   for (uint32_t uii = 1; uii < hc1; ++uii) {
3113     start = strcpya_k(start, "0,");
3114   }
3115   if (hc1 == hc2) {
3116     if (hc1) {
3117       start = strcpya_k(start, "1,");
3118     }
3119     for (uint32_t uii = hc1 + 1; uii != allele_ct; ++uii) {
3120       start = strcpya_k(start, "0,");
3121     }
3122     return &(start[-1]);
3123   }
3124   if (hc1) {
3125     start = strcpya_k(start, "0.5,");
3126   }
3127   for (uint32_t uii = hc1 + 1; uii != hc2; ++uii) {
3128     start = strcpya_k(start, "0,");
3129   }
3130   start = strcpya_k(start, "0.5,");
3131   for (uint32_t uii = hc2 + 1; uii != allele_ct; ++uii) {
3132     start = strcpya_k(start, "0,");
3133   }
3134   return &(start[-1]);
3135 }
3136 
3137 const char g_vft_names[3][18] = {"extract", "extract-intersect", "exclude"};
3138 
3139 #ifdef __cplusplus
3140 }  // namespace plink2
3141 #endif
3142