1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "include/pgenlib_write.h"
19 #include "plink2_compress_stream.h"
20 #include "plink2_data.h"
21 #include "plink2_pvar.h"
22 
23 #include <time.h>
24 
25 #ifdef __cplusplus
26 namespace plink2 {
27 #endif
28 
29 PglErr WriteMapOrBim(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const double* variant_cms, uint32_t variant_ct, uint32_t max_allele_slen, char delim, uint32_t output_zst, uint32_t thread_ct) {
30   // - Normally generates a .bim file.  Set max_allele_slen to zero to generate
31   //   a .map.
32   // - allele_presents must be nullptr unless we're trimming alt alleles.
33   // - Errors out when writing .bim if any remaining variant is multiallelic
34   //   and refalt1_select is nullptr.
35   // - Multiallelic-split case is handled by WriteBimSplit().
36   unsigned char* bigstack_mark = g_bigstack_base;
37   char* cswritep = nullptr;
38   CompressStreamState css;
39   PglErr reterr = kPglRetSuccess;
40   PreinitCstream(&css);
41   {
42     const uint32_t max_chr_blen = GetMaxChrSlen(cip) + 1;
43     // includes trailing tab
44     char* chr_buf;
45     if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
46       goto WriteMapOrBim_ret_NOMEM;
47     }
48     const uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen;
49     reterr = InitCstreamAlloc(outname, 0, output_zst, thread_ct, overflow_buf_size, &css, &cswritep);
50     if (unlikely(reterr)) {
51       goto WriteMapOrBim_ret_1;
52     }
53 
54     const char output_missing_geno_char = *g_output_missing_geno_ptr;
55     uintptr_t variant_uidx_base = 0;
56     uintptr_t cur_bits = variant_include[0];
57     uint32_t chr_fo_idx = UINT32_MAX;
58     uint32_t chr_end = 0;
59     uint32_t chr_buf_blen = 0;
60     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
61       const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
62       if (variant_uidx >= chr_end) {
63         do {
64           ++chr_fo_idx;
65           chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
66         } while (variant_uidx >= chr_end);
67         char* chr_name_end = chrtoa(cip, cip->chr_file_order[chr_fo_idx], chr_buf);
68         *chr_name_end = delim;
69         chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
70       }
71       cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
72       cswritep = strcpyax(cswritep, variant_ids[variant_uidx], delim);
73       if (!variant_cms) {
74         *cswritep++ = '0';
75       } else {
76         cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
77       }
78       *cswritep++ = delim;
79       cswritep = u32toa(variant_bps[variant_uidx], cswritep);
80       if (max_allele_slen) {
81         *cswritep++ = delim;
82         uintptr_t allele_idx_offset_base = variant_uidx * 2;
83         if (allele_idx_offsets) {
84           allele_idx_offset_base = allele_idx_offsets[variant_uidx];
85           if (!refalt1_select) {
86             const uintptr_t allele_idx_offset_end = allele_idx_offsets[variant_uidx + 1];
87             if (allele_idx_offset_end != allele_idx_offset_base + 2) {
88               // not actually unlikely at this point, but simplest to stay
89               // consistent
90               if (unlikely((!allele_presents) || (!AllBitsAreZero(allele_presents, 2 + allele_idx_offset_base, allele_idx_offset_end)))) {
91                 logputs("\n");
92                 logerrprintfww("Error: %s cannot contain multiallelic variants.\n", outname);
93                 goto WriteMapOrBim_ret_INCONSISTENT_INPUT;
94               }
95             }
96           }
97         }
98         const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
99         // note that VCF ref allele corresponds to A2, not A1
100         if (!refalt1_select) {
101           if ((!allele_presents) || IsSet(allele_presents, 1 + allele_idx_offset_base)) {
102             cswritep = strcpya(cswritep, cur_alleles[1]);
103           } else {
104             *cswritep++ = output_missing_geno_char;
105           }
106           *cswritep++ = delim;
107           cswritep = strcpya(cswritep, cur_alleles[0]);
108         } else {
109           STD_ARRAY_KREF(AlleleCode, 2) cur_refalt1_select = refalt1_select[variant_uidx];
110           if ((!allele_presents) || IsSet(allele_presents, cur_refalt1_select[1] + allele_idx_offset_base)) {
111             cswritep = strcpya(cswritep, cur_alleles[cur_refalt1_select[1]]);
112           } else {
113             *cswritep++ = output_missing_geno_char;
114           }
115           *cswritep++ = delim;
116           cswritep = strcpya(cswritep, cur_alleles[cur_refalt1_select[0]]);
117         }
118       }
119       AppendBinaryEoln(&cswritep);
120       if (unlikely(Cswrite(&css, &cswritep))) {
121         goto WriteMapOrBim_ret_WRITE_FAIL;
122       }
123     }
124     if (unlikely(CswriteCloseNull(&css, cswritep))) {
125       goto WriteMapOrBim_ret_WRITE_FAIL;
126     }
127   }
128   while (0) {
129   WriteMapOrBim_ret_NOMEM:
130     reterr = kPglRetNomem;
131     break;
132   WriteMapOrBim_ret_WRITE_FAIL:
133     reterr = kPglRetWriteFail;
134     break;
135   WriteMapOrBim_ret_INCONSISTENT_INPUT:
136     reterr = kPglRetInconsistentInput;
137     break;
138   }
139  WriteMapOrBim_ret_1:
140   CswriteCloseCond(&css, cswritep);
141   BigstackReset(bigstack_mark);
142   return reterr;
143 }
144 
PvarInfoReloadHeader(TextStream * pvar_reload_txsp,char ** line_iterp,uint32_t * info_col_idx_ptr)145 PglErr PvarInfoReloadHeader(TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* info_col_idx_ptr) {
146   char* line_iter;
147   do {
148     PglErr reterr = TextNextLineLstrip(pvar_reload_txsp, &line_iter);
149     if (unlikely(reterr)) {
150       return reterr;
151     }
152   } while (!StrStartsWithUnsafe(line_iter, "#CHROM"));
153   uint32_t info_col_idx = 0;
154   do {
155     line_iter = NextToken(line_iter);
156     ++info_col_idx;
157   } while (!tokequal_k(line_iter, "INFO"));
158   *line_iterp = line_iter;
159   *info_col_idx_ptr = info_col_idx;
160   return kPglRetSuccess;
161 }
162 
163 // May use all remaining workspace memory.
PvarInfoOpenAndReloadHeader(const char * pvar_info_reload,uint32_t calc_thread_ct,TextStream * pvar_reload_txsp,char ** line_iterp,uint32_t * info_col_idx_ptr)164 PglErr PvarInfoOpenAndReloadHeader(const char* pvar_info_reload, uint32_t calc_thread_ct, TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* info_col_idx_ptr) {
165   PglErr reterr = SizeAndInitTextStream(pvar_info_reload, bigstack_left(), calc_thread_ct, pvar_reload_txsp);
166   if (unlikely(reterr)) {
167     return reterr;
168   }
169   return PvarInfoReloadHeader(pvar_reload_txsp, line_iterp, info_col_idx_ptr);
170 }
171 
PvarInfoWrite(uint32_t info_pr_flag_present,uint32_t is_pr,char * info_token,char ** write_iter_ptr)172 void PvarInfoWrite(uint32_t info_pr_flag_present, uint32_t is_pr, char* info_token, char** write_iter_ptr) {
173   char* info_token_end = CurTokenEnd(info_token);
174   uint32_t info_token_slen = info_token_end - info_token;
175   char* info_token_pr = nullptr;
176   if (info_pr_flag_present) {
177     info_token_pr = PrInInfoToken(info_token_slen, info_token);
178   }
179   char* write_iter = *write_iter_ptr;
180   if (is_pr || (!info_token_pr))  {
181     write_iter = memcpya(write_iter, info_token, info_token_slen);
182     if (is_pr && (!info_token_pr)) {
183       if ((info_token_slen == 1) && (info_token[0] == '.')) {
184         write_iter[-1] = 'P';
185         *write_iter++ = 'R';
186       } else {
187         write_iter = strcpya_k(write_iter, ";PR");
188       }
189     }
190   } else {
191     // possible with --real-ref-alleles/--ref-from-fa
192     if (info_token_pr == info_token) {
193       if (info_token_slen == 2) {
194         *write_iter++ = '.';
195       } else {
196         write_iter = memcpya(write_iter, &(info_token[3]), info_token_slen - 3);
197       }
198     } else {
199       write_iter = memcpya(write_iter, info_token, S_CAST(uintptr_t, info_token_pr - info_token) - 1);
200       const char* pr_end = &(info_token_pr[2]);
201       write_iter = memcpya(write_iter, pr_end, info_token_end - pr_end);
202     }
203   }
204   *write_iter_ptr = write_iter;
205 }
206 
PvarInfoReload(uint32_t info_col_idx,uint32_t variant_uidx,TextStream * pvar_reload_txsp,char ** line_iterp,uint32_t * trs_variant_uidx_ptr)207 PglErr PvarInfoReload(uint32_t info_col_idx, uint32_t variant_uidx, TextStream* pvar_reload_txsp, char** line_iterp, uint32_t* trs_variant_uidx_ptr) {
208   uint32_t trs_variant_uidx = *trs_variant_uidx_ptr;
209   char* line_iter = AdvPastDelim(*line_iterp, '\n');
210   if (trs_variant_uidx < variant_uidx) {
211     TextSetPos(line_iter, pvar_reload_txsp);
212     PglErr reterr = TextSkipNz(variant_uidx - trs_variant_uidx, pvar_reload_txsp);
213     if (unlikely(reterr)) {
214       return reterr;
215     }
216     line_iter = TextLineEnd(pvar_reload_txsp);
217     trs_variant_uidx = variant_uidx;
218   }
219   PglErr reterr = TextNextLineLstripUnsafe(pvar_reload_txsp, &line_iter);
220   if (unlikely(reterr)) {
221     return reterr;
222   }
223   *line_iterp = NextTokenMultFar(line_iter, info_col_idx);
224 
225   // index *after* just-loaded line.
226   *trs_variant_uidx_ptr = trs_variant_uidx + 1;
227   return kPglRetSuccess;
228 }
229 
PvarInfoReloadAndWrite(uint32_t info_pr_flag_present,uint32_t info_col_idx,uint32_t variant_uidx,uint32_t is_pr,TextStream * pvar_reload_txsp,char ** line_iterp,char ** write_iter_ptr,uint32_t * trs_variant_uidx_ptr)230 PglErr PvarInfoReloadAndWrite(uint32_t info_pr_flag_present, uint32_t info_col_idx, uint32_t variant_uidx, uint32_t is_pr, TextStream* pvar_reload_txsp, char** line_iterp, char** write_iter_ptr, uint32_t* trs_variant_uidx_ptr) {
231   PglErr reterr = PvarInfoReload(info_col_idx, variant_uidx, pvar_reload_txsp, line_iterp, trs_variant_uidx_ptr);
232   if (unlikely(reterr)) {
233     return reterr;
234   }
235   PvarInfoWrite(info_pr_flag_present, is_pr, *line_iterp, write_iter_ptr);
236   return kPglRetSuccess;
237 }
238 
AppendChrsetLine(const ChrInfo * cip,char ** write_iter_ptr)239 void AppendChrsetLine(const ChrInfo* cip, char** write_iter_ptr) {
240   char* write_iter = strcpya_k(*write_iter_ptr, "##chrSet=<");
241   if (!(cip->haploid_mask[0] & 1)) {
242     write_iter = strcpya_k(write_iter, "autosomePairCt=");
243     write_iter = u32toa(cip->autosome_ct, write_iter);
244     if (!IsI32Neg(cip->xymt_codes[kChrOffsetX])) {
245       write_iter = strcpya_k(write_iter, ",X");
246     }
247     if (!IsI32Neg(cip->xymt_codes[kChrOffsetY])) {
248       write_iter = strcpya_k(write_iter, ",Y");
249     }
250     if (!IsI32Neg(cip->xymt_codes[kChrOffsetXY])) {
251       write_iter = strcpya_k(write_iter, ",XY");
252     }
253     if (!IsI32Neg(cip->xymt_codes[kChrOffsetMT])) {
254       write_iter = strcpya_k(write_iter, ",M");
255     }
256     if (!IsI32Neg(cip->xymt_codes[kChrOffsetPAR1])) {
257       write_iter = strcpya_k(write_iter, ",PAR1");
258     }
259     if (!IsI32Neg(cip->xymt_codes[kChrOffsetPAR2])) {
260       write_iter = strcpya_k(write_iter, ",PAR2");
261     }
262   } else {
263     write_iter = strcpya_k(write_iter, "haploidAutosomeCt=");
264     write_iter = u32toa(cip->autosome_ct, write_iter);
265   }
266   *write_iter++ = '>';
267   *write_iter_ptr = write_iter;
268   AppendBinaryEoln(write_iter_ptr);
269 }
270 
271 // fileformat, fileDate, source
AppendVcfHeaderStart(uint32_t v43,char ** cswritepp)272 void AppendVcfHeaderStart(uint32_t v43, char** cswritepp) {
273   char* cswritep = *cswritepp;
274   cswritep = strcpya_k(cswritep, "##fileformat=VCFv4.");
275   *cswritep++ = v43 + '2';
276   cswritep = strcpya_k(cswritep, EOLN_STR "##fileDate=");
277   time_t rawtime;
278   time(&rawtime);
279   const struct tm* loctime = localtime(&rawtime);
280   cswritep += strftime(cswritep, kMaxMediumLine, "%Y%m%d", loctime);
281   cswritep = strcpya_k(cswritep, EOLN_STR "##source=PLINKv2.00" EOLN_STR);
282   *cswritepp = cswritep;
283   return;
284 }
285 
286 // Note that the order-of-operations page lists this as happening right after
287 // the filtering performed by LoadPvar().  Which is effectively true, since we
288 // ignore variant_include (this is safe since LoadPvar() always initializes
289 // all variant_bps[] and allele_storage[] entries appropriately).
290 // possible todo: ChrInfo can have a length field, which is initialized by the
291 // ##contig header line when possible, but when that doesn't exist LoadPvar()
292 // can conditionally detect INFO:END and take that into account.  (Or a reason
293 // to keep the entire info_end array in memory may emerge.)
ChrLenLbound(const ChrInfo * cip,const uint32_t * variant_bps,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const uint32_t * new_variant_idx_to_old,uint32_t chr_fo_idx,uint32_t max_allele_slen,UnsortedVar vpos_sortstatus)294 uint32_t ChrLenLbound(const ChrInfo* cip, const uint32_t* variant_bps, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uint32_t* new_variant_idx_to_old, uint32_t chr_fo_idx, uint32_t max_allele_slen, UnsortedVar vpos_sortstatus) {
295   const uint32_t vidx_start = cip->chr_fo_vidx_start[chr_fo_idx];
296   const uint32_t vidx_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
297   assert(vidx_start != vidx_end);
298   if (!(vpos_sortstatus & kfUnsortedVarBp)) {
299     if (!new_variant_idx_to_old) {
300       if (max_allele_slen == 1) {
301         return variant_bps[vidx_end - 1];
302       }
303       uint32_t bp_end = 0;
304       for (uint32_t vidx = vidx_end; vidx != vidx_start; ) {
305         --vidx;
306         const uint32_t cur_bp = variant_bps[vidx];
307         if (cur_bp + max_allele_slen <= bp_end) {
308           break;
309         }
310         uintptr_t allele_idx_offset_base = vidx * 2;
311         if (allele_idx_offsets) {
312           allele_idx_offset_base = allele_idx_offsets[vidx];
313         }
314         // We only care about reference-allele length.
315         const uint32_t cur_bp_end = cur_bp + strlen(allele_storage[allele_idx_offset_base]) - 1;
316         if (cur_bp_end > bp_end) {
317           bp_end = cur_bp_end;
318         }
319       }
320       return bp_end;
321     }
322     if (max_allele_slen == 1) {
323       return variant_bps[new_variant_idx_to_old[vidx_end - 1]];
324     }
325     uint32_t bp_end = 0;
326     for (uint32_t new_vidx = vidx_end; new_vidx != vidx_start; ) {
327       --new_vidx;
328       const uint32_t old_vidx = new_variant_idx_to_old[new_vidx];
329       const uint32_t cur_bp = variant_bps[old_vidx];
330       if (cur_bp + max_allele_slen <= bp_end) {
331         break;
332       }
333       uintptr_t allele_idx_offset_base = old_vidx * 2;
334       if (allele_idx_offsets) {
335         allele_idx_offset_base = allele_idx_offsets[old_vidx];
336       }
337       const uint32_t cur_bp_end = cur_bp + strlen(allele_storage[allele_idx_offset_base]) - 1;
338       if (cur_bp_end > bp_end) {
339         bp_end = cur_bp_end;
340       }
341     }
342     return bp_end;
343   }
344   uint32_t bp_end = U32ArrMax(&(variant_bps[vidx_start]), vidx_end - vidx_start);
345   if (max_allele_slen == 1) {
346     return bp_end;
347   }
348   uint32_t min_check_bp = 0;
349   if (bp_end >= max_allele_slen) {
350     min_check_bp = bp_end + 1 - max_allele_slen;
351   }
352   for (uint32_t vidx = vidx_start; vidx != vidx_end; ++vidx) {
353     const uint32_t cur_bp = variant_bps[vidx];
354     if (cur_bp < min_check_bp) {
355       continue;
356     }
357     uintptr_t allele_idx_offset_base = vidx * 2;
358     if (allele_idx_offsets) {
359       allele_idx_offset_base = allele_idx_offsets[vidx];
360     }
361     const uint32_t cur_bp_end = cur_bp + strlen(allele_storage[allele_idx_offset_base]) - 1;
362     if (cur_bp_end > bp_end) {
363       bp_end = cur_bp_end;
364     }
365   }
366   return bp_end;
367 }
368 
PvarXheaderWrite(const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const uint32_t * new_variant_idx_to_old,uintptr_t xheader_blen,uint32_t vcfheader,uint32_t write_filter,uint32_t write_info,uint32_t append_info_pr_header_line,uint32_t max_allele_slen,UnsortedVar vpos_sortstatus,char * xheader,CompressStreamState * css_ptr,char ** cswritepp)369 PglErr PvarXheaderWrite(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uint32_t* new_variant_idx_to_old, uintptr_t xheader_blen, uint32_t vcfheader, uint32_t write_filter, uint32_t write_info, uint32_t append_info_pr_header_line, uint32_t max_allele_slen, UnsortedVar vpos_sortstatus, char* xheader, CompressStreamState* css_ptr, char** cswritepp) {
370   unsigned char* bigstack_mark = g_bigstack_base;
371   PglErr reterr = kPglRetSuccess;
372   {
373     if (!vcfheader) {
374       if (write_filter && write_info) {
375         if (unlikely(CsputsStd(xheader, xheader_blen, css_ptr, cswritepp))) {
376           goto PvarXheaderWrite_ret_WRITE_FAIL;
377         }
378       } else {
379         // Filter out FILTER/INFO definitions iff the corresponding column has
380         // been removed.
381         const char* copy_start = xheader;
382         const char* xheader_end = &(xheader[xheader_blen]);
383         for (const char* xheader_iter = xheader; xheader_iter != xheader_end; ) {
384           const char* next_line_start = AdvPastDelim(xheader_iter, '\n');
385           if (((!write_filter) && StrStartsWithUnsafe(xheader_iter, "##FILTER=<ID=")) ||
386               ((!write_info) && StrStartsWithUnsafe(xheader_iter, "##INFO=<ID="))) {
387             if (copy_start != xheader_iter) {
388               if (unlikely(CsputsStd(copy_start, xheader_iter - copy_start, css_ptr, cswritepp))) {
389                 goto PvarXheaderWrite_ret_WRITE_FAIL;
390               }
391             }
392             copy_start = next_line_start;
393           }
394           xheader_iter = next_line_start;
395         }
396         if (copy_start != xheader_end) {
397           if (unlikely(CsputsStd(copy_start, xheader_end - copy_start, css_ptr, cswritepp))) {
398             goto PvarXheaderWrite_ret_WRITE_FAIL;
399           }
400         }
401       }
402     } else {
403       // See the start of ExportVcf().
404       AppendVcfHeaderStart(1, cswritepp);
405       const uint32_t chr_ctl = BitCtToWordCt(cip->chr_ct);
406       uintptr_t* written_contig_header_lines;
407       if (unlikely(bigstack_calloc_w(chr_ctl, &written_contig_header_lines))) {
408         goto PvarXheaderWrite_ret_NOMEM;
409       }
410       uint32_t contig_zero_written = 0;
411       char* cswritep = *cswritepp;
412       // ExportVcf() has to perform a customized --merge-par operation, so it
413       // has special handling of chrX/PAR1/PAR2 ##contig header lines.  We omit
414       // that here.
415       char* xheader_end = &(xheader[xheader_blen]);
416       for (char* line_end = xheader; line_end != xheader_end; ) {
417         char* line_start = line_end;
418         line_end = AdvPastDelim(line_start, '\n');
419         const uint32_t slen = line_end - line_start;
420         if ((slen > 14) && StrStartsWithUnsafe(line_start, "##contig=<ID=")) {
421           char* contig_name_start = &(line_start[13]);
422           char* contig_name_end = S_CAST(char*, memchr(contig_name_start, ',', slen - 14));
423           if (!contig_name_end) {
424             // if this line is technically well-formed (ends in '>'), it's
425             // useless anyway, throw it out
426             continue;
427           }
428           const uint32_t chr_idx = GetChrCodeCounted(cip, contig_name_end - contig_name_start, contig_name_start);
429           if (IsI32Neg(chr_idx) || (!IsSet(cip->chr_mask, chr_idx))) {
430             continue;
431           }
432           const uint32_t chr_fo_idx = cip->chr_idx_to_foidx[chr_idx];
433           if (unlikely(IsSet(written_contig_header_lines, chr_fo_idx))) {
434             logerrputs("Error: Duplicate ##contig line in .pvar file.\n");
435             goto PvarXheaderWrite_ret_MALFORMED_INPUT;
436           }
437           SetBit(chr_fo_idx, written_contig_header_lines);
438           // if --output-chr was used at some point, we need to sync the
439           // ##contig chromosome code with the code in the .pvar body.
440           char* chr_name_write_start = strcpya_k(cswritep, "##contig=<ID=");
441           char* chr_name_write_end = chrtoa(cip, chr_idx, chr_name_write_start);
442           if ((*chr_name_write_start == '0') && (chr_name_write_end == &(chr_name_write_start[1]))) {
443             // --allow-extra-chr 0 special case
444             // note that cswritep has *not* been advanced
445             contig_zero_written = 1;  // technically we write this a bit later
446             continue;
447           }
448           cswritep = chr_name_write_end;
449           if (unlikely(Cswrite(css_ptr, &cswritep))) {
450             goto PvarXheaderWrite_ret_WRITE_FAIL;
451           }
452           if (unlikely(CsputsStd(contig_name_end, line_end - contig_name_end, css_ptr, &cswritep))) {
453             goto PvarXheaderWrite_ret_WRITE_FAIL;
454           }
455         } else {
456           if (!write_filter) {
457             if (StrStartsWithUnsafe(line_start, "##FILTER=<ID=")) {
458               continue;
459             }
460           }
461           if (!write_info) {
462             if (StrStartsWithUnsafe(line_start, "##INFO=<ID=")) {
463               continue;
464             }
465           }
466           if (unlikely(CsputsStd(line_start, slen, css_ptr, &cswritep))) {
467             goto PvarXheaderWrite_ret_WRITE_FAIL;
468           }
469         }
470       }
471       // fill in the missing ##contig lines
472       if (contig_zero_written) {
473         cswritep = strcpya_k(cswritep, "##contig=<ID=0,length=2147483645>" EOLN_STR);
474       }
475       for (uint32_t chr_fo_idx = 0; chr_fo_idx != cip->chr_ct; ++chr_fo_idx) {
476         if (IsSet(written_contig_header_lines, chr_fo_idx)) {
477           continue;
478         }
479         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
480         // AllBitsAreZero() doesn't do what we want in the --sort-vars case,
481         // but fortunately we don't need it there.
482         if ((!IsSet(cip->chr_mask, chr_idx)) || (variant_include && AllBitsAreZero(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], cip->chr_fo_vidx_start[chr_fo_idx + 1]))) {
483           continue;
484         }
485         char* chr_name_write_start = strcpya_k(cswritep, "##contig=<ID=");
486         char* chr_name_write_end = chrtoa(cip, chr_idx, chr_name_write_start);
487         if ((*chr_name_write_start == '0') && (chr_name_write_end == &(chr_name_write_start[1]))) {
488           // --allow-extra-chr 0 special case
489           if (contig_zero_written) {
490             continue;
491           }
492           contig_zero_written = 1;
493           cswritep = strcpya_k(chr_name_write_end, ",length=2147483645");
494         } else {
495           cswritep = strcpya_k(chr_name_write_end, ",length=");
496           const uint32_t pos_end = ChrLenLbound(cip, variant_bps, allele_idx_offsets, allele_storage, new_variant_idx_to_old, chr_fo_idx, max_allele_slen, vpos_sortstatus);
497           cswritep = u32toa(pos_end, cswritep);
498         }
499         *cswritep++ = '>';
500         AppendBinaryEoln(&cswritep);
501         if (unlikely(Cswrite(css_ptr, &cswritep))) {
502           goto PvarXheaderWrite_ret_WRITE_FAIL;
503         }
504       }
505       *cswritepp = cswritep;
506     }
507     if (append_info_pr_header_line) {
508       *cswritepp = strcpya_k(*cswritepp, "##INFO=<ID=PR,Number=0,Type=Flag,Description=\"Provisional reference allele, may not be based on real reference genome\">" EOLN_STR);
509     }
510   }
511   while (0) {
512   PvarXheaderWrite_ret_NOMEM:
513     reterr = kPglRetNomem;
514     break;
515   PvarXheaderWrite_ret_WRITE_FAIL:
516     reterr = kPglRetWriteFail;
517     break;
518   PvarXheaderWrite_ret_MALFORMED_INPUT:
519     reterr = kPglRetMalformedInput;
520     break;
521   }
522   BigstackReset(bigstack_mark);
523   return reterr;
524 }
525 
526 PglErr WritePvar(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct, char* xheader) {
527   // allele_presents must be nullptr unless we're trimming alt alleles
528   // split/join cases handled by WritePvarSplit() and WritePvarJoin()
529   unsigned char* bigstack_mark = g_bigstack_base;
530   char* cswritep = nullptr;
531   PglErr reterr = kPglRetSuccess;
532   CompressStreamState css;
533   TextStream pvar_reload_txs;
534   PreinitCstream(&css);
535   PreinitTextStream(&pvar_reload_txs);
536   {
537     const uint32_t max_chr_blen = GetMaxChrSlen(cip) + 1;
538     // includes trailing tab
539     char* chr_buf;
540 
541     if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
542       goto WritePvar_ret_NOMEM;
543     }
544     uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen + max_filter_slen + info_reload_slen;
545     if (overflow_buf_size < 2 * kCompressStreamBlock) {
546       overflow_buf_size = 2 * kCompressStreamBlock;
547     }
548     const uint32_t output_zst = (pvar_psam_flags / kfPvarZs) & 1;
549     reterr = InitCstreamAlloc(outname, 0, output_zst, thread_ct, overflow_buf_size, &css, &cswritep);
550     if (unlikely(reterr)) {
551       goto WritePvar_ret_1;
552     }
553     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
554     const uint32_t all_nonref = (nonref_flags_storage == 2);
555     uint32_t write_info_pr = all_nonref;
556     uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
557     if (write_info && nonref_flags) {
558       write_info_pr = !IntersectionIsEmpty(variant_include, nonref_flags, raw_variant_ctl);
559     }
560     write_info_pr = write_info_pr && write_info;
561     if (unlikely(write_info_pr && (info_flags & kfInfoPrNonflagPresent))) {
562       logputs("\n");
563       logerrputs("Error: Conflicting INFO:PR definitions.  Either fix all REF alleles so that the\n'provisional reference' flag is no longer needed, or remove/rename the other\nuse of the INFO:PR key.\n");
564       goto WritePvar_ret_INCONSISTENT_INPUT;
565     }
566 
567     uint32_t write_filter = 0;
568     if (pvar_psam_flags & kfPvarColFilter) {
569       write_filter = 1;
570     } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
571       write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
572     }
573     char* pvar_info_line_iter = nullptr;
574     uint32_t info_col_idx = 0;  // could save this during first load instead
575     const uint32_t info_pr_flag_present = (info_flags / kfInfoPrFlagPresent) & 1;
576     if (pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) {
577       reterr = PvarXheaderWrite(variant_include, cip, variant_bps, allele_idx_offsets, allele_storage, nullptr, xheader_blen, (pvar_psam_flags / kfPvarColVcfheader) & 1, write_filter, write_info, write_info_pr && (!info_pr_flag_present), max_allele_slen, vpos_sortstatus, xheader, &css, &cswritep);
578       if (unlikely(reterr)) {
579         goto WritePvar_ret_1;
580       }
581     }
582     // bugfix (30 Jul 2017): may be necessary to reload INFO when no ## lines
583     // are in the header... er, should we still allow this?
584     if (pvar_info_reload) {
585       reterr = PvarInfoOpenAndReloadHeader(pvar_info_reload, 1 + (thread_ct > 1), &pvar_reload_txs, &pvar_info_line_iter, &info_col_idx);
586       if (unlikely(reterr)) {
587         goto WritePvar_ret_TSTREAM_FAIL;
588       }
589     }
590     if (cip->chrset_source) {
591       AppendChrsetLine(cip, &cswritep);
592     }
593     cswritep = strcpya_k(cswritep, "#CHROM\tPOS\tID\tREF\tALT");
594 
595     uint32_t write_qual = 0;
596     if (pvar_psam_flags & kfPvarColQual) {
597       write_qual = 1;
598     } else if ((pvar_psam_flags & kfPvarColMaybequal) && qual_present) {
599       write_qual = !IntersectionIsEmpty(variant_include, qual_present, raw_variant_ctl);
600     }
601     if (write_qual) {
602       cswritep = strcpya_k(cswritep, "\tQUAL");
603     }
604 
605     if (write_filter) {
606       cswritep = strcpya_k(cswritep, "\tFILTER");
607     }
608 
609     if (write_info) {
610       cswritep = strcpya_k(cswritep, "\tINFO");
611     }
612 
613     uint32_t write_cm = 0;
614     if (pvar_psam_flags & kfPvarColCm) {
615       write_cm = 1;
616     } else if ((pvar_psam_flags & kfPvarColMaybecm) && variant_cms) {
617       if (raw_variant_ct == variant_ct) {
618         // nonzero_cm_present check was performed
619         write_cm = 1;
620       } else {
621         uintptr_t variant_uidx_base = 0;
622         uintptr_t cur_bits = variant_include[0];
623         for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
624           const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
625           if (variant_cms[variant_uidx] != 0.0) {
626             write_cm = 1;
627             break;
628           }
629         }
630       }
631     }
632     if (write_cm) {
633       cswritep = strcpya_k(cswritep, "\tCM");
634     }
635     AppendBinaryEoln(&cswritep);
636 
637     const char output_missing_geno_char = *g_output_missing_geno_ptr;
638     uint32_t trs_variant_uidx = 0;
639     uintptr_t variant_uidx_base = 0;
640     uintptr_t cur_bits = variant_include[0];
641     uint32_t chr_fo_idx = UINT32_MAX;
642     uint32_t chr_end = 0;
643     uint32_t chr_buf_blen = 0;
644     uint32_t ref_allele_idx = 0;
645     uint32_t alt1_allele_idx = 1;
646     uint32_t cur_allele_ct = 2;
647     uint32_t pct = 0;
648     uint32_t next_print_variant_idx = variant_ct / 100;
649     fputs("0%", stdout);
650     fflush(stdout);
651     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
652       const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
653       if (variant_uidx >= chr_end) {
654         do {
655           ++chr_fo_idx;
656           chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
657         } while (variant_uidx >= chr_end);
658         char* chr_name_end = chrtoa(cip, cip->chr_file_order[chr_fo_idx], chr_buf);
659         *chr_name_end = '\t';
660         chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
661       }
662       cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
663       cswritep = u32toa_x(variant_bps[variant_uidx], '\t', cswritep);
664       cswritep = strcpyax(cswritep, variant_ids[variant_uidx], '\t');
665       uintptr_t allele_idx_offset_base;
666       if (!allele_idx_offsets) {
667         allele_idx_offset_base = variant_uidx * 2;
668       } else {
669         allele_idx_offset_base = allele_idx_offsets[variant_uidx];
670         cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
671       }
672       const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
673       if (refalt1_select) {
674         ref_allele_idx = refalt1_select[variant_uidx][0];
675         alt1_allele_idx = refalt1_select[variant_uidx][1];
676       }
677       cswritep = strcpyax(cswritep, cur_alleles[ref_allele_idx], '\t');
678       uint32_t alt_allele_written = 0;
679       if ((!allele_presents) || IsSet(allele_presents, allele_idx_offset_base + alt1_allele_idx)) {
680         cswritep = strcpya(cswritep, cur_alleles[alt1_allele_idx]);
681         alt_allele_written = 1;
682       }
683       if (unlikely(Cswrite(&css, &cswritep))) {
684         goto WritePvar_ret_WRITE_FAIL;
685       }
686       if (cur_allele_ct > 2) {
687         for (uint32_t allele_idx = 0; allele_idx != cur_allele_ct; ++allele_idx) {
688           if ((allele_idx == ref_allele_idx) || (allele_idx == alt1_allele_idx) || (allele_presents && (!IsSet(allele_presents, allele_idx_offset_base + allele_idx)))) {
689             continue;
690           }
691           if (alt_allele_written) {
692             *cswritep++ = ',';
693           }
694           alt_allele_written = 1;
695           cswritep = strcpya(cswritep, cur_alleles[allele_idx]);
696           if (unlikely(Cswrite(&css, &cswritep))) {
697             goto WritePvar_ret_WRITE_FAIL;
698           }
699         }
700       }
701       if (!alt_allele_written) {
702         *cswritep++ = output_missing_geno_char;
703       }
704 
705       if (write_qual) {
706         *cswritep++ = '\t';
707         if ((!qual_present) || (!IsSet(qual_present, variant_uidx))) {
708           *cswritep++ = '.';
709         } else {
710           cswritep = ftoa_g(quals[variant_uidx], cswritep);
711         }
712       }
713 
714       if (write_filter) {
715         *cswritep++ = '\t';
716         if ((!filter_present) || (!IsSet(filter_present, variant_uidx))) {
717           *cswritep++ = '.';
718         } else if (!IsSet(filter_npass, variant_uidx)) {
719           cswritep = strcpya_k(cswritep, "PASS");
720         } else {
721           cswritep = strcpya(cswritep, filter_storage[variant_uidx]);
722         }
723       }
724 
725       if (write_info) {
726         *cswritep++ = '\t';
727         const uint32_t is_pr = all_nonref || (nonref_flags && IsSet(nonref_flags, variant_uidx));
728         if (pvar_info_line_iter) {
729           reterr = PvarInfoReloadAndWrite(info_pr_flag_present, info_col_idx, variant_uidx, is_pr, &pvar_reload_txs, &pvar_info_line_iter, &cswritep, &trs_variant_uidx);
730           if (unlikely(reterr)) {
731             goto WritePvar_ret_TSTREAM_FAIL;
732           }
733         } else {
734           if (is_pr) {
735             cswritep = strcpya_k(cswritep, "PR");
736           } else {
737             *cswritep++ = '.';
738           }
739         }
740       }
741 
742       if (write_cm) {
743         *cswritep++ = '\t';
744         if (!variant_cms) {
745           *cswritep++ = '0';
746         } else {
747           cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
748         }
749       }
750       AppendBinaryEoln(&cswritep);
751       if (variant_idx >= next_print_variant_idx) {
752         if (pct > 10) {
753           putc_unlocked('\b', stdout);
754         }
755         pct = (variant_idx * 100LLU) / variant_ct;
756         printf("\b\b%u%%", pct++);
757         fflush(stdout);
758         next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
759       }
760     }
761     if (unlikely(CswriteCloseNull(&css, cswritep))) {
762       goto WritePvar_ret_WRITE_FAIL;
763     }
764     if (pct > 10) {
765       putc_unlocked('\b', stdout);
766     }
767     fputs("\b\b", stdout);
768   }
769   while (0) {
770   WritePvar_ret_NOMEM:
771     reterr = kPglRetNomem;
772     break;
773   WritePvar_ret_TSTREAM_FAIL:
774     TextStreamErrPrint(pvar_info_reload, &pvar_reload_txs);
775     break;
776   WritePvar_ret_WRITE_FAIL:
777     reterr = kPglRetWriteFail;
778     break;
779   WritePvar_ret_INCONSISTENT_INPUT:
780     reterr = kPglRetInconsistentInput;
781     break;
782   }
783  WritePvar_ret_1:
784   CswriteCloseCond(&css, cswritep);
785   CleanupTextStream2(pvar_info_reload, &pvar_reload_txs, &reterr);
786   BigstackReset(bigstack_mark);
787   return reterr;
788 }
789 
WriteFam(const char * outname,const uintptr_t * sample_include,const PedigreeIdInfo * piip,const uintptr_t * sex_nm,const uintptr_t * sex_male,const PhenoCol * pheno_cols,const uint32_t * new_sample_idx_to_old,uint32_t sample_ct,uint32_t pheno_ct,char delim)790 PglErr WriteFam(const char* outname, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uint32_t pheno_ct, char delim) {
791   FILE* outfile = nullptr;
792   PglErr reterr = kPglRetSuccess;
793   {
794     if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
795       goto WriteFam_ret_OPEN_FAIL;
796     }
797     uintptr_t* pheno_nm = nullptr;
798     uintptr_t* pheno_cc = nullptr;
799     double* pheno_qt = nullptr;
800     // .fam files don't support categorical phenotypes
801     const uint32_t pheno_idx = FirstCcOrQtPhenoIdx(pheno_cols, pheno_ct);
802     if (pheno_idx != UINT32_MAX) {
803       const PhenoDtype type_code = pheno_cols[pheno_idx].type_code;
804       pheno_nm = pheno_cols[pheno_idx].nonmiss;
805       if (type_code == kPhenoDtypeCc) {
806         pheno_cc = pheno_cols[pheno_idx].data.cc;
807       } else {
808         pheno_qt = pheno_cols[pheno_idx].data.qt;
809       }
810     }
811     const char* legacy_output_missing_pheno = g_legacy_output_missing_pheno;
812     const uint32_t lomp_slen = strlen(legacy_output_missing_pheno);
813 
814     // possible todo: warning if two sample IDs only differ in SID?  (check for
815     // this if any file is being exported that can't have a SID column)
816     const char* sample_ids = piip->sii.sample_ids;
817     const char* paternal_ids = piip->parental_id_info.paternal_ids;
818     const char* maternal_ids = piip->parental_id_info.maternal_ids;
819     const uintptr_t max_sample_id_blen = piip->sii.max_sample_id_blen;
820     const uintptr_t max_paternal_id_blen = piip->parental_id_info.max_paternal_id_blen;
821     const uintptr_t max_maternal_id_blen = piip->parental_id_info.max_maternal_id_blen;
822     uintptr_t sample_uidx_base = 0;
823     uintptr_t cur_bits = sample_include[0];
824     uint32_t sample_uidx2 = 0;
825     char* write_iter = g_textbuf;
826     char* textbuf_flush = &(write_iter[kMaxMediumLine]);
827     // not really necessary to make sample_uidx increment dependent on
828     // new_sample_idx_to_old == nullptr
829     for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
830       uintptr_t sample_uidx;
831       if (!new_sample_idx_to_old) {
832         sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
833       } else {
834         do {
835           sample_uidx = new_sample_idx_to_old[sample_uidx2++];
836         } while (!IsSet(sample_include, sample_uidx));
837       }
838       const char* cur_sample_id = &(sample_ids[max_sample_id_blen * sample_uidx]);
839       if (delim == '\t') {
840         write_iter = strcpya(write_iter, cur_sample_id);
841       } else {
842         const char* fid_end = AdvToDelim(cur_sample_id, '\t');
843         write_iter = memcpyax(write_iter, cur_sample_id, fid_end - cur_sample_id, delim);
844         write_iter = strcpya(write_iter, &(fid_end[1]));
845       }
846       *write_iter++ = delim;
847       write_iter = strcpyax(write_iter, &(paternal_ids[max_paternal_id_blen * sample_uidx]), delim);
848       write_iter = strcpyax(write_iter, &(maternal_ids[max_maternal_id_blen * sample_uidx]), delim);
849       *write_iter++ = Sexchar(sex_nm, sex_male, sample_uidx);
850       *write_iter++ = delim;
851       if ((!pheno_nm) || (!IsSet(pheno_nm, sample_uidx))) {
852         write_iter = memcpya(write_iter, legacy_output_missing_pheno, lomp_slen);
853       } else if (pheno_cc) {
854         // do we want to allow user to force 0/1 output?
855         *write_iter++ = '1' + IsSet(pheno_cc, sample_uidx);
856       } else {
857         write_iter = dtoa_g(pheno_qt[sample_uidx], write_iter);
858       }
859       AppendBinaryEoln(&write_iter);
860       if (unlikely(fwrite_ck(textbuf_flush, outfile, &write_iter))) {
861         goto WriteFam_ret_WRITE_FAIL;
862       }
863     }
864     if (unlikely(fclose_flush_null(textbuf_flush, write_iter, &outfile))) {
865       goto WriteFam_ret_WRITE_FAIL;
866     }
867   }
868   while (0) {
869   WriteFam_ret_OPEN_FAIL:
870     reterr = kPglRetOpenFail;
871     break;
872   WriteFam_ret_WRITE_FAIL:
873     reterr = kPglRetWriteFail;
874     break;
875   }
876   fclose_cond(outfile);
877   return reterr;
878 }
879 
DataFidColIsRequired(const uintptr_t * sample_include,const SampleIdInfo * siip,uint32_t sample_ct,uint32_t maybe_modifier)880 uint32_t DataFidColIsRequired(const uintptr_t* sample_include, const SampleIdInfo* siip, uint32_t sample_ct, uint32_t maybe_modifier) {
881   if (maybe_modifier & 2) {
882     return 1;
883   }
884   if ((!(maybe_modifier & 1)) || (!(siip->flags & kfSampleIdFidPresent))) {
885     return 0;
886   }
887   const char* sample_ids = siip->sample_ids;
888   const uintptr_t max_sample_id_blen = siip->max_sample_id_blen;
889   uintptr_t sample_uidx_base = 0;
890   uintptr_t cur_bits = sample_include[0];
891   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
892     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
893     if (!memequal_k(&(sample_ids[sample_uidx * max_sample_id_blen]), "0\t", 2)) {
894       return 1;
895     }
896   }
897   return 0;
898 }
899 
DataSidColIsRequired(const uintptr_t * sample_include,const char * sids,uint32_t sample_ct,uint32_t max_sid_blen,uint32_t maybe_modifier)900 uint32_t DataSidColIsRequired(const uintptr_t* sample_include, const char* sids, uint32_t sample_ct, uint32_t max_sid_blen, uint32_t maybe_modifier) {
901   // note that MAYBESID and SID can both be set
902   if (maybe_modifier & 2) {
903     return 1;
904   }
905   if (sids && (maybe_modifier & 1)) {
906     uintptr_t sample_uidx_base = 0;
907     uintptr_t cur_bits = sample_include[0];
908     for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
909       const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
910       if (!memequal_k(&(sids[sample_uidx * max_sid_blen]), "0", 2)) {
911         return 1;
912       }
913     }
914   }
915   return 0;
916 }
917 
DataParentalColsAreRequired(const uintptr_t * sample_include,const PedigreeIdInfo * piip,uint32_t sample_ct,uint32_t maybe_modifier)918 uint32_t DataParentalColsAreRequired(const uintptr_t* sample_include, const PedigreeIdInfo* piip, uint32_t sample_ct, uint32_t maybe_modifier) {
919   if (maybe_modifier & 2) {
920     return 1;
921   }
922   if ((!(maybe_modifier & 1)) || (!(piip->sii.flags & kfSampleIdParentsPresent))) {
923     return 0;
924   }
925   const char* paternal_ids = piip->parental_id_info.paternal_ids;
926   const char* maternal_ids = piip->parental_id_info.maternal_ids;
927   const uintptr_t max_paternal_id_blen = piip->parental_id_info.max_paternal_id_blen;
928   const uintptr_t max_maternal_id_blen = piip->parental_id_info.max_maternal_id_blen;
929   uintptr_t sample_uidx_base = 0;
930   uintptr_t cur_bits = sample_include[0];
931   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
932     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
933     if ((!strequal_k_unsafe(&(paternal_ids[sample_uidx * max_paternal_id_blen]), "0")) || (!strequal_k_unsafe(&(maternal_ids[sample_uidx * max_maternal_id_blen]), "0"))) {
934       return 1;
935     }
936   }
937   return 0;
938 }
939 
AppendPhenoStr(const PhenoCol * pheno_col,const char * output_missing_pheno,uint32_t omp_slen,uint32_t sample_uidx,char * write_iter)940 char* AppendPhenoStr(const PhenoCol* pheno_col, const char* output_missing_pheno, uint32_t omp_slen, uint32_t sample_uidx, char* write_iter) {
941   const PhenoDtype type_code = pheno_col->type_code;
942   if (type_code <= kPhenoDtypeQt) {
943     if (!IsSet(pheno_col->nonmiss, sample_uidx)) {
944       write_iter = memcpya(write_iter, output_missing_pheno, omp_slen);
945     } else if (type_code == kPhenoDtypeCc) {
946       *write_iter++ = '1' + IsSet(pheno_col->data.cc, sample_uidx);
947     } else {
948       write_iter = dtoa_g(pheno_col->data.qt[sample_uidx], write_iter);
949     }
950   } else {
951     write_iter = strcpya(write_iter, pheno_col->category_names[pheno_col->data.cat[sample_uidx]]);
952   }
953   return write_iter;
954 }
955 
WritePsam(const char * outname,const uintptr_t * sample_include,const PedigreeIdInfo * piip,const uintptr_t * sex_nm,const uintptr_t * sex_male,const PhenoCol * pheno_cols,const char * pheno_names,const uint32_t * new_sample_idx_to_old,uint32_t sample_ct,uint32_t pheno_ct,uintptr_t max_pheno_name_blen,PvarPsamFlags pvar_psam_flags)956 PglErr WritePsam(const char* outname, const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, PvarPsamFlags pvar_psam_flags) {
957   FILE* outfile = nullptr;
958   PglErr reterr = kPglRetSuccess;
959   {
960     if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
961       goto WritePsam_ret_OPEN_FAIL;
962     }
963     const char* output_missing_pheno = g_output_missing_pheno;
964     const uint32_t omp_slen = strlen(output_missing_pheno);
965 
966     char* textbuf_flush = &(g_textbuf[kMaxMediumLine]);
967 
968     const char* sample_ids = piip->sii.sample_ids;
969     const char* sids = piip->sii.sids;
970     const char* paternal_ids = piip->parental_id_info.paternal_ids;
971     const char* maternal_ids = piip->parental_id_info.maternal_ids;
972     const uintptr_t max_sample_id_blen = piip->sii.max_sample_id_blen;
973     const uintptr_t max_sid_blen = piip->sii.max_sid_blen;
974     const uintptr_t max_paternal_id_blen = piip->parental_id_info.max_paternal_id_blen;
975     const uintptr_t max_maternal_id_blen = piip->parental_id_info.max_maternal_id_blen;
976     const uint32_t write_fid = DataFidColIsRequired(sample_include, &(piip->sii), sample_ct, pvar_psam_flags / kfPsamColMaybefid);
977     const uint32_t write_sid = DataSidColIsRequired(sample_include, sids, sample_ct, max_sid_blen, pvar_psam_flags / kfPsamColMaybesid);
978     const uint32_t write_parents = DataParentalColsAreRequired(sample_include, piip, sample_ct, pvar_psam_flags / kfPsamColMaybeparents);
979     const uint32_t write_sex = (pvar_psam_flags / kfPsamColSex) & 1;
980     const uint32_t write_empty_pheno = (pvar_psam_flags & kfPsamColPheno1) && (!pheno_ct);
981     const uint32_t write_phenos = (pvar_psam_flags & (kfPsamColPheno1 | kfPsamColPhenos)) && pheno_ct;
982     if (write_phenos && (!(pvar_psam_flags & kfPsamColPhenos))) {
983       pheno_ct = 1;
984     }
985     char* write_iter = g_textbuf;
986     *write_iter++ = '#';
987     if (write_fid) {
988       write_iter = strcpya_k(write_iter, "FID\t");
989     }
990     write_iter = strcpya_k(write_iter, "IID");
991     if (write_sid) {
992       write_iter = strcpya_k(write_iter, "\tSID");
993     }
994     if (write_parents) {
995       write_iter = strcpya_k(write_iter, "\tPAT\tMAT");
996     }
997     if (write_sex) {
998       write_iter = strcpya_k(write_iter, "\tSEX");
999     }
1000     if (write_phenos) {
1001       for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
1002         *write_iter++ = '\t';
1003         const char* cur_pheno_name = &(pheno_names[pheno_idx * max_pheno_name_blen]);
1004         const uint32_t cur_pheno_name_slen = strlen(cur_pheno_name);
1005         if (strequal_k(cur_pheno_name, "SEX", cur_pheno_name_slen)) {
1006           if (unlikely(write_sex)) {
1007             logerrputs("Error: .psam file cannot have both a regular SEX column and a phenotype named\n'SEX'.  Exclude or rename one of these columns.\n");
1008             goto WritePsam_ret_INCONSISTENT_INPUT;
1009           }
1010           // does this phenotype column conform to the SEX column format?
1011           // case/control is always ok, but quantitative or categorical needs
1012           // to be checked
1013           const PhenoCol* sex_col = &(pheno_cols[pheno_idx]);
1014           if (sex_col->type_code != kPhenoDtypeCc) {
1015             // could bitwise-and sample_include and pheno_nm before the loop
1016             const uintptr_t* pheno_nm = sex_col->nonmiss;
1017             uintptr_t sample_uidx_base = 0;
1018             uintptr_t cur_bits = sample_include[0];
1019             if (sex_col->type_code == kPhenoDtypeQt) {
1020               const double* pheno_vals = sex_col->data.qt;
1021               for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
1022                 const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
1023                 if (IsSet(pheno_nm, sample_uidx)) {
1024                   const double dxx = pheno_vals[sample_uidx];
1025                   // tolerate '-9' and '0' as missing values, and anything in
1026                   // [1, 2] (could be reasonable to represent XXY, etc. with
1027                   // decimals).
1028                   if (unlikely(((dxx < 1.0) && (dxx != -9.0) && (dxx != 0.0)) || (dxx > 2.0))) {
1029                     logerrputs("Error: .psam numeric SEX values are expected to be in {-9, 0, 1, 2}.\n");
1030                     goto WritePsam_ret_INCONSISTENT_INPUT;
1031                   }
1032                 }
1033               }
1034             } else {
1035               assert(sex_col->type_code == kPhenoDtypeCat);
1036               const uint32_t nonnull_cat_ct = sex_col->nonnull_category_ct;
1037               if (nonnull_cat_ct) {
1038                 const char* const* cur_category_names = sex_col->category_names;
1039                 // tolerate 'M' and 'm' being present simultaneously, etc.
1040                 uint32_t male_cat_idx1 = 0;
1041                 uint32_t male_cat_idx2 = 0;
1042                 uint32_t female_cat_idx1 = 0;
1043                 uint32_t female_cat_idx2 = 0;
1044                 for (uint32_t cat_idx = 1; cat_idx <= nonnull_cat_ct; ++cat_idx) {
1045                   const char* cur_cat_name = cur_category_names[cat_idx];
1046                   if (!cur_cat_name[1]) {
1047                     uint32_t first_char_code = ctou32(cur_cat_name[0]);
1048                     first_char_code &= 0xdf;
1049                     if (first_char_code == 70) {
1050                       if (!female_cat_idx1) {
1051                         female_cat_idx1 = cat_idx;
1052                       } else {
1053                         female_cat_idx2 = cat_idx;
1054                       }
1055                     } else if (first_char_code == 77) {
1056                       if (!male_cat_idx1) {
1057                         male_cat_idx1 = cat_idx;
1058                       } else {
1059                         male_cat_idx2 = cat_idx;
1060                       }
1061                     }
1062                   }
1063                 }
1064                 if (S_CAST(uint32_t, (male_cat_idx1 != 0) + (male_cat_idx2 != 0) + (female_cat_idx1 != 0) + (female_cat_idx2 != 0)) < nonnull_cat_ct) {
1065                   const uint32_t* pheno_vals = sex_col->data.cat;
1066                   for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
1067                     const uintptr_t sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
1068                     if (IsSet(pheno_nm, sample_uidx)) {
1069                       const uint32_t cur_cat_idx = pheno_vals[sample_uidx];
1070                       if (unlikely((cur_cat_idx != male_cat_idx1) && (cur_cat_idx != female_cat_idx1) && (cur_cat_idx != male_cat_idx2) && (cur_cat_idx != female_cat_idx2))) {
1071                         logerrputs("Error: .psam alphabetic SEX values are expected to be in {'F', 'f', 'M', 'm'}.\n");
1072                         goto WritePsam_ret_INCONSISTENT_INPUT;
1073                       }
1074                     }
1075                   }
1076                 }
1077               }
1078             }
1079           }
1080         }
1081         write_iter = memcpya(write_iter, cur_pheno_name, cur_pheno_name_slen);
1082         if (unlikely(fwrite_ck(textbuf_flush, outfile, &write_iter))) {
1083           goto WritePsam_ret_WRITE_FAIL;
1084         }
1085       }
1086     } else if (write_empty_pheno) {
1087       write_iter = strcpya_k(write_iter, "\tPHENO1");
1088     }
1089     AppendBinaryEoln(&write_iter);
1090 
1091     uintptr_t sample_uidx_base = 0;
1092     uintptr_t cur_bits = sample_include[0];
1093     uint32_t sample_uidx2 = 0;
1094     // not really necessary to make sample_uidx increment dependent on
1095     // new_sample_idx_to_old == nullptr
1096     for (uint32_t sample_idx = 0; sample_idx != sample_ct; ++sample_idx) {
1097       uintptr_t sample_uidx;
1098       if (!new_sample_idx_to_old) {
1099         sample_uidx = BitIter1(sample_include, &sample_uidx_base, &cur_bits);
1100       } else {
1101         do {
1102           sample_uidx = new_sample_idx_to_old[sample_uidx2++];
1103         } while (!IsSet(sample_include, sample_uidx));
1104       }
1105       const char* cur_sample_id = &(sample_ids[max_sample_id_blen * sample_uidx]);
1106       if (!write_fid) {
1107         cur_sample_id = AdvPastDelim(cur_sample_id, '\t');
1108       }
1109       write_iter = strcpya(write_iter, cur_sample_id);
1110       if (write_sid) {
1111         *write_iter++ = '\t';
1112         if (sids) {
1113           write_iter = strcpya(write_iter, &(sids[max_sid_blen * sample_uidx]));
1114         } else {
1115           *write_iter++ = '0';
1116         }
1117       }
1118       if (write_parents) {
1119         *write_iter++ = '\t';
1120         write_iter = strcpyax(write_iter, &(paternal_ids[max_paternal_id_blen * sample_uidx]), '\t');
1121         write_iter = strcpya(write_iter, &(maternal_ids[max_maternal_id_blen * sample_uidx]));
1122       }
1123       if (write_sex) {
1124         *write_iter++ = '\t';
1125         if (IsSet(sex_nm, sample_uidx)) {
1126           *write_iter++ = '2' - IsSet(sex_male, sample_uidx);
1127         } else {
1128           // this is better than '0' since it allows the raw column to be used
1129           // as --covar input
1130           // (can't do this for .fam export, though: not worth the
1131           // compatibility issues)
1132           write_iter = strcpya_k(write_iter, "NA");
1133         }
1134       }
1135       if (write_phenos) {
1136         for (uint32_t pheno_idx = 0; pheno_idx != pheno_ct; ++pheno_idx) {
1137           *write_iter++ = '\t';
1138           write_iter = AppendPhenoStr(&(pheno_cols[pheno_idx]), output_missing_pheno, omp_slen, sample_uidx, write_iter);
1139           if (unlikely(fwrite_ck(textbuf_flush, outfile, &write_iter))) {
1140             goto WritePsam_ret_WRITE_FAIL;
1141           }
1142         }
1143       } else {
1144         if (write_empty_pheno) {
1145           *write_iter++ = '\t';
1146           write_iter = memcpya(write_iter, output_missing_pheno, omp_slen);
1147         }
1148         if (unlikely(fwrite_ck(textbuf_flush, outfile, &write_iter))) {
1149           goto WritePsam_ret_WRITE_FAIL;
1150         }
1151       }
1152       AppendBinaryEoln(&write_iter);
1153     }
1154     if (unlikely(fclose_flush_null(textbuf_flush, write_iter, &outfile))) {
1155       goto WritePsam_ret_WRITE_FAIL;
1156     }
1157   }
1158   while (0) {
1159   WritePsam_ret_OPEN_FAIL:
1160     reterr = kPglRetOpenFail;
1161     break;
1162   WritePsam_ret_WRITE_FAIL:
1163     reterr = kPglRetWriteFail;
1164     break;
1165   WritePsam_ret_INCONSISTENT_INPUT:
1166     reterr = kPglRetInconsistentInput;
1167     break;
1168   }
1169   fclose_cond(outfile);
1170   return reterr;
1171 }
1172 
1173 /*
1174 #ifdef __arm__
1175 #  error "Unaligned accesses in BitvecResort()."
1176 #endif
1177 void BitvecResort(const uintptr_t* bitvec, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, unsigned char* writebuf) {
1178   const uint32_t sample_ctl_m1 = BitCtToWordCt(sample_ct) - 1;
1179   uint32_t widx = 0;
1180   uint32_t cur_word_entry_ct = kBitsPerWord;
1181   const uint32_t* new_sample_idx_to_old_base = new_sample_idx_to_old;
1182   uintptr_t* writebuf_walias = (uintptr_t*)writebuf;
1183   while (1) {
1184     if (widx == sample_ctl_m1) {
1185       cur_word_entry_ct = 1 + ((sample_ct - 1) % kBitsPerWord);
1186     }
1187     uintptr_t cur_word = 0;
1188     for (uint32_t uii = 0; uii != cur_word_entry_ct; ++uii) {
1189       cur_word |= IsSet(bitvec, new_sample_idx_to_old_base[uii]) << uii;
1190     }
1191     if (widx == sample_ctl_m1) {
1192       memcpy(&(writebuf_walias[widx]), &cur_word, (cur_word_entry_ct + (CHAR_BIT - 1)) / CHAR_BIT);
1193       return;
1194     }
1195     writebuf_walias[widx++] = cur_word;
1196     new_sample_idx_to_old_base = &(new_sample_idx_to_old_base[kBitsPerWord]);
1197   }
1198 }
1199 */
1200 
1201 #ifdef __arm__
1202 #  error "Unaligned accesses in GenovecResort()."
1203 #endif
GenovecResort(const uintptr_t * genovec,const uint32_t * new_sample_idx_to_old,uint32_t sample_ct,void * writebuf)1204 void GenovecResort(const uintptr_t* genovec, const uint32_t* new_sample_idx_to_old, uint32_t sample_ct, void* writebuf) {
1205   // writebuf need not be word-aligned
1206   const uint32_t sample_ctl2_m1 = NypCtToWordCt(sample_ct) - 1;
1207   const uint32_t* new_sample_idx_to_old_iter = new_sample_idx_to_old;
1208   uintptr_t* writebuf_walias = S_CAST(uintptr_t*, writebuf);
1209   for (uint32_t widx = 0; widx != sample_ctl2_m1; ++widx) {
1210     uintptr_t cur_word = 0;
1211     // this is noticeably better than the ascending loop
1212     for (uint32_t uii = kBitsPerWordD2 - 1; ; --uii) {
1213       cur_word |= GetNyparrEntry(genovec, new_sample_idx_to_old_iter[uii]);
1214       if (!uii) {
1215         break;
1216       }
1217       cur_word = cur_word << 2;
1218     }
1219     writebuf_walias[widx] = cur_word;
1220     new_sample_idx_to_old_iter = &(new_sample_idx_to_old_iter[kBitsPerWordD2]);
1221   }
1222   const uint32_t cur_word_entry_ct = ModNz(sample_ct, kBitsPerWordD2);
1223   uintptr_t cur_word = 0;
1224   for (uint32_t uii = cur_word_entry_ct - 1; ; --uii) {
1225     cur_word |= GetNyparrEntry(genovec, new_sample_idx_to_old_iter[uii]);
1226     if (!uii) {
1227       break;
1228     }
1229     cur_word = cur_word << 2;
1230   }
1231   SubwordStore(cur_word, NypCtToByteCt(cur_word_entry_ct), &(writebuf_walias[sample_ctl2_m1]));
1232 }
1233 
1234 // Revised phaseraw:
1235 //   4 byte het_ct, 4 byte explicit_phasepresent_ct
1236 //   first half, up to (1 + (het_ct / kBitsPerWord)) words
1237 //   second half, rounded up to vector boundary
UnpackHphase(const uintptr_t * __restrict all_hets,const uintptr_t * __restrict phaseraw,uint32_t raw_sample_ct,uintptr_t ** phasepresent_ptr,uintptr_t * __restrict phaseinfo)1238 void UnpackHphase(const uintptr_t* __restrict all_hets, const uintptr_t* __restrict phaseraw, uint32_t raw_sample_ct, uintptr_t** phasepresent_ptr, uintptr_t* __restrict phaseinfo) {
1239   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1240   const uint32_t het_ct = S_CAST(uint32_t, phaseraw[0]);
1241   const uintptr_t* aux2a = &(phaseraw[8 / kBytesPerWord]);
1242   if (!(aux2a[0] & 1)) {
1243     // phase always present
1244     *phasepresent_ptr = nullptr;
1245     ExpandBytearr(aux2a, all_hets, raw_sample_ctl, het_ct, 1, phaseinfo);
1246   } else {
1247     // bugfix (4 Mar 2018): need to pass raw_phasepresent_ct, not het_ct
1248 #ifdef __LP64__
1249     const uint32_t raw_phasepresent_ct = phaseraw[0] >> 32;
1250 #else
1251     const uint32_t raw_phasepresent_ct = phaseraw[1];
1252 #endif
1253     const uintptr_t* aux2b = &(aux2a[1 + (het_ct / kBitsPerWord)]);
1254     ExpandBytearrNested(aux2b, aux2a, all_hets, raw_sample_ctl, raw_phasepresent_ct, 1, *phasepresent_ptr, phaseinfo);
1255   }
1256 }
1257 
UnpackHphaseSubset(const uintptr_t * __restrict all_hets,const uintptr_t * __restrict phaseraw,const uintptr_t * __restrict sample_include,uint32_t sample_ct,uintptr_t ** phasepresent_ptr,uintptr_t * __restrict phaseinfo)1258 void UnpackHphaseSubset(const uintptr_t* __restrict all_hets, const uintptr_t* __restrict phaseraw, const uintptr_t* __restrict sample_include, uint32_t sample_ct, uintptr_t** phasepresent_ptr, uintptr_t* __restrict phaseinfo) {
1259   // const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1260   // const uint32_t het_ct = PopcountWords(all_hets, raw_sample_ctl);
1261   const uint32_t het_ct = S_CAST(uint32_t, phaseraw[0]);
1262   const uintptr_t* aux2a = &(phaseraw[8 / kBytesPerWord]);
1263   if (!(aux2a[0] & 1)) {
1264     // phase always present
1265     *phasepresent_ptr = nullptr;
1266     ExpandThenSubsetBytearr(aux2a, all_hets, sample_include, het_ct, sample_ct, 1, phaseinfo);
1267   } else {
1268     const uint32_t first_half_word_ct = 1 + (het_ct / kBitsPerWord);
1269     // const uint32_t raw_phasepresent_ct = PopcountWords(phaseraw, first_half_word_ct) - 1;
1270 #ifdef __LP64__
1271     const uint32_t raw_phasepresent_ct = phaseraw[0] >> 32;
1272 #else
1273     const uint32_t raw_phasepresent_ct = phaseraw[1];
1274 #endif
1275     const uintptr_t* aux2b = &(aux2a[first_half_word_ct]);
1276 
1277     // see "if (explicit_phasepresent) {}" block in PgrGetRaw().  Could
1278     // change this convention.
1279     ExpandThenSubsetBytearrNested(aux2b, aux2a, all_hets, sample_include, sample_ct, raw_phasepresent_ct, 1, *phasepresent_ptr, phaseinfo);
1280   }
1281 }
1282 
UnpackAndResortHphase(const uintptr_t * __restrict all_hets,const uintptr_t * __restrict phaseraw,const uintptr_t * sample_include,const uint32_t * old_sample_idx_to_new,uint32_t raw_sample_ct,uint32_t sample_ct,uintptr_t ** phasepresent_ptr,uintptr_t * __restrict phaseinfo)1283 void UnpackAndResortHphase(const uintptr_t* __restrict all_hets, const uintptr_t* __restrict phaseraw, const uintptr_t* sample_include, const uint32_t* old_sample_idx_to_new, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t** phasepresent_ptr, uintptr_t* __restrict phaseinfo) {
1284   const uintptr_t* aux2a_iter = &(phaseraw[8 / kBytesPerWord]);
1285   const uint32_t* old_sample_idx_to_new_iter = old_sample_idx_to_new;
1286   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1287   const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
1288   uintptr_t aux2a_word = *aux2a_iter++;
1289   uint32_t read_idx_lowbits = 1;
1290   ZeroWArr(sample_ctl, phaseinfo);
1291   if (!(aux2a_word & 1)) {
1292     // phase always present
1293     *phasepresent_ptr = nullptr;
1294     for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
1295       uintptr_t new_phasepresent_word = all_hets[widx];
1296       const uint32_t read_idx_lowbits_end = read_idx_lowbits + PopcountWord(new_phasepresent_word);
1297       uintptr_t tmp_phaseinfo_input_word = aux2a_word >> read_idx_lowbits;
1298       if (read_idx_lowbits_end >= kBitsPerWord) {
1299         // always safe to read an extra word off the end
1300         aux2a_word = *aux2a_iter++;
1301         if (read_idx_lowbits) {
1302           tmp_phaseinfo_input_word |= aux2a_word << (kBitsPerWord - read_idx_lowbits);
1303         }
1304       }
1305       // no need to mask off top bits of tmp_phaseinfo_input_word
1306       read_idx_lowbits = read_idx_lowbits_end % kBitsPerWord;
1307       if (!sample_include) {
1308 #ifdef USE_AVX2
1309         uintptr_t phaseinfo_bits_to_set = _pdep_u64(tmp_phaseinfo_input_word, new_phasepresent_word);
1310         while (phaseinfo_bits_to_set) {
1311           const uint32_t sample_uidx_lowbits = ctzw(phaseinfo_bits_to_set);
1312           SetBit(old_sample_idx_to_new_iter[sample_uidx_lowbits], phaseinfo);
1313           phaseinfo_bits_to_set &= phaseinfo_bits_to_set - 1;
1314         }
1315 #else
1316         while (new_phasepresent_word) {
1317           const uint32_t sample_uidx_lowbits = ctzw(new_phasepresent_word);
1318           if (tmp_phaseinfo_input_word & 1) {
1319             SetBit(old_sample_idx_to_new_iter[sample_uidx_lowbits], phaseinfo);
1320           }
1321           tmp_phaseinfo_input_word >>= 1;
1322           new_phasepresent_word &= new_phasepresent_word - 1;
1323         }
1324 #endif
1325       } else {
1326 #ifdef USE_AVX2
1327         uintptr_t phaseinfo_bits_to_set = _pdep_u64(tmp_phaseinfo_input_word, new_phasepresent_word) & sample_include[widx];
1328         while (phaseinfo_bits_to_set) {
1329           const uint32_t sample_uidx_lowbits = ctzw(phaseinfo_bits_to_set);
1330           SetBit(old_sample_idx_to_new_iter[sample_uidx_lowbits], phaseinfo);
1331           phaseinfo_bits_to_set &= phaseinfo_bits_to_set - 1;
1332         }
1333 #else
1334         uintptr_t masked_phasepresent_word = new_phasepresent_word & sample_include[widx];
1335         while (masked_phasepresent_word) {
1336           const uint32_t sample_uidx_lowbits = ctzw(masked_phasepresent_word);
1337           const uintptr_t lowmask = (k1LU << sample_uidx_lowbits) - k1LU;
1338           if ((tmp_phaseinfo_input_word >> PopcountWord(new_phasepresent_word & lowmask)) & 1) {
1339             SetBit(old_sample_idx_to_new_iter[sample_uidx_lowbits], phaseinfo);
1340           }
1341           masked_phasepresent_word &= masked_phasepresent_word - 1;
1342         }
1343 #endif
1344       }
1345       old_sample_idx_to_new_iter = &(old_sample_idx_to_new_iter[kBitsPerWord]);
1346     }
1347     return;
1348   }
1349   uintptr_t* phasepresent = *phasepresent_ptr;
1350   const uint32_t het_ct = S_CAST(uint32_t, phaseraw[0]);
1351   const uintptr_t* phaseinfo_read_iter = &(phaseraw[(8 / kBytesPerWord) + 1 + (het_ct / kBitsPerWord)]);
1352   uintptr_t phaseinfo_read_word = *phaseinfo_read_iter++;
1353   uint32_t phaseinfo_read_idx_lowbits = 0;
1354   ZeroWArr(sample_ctl, phasepresent);
1355   for (uint32_t widx = 0; widx != raw_sample_ctl; ++widx) {
1356     uintptr_t geno_hets = all_hets[widx];
1357     if (geno_hets) {
1358       const uint32_t read_idx_lowbits_end = read_idx_lowbits + PopcountWord(geno_hets);
1359       uintptr_t tmp_phasepresent_input_word = aux2a_word >> read_idx_lowbits;
1360       if (read_idx_lowbits_end >= kBitsPerWord) {
1361         // always safe to read an extra word off the end, when
1362         // read_idx_lowbits_end == kBitsPerWord and we're at the last word
1363         aux2a_word = *aux2a_iter++;
1364         if (read_idx_lowbits) {
1365           tmp_phasepresent_input_word |= aux2a_word << (kBitsPerWord - read_idx_lowbits);
1366         }
1367       }
1368       tmp_phasepresent_input_word = bzhi_max(tmp_phasepresent_input_word, read_idx_lowbits_end - read_idx_lowbits);
1369       read_idx_lowbits = read_idx_lowbits_end % kBitsPerWord;
1370       if (tmp_phasepresent_input_word) {
1371         const uint32_t read_phasepresent_ct = PopcountWord(tmp_phasepresent_input_word);
1372         uintptr_t tmp_phaseinfo_input_word;
1373         // avoid reading off end of phaseinfo here
1374         if (phaseinfo_read_idx_lowbits != kBitsPerWord) {
1375           const uint32_t phaseinfo_read_idx_lowbits_end = phaseinfo_read_idx_lowbits + read_phasepresent_ct;
1376           tmp_phaseinfo_input_word = phaseinfo_read_word >> phaseinfo_read_idx_lowbits;
1377           if (phaseinfo_read_idx_lowbits_end < kBitsPerWord) {
1378             phaseinfo_read_idx_lowbits = phaseinfo_read_idx_lowbits_end;
1379           } else {
1380             phaseinfo_read_word = *phaseinfo_read_iter++;
1381             tmp_phaseinfo_input_word |= phaseinfo_read_word << (kBitsPerWord - phaseinfo_read_idx_lowbits);
1382             phaseinfo_read_idx_lowbits = phaseinfo_read_idx_lowbits_end - kBitsPerWord;
1383           }
1384         } else {
1385           // special case, can't right-shift 64
1386           phaseinfo_read_word = *phaseinfo_read_iter++;
1387           phaseinfo_read_idx_lowbits = read_phasepresent_ct;
1388           tmp_phaseinfo_input_word = phaseinfo_read_word;
1389         }
1390         // no need to mask off top bits of tmp_phaseinfo_input_word
1391         if (!sample_include) {
1392 #ifdef USE_AVX2
1393           for (uintptr_t phasepresent_bits_to_set = _pdep_u64(tmp_phasepresent_input_word, geno_hets); ; ) {
1394             const uint32_t new_sample_idx = old_sample_idx_to_new_iter[ctzw(phasepresent_bits_to_set)];
1395             const uint32_t new_sample_widx = new_sample_idx / kBitsPerWord;
1396             const uint32_t new_sample_lowbits = new_sample_idx % kBitsPerWord;
1397             const uintptr_t shifted_bit = k1LU << new_sample_lowbits;
1398             phasepresent[new_sample_widx] |= shifted_bit;
1399             if (tmp_phaseinfo_input_word & 1) {
1400               phaseinfo[new_sample_widx] |= shifted_bit;
1401             }
1402             // branchless version doesn't seem to be any better here; probably
1403             // due to additional random memory access.
1404             // phaseinfo[new_sample_widx] |= (tmp_phaseinfo_input_word & 1) << new_sample_lowbits;
1405 
1406             phasepresent_bits_to_set &= phasepresent_bits_to_set - 1;
1407             if (!phasepresent_bits_to_set) {
1408               break;
1409             }
1410             tmp_phaseinfo_input_word >>= 1;
1411           }
1412 #else
1413           for (; ; tmp_phasepresent_input_word >>= 1) {
1414             if (tmp_phasepresent_input_word & 1) {
1415               const uint32_t new_sample_idx = old_sample_idx_to_new_iter[ctzw(geno_hets)];
1416               const uint32_t new_sample_widx = new_sample_idx / kBitsPerWord;
1417               const uint32_t new_sample_lowbits = new_sample_idx % kBitsPerWord;
1418               const uintptr_t shifted_bit = k1LU << new_sample_lowbits;
1419               phasepresent[new_sample_widx] |= shifted_bit;
1420               if (tmp_phaseinfo_input_word & 1) {
1421                 phaseinfo[new_sample_widx] |= shifted_bit;
1422               }
1423               if (tmp_phasepresent_input_word == 1) {
1424                 break;
1425               }
1426               tmp_phaseinfo_input_word >>= 1;
1427             }
1428             geno_hets &= geno_hets - 1;
1429           }
1430 #endif
1431         } else {
1432           const uintptr_t sample_include_word = sample_include[widx];
1433 #ifdef USE_AVX2
1434           const uintptr_t phasepresent_word_expanded = _pdep_u64(tmp_phasepresent_input_word, geno_hets);
1435           uintptr_t phasepresent_bits_to_set = phasepresent_word_expanded & sample_include_word;
1436           if (phasepresent_bits_to_set) {
1437             // tmp_phaseinfo_input_word gives us the phasing state of the
1438             // positions in phasepresent_word_expanded.
1439             // However, we're only iterating over the positions in
1440             // (phasepresent_word_expanded & sample_include_word).
1441             // (can replace sample_include_word with phasepresent_bits_to_set
1442             // in this expression)
1443             uintptr_t collapsed_phaseinfo_input_word = _pext_u64(tmp_phaseinfo_input_word, _pext_u64(sample_include_word, phasepresent_word_expanded));
1444             while (1) {
1445               const uint32_t new_sample_idx = old_sample_idx_to_new_iter[ctzw(phasepresent_bits_to_set)];
1446               const uint32_t new_sample_widx = new_sample_idx / kBitsPerWord;
1447               const uint32_t new_sample_lowbits = new_sample_idx % kBitsPerWord;
1448               const uintptr_t shifted_bit = k1LU << new_sample_lowbits;
1449               phasepresent[new_sample_widx] |= shifted_bit;
1450               if (collapsed_phaseinfo_input_word & 1) {
1451                 phaseinfo[new_sample_widx] |= shifted_bit;
1452               }
1453 
1454               phasepresent_bits_to_set &= phasepresent_bits_to_set - 1;
1455               if (!phasepresent_bits_to_set) {
1456                 break;
1457               }
1458               collapsed_phaseinfo_input_word >>= 1;
1459             }
1460           }
1461 #else
1462           for (; ; tmp_phasepresent_input_word >>= 1) {
1463             if (tmp_phasepresent_input_word & 1) {
1464               const uintptr_t geno_hets_lowbit = geno_hets & (-geno_hets);
1465               if (sample_include_word & geno_hets_lowbit) {
1466                 const uint32_t sample_uidx_lowbits = ctzw(geno_hets_lowbit);
1467                 const uint32_t new_sample_idx = old_sample_idx_to_new_iter[sample_uidx_lowbits];
1468                 const uint32_t new_sample_widx = new_sample_idx / kBitsPerWord;
1469                 const uint32_t new_sample_lowbits = new_sample_idx % kBitsPerWord;
1470                 const uintptr_t shifted_bit = k1LU << new_sample_lowbits;
1471                 phasepresent[new_sample_widx] |= shifted_bit;
1472                 if (tmp_phaseinfo_input_word & 1) {
1473                   phaseinfo[new_sample_widx] |= shifted_bit;
1474                 }
1475               }
1476               if (tmp_phasepresent_input_word == 1) {
1477                 break;
1478               }
1479               tmp_phaseinfo_input_word >>= 1;
1480             }
1481             geno_hets &= geno_hets - 1;
1482           }
1483 #endif
1484         }
1485       }
1486     }
1487     old_sample_idx_to_new_iter = &(old_sample_idx_to_new_iter[kBitsPerWord]);
1488   }
1489 }
1490 
1491 
1492 // these also work on dphaseraw
CopyDosage(const uintptr_t * __restrict read_dosagepresent,const Dosage * read_dosagevals,uint32_t raw_sample_ct,uint32_t dosage_ct,uintptr_t * __restrict write_dosagepresent,Dosage * write_dosagevals,uint32_t * write_dosage_ct_ptr)1493 void CopyDosage(const uintptr_t* __restrict read_dosagepresent, const Dosage* read_dosagevals, uint32_t raw_sample_ct, uint32_t dosage_ct, uintptr_t* __restrict write_dosagepresent, Dosage* write_dosagevals, uint32_t* write_dosage_ct_ptr) {
1494   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1495   *write_dosage_ct_ptr = dosage_ct;
1496   memcpy(write_dosagepresent, read_dosagepresent, raw_sample_ctl * sizeof(intptr_t));
1497   memcpy(write_dosagevals, read_dosagevals, dosage_ct * sizeof(Dosage));
1498 }
1499 
CopyAndResort8bit(const uintptr_t * __restrict src_subset,const void * __restrict src_vals,const uint32_t * __restrict new_sample_idx_to_old,uint32_t raw_sample_ct,uint32_t sample_ct,uintptr_t * __restrict dst_subset,void * __restrict dst_vals,uint32_t * __restrict cumulative_popcount_buf)1500 uint32_t CopyAndResort8bit(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uint32_t* __restrict new_sample_idx_to_old, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals, uint32_t* __restrict cumulative_popcount_buf) {
1501   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1502   FillCumulativePopcounts(src_subset, raw_sample_ctl, cumulative_popcount_buf);
1503   const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
1504   ZeroWArr(sample_ctl, dst_subset);
1505   const unsigned char* src_vals_uc = S_CAST(const unsigned char*, src_vals);
1506   unsigned char* dst_vals_uc = S_CAST(unsigned char*, dst_vals);
1507   unsigned char* dst_vals_iter = dst_vals_uc;
1508   // Tried word-based loop, was significantly worse
1509   for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
1510     const uint32_t old_sample_idx = new_sample_idx_to_old[new_sample_idx];
1511     if (IsSet(src_subset, old_sample_idx)) {
1512       SetBit(new_sample_idx, dst_subset);
1513       const uint32_t old_dosagevals_idx = RawToSubsettedPos(src_subset, cumulative_popcount_buf, old_sample_idx);
1514       *dst_vals_iter++ = src_vals_uc[old_dosagevals_idx];
1515     }
1516   }
1517   return dst_vals_iter - dst_vals_uc;
1518 }
1519 
CopyAndResort16bit(const uintptr_t * __restrict src_subset,const void * __restrict src_vals,const uint32_t * __restrict new_sample_idx_to_old,uint32_t raw_sample_ct,uint32_t sample_ct,uintptr_t * __restrict dst_subset,void * __restrict dst_vals,uint32_t * __restrict cumulative_popcount_buf)1520 uint32_t CopyAndResort16bit(const uintptr_t* __restrict src_subset, const void* __restrict src_vals, const uint32_t* __restrict new_sample_idx_to_old, uint32_t raw_sample_ct, uint32_t sample_ct, uintptr_t* __restrict dst_subset, void* __restrict dst_vals, uint32_t* __restrict cumulative_popcount_buf) {
1521   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1522   FillCumulativePopcounts(src_subset, raw_sample_ctl, cumulative_popcount_buf);
1523   const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
1524   ZeroWArr(sample_ctl, dst_subset);
1525   const uint16_t* src_vals_u16 = S_CAST(const uint16_t*, src_vals);
1526   uint16_t* dst_vals_u16 = S_CAST(uint16_t*, dst_vals);
1527   uint16_t* dst_vals_iter = dst_vals_u16;
1528   // Tried word-based loop, was significantly worse
1529   for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
1530     const uint32_t old_sample_idx = new_sample_idx_to_old[new_sample_idx];
1531     if (IsSet(src_subset, old_sample_idx)) {
1532       SetBit(new_sample_idx, dst_subset);
1533       const uint32_t old_dosagevals_idx = RawToSubsettedPos(src_subset, cumulative_popcount_buf, old_sample_idx);
1534       *dst_vals_iter++ = src_vals_u16[old_dosagevals_idx];
1535     }
1536   }
1537   return dst_vals_iter - dst_vals_u16;
1538 }
1539 
1540 // Requires trailing bits of genovec to be zeroed out.
1541 // "Flat" = don't separate one_cts and two_cts.
1542 void GetMFlatCounts64(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const PgenVariant* pgvp, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t allele_ct, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* all_dosages) {
1543   if (sample_ct == raw_sample_ct) {
1544     GenoarrCountFreqsUnsafe(pgvp->genovec, sample_ct, genocounts);
1545   } else {
1546     GenoarrCountSubsetFreqs(pgvp->genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
1547   }
1548   all_dosages[0] = 2 * genocounts[0] + genocounts[1];
1549   all_dosages[1] = 2 * genocounts[2] + genocounts[1];
1550   ZeroU64Arr(allele_ct - 2, &(all_dosages[2]));
1551   const AlleleCode* patch_01_vals = pgvp->patch_01_vals;
1552   const AlleleCode* patch_10_vals = pgvp->patch_10_vals;
1553   const uint32_t patch_01_ct = pgvp->patch_01_ct;
1554   const uint32_t patch_10_ct = pgvp->patch_10_ct;
1555   if (sample_ct == raw_sample_ct) {
1556     all_dosages[1] -= patch_01_ct + 2 * patch_10_ct;
1557     for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
1558       all_dosages[patch_01_vals[uii]] += 1;
1559     }
1560     const uint32_t patch_10_ct_x2 = patch_10_ct * 2;
1561     for (uint32_t uii = 0; uii != patch_10_ct_x2; ++uii) {
1562       all_dosages[patch_10_vals[uii]] += 1;
1563     }
1564   } else {
1565     if (patch_01_ct) {
1566       const uintptr_t* patch_01_set = pgvp->patch_01_set;
1567       uintptr_t sample_widx = 0;
1568       uintptr_t patch_01_bits = patch_01_set[0];
1569       uint32_t subsetted_patch_01_ct = 0;
1570       for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
1571         const uintptr_t lowbit = BitIter1y(patch_01_set, &sample_widx, &patch_01_bits);
1572         if (sample_include[sample_widx] & lowbit) {
1573           all_dosages[patch_01_vals[uii]] += 1;
1574           ++subsetted_patch_01_ct;
1575         }
1576       }
1577       all_dosages[1] -= subsetted_patch_01_ct;
1578     }
1579     if (patch_10_ct) {
1580       const uintptr_t* patch_10_set = pgvp->patch_10_set;
1581       uintptr_t sample_widx = 0;
1582       uintptr_t patch_10_bits = patch_10_set[0];
1583       uint32_t subsetted_patch_10_ct = 0;
1584       for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
1585         const uintptr_t lowbit = BitIter1y(patch_10_set, &sample_widx, &patch_10_bits);
1586         if (sample_include[sample_widx] & lowbit) {
1587           all_dosages[patch_10_vals[2 * uii]] += 1;
1588           all_dosages[patch_10_vals[2 * uii + 1]] += 1;
1589           ++subsetted_patch_10_ct;
1590         }
1591       }
1592       all_dosages[1] -= 2 * subsetted_patch_10_ct;
1593     }
1594   }
1595 }
1596 
1597 void GetMCounts64(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, const PgenVariant* pgvp, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t allele_ct, STD_ARRAY_REF(uint32_t, 4) genocounts, uint64_t* __restrict one_cts, uint64_t* __restrict two_cts) {
1598   // This mirrors GetMultiallelicCountsAndDosage16s().
1599   if (sample_ct == raw_sample_ct) {
1600     GenoarrCountFreqsUnsafe(pgvp->genovec, sample_ct, genocounts);
1601   } else {
1602     GenoarrCountSubsetFreqs(pgvp->genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
1603   }
1604   one_cts[0] = genocounts[1];
1605   one_cts[1] = genocounts[1];
1606   ZeroU64Arr(allele_ct - 2, &(one_cts[2]));
1607   two_cts[0] = genocounts[0];
1608   two_cts[1] = genocounts[2];
1609   ZeroU64Arr(allele_ct - 2, &(two_cts[2]));
1610   const AlleleCode* patch_01_vals = pgvp->patch_01_vals;
1611   const AlleleCode* patch_10_vals = pgvp->patch_10_vals;
1612   const uint32_t patch_01_ct = pgvp->patch_01_ct;
1613   const uint32_t patch_10_ct = pgvp->patch_10_ct;
1614   if (sample_ct == raw_sample_ct) {
1615     one_cts[1] -= patch_01_ct;
1616     for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
1617       one_cts[patch_01_vals[uii]] += 1;
1618     }
1619     two_cts[1] -= patch_10_ct;
1620     const AlleleCode* patch_10_vals_iter = patch_10_vals;
1621     for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
1622       const AlleleCode code_lo = *patch_10_vals_iter++;
1623       const AlleleCode code_hi = *patch_10_vals_iter++;
1624       if (code_lo == code_hi) {
1625         two_cts[code_lo] += 1;
1626       } else {
1627         one_cts[code_lo] += 1;
1628         one_cts[code_hi] += 1;
1629       }
1630     }
1631   } else {
1632     if (patch_01_ct) {
1633       const uintptr_t* patch_01_set = pgvp->patch_01_set;
1634       uintptr_t sample_widx = 0;
1635       uintptr_t patch_01_bits = patch_01_set[0];
1636       uint32_t subsetted_patch_01_ct = 0;
1637       for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
1638         const uintptr_t lowbit = BitIter1y(patch_01_set, &sample_widx, &patch_01_bits);
1639         if (sample_include[sample_widx] & lowbit) {
1640           one_cts[patch_01_vals[uii]] += 1;
1641           ++subsetted_patch_01_ct;
1642         }
1643       }
1644       one_cts[1] -= subsetted_patch_01_ct;
1645     }
1646     if (patch_10_ct) {
1647       const uintptr_t* patch_10_set = pgvp->patch_10_set;
1648       uintptr_t sample_widx = 0;
1649       uintptr_t patch_10_bits = patch_10_set[0];
1650       uint32_t subsetted_patch_10_ct = 0;
1651       for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
1652         const uintptr_t lowbit = BitIter1y(patch_10_set, &sample_widx, &patch_10_bits);
1653         if (sample_include[sample_widx] & lowbit) {
1654           ++subsetted_patch_10_ct;
1655           const AlleleCode code_lo = patch_10_vals[2 * uii];
1656           const AlleleCode code_hi = patch_10_vals[2 * uii + 1];
1657           if (code_lo == code_hi) {
1658             two_cts[code_lo] += 1;
1659           } else {
1660             one_cts[code_lo] += 1;
1661             one_cts[code_hi] += 1;
1662           }
1663         }
1664       }
1665       two_cts[1] -= subsetted_patch_10_ct;
1666     }
1667   }
1668 }
1669 
1670 typedef struct LoadAlleleAndGenoCountsCtxStruct {
1671   const uintptr_t* variant_include;
1672   const ChrInfo* cip;
1673   const uintptr_t* allele_idx_offsets;
1674   const uintptr_t* sample_include;
1675   uintptr_t* sample_include_interleaved_vec;
1676   uint32_t* sample_include_cumulative_popcounts;
1677   const uintptr_t* sex_male;
1678   uintptr_t* sex_male_interleaved_vec;
1679   uint32_t* sex_male_cumulative_popcounts;
1680   uintptr_t* nosex_interleaved_vec;
1681   const uintptr_t* founder_info;
1682   uintptr_t* founder_info_interleaved_vec;
1683   uint32_t* founder_info_cumulative_popcounts;
1684   uintptr_t* founder_male;
1685   uintptr_t* founder_male_interleaved_vec;
1686   uint32_t* founder_male_cumulative_popcounts;
1687   uintptr_t* founder_nosex_interleaved_vec;
1688   uint32_t raw_sample_ct;
1689   uint32_t sample_ct;
1690   uint32_t founder_ct;
1691   uint32_t male_ct;
1692   uint32_t nosex_ct;
1693   uint32_t founder_male_ct;
1694   uint32_t founder_nosex_ct;
1695   uint32_t first_hap_uidx;
1696   uint32_t is_minimac3_r2;
1697 
1698   PgenReader** pgr_ptrs;
1699 
1700   uintptr_t** genovecs;
1701   uintptr_t** thread_read_mhc;
1702   uintptr_t** dosage_presents;
1703   Dosage** dosage_mains;
1704   uint64_t** all_dosages;
1705   uint32_t* read_variant_uidx_starts;
1706 
1707   // shouldn't need array, or errno storage, since kPglRetMalformedInput is the
1708   // only possible error for now
1709   PglErr reterr;
1710 
1711   uint32_t cur_block_size;
1712 
1713   unsigned char* allele_presents_bytearr;
1714   uint64_t* allele_ddosages;
1715   STD_ARRAY_PTR_DECL(uint32_t, 3, raw_geno_cts);
1716   uint32_t* variant_missing_hc_cts;
1717   uint32_t* variant_missing_dosage_cts;
1718   uint32_t* variant_hethap_cts;
1719   uint64_t* founder_allele_ddosages;
1720   STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts);
1721   STD_ARRAY_PTR_DECL(uint32_t, 3, x_male_geno_cts);
1722   STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts);
1723   STD_ARRAY_PTR_DECL(uint32_t, 3, x_nosex_geno_cts);
1724   STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts);
1725   double* imp_r2_vals;
1726 } LoadAlleleAndGenoCountsCtx;
1727 
LoadAlleleAndGenoCountsThread(void * raw_arg)1728 THREAD_FUNC_DECL LoadAlleleAndGenoCountsThread(void* raw_arg) {
1729   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
1730   const uintptr_t tidx = arg->tidx;
1731   LoadAlleleAndGenoCountsCtx* ctx = S_CAST(LoadAlleleAndGenoCountsCtx*, arg->sharedp->context);
1732 
1733   const uintptr_t* variant_include = ctx->variant_include;
1734   const ChrInfo* cip = ctx->cip;
1735   const uintptr_t* allele_idx_offsets = ctx->allele_idx_offsets;
1736   const uint32_t thread_ct = GetThreadCt(arg->sharedp);
1737   const uint32_t subset_ct = (ctx->founder_info != nullptr) + 1;
1738   const uint32_t raw_sample_ct = ctx->raw_sample_ct;
1739   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
1740   const uint32_t first_hap_uidx = ctx->first_hap_uidx;
1741   const uint32_t is_minimac3_r2 = ctx->is_minimac3_r2;
1742   const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
1743   PgenReader* pgrp = ctx->pgr_ptrs[tidx];
1744   PgenVariant pgv;
1745   pgv.genovec = ctx->genovecs[tidx];
1746   SetPgvThreadMhcNull(raw_sample_ct, tidx, ctx->thread_read_mhc, &pgv);
1747   pgv.dosage_present = nullptr;
1748   pgv.dosage_main = nullptr;
1749   if (ctx->dosage_presents) {
1750     pgv.dosage_present = ctx->dosage_presents[tidx];
1751     pgv.dosage_main = ctx->dosage_mains[tidx];
1752   }
1753   uint64_t* all_dosages = nullptr;
1754   if (ctx->all_dosages) {
1755     all_dosages = ctx->all_dosages[tidx];
1756   }
1757   uint32_t is_y = 0;
1758   uint32_t is_nonxy_haploid = 0;
1759   uint32_t x_start = 0;
1760   uint32_t x_code;
1761   if (XymtExists(cip, kChrOffsetX, &x_code)) {
1762     const uint32_t x_chr_fo_idx = cip->chr_idx_to_foidx[x_code];
1763     x_start = cip->chr_fo_vidx_start[x_chr_fo_idx];
1764   }
1765   uint32_t allele_ct = 2;
1766   do {
1767     const uintptr_t cur_block_size = ctx->cur_block_size;
1768     // no overflow danger since cur_block_size <= 2^16, tidx < (2^16 - 1)
1769     const uint32_t cur_idx_end = ((tidx + 1) * cur_block_size) / thread_ct;
1770     const uintptr_t* sample_include = ctx->sample_include;
1771     const uintptr_t* sample_include_interleaved_vec = ctx->sample_include_interleaved_vec;
1772     const uint32_t* sample_include_cumulative_popcounts = ctx->sample_include_cumulative_popcounts;
1773     const uintptr_t* sex_male = ctx->sex_male;
1774     const uintptr_t* sex_male_interleaved_vec = ctx->sex_male_interleaved_vec;
1775     const uint32_t* sex_male_cumulative_popcounts = ctx->sex_male_cumulative_popcounts;
1776     const uintptr_t* nosex_interleaved_vec = ctx->nosex_interleaved_vec;
1777     uint32_t sample_ct = ctx->sample_ct;
1778     uint32_t male_ct = ctx->male_ct;
1779     uint32_t nosex_ct = ctx->nosex_ct;
1780     unsigned char* allele_presents_bytearr = ctx->allele_presents_bytearr;
1781     uint64_t* allele_ddosages = ctx->allele_ddosages;
1782     STD_ARRAY_PTR_DECL(uint32_t, 3, raw_geno_cts) = ctx->raw_geno_cts;
1783     uint32_t* variant_missing_hc_cts = ctx->variant_missing_hc_cts;
1784     uint32_t* variant_missing_dosage_cts = ctx->variant_missing_dosage_cts;
1785     uint32_t* variant_hethap_cts = ctx->variant_hethap_cts;
1786     STD_ARRAY_PTR_DECL(uint32_t, 3, x_male_geno_cts) = ctx->x_male_geno_cts;
1787     STD_ARRAY_PTR_DECL(uint32_t, 3, x_nosex_geno_cts) = ctx->x_nosex_geno_cts;
1788     double* imp_r2_vals = ctx->imp_r2_vals;
1789     pgv.dosage_ct = 0;
1790     for (uint32_t subset_idx = 0; ; ) {
1791       // bugfix (29 Dec 2019): this boolean can change with subset_idx
1792       const uint32_t no_multiallelic_branch = (!variant_hethap_cts) && (!allele_presents_bytearr) && (!allele_ddosages) && (!imp_r2_vals);
1793       PgrSampleSubsetIndex pssi;
1794       PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, pgrp, &pssi);
1795       uint32_t cur_idx = (tidx * cur_block_size) / thread_ct;
1796       uintptr_t variant_uidx_base;
1797       uintptr_t variant_include_bits;
1798       BitIter1Start(variant_include, ctx->read_variant_uidx_starts[tidx], &variant_uidx_base, &variant_include_bits);
1799       uint32_t chr_end = 0;
1800       uint32_t is_x_or_y = 0;
1801       PglErr reterr = kPglRetSuccess;
1802 
1803       STD_ARRAY_DECL(uint32_t, 4, genocounts);
1804       STD_ARRAY_DECL(uint32_t, 4, sex_specific_genocounts);
1805       for (; cur_idx != cur_idx_end; ++cur_idx) {
1806         const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &variant_include_bits);
1807         if (variant_uidx >= chr_end) {
1808           const uint32_t chr_fo_idx = GetVariantChrFoIdx(cip, variant_uidx);
1809           const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
1810           chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
1811           is_y = 0;
1812           is_nonxy_haploid = 0;
1813           if (chr_idx == x_code) {
1814             is_x_or_y = 1;
1815             PgrClearSampleSubsetIndex(pgrp, &pssi);
1816           } else if (chr_idx == y_code) {
1817             is_x_or_y = 1;
1818             is_y = 1;
1819             // ugh
1820             if ((!allele_presents_bytearr) || (sample_ct == male_ct)) {
1821               PgrSetSampleSubsetIndex(sex_male_cumulative_popcounts, pgrp, &pssi);
1822             } else {
1823               PgrClearSampleSubsetIndex(pgrp, &pssi);
1824             }
1825           } else {
1826             if (is_x_or_y) {
1827               PgrSetSampleSubsetIndex(sample_include_cumulative_popcounts, pgrp, &pssi);
1828             }
1829             is_x_or_y = 0;
1830             // true for MT
1831             is_nonxy_haploid = IsSet(cip->haploid_mask, chr_idx);
1832           }
1833         }
1834         uintptr_t cur_allele_idx_offset;
1835         if (!allele_idx_offsets) {
1836           cur_allele_idx_offset = 2 * variant_uidx;
1837         } else {
1838           cur_allele_idx_offset = allele_idx_offsets[variant_uidx];
1839           allele_ct = allele_idx_offsets[variant_uidx + 1] - cur_allele_idx_offset;
1840         }
1841         uint32_t hethap_ct;
1842         if ((allele_ct == 2) || no_multiallelic_branch) {
1843           uint64_t cur_dosages[2];
1844           if (!is_x_or_y) {
1845             reterr = PgrGetDCounts(sample_include, sample_include_interleaved_vec, pssi, sample_ct, variant_uidx, is_minimac3_r2, pgrp, imp_r2_vals? (&(imp_r2_vals[variant_uidx])) : nullptr, genocounts, cur_dosages);
1846             if (unlikely(reterr)) {
1847               ctx->reterr = reterr;
1848               break;
1849             }
1850             if (allele_presents_bytearr) {
1851               if (cur_dosages[0]) {
1852                 allele_presents_bytearr[cur_allele_idx_offset] = 128;
1853               }
1854               if (cur_dosages[1]) {
1855                 allele_presents_bytearr[cur_allele_idx_offset + 1] = 128;
1856               }
1857             }
1858             if (!is_nonxy_haploid) {
1859               hethap_ct = 0;
1860               if (allele_ddosages) {
1861                 // ...but save all allele counts here.
1862                 allele_ddosages[cur_allele_idx_offset] = cur_dosages[0] * 2;
1863                 allele_ddosages[cur_allele_idx_offset + 1] = cur_dosages[1] * 2;
1864               }
1865             } else {
1866               // this hethap_ct can be inaccurate in multiallelic case
1867               hethap_ct = genocounts[1];
1868               if (imp_r2_vals && (!is_minimac3_r2)) {
1869                 // Assuming the input data isn't malformed "phased haploid",
1870                 // minimac3-r2 is independent of haploid/diploid state; only
1871                 // mach-r2 requires a haploid correction.
1872                 imp_r2_vals[variant_uidx] *= 0.5;
1873               }
1874               if (allele_ddosages) {
1875                 allele_ddosages[cur_allele_idx_offset] = cur_dosages[0];
1876                 allele_ddosages[cur_allele_idx_offset + 1] = cur_dosages[1];
1877               }
1878             }
1879           } else if (is_y) {
1880             if ((!allele_presents_bytearr) || (sample_ct == male_ct)) {
1881               reterr = PgrGetDCounts(sex_male, sex_male_interleaved_vec, pssi, male_ct, variant_uidx, 0, pgrp, imp_r2_vals? (&(imp_r2_vals[variant_uidx])) : nullptr, genocounts, cur_dosages);
1882               if (unlikely(reterr)) {
1883                 ctx->reterr = reterr;
1884                 break;
1885               }
1886               hethap_ct = genocounts[1];
1887               if (imp_r2_vals && (!is_minimac3_r2)) {
1888                 // note that female/unknown-sex are not counted here
1889                 imp_r2_vals[variant_uidx] *= 0.5;
1890               }
1891               if (allele_presents_bytearr) {
1892                 if (cur_dosages[0]) {
1893                   allele_presents_bytearr[cur_allele_idx_offset] = 128;
1894                 }
1895                 if (cur_dosages[1]) {
1896                   allele_presents_bytearr[cur_allele_idx_offset + 1] = 128;
1897                 }
1898               }
1899               if (allele_ddosages) {
1900                 allele_ddosages[cur_allele_idx_offset] = cur_dosages[0];
1901                 allele_ddosages[cur_allele_idx_offset + 1] = cur_dosages[1];
1902               }
1903             } else {
1904               // ugh, need to count female/unknown-sex for allele_presents and
1905               // ignore elsewhere
1906               reterr = PgrGetD(nullptr, pssi, raw_sample_ct, variant_uidx, pgrp, pgv.genovec, pgv.dosage_present, pgv.dosage_main, &pgv.dosage_ct);
1907               if (unlikely(reterr)) {
1908                 ctx->reterr = reterr;
1909                 break;
1910               }
1911               const uint32_t dosage_is_relevant = pgv.dosage_ct && ((sample_ct == raw_sample_ct) || (!IntersectionIsEmpty(sample_include, pgv.dosage_present, raw_sample_ctl)));
1912               if (dosage_is_relevant) {
1913                 // at least one dosage value is present, that's all we need to
1914                 // know
1915                 allele_presents_bytearr[cur_allele_idx_offset] = 128;
1916                 allele_presents_bytearr[cur_allele_idx_offset + 1] = 128;
1917               } else {
1918                 // only hardcalls matter
1919                 // bugfix (31 Jul 2018): forgot to initialize genocounts here
1920                 // possible todo: use a specialized function which just checks
1921                 // which alleles exist
1922                 if (sample_ct == raw_sample_ct) {
1923                   ZeroTrailingNyps(raw_sample_ct, pgv.genovec);
1924                   GenoarrCountFreqsUnsafe(pgv.genovec, sample_ct, genocounts);
1925                 } else {
1926                   GenoarrCountSubsetFreqs(pgv.genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
1927                 }
1928                 if (genocounts[0] || genocounts[1]) {
1929                   allele_presents_bytearr[cur_allele_idx_offset] = 128;
1930                 }
1931                 if (genocounts[1] || genocounts[2]) {
1932                   allele_presents_bytearr[cur_allele_idx_offset + 1] = 128;
1933                 }
1934               }
1935               GenoarrCountSubsetFreqs(pgv.genovec, sex_male_interleaved_vec, raw_sample_ct, male_ct, genocounts);
1936               hethap_ct = genocounts[1];
1937               // x2, x4 since this is haploid
1938               uintptr_t alt1_ct_x2 = genocounts[2] * 2 + hethap_ct;
1939               uintptr_t alt1_sq_sum_x4 = genocounts[2] * (4 * k1LU) + hethap_ct;
1940               uint64_t alt1_ddosage = 0;  // in 32768ths
1941               uint64_t alt1_ddosage_sq_sum = 0;
1942               uint32_t additional_dosage_ct = 0;
1943               if (dosage_is_relevant) {
1944                 uintptr_t sample_widx = 0;
1945                 uintptr_t dosage_present_bits = pgv.dosage_present[0];
1946                 uint32_t sample_uidx = 0;
1947                 for (uint32_t dosage_idx = 0; dosage_idx != pgv.dosage_ct; ++dosage_idx) {
1948                   const uintptr_t lowbit = BitIter1y(pgv.dosage_present, &sample_widx, &dosage_present_bits);
1949                   if (sample_include[sample_widx] & lowbit) {
1950                     const uintptr_t cur_dosage_val = pgv.dosage_main[dosage_idx];
1951                     alt1_ddosage += cur_dosage_val;
1952                     alt1_ddosage_sq_sum += cur_dosage_val * cur_dosage_val;
1953                     const uintptr_t hardcall_code = GetNyparrEntry(pgv.genovec, sample_uidx);
1954                     if (hardcall_code != 3) {
1955                       alt1_ct_x2 -= hardcall_code;
1956                       alt1_sq_sum_x4 -= hardcall_code * hardcall_code;
1957                     } else {
1958                       ++additional_dosage_ct;
1959                     }
1960                   }
1961                 }
1962               }
1963               const uintptr_t obs_ct = male_ct + additional_dosage_ct - genocounts[3];
1964               alt1_ddosage += alt1_ct_x2 * S_CAST(uint64_t, kDosageMid);
1965               alt1_ddosage_sq_sum += alt1_sq_sum_x4 * 0x10000000LLU;
1966               cur_dosages[0] = obs_ct * S_CAST(uint64_t, kDosageMax) - alt1_ddosage;
1967               cur_dosages[1] = alt1_ddosage;
1968               if (imp_r2_vals) {
1969                 // minimac3-r2 and mach-r2 are identical in haploid case
1970                 const double dosage_sumd = u63tod(alt1_ddosage);
1971                 const double dosage_avg = dosage_sumd / u31tod(obs_ct);
1972                 const double dosage_variance = u63tod(alt1_ddosage_sq_sum) - dosage_sumd * dosage_avg;
1973                 imp_r2_vals[variant_uidx] = dosage_variance / (dosage_sumd * (32768 - dosage_avg));
1974               }
1975               if (allele_ddosages) {
1976                 allele_ddosages[cur_allele_idx_offset] = cur_dosages[0];
1977                 allele_ddosages[cur_allele_idx_offset + 1] = alt1_ddosage;
1978               }
1979             }
1980           } else {
1981             // chrX
1982             reterr = PgrGetD(nullptr, pssi, raw_sample_ct, variant_uidx, pgrp, pgv.genovec, pgv.dosage_present, pgv.dosage_main, &pgv.dosage_ct);
1983             if (unlikely(reterr)) {
1984               ctx->reterr = reterr;
1985               break;
1986             }
1987             if (sample_ct == raw_sample_ct) {
1988               ZeroTrailingNyps(raw_sample_ct, pgv.genovec);
1989               GenoarrCountFreqsUnsafe(pgv.genovec, sample_ct, genocounts);
1990             } else {
1991               GenoarrCountSubsetFreqs(pgv.genovec, sample_include_interleaved_vec, raw_sample_ct, sample_ct, genocounts);
1992             }
1993             GenoarrCountSubsetFreqs(pgv.genovec, sex_male_interleaved_vec, raw_sample_ct, male_ct, sex_specific_genocounts);
1994             hethap_ct = sex_specific_genocounts[1];
1995             // Could compute imputation r2 iff there are no unknown-sex
1996             // samples, but probably not worth it since larger datasets could
1997             // have a small number of Klinefelter syndrome cases, etc. coded as
1998             // unknown-sex, and we don't want to discourage their inclusion;
1999             // let's delegate that chrX filter to other software for now.
2000 
2001             if (allele_presents_bytearr) {
2002               if (pgv.dosage_ct && ((sample_ct == raw_sample_ct) || (!IntersectionIsEmpty(sample_include, pgv.dosage_present, raw_sample_ctl)))) {
2003                 // at least one dosage value is present, that's all we need to
2004                 // know
2005                 allele_presents_bytearr[cur_allele_idx_offset] = 128;
2006                 allele_presents_bytearr[cur_allele_idx_offset + 1] = 128;
2007               } else {
2008                 // only hardcalls matter
2009                 if (genocounts[0] || genocounts[1]) {
2010                   allele_presents_bytearr[cur_allele_idx_offset] = 128;
2011                 }
2012                 if (genocounts[1] || genocounts[2]) {
2013                   allele_presents_bytearr[cur_allele_idx_offset + 1] = 128;
2014                 }
2015               }
2016             }
2017             if (allele_ddosages) {
2018               uintptr_t alt1_ct = 4 * genocounts[2] + 2 * genocounts[1] - 2 * sex_specific_genocounts[2] - hethap_ct;  // nonmales count twice
2019               uint64_t alt1_ddosage = 0;  // in 32768ths, nonmales count twice
2020               uint32_t additional_dosage_ct = 0;  // missing hardcalls only; nonmales count twice
2021               // bugfix (12 Jul 2018): dosage_present may be null if dosage_ct
2022               // == 0
2023               if (pgv.dosage_ct) {
2024                 uintptr_t sample_uidx_base = 0;
2025                 uintptr_t dosage_present_bits = pgv.dosage_present[0];
2026                 if (sample_ct == raw_sample_ct) {
2027                   for (uint32_t dosage_idx = 0; dosage_idx != pgv.dosage_ct; ++dosage_idx) {
2028                     const uintptr_t sample_uidx = BitIter1(pgv.dosage_present, &sample_uidx_base, &dosage_present_bits);
2029                     const uintptr_t cur_dosage_val = pgv.dosage_main[dosage_idx];
2030                     const uintptr_t sex_multiplier = 2 - IsSet(sex_male, sample_uidx);
2031                     alt1_ddosage += cur_dosage_val * sex_multiplier;
2032 
2033                     // could call GenoarrCountSubsetIntersectFreqs() twice
2034                     // instead, but since we've already manually extracted the
2035                     // sex bit it probably doesn't help?
2036                     const uintptr_t hardcall_code = GetNyparrEntry(pgv.genovec, sample_uidx);
2037                     if (hardcall_code != 3) {
2038                       alt1_ct -= hardcall_code * sex_multiplier;
2039                     } else {
2040                       additional_dosage_ct += sex_multiplier;
2041                     }
2042                   }
2043                 } else {
2044                   for (uint32_t dosage_idx = 0; dosage_idx != pgv.dosage_ct; ++dosage_idx) {
2045                     const uintptr_t sample_uidx = BitIter1(pgv.dosage_present, &sample_uidx_base, &dosage_present_bits);
2046                     if (IsSet(sample_include, sample_uidx)) {
2047                       const uintptr_t cur_dosage_val = pgv.dosage_main[dosage_idx];
2048                       const uintptr_t sex_multiplier = 2 - IsSet(sex_male, sample_uidx);
2049                       alt1_ddosage += cur_dosage_val * sex_multiplier;
2050                       const uintptr_t hardcall_code = GetNyparrEntry(pgv.genovec, sample_uidx);
2051                       if (hardcall_code != 3) {
2052                         alt1_ct -= hardcall_code * sex_multiplier;
2053                       } else {
2054                         additional_dosage_ct += sex_multiplier;
2055                       }
2056                     }
2057                   }
2058                 }
2059               }
2060               alt1_ddosage += alt1_ct * S_CAST(uint64_t, kDosageMid);
2061 
2062               // bugfix (14 May 2018): this didn't correctly distinguish
2063               // between missing vs. 'replaced' hardcalls
2064               const uintptr_t weighted_obs_ct = (2 * (sample_ct - genocounts[3]) - male_ct + sex_specific_genocounts[3] + additional_dosage_ct) * (2 * k1LU);
2065 
2066               allele_ddosages[cur_allele_idx_offset] = weighted_obs_ct * S_CAST(uint64_t, kDosageMid) - alt1_ddosage;
2067               allele_ddosages[cur_allele_idx_offset + 1] = alt1_ddosage;
2068             }
2069             if (x_male_geno_cts) {
2070               STD_ARRAY_REF(uint32_t, 3) cur_x_male_geno_cts = x_male_geno_cts[variant_uidx - x_start];
2071               cur_x_male_geno_cts[0] = sex_specific_genocounts[0];
2072               cur_x_male_geno_cts[1] = sex_specific_genocounts[1];
2073               cur_x_male_geno_cts[2] = sex_specific_genocounts[2];
2074               if (x_nosex_geno_cts) {
2075                 GenoarrCountSubsetFreqs(pgv.genovec, nosex_interleaved_vec, raw_sample_ct, nosex_ct, sex_specific_genocounts);
2076                 STD_ARRAY_REF(uint32_t, 3) cur_nosex_geno_cts = x_nosex_geno_cts[variant_uidx - x_start];
2077                 cur_nosex_geno_cts[0] = sex_specific_genocounts[0];
2078                 cur_nosex_geno_cts[1] = sex_specific_genocounts[1];
2079                 cur_nosex_geno_cts[2] = sex_specific_genocounts[2];
2080               }
2081             }
2082           }
2083           if (variant_missing_dosage_cts) {
2084             uint32_t missing_dosage_ct;
2085             if (!is_x_or_y) {
2086               missing_dosage_ct = sample_ct - ((cur_dosages[0] + cur_dosages[1]) / kDosageMax);
2087             } else if (is_y) {
2088               missing_dosage_ct = male_ct - ((cur_dosages[0] + cur_dosages[1]) / kDosageMax);
2089             } else {
2090               if (pgv.dosage_ct) {
2091                 ZeroTrailingNyps(raw_sample_ct, pgv.genovec);
2092                 missing_dosage_ct = GenoarrCountMissingInvsubsetUnsafe(pgv.genovec, pgv.dosage_present, raw_sample_ct);
2093               } else {
2094                 missing_dosage_ct = genocounts[3];
2095               }
2096             }
2097             variant_missing_dosage_cts[variant_uidx] = missing_dosage_ct;
2098           }
2099         } else {
2100           // multiallelic cases
2101           if (!is_x_or_y) {
2102             reterr = PgrGetMDCounts(sample_include, sample_include_interleaved_vec, pssi, sample_ct, variant_uidx, is_minimac3_r2, pgrp, imp_r2_vals? (&(imp_r2_vals[variant_uidx])) : nullptr, &hethap_ct, genocounts, all_dosages);
2103             if (unlikely(reterr)) {
2104               ctx->reterr = reterr;
2105               break;
2106             }
2107             if (allele_presents_bytearr) {
2108               for (uintptr_t aidx = 0; aidx != allele_ct; ++aidx) {
2109                 if (all_dosages[aidx]) {
2110                   allele_presents_bytearr[cur_allele_idx_offset + aidx] = 128;
2111                 }
2112               }
2113             }
2114             if (!is_nonxy_haploid) {
2115               hethap_ct = 0;
2116               if (allele_ddosages) {
2117                 for (uintptr_t aidx = 0; aidx != allele_ct; ++aidx) {
2118                   allele_ddosages[cur_allele_idx_offset + aidx] = all_dosages[aidx] * 2;
2119                 }
2120               }
2121             } else {
2122               if (imp_r2_vals && (!is_minimac3_r2)) {
2123                 imp_r2_vals[variant_uidx] *= 0.5;
2124               }
2125               if (allele_ddosages) {
2126                 memcpy(&(allele_ddosages[cur_allele_idx_offset]), all_dosages, allele_ct * sizeof(int64_t));
2127               }
2128             }
2129           } else if (is_y) {
2130             if ((!allele_presents_bytearr) || (sample_ct == male_ct)) {
2131               reterr = PgrGetMDCounts(sex_male, sex_male_interleaved_vec, pssi, male_ct, variant_uidx, 0, pgrp, imp_r2_vals? (&(imp_r2_vals[variant_uidx])) : nullptr, &hethap_ct, genocounts, all_dosages);
2132               if (unlikely(reterr)) {
2133                 ctx->reterr = reterr;
2134                 break;
2135               }
2136               if (imp_r2_vals && (!is_minimac3_r2)) {
2137                 imp_r2_vals[variant_uidx] *= 0.5;
2138               }
2139               if (allele_presents_bytearr) {
2140                 for (uintptr_t aidx = 0; aidx != allele_ct; ++aidx) {
2141                   if (all_dosages[aidx]) {
2142                     allele_presents_bytearr[cur_allele_idx_offset + aidx] = 128;
2143                   }
2144                 }
2145               }
2146               if (allele_ddosages) {
2147                 memcpy(&(allele_ddosages[cur_allele_idx_offset]), all_dosages, allele_ct * sizeof(int64_t));
2148               }
2149             } else {
2150               // need to count female/unknown-sex for allele_presents and
2151               // ignore elsewhere
2152               reterr = PgrGetM(nullptr, pssi, raw_sample_ct, variant_uidx, pgrp, &pgv);
2153               if (unlikely(reterr)) {
2154                 ctx->reterr = reterr;
2155                 break;
2156               }
2157               // possible todo: use a specialized function which just checks
2158               // which alleles exist
2159               ZeroTrailingNyps(raw_sample_ct, pgv.genovec);
2160               GetMFlatCounts64(sample_include, sample_include_interleaved_vec, &pgv, raw_sample_ct, sample_ct, allele_ct, genocounts, all_dosages);
2161               for (uintptr_t aidx = 0; aidx != allele_ct; ++aidx) {
2162                 if (all_dosages[aidx]) {
2163                   allele_presents_bytearr[cur_allele_idx_offset + aidx] = 128;
2164                 }
2165               }
2166 
2167               uint64_t* two_cts = &(all_dosages[allele_ct]);
2168               GetMCounts64(sex_male, sex_male_interleaved_vec, &pgv, raw_sample_ct, male_ct, allele_ct, genocounts, all_dosages, two_cts);
2169               uintptr_t hethap_x2 = 0;
2170               for (uint32_t aidx = 0; aidx != allele_ct; ++aidx) {
2171                 hethap_x2 += all_dosages[aidx];
2172               }
2173               hethap_ct = hethap_x2 / 2;
2174               if (allele_ddosages) {
2175                 for (uintptr_t aidx = 0; aidx != allele_ct; ++aidx) {
2176                   allele_ddosages[cur_allele_idx_offset + aidx] = all_dosages[aidx] * kDosageMid + two_cts[aidx] * kDosageMax;
2177                 }
2178               }
2179               if (imp_r2_vals) {
2180                 for (uint32_t aidx = 0; aidx != allele_ct; ++aidx) {
2181                   const uint64_t one_ct = allele_ddosages[aidx];
2182                   const uint64_t two_ct = two_cts[aidx];
2183                   // now sums
2184                   allele_ddosages[aidx] = one_ct * kDosageMid + two_ct * kDosageMax;
2185                   // now ssqs
2186                   two_cts[aidx] = one_ct * kDosageMid * kDosageMid + two_ct * kDosageMax * kDosageMax;
2187                 }
2188                 imp_r2_vals[variant_uidx] = 0.5 * MultiallelicDiploidMachR2(all_dosages, two_cts, male_ct - genocounts[3], allele_ct);
2189               }
2190             }
2191           } else {
2192             // chrX
2193             // multiallelic dosages not supported yet
2194             reterr = PgrGetM(nullptr, pssi, raw_sample_ct, variant_uidx, pgrp, &pgv);
2195             if (unlikely(reterr)) {
2196               ctx->reterr = reterr;
2197               break;
2198             }
2199             ZeroTrailingNyps(raw_sample_ct, pgv.genovec);
2200             // We don't attempt to compute imp_r2 on chrX, so flat counts are
2201             // fine.
2202             GetMFlatCounts64(sample_include, sample_include_interleaved_vec, &pgv, raw_sample_ct, sample_ct, allele_ct, genocounts, all_dosages);
2203 
2204             // Double all counts, then subtract male counts.
2205             for (uint32_t aidx = 0; aidx != allele_ct; ++aidx) {
2206               all_dosages[aidx] *= 2;
2207             }
2208             GenoarrCountSubsetFreqs(pgv.genovec, sex_male_interleaved_vec, raw_sample_ct, male_ct, sex_specific_genocounts);
2209             hethap_ct = sex_specific_genocounts[1];
2210             if (male_ct) {
2211               all_dosages[0] -= 2 * sex_specific_genocounts[0] + hethap_ct;
2212 
2213               // may underflow
2214               all_dosages[1] -= 2 * sex_specific_genocounts[2] + hethap_ct;
2215 
2216               if (pgv.patch_01_ct) {
2217                 uintptr_t sample_widx = 0;
2218                 uintptr_t patch_01_bits = pgv.patch_01_set[0];
2219                 uint32_t male_patch_01_ct = 0;
2220                 for (uint32_t uii = 0; uii != pgv.patch_01_ct; ++uii) {
2221                   const uintptr_t lowbit = BitIter1y(pgv.patch_01_set, &sample_widx, &patch_01_bits);
2222                   if (sex_male[sample_widx] & lowbit) {
2223                     ++male_patch_01_ct;
2224                     all_dosages[pgv.patch_01_vals[uii]] -= 1;
2225                   }
2226                 }
2227                 all_dosages[1] += male_patch_01_ct;
2228               }
2229               if (pgv.patch_10_ct) {
2230                 uintptr_t sample_widx = 0;
2231                 uintptr_t patch_10_bits = pgv.patch_10_set[0];
2232                 uint32_t male_patch_10_ct = 0;
2233                 for (uint32_t uii = 0; uii != pgv.patch_10_ct; ++uii) {
2234                   const uintptr_t lowbit = BitIter1y(pgv.patch_10_set, &sample_widx, &patch_10_bits);
2235                   if (sex_male[sample_widx] & lowbit) {
2236                     ++male_patch_10_ct;
2237                     const AlleleCode code_lo = pgv.patch_10_vals[2 * uii];
2238                     const AlleleCode code_hi = pgv.patch_10_vals[2 * uii + 1];
2239                     all_dosages[code_lo] -= 1;
2240                     all_dosages[code_hi] -= 1;
2241                     hethap_ct += (code_lo != code_hi);
2242                   }
2243                 }
2244                 all_dosages[1] += male_patch_10_ct * 2;
2245               }
2246             }
2247             if (allele_presents_bytearr) {
2248               for (uintptr_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
2249                 if (all_dosages[allele_idx]) {
2250                   allele_presents_bytearr[cur_allele_idx_offset + allele_idx] = 128;
2251                 }
2252               }
2253             }
2254             if (allele_ddosages) {
2255               for (uintptr_t aidx = 0; aidx != allele_ct; ++aidx) {
2256                 allele_ddosages[cur_allele_idx_offset + aidx] = all_dosages[aidx] * kDosageMid;
2257               }
2258             }
2259             if (x_male_geno_cts) {
2260               STD_ARRAY_REF(uint32_t, 3) cur_x_male_geno_cts = x_male_geno_cts[variant_uidx - x_start];
2261               cur_x_male_geno_cts[0] = sex_specific_genocounts[0];
2262               cur_x_male_geno_cts[1] = sex_specific_genocounts[1];
2263               cur_x_male_geno_cts[2] = sex_specific_genocounts[2];
2264               if (x_nosex_geno_cts) {
2265                 GenoarrCountSubsetFreqs(pgv.genovec, nosex_interleaved_vec, raw_sample_ct, nosex_ct, sex_specific_genocounts);
2266                 STD_ARRAY_REF(uint32_t, 3) cur_nosex_geno_cts = x_nosex_geno_cts[variant_uidx - x_start];
2267                 cur_nosex_geno_cts[0] = sex_specific_genocounts[0];
2268                 cur_nosex_geno_cts[1] = sex_specific_genocounts[1];
2269                 cur_nosex_geno_cts[2] = sex_specific_genocounts[2];
2270               }
2271             }
2272           }
2273           if (variant_missing_dosage_cts) {
2274             // multiallelic dosage not supported yet
2275             variant_missing_dosage_cts[variant_uidx] = genocounts[3];
2276           }
2277         }
2278         if (raw_geno_cts) {
2279           STD_ARRAY_REF(uint32_t, 3) cur_raw_geno_cts = raw_geno_cts[variant_uidx];
2280           cur_raw_geno_cts[0] = genocounts[0];
2281           cur_raw_geno_cts[1] = genocounts[1];
2282           cur_raw_geno_cts[2] = genocounts[2];
2283         }
2284         if (variant_missing_hc_cts) {
2285           variant_missing_hc_cts[variant_uidx] = genocounts[3];
2286           if (variant_hethap_cts && (variant_uidx >= first_hap_uidx)) {
2287             variant_hethap_cts[variant_uidx - first_hap_uidx] = hethap_ct;
2288           }
2289         }
2290       }
2291       if ((++subset_idx == subset_ct) || reterr) {
2292         break;
2293       }
2294       sample_include = ctx->founder_info;
2295       sample_include_interleaved_vec = ctx->founder_info_interleaved_vec;
2296       sample_include_cumulative_popcounts = ctx->founder_info_cumulative_popcounts;
2297       sex_male = ctx->founder_male;
2298       sex_male_interleaved_vec = ctx->founder_male_interleaved_vec;
2299       sex_male_cumulative_popcounts = ctx->founder_male_cumulative_popcounts;
2300 
2301       nosex_interleaved_vec = ctx->founder_nosex_interleaved_vec;
2302 
2303       sample_ct = ctx->founder_ct;
2304       male_ct = ctx->founder_male_ct;
2305       nosex_ct = ctx->founder_nosex_ct;
2306       allele_presents_bytearr = nullptr;
2307       allele_ddosages = ctx->founder_allele_ddosages;
2308       variant_missing_hc_cts = nullptr;
2309       variant_missing_dosage_cts = nullptr;
2310       raw_geno_cts = ctx->founder_raw_geno_cts;
2311       x_male_geno_cts = ctx->founder_x_male_geno_cts;
2312       x_nosex_geno_cts = ctx->founder_x_nosex_geno_cts;
2313       imp_r2_vals = nullptr;
2314     }
2315   } while (!THREAD_BLOCK_FINISH(arg));
2316   THREAD_RETURN;
2317 }
2318 
2319 PglErr LoadAlleleAndGenoCounts(const uintptr_t* sample_include, const uintptr_t* founder_info, const uintptr_t* sex_nm, const uintptr_t* sex_male, const uintptr_t* variant_include, const ChrInfo* cip, const uintptr_t* allele_idx_offsets, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t founder_ct, uint32_t male_ct, uint32_t nosex_ct, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t first_hap_uidx, uint32_t is_minimac3_r2, uint32_t max_thread_ct, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, uintptr_t* allele_presents, uint64_t* allele_ddosages, uint64_t* founder_allele_ddosages, uint32_t* variant_missing_hc_cts, uint32_t* variant_missing_dosage_cts, uint32_t* variant_hethap_cts, STD_ARRAY_PTR_DECL(uint32_t, 3, raw_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, founder_raw_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, x_male_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_male_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, x_nosex_geno_cts), STD_ARRAY_PTR_DECL(uint32_t, 3, founder_x_nosex_geno_cts), double* imp_r2_vals) {
2320   unsigned char* bigstack_mark = g_bigstack_base;
2321   unsigned char* bigstack_end_mark = g_bigstack_end;
2322   PglErr reterr = kPglRetSuccess;
2323   ThreadGroup tg;
2324   PreinitThreads(&tg);
2325   LoadAlleleAndGenoCountsCtx ctx;
2326   {
2327     if (!variant_ct) {
2328       goto LoadAlleleAndGenoCounts_ret_1;
2329     }
2330 
2331     // four cases:
2332     // 1. allele_ddosages, raw_geno_cts, and/or variant_missing_{hc,dosage}_cts
2333     //    required, and that's it
2334     // 2. founder_allele_ddosages and/or founder_raw_geno_cts required, and
2335     //    that's it
2336     // 3. both required, and founder_ct != sample_ct.
2337     // 4. both required, and founder_ct == sample_ct.  caller is expected to
2338     //    make founder_allele_ddosages and allele_ddosages point to the same
2339     //    memory, ditto for founder_raw_geno_cts/raw_geno_cts.
2340     const uint32_t only_founder_cts_required = (!allele_presents) && (!allele_ddosages) && (!raw_geno_cts) && (!variant_missing_hc_cts) && (!variant_missing_dosage_cts);
2341     const uint32_t two_subsets_required = (founder_ct != sample_ct) && (!only_founder_cts_required) && (founder_allele_ddosages || founder_raw_geno_cts);
2342     ctx.cip = cip;
2343     ctx.sample_include = only_founder_cts_required? founder_info : sample_include;
2344     ctx.raw_sample_ct = raw_sample_ct;
2345     ctx.sample_ct = only_founder_cts_required? founder_ct : sample_ct;
2346     ctx.male_ct = male_ct;
2347     ctx.allele_ddosages = only_founder_cts_required? founder_allele_ddosages : allele_ddosages;
2348     ctx.raw_geno_cts = only_founder_cts_required? founder_raw_geno_cts : raw_geno_cts;
2349     ctx.x_male_geno_cts = only_founder_cts_required? founder_x_male_geno_cts : x_male_geno_cts;
2350     ctx.x_nosex_geno_cts = only_founder_cts_required? founder_x_nosex_geno_cts : x_nosex_geno_cts;
2351     ctx.imp_r2_vals = imp_r2_vals;
2352     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
2353     const uint32_t raw_sample_ctv = BitCtToVecCt(raw_sample_ct);
2354     if (unlikely(
2355             bigstack_alloc_w(raw_sample_ctv * kWordsPerVec, &ctx.sample_include_interleaved_vec) ||
2356             bigstack_alloc_u32(raw_sample_ctl, &ctx.sample_include_cumulative_popcounts) ||
2357             bigstack_alloc_w(raw_sample_ctv * kWordsPerVec, &ctx.sex_male_interleaved_vec) ||
2358             bigstack_alloc_u32(raw_sample_ctl, &ctx.sex_male_cumulative_popcounts))) {
2359       goto LoadAlleleAndGenoCounts_ret_NOMEM;
2360     }
2361     FillInterleavedMaskVec(ctx.sample_include, raw_sample_ctv, ctx.sample_include_interleaved_vec);
2362     FillCumulativePopcounts(ctx.sample_include, raw_sample_ctl, ctx.sample_include_cumulative_popcounts);
2363     if ((founder_ct == sample_ct) || (!only_founder_cts_required)) {
2364       ctx.sex_male = sex_male;
2365     } else {
2366       // no nonfounder counts required
2367       uintptr_t* new_sex_male;
2368       if (unlikely(bigstack_alloc_w(raw_sample_ctl, &new_sex_male))) {
2369         goto LoadAlleleAndGenoCounts_ret_NOMEM;
2370       }
2371       BitvecAndCopy(sex_male, founder_info, raw_sample_ctl, new_sex_male);
2372       ZeroTrailingWords(raw_sample_ctl, new_sex_male);
2373       ctx.sex_male = new_sex_male;
2374     }
2375     FillInterleavedMaskVec(ctx.sex_male, raw_sample_ctv, ctx.sex_male_interleaved_vec);
2376     FillCumulativePopcounts(ctx.sex_male, raw_sample_ctl, ctx.sex_male_cumulative_popcounts);
2377     if (!(x_nosex_geno_cts || founder_x_nosex_geno_cts)) {
2378       nosex_ct = 0;
2379     }
2380     ctx.nosex_ct = nosex_ct;
2381     ctx.nosex_interleaved_vec = nullptr;
2382     uintptr_t* nosex_buf = nullptr;
2383     if (nosex_ct) {
2384       if (unlikely(
2385               bigstack_end_alloc_w(raw_sample_ctl, &nosex_buf) ||
2386               bigstack_alloc_w(raw_sample_ctv * kWordsPerVec, &ctx.nosex_interleaved_vec))) {
2387         goto LoadAlleleAndGenoCounts_ret_NOMEM;
2388       }
2389       BitvecInvmaskCopy(ctx.sample_include, sex_nm, raw_sample_ctl, nosex_buf);
2390       ZeroTrailingWords(raw_sample_ctl, nosex_buf);
2391       FillInterleavedMaskVec(nosex_buf, raw_sample_ctv, ctx.nosex_interleaved_vec);
2392     }
2393 
2394     ctx.variant_missing_hc_cts = variant_missing_hc_cts;
2395     ctx.variant_missing_dosage_cts = variant_missing_dosage_cts;
2396     ctx.variant_hethap_cts = variant_hethap_cts;
2397     ctx.first_hap_uidx = first_hap_uidx;
2398     ctx.is_minimac3_r2 = is_minimac3_r2;
2399 
2400     ctx.founder_info = nullptr;
2401     ctx.founder_info_interleaved_vec = nullptr;
2402     ctx.founder_info_cumulative_popcounts = nullptr;
2403     ctx.founder_male = nullptr;
2404     ctx.founder_male_interleaved_vec = nullptr;
2405     ctx.founder_male_cumulative_popcounts = nullptr;
2406     ctx.founder_nosex_interleaved_vec = nullptr;
2407     ctx.founder_ct = 0;
2408     ctx.founder_male_ct = 0;
2409     ctx.founder_nosex_ct = 0;
2410     ctx.founder_allele_ddosages = nullptr;
2411     ctx.founder_raw_geno_cts = nullptr;
2412     ctx.founder_x_male_geno_cts = nullptr;
2413     ctx.founder_x_nosex_geno_cts = nullptr;
2414     if (two_subsets_required) {
2415       if (founder_ct) {
2416         ctx.founder_info = founder_info;
2417         if (unlikely(
2418                 bigstack_alloc_w(raw_sample_ctv * kWordsPerVec, &ctx.founder_info_interleaved_vec) ||
2419                 bigstack_alloc_u32(raw_sample_ctl, &ctx.founder_info_cumulative_popcounts) ||
2420                 bigstack_alloc_w(raw_sample_ctl, &ctx.founder_male) ||
2421                 bigstack_alloc_w(raw_sample_ctv * kWordsPerVec, &ctx.founder_male_interleaved_vec) ||
2422                 bigstack_alloc_u32(raw_sample_ctl, &ctx.founder_male_cumulative_popcounts))) {
2423           goto LoadAlleleAndGenoCounts_ret_NOMEM;
2424         }
2425         FillInterleavedMaskVec(founder_info, raw_sample_ctv, ctx.founder_info_interleaved_vec);
2426         FillCumulativePopcounts(founder_info, raw_sample_ctl, ctx.founder_info_cumulative_popcounts);
2427         BitvecAndCopy(sex_male, founder_info, raw_sample_ctl, ctx.founder_male);
2428         ZeroTrailingWords(raw_sample_ctl, ctx.founder_male);
2429         FillInterleavedMaskVec(ctx.founder_male, raw_sample_ctv, ctx.founder_male_interleaved_vec);
2430         FillCumulativePopcounts(ctx.founder_male, raw_sample_ctl, ctx.founder_male_cumulative_popcounts);
2431         ctx.founder_ct = founder_ct;
2432         ctx.founder_male_ct = ctx.founder_male_cumulative_popcounts[raw_sample_ctl - 1] + PopcountWord(ctx.founder_male[raw_sample_ctl - 1]);
2433         ctx.founder_allele_ddosages = founder_allele_ddosages;
2434         ctx.founder_raw_geno_cts = founder_raw_geno_cts;
2435         ctx.founder_x_male_geno_cts = founder_x_male_geno_cts;
2436         if (nosex_ct) {
2437           // caller currently responsible for ensuring that when
2438           // founder_nosex_ct is zero, founder_x_nosex_geno_cts ==
2439           // nullptr
2440           if (unlikely(bigstack_alloc_w(raw_sample_ctv * kWordsPerVec, &ctx.founder_nosex_interleaved_vec))) {
2441             goto LoadAlleleAndGenoCounts_ret_NOMEM;
2442           }
2443           BitvecAnd(founder_info, raw_sample_ctl, nosex_buf);
2444           ctx.founder_nosex_ct = PopcountWords(nosex_buf, raw_sample_ctl);
2445           assert(ctx.founder_nosex_ct);
2446           ZeroTrailingWords(raw_sample_ctl, nosex_buf);
2447           FillInterleavedMaskVec(nosex_buf, raw_sample_ctv, ctx.founder_nosex_interleaved_vec);
2448           ctx.founder_x_nosex_geno_cts = founder_x_nosex_geno_cts;
2449         }
2450       } else {
2451         if (founder_allele_ddosages) {
2452           ZeroU64Arr(allele_idx_offsets? allele_idx_offsets[raw_variant_ct] : (2 * raw_variant_ct), founder_allele_ddosages);
2453         }
2454         if (founder_raw_geno_cts) {
2455           memset(founder_raw_geno_cts, 0, raw_variant_ct * (3 * sizeof(int32_t)));
2456         }
2457       }
2458     } else if (founder_ct == sample_ct) {
2459       // bugfix: some founder and some nonfounder counts required
2460       if ((!ctx.allele_ddosages) && founder_allele_ddosages) {
2461         ctx.allele_ddosages = founder_allele_ddosages;
2462       }
2463       if ((!ctx.raw_geno_cts) && founder_raw_geno_cts) {
2464         ctx.raw_geno_cts = founder_raw_geno_cts;
2465       }
2466       if ((!ctx.x_male_geno_cts) && founder_x_male_geno_cts) {
2467         ctx.x_male_geno_cts = founder_x_male_geno_cts;
2468       }
2469       if ((!ctx.x_nosex_geno_cts) && founder_x_nosex_geno_cts) {
2470         ctx.x_nosex_geno_cts = founder_x_nosex_geno_cts;
2471       }
2472     } else if (only_founder_cts_required) {
2473       ctx.male_ct = ctx.sex_male_cumulative_popcounts[raw_sample_ctl - 1] + PopcountWord(ctx.sex_male[raw_sample_ctl - 1]);
2474       if (nosex_ct) {
2475         ctx.nosex_ct = PopcountWords(nosex_buf, raw_sample_ctl);
2476       }
2477     }
2478     const uintptr_t raw_allele_ct = allele_idx_offsets? allele_idx_offsets[raw_variant_ct] : (2 * raw_variant_ct);
2479     if (!ctx.sample_ct) {
2480       if (allele_presents) {
2481         ZeroWArr(BitCtToWordCt(raw_allele_ct), allele_presents);
2482       }
2483       if (ctx.allele_ddosages) {
2484         ZeroU64Arr(raw_allele_ct, ctx.allele_ddosages);
2485       }
2486       if (ctx.raw_geno_cts) {
2487         memset(ctx.raw_geno_cts, 0, raw_variant_ct * (3 * sizeof(int32_t)));
2488       }
2489       // early exit
2490       goto LoadAlleleAndGenoCounts_ret_1;
2491     }
2492     BigstackEndReset(bigstack_end_mark);  // free nosex_buf
2493     if (allele_presents) {
2494       const uintptr_t raw_allele_ct_a64 = RoundUpPow2(raw_allele_ct, kCacheline);
2495       if (unlikely(bigstack_left() < raw_allele_ct_a64)) {
2496         goto LoadAlleleAndGenoCounts_ret_NOMEM;
2497       }
2498       // fill byte-array instead of bitarray so multithreading works
2499       ctx.allele_presents_bytearr = S_CAST(unsigned char*, bigstack_alloc_raw(raw_allele_ct_a64));
2500       memset(ctx.allele_presents_bytearr, 0, raw_allele_ct_a64);
2501     } else {
2502       ctx.allele_presents_bytearr = nullptr;
2503     }
2504 
2505     uint32_t unused_chr_code;
2506     uint32_t unused_chr_code2;
2507     uint32_t xy_complications_present = ((allele_presents || allele_ddosages || founder_allele_ddosages || variant_missing_dosage_cts) && XymtExists(cip, kChrOffsetX, &unused_chr_code)) || (allele_presents && (sample_ct != male_ct) && XymtExists(cip, kChrOffsetY, &unused_chr_code2));
2508     const uint32_t xy_dosages_needed = (pgfip->gflags & kfPgenGlobalDosagePresent) && xy_complications_present;
2509 
2510     // todo: check when this saturates
2511     uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
2512     const uint32_t max_allele_ct = pgfip->max_allele_ct;
2513     uint32_t mhc_needed = 0;
2514     ctx.thread_read_mhc = nullptr;
2515     if ((max_allele_ct > 2) && (variant_hethap_cts || allele_presents || allele_ddosages || founder_allele_ddosages || imp_r2_vals)) {
2516       if (unlikely(
2517               bigstack_alloc_u64p(calc_thread_ct, &ctx.all_dosages))) {
2518         goto LoadAlleleAndGenoCounts_ret_NOMEM;
2519       }
2520       mhc_needed = (xy_complications_present || ((variant_hethap_cts || imp_r2_vals) && XymtExists(cip, kChrOffsetX, &unused_chr_code)));
2521       for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
2522         // double allocation size, to leave room for chrY ssqs
2523         if (unlikely(
2524                 bigstack_alloc_u64(max_allele_ct * 2, &(ctx.all_dosages[tidx])))) {
2525           goto LoadAlleleAndGenoCounts_ret_NOMEM;
2526         }
2527       }
2528     } else {
2529       ctx.all_dosages = nullptr;
2530     }
2531     STD_ARRAY_DECL(unsigned char*, 2, main_loadbufs);
2532     // defensive
2533     ctx.dosage_presents = nullptr;
2534     ctx.dosage_mains = nullptr;
2535     uint32_t read_block_size;
2536     // todo: check if raw_sample_ct should be replaced with sample_ct here
2537     if (unlikely(PgenMtLoadInit(variant_include, raw_sample_ct, variant_ct, bigstack_left(), pgr_alloc_cacheline_ct, 0, 0, 0, pgfip, &calc_thread_ct, &ctx.genovecs, mhc_needed? (&ctx.thread_read_mhc) : nullptr, nullptr, nullptr, xy_dosages_needed? (&ctx.dosage_presents) : nullptr, xy_dosages_needed? (&ctx.dosage_mains) : nullptr, nullptr, nullptr, &read_block_size, nullptr, main_loadbufs, &ctx.pgr_ptrs, &ctx.read_variant_uidx_starts))) {
2538       goto LoadAlleleAndGenoCounts_ret_NOMEM;
2539     }
2540     if (unlikely(SetThreadCt(calc_thread_ct, &tg))) {
2541       goto LoadAlleleAndGenoCounts_ret_NOMEM;
2542     }
2543     ctx.variant_include = variant_include;
2544     ctx.allele_idx_offsets = allele_idx_offsets;
2545     ctx.reterr = kPglRetSuccess;
2546     SetThreadFuncAndData(LoadAlleleAndGenoCountsThread, &ctx, &tg);
2547 
2548     logputs("Calculating allele frequencies... ");
2549     fputs("0%", stdout);
2550     fflush(stdout);
2551     uint32_t pct = 0;
2552 
2553     uint32_t parity = 0;
2554     uint32_t read_block_idx = 0;
2555     uint32_t next_print_variant_idx = variant_ct / 100;
2556     for (uint32_t variant_idx = 0; ; ) {
2557       const uint32_t cur_block_size = MultireadNonempty(variant_include, &tg, raw_variant_ct, read_block_size, pgfip, &read_block_idx, &reterr);
2558       if (unlikely(reterr)) {
2559         goto LoadAlleleAndGenoCounts_ret_PGR_FAIL;
2560       }
2561       if (variant_idx) {
2562         JoinThreads(&tg);
2563         reterr = ctx.reterr;
2564         if (unlikely(reterr)) {
2565           goto LoadAlleleAndGenoCounts_ret_PGR_FAIL;
2566         }
2567       }
2568       if (!IsLastBlock(&tg)) {
2569         ctx.cur_block_size = cur_block_size;
2570         ComputeUidxStartPartition(variant_include, cur_block_size, calc_thread_ct, read_block_idx * read_block_size, ctx.read_variant_uidx_starts);
2571         PgrCopyBaseAndOffset(pgfip, calc_thread_ct, ctx.pgr_ptrs);
2572         if (variant_idx + cur_block_size == variant_ct) {
2573           DeclareLastThreadBlock(&tg);
2574         }
2575         if (unlikely(SpawnThreads(&tg))) {
2576           goto LoadAlleleAndGenoCounts_ret_THREAD_CREATE_FAIL;
2577         }
2578       }
2579 
2580       parity = 1 - parity;
2581       if (variant_idx == variant_ct) {
2582         break;
2583       }
2584       if (variant_idx >= next_print_variant_idx) {
2585         if (pct > 10) {
2586           putc_unlocked('\b', stdout);
2587         }
2588         pct = (variant_idx * 100LLU) / variant_ct;
2589         printf("\b\b%u%%", pct++);
2590         fflush(stdout);
2591         next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
2592       }
2593 
2594       ++read_block_idx;
2595       variant_idx += cur_block_size;
2596       // crucially, this is independent of the PgenReader block_base
2597       // pointers
2598       pgfip->block_base = main_loadbufs[parity];
2599     }
2600     if (allele_presents) {
2601       const uintptr_t raw_allele_ctl = BitCtToWordCt(raw_allele_ct);
2602       allele_presents[raw_allele_ctl - 1] = 0;
2603 #ifdef __LP64__
2604       const uintptr_t vec_ct = DivUp(raw_allele_ct, kBytesPerVec);
2605       VecUc* bytearr_alias = R_CAST(VecUc*, ctx.allele_presents_bytearr);
2606       Vec8thUint* allele_presents_alias = R_CAST(Vec8thUint*, allele_presents);
2607       for (uintptr_t vec_idx = 0; vec_idx != vec_ct; ++vec_idx) {
2608         allele_presents_alias[vec_idx] = vecuc_movemask(bytearr_alias[vec_idx]);
2609       }
2610 #else
2611       const uintptr_t twovec_ct = DivUp(raw_allele_ct, 8);
2612       uintptr_t* bytearr_iter = R_CAST(uintptr_t*, ctx.allele_presents_bytearr);
2613       unsigned char* allele_presents_iter = R_CAST(unsigned char*, allele_presents);
2614       unsigned char* allele_presents_stop = &(allele_presents_iter[twovec_ct]);
2615       for (; allele_presents_iter != allele_presents_stop; ++allele_presents_iter) {
2616         // 31,23,15,7 -> 3,2,1,0: multiply by number with bits 0,7,14,21 set,
2617         // then right-shift
2618         uintptr_t cur_word = ((*bytearr_iter++) * 0x204081) >> 28;
2619         cur_word |= ((*bytearr_iter++) * 0x204081) >> 24;
2620         *allele_presents_iter = cur_word;
2621       }
2622 #endif
2623     }
2624     if (pct > 10) {
2625       putc_unlocked('\b', stdout);
2626     }
2627     fputs("\b\b", stdout);
2628     logputs("done.\n");
2629   }
2630   while (0) {
2631   LoadAlleleAndGenoCounts_ret_NOMEM:
2632     reterr = kPglRetNomem;
2633     break;
2634   LoadAlleleAndGenoCounts_ret_PGR_FAIL:
2635     PgenErrPrintN(reterr);
2636     break;
2637   LoadAlleleAndGenoCounts_ret_THREAD_CREATE_FAIL:
2638     reterr = kPglRetThreadCreateFail;
2639     break;
2640   }
2641  LoadAlleleAndGenoCounts_ret_1:
2642   CleanupThreads(&tg);
2643   BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
2644   pgfip->block_base = nullptr;
2645   return reterr;
2646 }
2647 
ApplyHardCallThresh(const uintptr_t * dosage_present,const Dosage * dosage_main,uint32_t dosage_ct,uint32_t hard_call_halfdist,uintptr_t * genovec)2648 void ApplyHardCallThresh(const uintptr_t* dosage_present, const Dosage* dosage_main, uint32_t dosage_ct, uint32_t hard_call_halfdist, uintptr_t* genovec) {
2649   uintptr_t sample_uidx_base = 0;
2650   uintptr_t cur_bits = dosage_present[0];
2651   for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
2652     const uintptr_t sample_uidx = BitIter1(dosage_present, &sample_uidx_base, &cur_bits);
2653     const uint32_t dosage_int = dosage_main[dosage_idx];
2654     const uint32_t halfdist = BiallelicDosageHalfdist(dosage_int);
2655     const uintptr_t widx = sample_uidx / kBitsPerWordD2;
2656     uintptr_t prev_geno_word = genovec[widx];
2657     const uint32_t shift = (sample_uidx % kBitsPerWordD2) * 2;
2658     uintptr_t new_geno;
2659     if (halfdist < hard_call_halfdist) {
2660       new_geno = 3;
2661     } else {
2662       new_geno = (dosage_int + kDosage4th) / kDosageMid;
2663     }
2664     const uintptr_t prev_geno = (prev_geno_word >> shift) & 3;
2665     const uintptr_t geno_xor = new_geno ^ prev_geno;
2666     if (geno_xor) {
2667       genovec[widx] = prev_geno_word ^ (geno_xor << shift);
2668     }
2669   }
2670 }
2671 
ApplyHardCallThreshPhased(const uintptr_t * dosage_present,const Dosage * dosage_main,uint32_t dosage_ct,uint32_t hard_call_halfdist,uintptr_t * genovec,uintptr_t * phasepresent,uintptr_t * phaseinfo,uintptr_t * dphase_present,SDosage * dphase_delta,SDosage * tmp_dphase_delta)2672 uint32_t ApplyHardCallThreshPhased(const uintptr_t* dosage_present, const Dosage* dosage_main, uint32_t dosage_ct, uint32_t hard_call_halfdist, uintptr_t* genovec, uintptr_t* phasepresent, uintptr_t* phaseinfo, uintptr_t* dphase_present, SDosage* dphase_delta, SDosage* tmp_dphase_delta) {
2673   // Generate new hphase values when we're converting a hardcall from
2674   // missing/hom to het, and abs(dphase_delta) > 0.5.  Erase explicit dphase in
2675   // that case if dphase_delta is maximal.
2676   //
2677   // Erase hphase value when we're converting a hardcall from het to
2678   // missing/hom.  If hardcall was previously phased and no explicit dphase
2679   // value existed, add it.
2680   //
2681   // Since both insertions and deletions are possible, we write the updated
2682   // dphase_delta to a buffer and copy it back, instead of editing in place.
2683   //
2684   // Returns final dphase_ct.
2685   //
2686   // Some extraneous phaseinfo bits may be set on return.
2687   const SDosage* dphase_read_iter = dphase_delta;
2688   SDosage* dphase_write_iter = tmp_dphase_delta;
2689   uintptr_t sample_uidx_base = 0;
2690   uintptr_t cur_bits = dosage_present[0];
2691   for (uint32_t dosage_idx = 0; dosage_idx != dosage_ct; ++dosage_idx) {
2692     const uintptr_t sample_uidx = BitIter1(dosage_present, &sample_uidx_base, &cur_bits);
2693     const uint32_t dosage_int = dosage_main[dosage_idx];
2694     const uint32_t halfdist = BiallelicDosageHalfdist(dosage_int);
2695     const uintptr_t widx = sample_uidx / kBitsPerWordD2;
2696     uintptr_t prev_geno_word = genovec[widx];
2697     const uint32_t shift = (sample_uidx % kBitsPerWordD2) * 2;
2698     uintptr_t new_geno;
2699     if (halfdist < hard_call_halfdist) {
2700       new_geno = 3;
2701     } else {
2702       new_geno = (dosage_int + kDosage4th) / kDosageMid;
2703     }
2704     const uintptr_t prev_geno = (prev_geno_word >> shift) & 3;
2705     const uintptr_t geno_xor = new_geno ^ prev_geno;
2706     const uint32_t cur_hphase_present = IsSet(phasepresent, sample_uidx);
2707     if (IsSet(dphase_present, sample_uidx)) {
2708       int32_t dphase_delta_val = *dphase_read_iter++;
2709       *dphase_write_iter++ = dphase_delta_val;
2710       if (geno_xor) {
2711         if (new_geno == 1) {
2712           const uint32_t neg_sign_bit = -(S_CAST(uint32_t, dphase_delta_val) >> 31);
2713           const uint32_t abs_dphase_delta_val = (S_CAST(uint32_t, dphase_delta_val) ^ neg_sign_bit) - neg_sign_bit;
2714           if (abs_dphase_delta_val > kDosage4th) {
2715             SetBit(sample_uidx, phasepresent);
2716             AssignBit(sample_uidx, neg_sign_bit + 1, phaseinfo);
2717             // is dphase_delta maximal?
2718             if ((abs_dphase_delta_val == dosage_int) || (abs_dphase_delta_val + dosage_int == kDosageMax)) {
2719               ClearBit(sample_uidx, dphase_present);
2720               --dphase_write_iter;
2721             }
2722           }
2723         } else {
2724           ClearBit(sample_uidx, phasepresent);
2725         }
2726         genovec[widx] = prev_geno_word ^ (geno_xor << shift);
2727       }
2728     } else {
2729       if (geno_xor) {
2730         if (cur_hphase_present) {
2731           assert(new_geno != 1);
2732           ClearBit(sample_uidx, phasepresent);
2733           SetBit(sample_uidx, dphase_present);
2734           int32_t new_dphase_delta_val = DosageHomdist(dosage_int);
2735           if (!IsSet(phaseinfo, sample_uidx)) {
2736             new_dphase_delta_val = -new_dphase_delta_val;
2737           }
2738           *dphase_write_iter++ = new_dphase_delta_val;
2739         }
2740         genovec[widx] = prev_geno_word ^ (geno_xor << shift);
2741       }
2742     }
2743   }
2744   const uint32_t dphase_ct = dphase_write_iter - tmp_dphase_delta;
2745   memcpy(dphase_delta, tmp_dphase_delta, dphase_ct * sizeof(Dosage));
2746   return dphase_ct;
2747 }
2748 
2749 uintptr_t InitWriteAlleleIdxOffsets(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uint32_t* new_variant_idx_to_old, uint32_t variant_ct, uintptr_t* new_allele_idx_offsets) {
2750   uintptr_t cur_offset = 0;
2751   if (allele_presents) {
2752     uint32_t ref_allele_idx = 0;
2753     uintptr_t variant_uidx_base = 0;
2754     uintptr_t cur_bits = 0;
2755     if (!new_variant_idx_to_old) {
2756       cur_bits = variant_include[0];
2757     }
2758     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
2759       uint32_t variant_uidx;
2760       if (new_variant_idx_to_old) {
2761         variant_uidx = new_variant_idx_to_old[variant_idx];
2762       } else {
2763         variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
2764       }
2765       new_allele_idx_offsets[variant_idx] = cur_offset;
2766       const uintptr_t old_offset_start = allele_idx_offsets[variant_uidx];
2767       const uintptr_t old_offset_end = allele_idx_offsets[variant_uidx + 1];
2768       uint32_t cur_allele_ct = old_offset_end - old_offset_start;
2769       if (cur_allele_ct > 2) {
2770         cur_allele_ct = PopcountBitRange(allele_presents, old_offset_start, old_offset_end);
2771         if (refalt1_select) {
2772           ref_allele_idx = refalt1_select[variant_uidx][0];
2773         }
2774         if (!IsSet(allele_presents, old_offset_start + ref_allele_idx)) {
2775           ++cur_allele_ct;
2776         }
2777         if (cur_allele_ct < 2) {
2778           cur_allele_ct = 2;
2779         }
2780       }
2781       cur_offset += cur_allele_ct;
2782     }
2783   } else if (!new_variant_idx_to_old) {
2784     uintptr_t variant_uidx_base = 0;
2785     uintptr_t cur_bits = variant_include[0];
2786     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
2787       const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
2788       new_allele_idx_offsets[variant_idx] = cur_offset;
2789       cur_offset += allele_idx_offsets[variant_uidx + 1] - allele_idx_offsets[variant_uidx];
2790     }
2791   } else {
2792     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
2793       const uint32_t variant_uidx = new_variant_idx_to_old[variant_idx];
2794       new_allele_idx_offsets[variant_idx] = cur_offset;
2795       cur_offset += allele_idx_offsets[variant_uidx + 1] - allele_idx_offsets[variant_uidx];
2796     }
2797   }
2798   return cur_offset;
2799 }
2800 
2801 // Join behavior:
2802 // - Require sorted input .pvar for now, though it won't be difficult to lift
2803 //   this restriction later.  Ok for input to contain multiallelic variants.
2804 // - Don't need to do anything different when chr:pos only appears once in a
2805 //   biallelic variant, or in a multiallelic variant in "+any" mode.  For
2806 //   multiallelic variants in +both/+snps mode, error out if the variant is
2807 //   mixed SNP/non-SNP.  In +both case, also error out if the variant is mixed
2808 //   symbolic/non-symbolic, or it's symbolic and does not satisfy the REF or
2809 //   INFO:END constraints.
2810 // - When multiple variants have the same chr:pos:
2811 //   - In +snps mode, also don't need to do anything different for non-SNPs.
2812 //   - Otherwise, create up to three linked lists of input variant records
2813 //     (need to distinguish SNPs from non-SNPs in +both mode, and within the
2814 //     non-SNP category, symbolic alleles are separate from non-symbolic).
2815 //     The variant with symbolic alleles has additional constraints: a warning
2816 //     is printed if INFO:END isn't defined, and an error occurs if either REF
2817 //     is multi-character, or there's an INFO:END mismatch.
2818 //   - Error out if REF alleles aren't all consistent, or any ALT allele is
2819 //     duplicated (note that --pmerge must support the latter).
2820 //   - For joined not-entirely-SNP non-symbolic variants, the final REF is the
2821 //     longest of the original REFs; ALT alleles have bases added to the end if
2822 //     necessary.  (Yes, this causes SNPs to stop being visible to a strlen ==
2823 //     1 check in +any mode, which is why + is interpreted as +both instead.)
2824 //   - Final ALT allele order is based on allele frequency (highest first),
2825 //     with ties broken by natural-sort.
2826 //   - ID: 1. If --set-all-var-ids specified, apply template.
2827 //         2. Otherwise, keep original variant ID if all sources identical and
2828 //            nonmissing.
2829 //         3. Otherwise, if vid-join specified, check if all original IDs are
2830 //            nonmissing and contain exactly one ';' per extra ALT allele.  If
2831 //            so, join on ';' in final ALT allele order.
2832 //         4. Otherwise, set to --set-missing-var-ids template or missing code.
2833 //   - QUAL is minimum of inputs ('.' treated as positive infinity).  FILTER is
2834 //     natural-sorted union of non-PASS values; if all inputs were '.'/PASS,
2835 //     output is PASS unless all inputs were missing.
2836 //   - INFO join is based on the Number field in the key's header line.  (If
2837 //     there's no header line corresponding to a key, we error out.)  For
2838 //     Number=A, we join in the obvious manner.  For Number=R (or Number=G in
2839 //     the haploid case), we replace the reference allele entry with . iff
2840 //     there's any string mismatch (e.g. '13' and '13.0' will be treated as
2841 //     unequal), and print a warning (with more than 3 warnings, later warnings
2842 //     are only written to log file).  For diploid Number=G, we do the same for
2843 //     the hom-ref entry.  For Number=0, we error out if the header line Type
2844 //     isn't Flag, and the final flag is set iff any of the original variants
2845 //     have the flag set.  For the other cases (Number=<fixed constant> or
2846 //     '.'), we replace the value with '.' if there's any string mismatch.
2847 //     The key won't appear at all iff it doesn't appear in any of the original
2848 //     variants (not even a '.').
2849 //   - We error out if the joined variant has total ALT dosage > ~2.02, or
2850 //     either side of the total ALT phased dosage > ~1.01.  (We scale the
2851 //     components down if there's a <1% overflow.)
2852 // - Missing ALTs... ugh.  Have to permit this in biallelic case, but don't
2853 //   allow it elsewhere.  This forces a SNP (if REF is single-char) and/or
2854 //   non-SNP (if REF is multichar) entry to be written when no corresponding
2855 //   regular ALT is present, genotype writing/merging will error out if a
2856 //   missing allele has any dosage, and QUAL/FILTER/INFO is merged as usual
2857 //   when there are multiple missing-ALT same-type variants at the same
2858 //   position.  However, when a regular ALT is present, the missing-ALT
2859 //   variants are completely ignored.
2860 
2861 // The simplest design involves precomputing the entire (new variant idx, new
2862 // allele idx) -> (old variant uidx, old allele idx) mapping here, and
2863 // referring to that in both the .pvar and .pgen writers.  Unfortunately, that
2864 // has a rather high memory requirement of 5 bytes per allele (assuming
2865 // sizeof(AlleleCode) == 1).  While that's smaller than the 8 bytes/allele we
2866 // pay for allele_storage[], and also practically always smaller than the
2867 // 21+[variant ID len] per variant we pay for variant_bps + variant_ids +
2868 // allele_idx_offsets, it's still worth some effort to avoid; in particular, we
2869 // want an 8 GiB workspace to be sufficient for most operations on the full
2870 // 1000 Genomes phase 3 variant set (~84.8 million), and that's barely true
2871 // right now, so a bit of additional complexity to avoid losing ~850 MB is
2872 // justified.
2873 //
2874 // Thus, we only save the number of alleles in each new variant here, and force
2875 // the which-allele-comes-from-where computation (as well as SNP vs. non-SNP
2876 // vs. symbolic handling) to be repeated in the .pvar and .pgen writers.  This
2877 // sucks, but being forced to perform an ordinary analysis on a remote machine
2878 // rather than locally sucks a bit more.
2879 //
2880 // Incidentally, another place to look, if it's important to further reduce
2881 // memory requirements, is internal representation of the FILTER field.  The
2882 // current design gains a bit of extra speed by simply storing the non-./PASS
2883 // strings without parsing them further; but we could put them into a temporary
2884 // storage location and then convert to bitarray + string table at the end of
2885 // LoadPvar().  (Note that we already use only bitarrays when all FILTER values
2886 // are ./PASS, though.)
2887 
2888 ENUM_U31_DEF_START()
2889   kJoinVtypeError,
2890   kJoinVtypeSnp,
2891   kJoinVtypeNonsnp,
2892   kJoinVtypeMixedSnpNonsnp,
2893   kJoinVtypeSymbolic,
2894   kJoinVtypeEnd
2895 ENUM_U31_DEF_END(JoinVtype);
2896 
2897 typedef struct JoinCountsStruct {
2898   uintptr_t snp_ct;
2899   uintptr_t nonsnp_ct;
2900   uintptr_t symbolic_ct;
2901   uint32_t missalt_snp_ct;
2902   uint32_t missalt_nonsnp_ct;
2903 } JoinCounts;
2904 
JoinCount(const char * const * cur_alleles,uintptr_t allele_ct,JoinCounts * jcp)2905 JoinVtype JoinCount(const char* const* cur_alleles, uintptr_t allele_ct, JoinCounts* jcp) {
2906   jcp->snp_ct = 0;
2907   jcp->symbolic_ct = 0;
2908   jcp->missalt_snp_ct = 0;
2909   jcp->missalt_nonsnp_ct = 0;
2910   if (cur_alleles[0][1] == '\0') {
2911     jcp->nonsnp_ct = 0;
2912     for (uintptr_t allele_idx = 1; allele_idx != allele_ct; ++allele_idx) {
2913       const char* cur_allele = cur_alleles[allele_idx];
2914       if (cur_allele[0] == '<') {
2915         jcp->symbolic_ct += 1;
2916       } else if (cur_allele[1] == '\0') {
2917         if (cur_allele[0] == '.') {
2918           if (allele_ct == 2) {
2919             jcp->missalt_snp_ct = 1;
2920             return kJoinVtypeSnp;
2921           }
2922           return kJoinVtypeError;
2923         }
2924         jcp->snp_ct += 1;
2925       } else {
2926         jcp->nonsnp_ct += 1;
2927       }
2928     }
2929     if (jcp->symbolic_ct) {
2930       return (jcp->symbolic_ct == allele_ct - 1)? kJoinVtypeSymbolic : kJoinVtypeError;
2931     }
2932     if (jcp->nonsnp_ct) {
2933       return jcp->snp_ct? kJoinVtypeMixedSnpNonsnp : kJoinVtypeNonsnp;
2934     }
2935     return kJoinVtypeSnp;
2936   }
2937   for (uint32_t allele_idx = 1; allele_idx != allele_ct; ++allele_idx) {
2938     const char* cur_allele = cur_alleles[allele_idx];
2939     if (cur_allele[0] == '<') {
2940       return kJoinVtypeError;
2941     }
2942     if (memequal_k(cur_allele, ".", 2)) {
2943       if (allele_ct == 2) {
2944         jcp->nonsnp_ct = 0;
2945         jcp->missalt_nonsnp_ct = 1;
2946         return kJoinVtypeNonsnp;
2947       }
2948       return kJoinVtypeError;
2949     }
2950   }
2951   jcp->nonsnp_ct = allele_ct - 1;
2952   return kJoinVtypeNonsnp;
2953 }
2954 
PlanJoinOne(uint32_t cur_alt_allele_ct,uintptr_t ** write_allele_idx_offsets_iterp,uintptr_t * cur_offsetp,uint32_t * max_write_allele_ctp)2955 void PlanJoinOne(uint32_t cur_alt_allele_ct, uintptr_t** write_allele_idx_offsets_iterp, uintptr_t* cur_offsetp, uint32_t* max_write_allele_ctp) {
2956   const uint32_t cur_write_allele_ct = 1 + MAXV(1, cur_alt_allele_ct);
2957   if (cur_write_allele_ct > (*max_write_allele_ctp)) {
2958     *max_write_allele_ctp = cur_write_allele_ct;
2959   }
2960   *cur_offsetp += cur_write_allele_ct;
2961   uintptr_t* write_allele_idx_offsets_iter = *write_allele_idx_offsets_iterp;
2962   *write_allele_idx_offsets_iter++ = *cur_offsetp;
2963   *write_allele_idx_offsets_iterp = write_allele_idx_offsets_iter;
2964 }
2965 
PlanJoinFlushPos(const JoinCounts * jcp,MakePlink2Flags join_mode,uintptr_t ** write_allele_idx_offsets_iterp,uintptr_t * cur_offsetp,uint32_t * max_write_allele_ctp,uint32_t * max_missalt_ctp)2966 void PlanJoinFlushPos(const JoinCounts* jcp, MakePlink2Flags join_mode, uintptr_t** write_allele_idx_offsets_iterp, uintptr_t* cur_offsetp, uint32_t* max_write_allele_ctp, uint32_t* max_missalt_ctp) {
2967   if (join_mode == kfMakePlink2MJoinSnps) {
2968     if (!(jcp->snp_ct || jcp->missalt_snp_ct)) {
2969       // all non-SNPs at this position, which were already accounted for
2970       return;
2971     }
2972     PlanJoinOne(jcp->snp_ct, write_allele_idx_offsets_iterp, cur_offsetp, max_write_allele_ctp);
2973     if ((!jcp->snp_ct) && (jcp->missalt_snp_ct > (*max_missalt_ctp))) {
2974       *max_missalt_ctp = jcp->missalt_snp_ct;
2975     }
2976     return;
2977   }
2978   if (join_mode == kfMakePlink2MJoinBoth) {
2979     if (jcp->snp_ct || jcp->missalt_snp_ct) {
2980       PlanJoinOne(jcp->snp_ct, write_allele_idx_offsets_iterp, cur_offsetp, max_write_allele_ctp);
2981       if ((!jcp->snp_ct) && (jcp->missalt_snp_ct > (*max_missalt_ctp))) {
2982         *max_missalt_ctp = jcp->missalt_snp_ct;
2983       }
2984     }
2985     if (jcp->nonsnp_ct || jcp->missalt_nonsnp_ct) {
2986       PlanJoinOne(jcp->nonsnp_ct, write_allele_idx_offsets_iterp, cur_offsetp, max_write_allele_ctp);
2987       if ((!jcp->nonsnp_ct) && (jcp->missalt_nonsnp_ct > (*max_missalt_ctp))) {
2988         *max_missalt_ctp = jcp->missalt_nonsnp_ct;
2989       }
2990     }
2991   } else {
2992     if (jcp->snp_ct || jcp->nonsnp_ct || jcp->missalt_snp_ct || jcp->missalt_nonsnp_ct) {
2993       const uint32_t alt_allele_ct = jcp->snp_ct + jcp->nonsnp_ct;
2994       PlanJoinOne(alt_allele_ct, write_allele_idx_offsets_iterp, cur_offsetp, max_write_allele_ctp);
2995       const uint32_t missalt_ct = jcp->missalt_snp_ct + jcp->missalt_nonsnp_ct;
2996       if ((missalt_ct > (*max_missalt_ctp)) && (!alt_allele_ct)) {
2997         *max_missalt_ctp = missalt_ct;
2998       }
2999     }
3000   }
3001   if (jcp->symbolic_ct) {
3002     PlanJoinOne(jcp->symbolic_ct, write_allele_idx_offsets_iterp, cur_offsetp, max_write_allele_ctp);
3003   }
3004 }
3005 
3006 
3007 // *write_allele_idx_offsetsp is assumed to be initialized to nullptr.
3008 // *max_missalt_ctp is assumed to be initialized to 0.
PlanMultiallelicJoin(const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,MakePlink2Flags flags,uint32_t * write_variant_ctp,const uintptr_t ** write_allele_idx_offsetsp,uint32_t * max_write_allele_ctp,uint32_t * max_missalt_ctp)3009 PglErr PlanMultiallelicJoin(const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, MakePlink2Flags flags, uint32_t* write_variant_ctp, const uintptr_t** write_allele_idx_offsetsp, uint32_t* max_write_allele_ctp, uint32_t* max_missalt_ctp) {
3010   uint32_t variant_uidx = 0;
3011   PglErr reterr = kPglRetSuccess;
3012   {
3013     const uint32_t variant_ct = *write_variant_ctp;
3014     uintptr_t* write_allele_idx_offsets = R_CAST(uintptr_t*, g_bigstack_base);
3015     uintptr_t* write_allele_idx_offsets_stop = R_CAST(uintptr_t*, RoundDownPow2(R_CAST(uintptr_t, g_bigstack_end), kCacheline));
3016     if (write_allele_idx_offsets == write_allele_idx_offsets_stop) {
3017       goto PlanMultiallelicJoin_ret_NOMEM;
3018     }
3019     write_allele_idx_offsets_stop = &(write_allele_idx_offsets_stop[-4]);
3020     const MakePlink2Flags join_mode = flags & kfMakePlink2MMask;
3021     uintptr_t* write_allele_idx_offsets_iter = write_allele_idx_offsets;
3022     *write_allele_idx_offsets_iter++ = 0;
3023     uintptr_t cur_offset = 0;
3024     uintptr_t variant_uidx_base = 0;
3025     uintptr_t cur_bits = variant_include[0];
3026     uint32_t chr_fo_idx = UINT32_MAX;
3027     uint32_t chr_end = 0;
3028     uint32_t prev_bp = 0;
3029     uint32_t allele_ct = 2;
3030     uint32_t max_write_allele_ct = 2;
3031     JoinCounts jc;
3032     // possible todo: track max_write_allele_ct for each subcase, instead of
3033     // having a single value
3034     jc.snp_ct = 0;
3035     jc.nonsnp_ct = 0;
3036     jc.symbolic_ct = 0;
3037     jc.missalt_snp_ct = 0;
3038     jc.missalt_nonsnp_ct = 0;
3039     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
3040       variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3041       if (variant_uidx >= chr_end) {
3042         do {
3043           ++chr_fo_idx;
3044           chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
3045         } while (variant_uidx >= chr_end);
3046         prev_bp = UINT32_MAX;
3047       }
3048       uintptr_t allele_idx_offset_base = variant_uidx * 2;
3049       if (allele_idx_offsets) {
3050         allele_idx_offset_base = allele_idx_offsets[variant_uidx];
3051         allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
3052       }
3053       const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
3054       JoinCounts cur_jc;
3055       JoinVtype jvt = JoinCount(cur_alleles, allele_ct, &cur_jc);
3056       if (unlikely(jvt == kJoinVtypeError)) {
3057         goto PlanMultiallelicJoin_ret_MIXED_SYMBOLIC;
3058       }
3059       if (unlikely((join_mode != kfMakePlink2MJoinAny) && (jvt == kJoinVtypeMixedSnpNonsnp))) {
3060         logerrprintfww("Error: Variant '%s' is mixed SNP/non-SNP; multiallelics=+both and +snps don't permit this.\n", variant_ids[variant_uidx]);
3061         goto PlanMultiallelicJoin_ret_INCONSISTENT_INPUT;
3062       }
3063       const uint32_t cur_bp = variant_bps[variant_uidx];
3064       if (cur_bp != prev_bp) {
3065         PlanJoinFlushPos(&jc, join_mode, &write_allele_idx_offsets_iter, &cur_offset, &max_write_allele_ct, max_missalt_ctp);
3066         if (join_mode == kfMakePlink2MJoinSnps) {
3067           if (cur_jc.nonsnp_ct || cur_jc.symbolic_ct || cur_jc.missalt_nonsnp_ct) {
3068             // Flush non-SNP immediately.
3069             const uint32_t cur_write_allele_ct = 1 + cur_jc.nonsnp_ct + cur_jc.symbolic_ct;
3070             cur_offset += cur_write_allele_ct;
3071             if (cur_write_allele_ct > max_write_allele_ct) {
3072               max_write_allele_ct = cur_write_allele_ct;
3073             }
3074             *write_allele_idx_offsets_iter++ = cur_offset;
3075             // Also need to reinitialize.
3076             jc.snp_ct = 0;
3077             jc.missalt_snp_ct = 0;
3078           } else {
3079             jc.snp_ct = cur_jc.snp_ct;
3080             jc.missalt_snp_ct = cur_jc.missalt_snp_ct;
3081           }
3082         } else {
3083           jc = cur_jc;
3084         }
3085         prev_bp = cur_bp;
3086       } else if ((join_mode == kfMakePlink2MJoinSnps) && (cur_jc.nonsnp_ct || cur_jc.symbolic_ct)) {
3087         // Flush non-SNP immediately.
3088         const uint32_t cur_write_allele_ct = 1 + cur_jc.nonsnp_ct + cur_jc.symbolic_ct;
3089         cur_offset += cur_write_allele_ct;
3090         if (cur_write_allele_ct > max_write_allele_ct) {
3091           max_write_allele_ct = cur_write_allele_ct;
3092         }
3093         *write_allele_idx_offsets_iter++ = cur_offset;
3094       } else {
3095         jc.snp_ct += cur_jc.snp_ct;
3096         jc.nonsnp_ct += cur_jc.nonsnp_ct;
3097         jc.symbolic_ct += cur_jc.symbolic_ct;
3098         jc.missalt_snp_ct += cur_jc.missalt_snp_ct;
3099         jc.missalt_nonsnp_ct += cur_jc.missalt_nonsnp_ct;
3100         continue;
3101       }
3102       if (write_allele_idx_offsets_iter > write_allele_idx_offsets_stop) {
3103         goto PlanMultiallelicJoin_ret_NOMEM;
3104       }
3105     }
3106     // Flush last position.
3107     PlanJoinFlushPos(&jc, join_mode, &write_allele_idx_offsets_iter, &cur_offset, &max_write_allele_ct, max_missalt_ctp);
3108     if (max_write_allele_ct > kPglMaxAltAlleleCt + 1) {
3109       goto PlanMultiallelicJoin_ret_TOO_MANY_ALTS;
3110     }
3111     *write_variant_ctp = S_CAST(uintptr_t, write_allele_idx_offsets_iter - write_allele_idx_offsets) - 1;
3112     *max_write_allele_ctp = max_write_allele_ct;
3113     if (max_write_allele_ct > 2) {
3114       BigstackBaseSet(write_allele_idx_offsets_iter);
3115       *write_allele_idx_offsetsp = write_allele_idx_offsets;
3116     }
3117   }
3118   while (0) {
3119   PlanMultiallelicJoin_ret_NOMEM:
3120     reterr = kPglRetNomem;
3121     break;
3122   PlanMultiallelicJoin_ret_MIXED_SYMBOLIC:
3123     logerrprintfww("Error: Variant '%s' mixes symbolic and non-symbolic alleles in an unsupported manner.\n", variant_ids[variant_uidx]);
3124   PlanMultiallelicJoin_ret_INCONSISTENT_INPUT:
3125     reterr = kPglRetInconsistentInput;
3126     break;
3127   PlanMultiallelicJoin_ret_TOO_MANY_ALTS:
3128     logerrprintf("Error: Variant-join would create a variant with too many ALT alleles for this\nplink2 build.\n");
3129     reterr = kPglRetNotYetSupported;
3130     break;
3131   }
3132   return reterr;
3133 }
3134 
PlanMultiallelicSplit(const uintptr_t * variant_include,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,uint32_t max_allele_ct,MakePlink2Flags flags,uint32_t * write_variant_ctp,const uintptr_t ** write_allele_idx_offsetsp)3135 PglErr PlanMultiallelicSplit(const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, uint32_t max_allele_ct, MakePlink2Flags flags, uint32_t* write_variant_ctp, const uintptr_t** write_allele_idx_offsetsp) {
3136   uint32_t variant_uidx = 0;
3137   PglErr reterr = kPglRetSuccess;
3138   {
3139     const uint32_t variant_ct = *write_variant_ctp;
3140     const uint32_t only_split_snps = ((flags & kfMakePlink2MMask) == kfMakePlink2MSplitSnps);
3141     uintptr_t* write_allele_idx_offsets = nullptr;
3142     uintptr_t* write_allele_idx_offsets_stop = nullptr;
3143     uintptr_t* write_allele_idx_offsets_iter = nullptr;
3144     if (only_split_snps) {
3145       write_allele_idx_offsets = R_CAST(uintptr_t*, g_bigstack_base);
3146       write_allele_idx_offsets_stop = R_CAST(uintptr_t*, RoundDownPow2(R_CAST(uintptr_t, g_bigstack_end), kCacheline));
3147       if (S_CAST(uintptr_t, write_allele_idx_offsets_stop - write_allele_idx_offsets) <= max_allele_ct) {
3148         goto PlanMultiallelicSplit_ret_NOMEM;
3149       }
3150       write_allele_idx_offsets_stop -= max_allele_ct;
3151       write_allele_idx_offsets_iter = write_allele_idx_offsets;
3152       *write_allele_idx_offsets_iter++ = 0;
3153     }
3154     uintptr_t cur_offset = 0;
3155     uintptr_t write_variant_ct = 0;
3156     uintptr_t variant_uidx_base = 0;
3157     uintptr_t cur_bits = variant_include[0];
3158     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
3159       variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3160       const uintptr_t allele_idx_offset_base = allele_idx_offsets[variant_uidx];
3161       const uint32_t allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
3162       if (allele_ct == 2) {
3163         if (only_split_snps) {
3164           cur_offset += 2;
3165           *write_allele_idx_offsets_iter++ = cur_offset;
3166         }
3167         ++write_variant_ct;
3168       } else {
3169         if (only_split_snps) {
3170           const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
3171           uint32_t do_split = 1;
3172           for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
3173             if (cur_alleles[allele_idx][1] != '\0') {
3174               do_split = 0;
3175               break;
3176             }
3177           }
3178           if (do_split) {
3179             for (uint32_t allele_idx = 1; allele_idx != allele_ct; ++allele_idx) {
3180               cur_offset += 2;
3181               *write_allele_idx_offsets_iter++ = cur_offset;
3182             }
3183             write_variant_ct += allele_ct - 1;
3184           } else {
3185             cur_offset += allele_ct;
3186             *write_allele_idx_offsets_iter++ = cur_offset;
3187             ++write_variant_ct;
3188           }
3189           if (write_allele_idx_offsets_iter > write_allele_idx_offsets_stop) {
3190             goto PlanMultiallelicSplit_ret_NOMEM;
3191           }
3192         } else {
3193           write_variant_ct += allele_ct - 1;
3194         }
3195       }
3196     }
3197     if (write_variant_ct > 0x7ffffffd) {
3198       logerrputs("Error: " PROG_NAME_STR " does not support more than 2^31 - 3 variants.  We recommend using\nother software for very deep studies of small numbers of genomes.\n");
3199       goto PlanMultiallelicSplit_ret_INCONSISTENT_INPUT;
3200     }
3201     *write_variant_ctp = write_variant_ct;
3202     if (only_split_snps && (cur_offset != 2 * write_variant_ct)) {
3203       assert(cur_offset > 2 * write_variant_ct);
3204       BigstackBaseSet(write_allele_idx_offsets_iter);
3205       *write_allele_idx_offsetsp = write_allele_idx_offsets;
3206     }
3207   }
3208   while (0) {
3209   PlanMultiallelicSplit_ret_NOMEM:
3210     reterr = kPglRetNomem;
3211     break;
3212   PlanMultiallelicSplit_ret_INCONSISTENT_INPUT:
3213     reterr = kPglRetInconsistentInput;
3214     break;
3215   }
3216   return reterr;
3217 }
3218 
3219 // Returns 1 iff there are exactly (allele_ct - 2) semicolons in
3220 // orig_variant_id, and no two are adjacent (or leading/trailing).
VaridSplitOk(const char * orig_variant_id,uint32_t allele_ct)3221 uint32_t VaridSplitOk(const char* orig_variant_id, uint32_t allele_ct) {
3222   const char* id_iter = orig_variant_id;
3223   for (uint32_t aidx = 2; aidx != allele_ct; ++aidx) {
3224     const char* tok_end = strchr(id_iter, ';');
3225     if ((!tok_end) || (tok_end == id_iter)) {
3226       return 0;
3227     }
3228     id_iter = &(tok_end[1]);
3229   }
3230   return (*id_iter != '\0') && (!strchr(id_iter, ';'));
3231 }
3232 
3233 // Similar to WriteMapOrBim(), but there are enough small differences to
3234 // justify making this a separate function instead of clogging the original
3235 // with more conditionals.
WriteBimSplit(const char * outname,const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const double * variant_cms,const char * varid_template_str,const char * missing_varid_match,uint32_t variant_ct,uint32_t max_allele_slen,uint32_t new_variant_id_max_allele_slen,uint32_t varid_split,uint32_t varid_dup,MiscFlags misc_flags,uint32_t output_zst,uint32_t thread_ct)3236 PglErr WriteBimSplit(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const double* variant_cms, const char* varid_template_str, const char* missing_varid_match, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t new_variant_id_max_allele_slen, uint32_t varid_split, uint32_t varid_dup, MiscFlags misc_flags, uint32_t output_zst, uint32_t thread_ct) {
3237   unsigned char* bigstack_mark = g_bigstack_base;
3238   char* cswritep = nullptr;
3239   CompressStreamState css;
3240   PglErr reterr = kPglRetSuccess;
3241   PreinitCstream(&css);
3242   {
3243     const uint32_t max_chr_blen = GetMaxChrSlen(cip) + 1;
3244     // includes trailing tab
3245     char* chr_buf;
3246     if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
3247       goto WriteBimSplit_ret_NOMEM;
3248     }
3249     const uint32_t new_variant_id_overflow_missing = (misc_flags / kfMiscNewVarIdOverflowMissing) & 1;
3250     const uint32_t varid_dup_nosplit = varid_dup && (!varid_split);
3251     VaridTemplate* varid_templatep = nullptr;
3252     uint32_t missing_varid_slen = 0;
3253     uint32_t missing_varid_match_blen = 0; // nonzero iff --set-missing-var-ids
3254     if (varid_template_str) {
3255       if (!missing_varid_match) {
3256         missing_varid_match = &(g_one_char_strs[92]); // '.'
3257       }
3258       missing_varid_slen = strlen(missing_varid_match);
3259       if (misc_flags & kfMiscSetMissingVarIds) {
3260         missing_varid_match_blen = missing_varid_slen + 1;
3261       }
3262       if (unlikely(BIGSTACK_ALLOC_X(VaridTemplate, 1, &varid_templatep))) {
3263         goto WriteBimSplit_ret_NOMEM;
3264       }
3265       const uint32_t overflow_substitute_blen = new_variant_id_overflow_missing? (missing_varid_slen + 1) : 0;
3266       VaridTemplateInit(varid_template_str, missing_varid_match, chr_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_templatep);
3267       if (varid_dup) {
3268         for (uint32_t uii = 0; uii != varid_templatep->insert_ct; ++uii) {
3269           const uint32_t insert_type = varid_templatep->insert_types[uii];
3270           if ((insert_type == 3) || ((insert_type == 2) && (varid_templatep->alleles_needed & 4))) {
3271             // Could define what takes precedence here, but simpler to prohibit
3272             // this combination.
3273             logerrputs("Error: 'vid-[split-]dup' cannot be used with a --set-all-var-ids or\n--set-missing-var-ids template string containing a non-REF allele.\n");
3274             goto WriteBimSplit_ret_INVALID_CMDLINE;
3275           }
3276         }
3277       }
3278     }
3279     const uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen;
3280     reterr = InitCstreamAlloc(outname, 0, output_zst, thread_ct, overflow_buf_size, &css, &cswritep);
3281     if (unlikely(reterr)) {
3282       goto WriteBimSplit_ret_1;
3283     }
3284 
3285     const VaridTemplate* cur_varid_templatep = nullptr;
3286     const char* varid_token_start = nullptr; // for vid-split
3287     uint32_t chr_fo_idx = UINT32_MAX;
3288     uint32_t chr_end = 0;
3289     uint32_t chr_buf_blen = 0;
3290     uintptr_t variant_uidx_base = 0;
3291     uintptr_t cur_bits = variant_include[0];
3292     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
3293       const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3294       if (variant_uidx >= chr_end) {
3295         do {
3296           ++chr_fo_idx;
3297           chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
3298         } while (variant_uidx >= chr_end);
3299         char* chr_name_end = chrtoa(cip, cip->chr_file_order[chr_fo_idx], chr_buf);
3300         *chr_name_end = '\t';
3301         const uint32_t chr_slen = chr_name_end - chr_buf;
3302         chr_buf_blen = 1 + chr_slen;
3303         if (varid_templatep) {
3304           const int32_t chr_slen_delta = chr_slen - varid_templatep->chr_slen;
3305           varid_templatep->chr_slen = chr_slen;
3306           varid_templatep->base_len += chr_slen_delta;
3307         }
3308       }
3309       const uintptr_t allele_idx_offset_base = allele_idx_offsets[variant_uidx];
3310       const uint32_t orig_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
3311       const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
3312       const char* orig_variant_id = variant_ids[variant_uidx];
3313       const char* ref_allele = cur_alleles[0];
3314       const uint32_t ref_allele_slen = strlen(ref_allele);
3315       uint32_t keep_orig_id = 1;
3316       if ((orig_allele_ct > 2) && (!varid_dup_nosplit)) {
3317         keep_orig_id = 0;
3318         if (varid_templatep && (!missing_varid_match_blen)) {
3319           cur_varid_templatep = varid_templatep;
3320         } else {
3321           cur_varid_templatep = nullptr;
3322           if (varid_split) {
3323             if (VaridSplitOk(orig_variant_id, orig_allele_ct)) {
3324               varid_token_start = orig_variant_id;
3325             } else if (varid_dup) {
3326               keep_orig_id = 1;
3327             } else {
3328               varid_token_start = nullptr;
3329             }
3330           }
3331           if ((!varid_token_start) && varid_templatep) {
3332             // --set-missing-var-ids usually applies here when it's specified;
3333             // the exceptions are when vid-split was also specified and the
3334             // split succeeded, or vid-split-dup was specified.
3335             // (In the latter case, this value is ignored anyway.)
3336             cur_varid_templatep = varid_templatep;
3337           }
3338         }
3339       }
3340       const uint32_t cur_bp = variant_bps[variant_uidx];
3341       // We already verified that no variants to be written have >2 alleles, so
3342       // we don't need to distinguish between '-' and '-snps'.
3343       for (uint32_t alt_allele_idx = 1; alt_allele_idx != orig_allele_ct; ++alt_allele_idx) {
3344         cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
3345         const char* cur_alt_allele = cur_alleles[alt_allele_idx];
3346         const uint32_t cur_alt_allele_slen = strlen(cur_alt_allele);
3347         if (keep_orig_id) {
3348           cswritep = strcpyax(cswritep, orig_variant_id, '\t');
3349         } else {
3350           if (cur_varid_templatep) {
3351             // Always true in --set-all-var-ids case.  True in
3352             // --set-missing-var-ids case when vid-split unspecified, or split
3353             // failed.
3354             cswritep = VaridTemplateWrite(cur_varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, cswritep);
3355             *cswritep++ = '\t';
3356           } else if (varid_token_start) {
3357             const char* varid_token_end = strchrnul(varid_token_start, ';');
3358             // If substring matches missing code and --set-missing-var-ids is
3359             // specified, we replace it.
3360             if (varid_templatep && (S_CAST(uintptr_t, varid_token_end - varid_token_start) == missing_varid_slen) && memequal(varid_token_start, missing_varid_match, missing_varid_slen)) {
3361               cswritep = VaridTemplateWrite(varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, cswritep);
3362             } else {
3363               cswritep = memcpya(cswritep, varid_token_start, varid_token_end - varid_token_start);
3364             }
3365             *cswritep++ = '\t';
3366             varid_token_start = &(varid_token_end[1]);
3367           } else {
3368             cswritep = memcpyax(cswritep, missing_varid_match, missing_varid_slen, '\t');
3369           }
3370         }
3371         if (!variant_cms) {
3372           *cswritep++ = '0';
3373         } else {
3374           cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
3375         }
3376         *cswritep++ = '\t';
3377         cswritep = u32toa(cur_bp, cswritep);
3378         *cswritep++ = '\t';
3379         // note that VCF ref allele corresponds to A2, not A1
3380         cswritep = memcpya(cswritep, cur_alt_allele, cur_alt_allele_slen);
3381         *cswritep++ = '\t';
3382         cswritep = memcpya(cswritep, ref_allele, ref_allele_slen);
3383         AppendBinaryEoln(&cswritep);
3384         if (unlikely(Cswrite(&css, &cswritep))) {
3385           goto WriteBimSplit_ret_WRITE_FAIL;
3386         }
3387       }
3388     }
3389     if (unlikely(CswriteCloseNull(&css, cswritep))) {
3390       goto WriteBimSplit_ret_WRITE_FAIL;
3391     }
3392   }
3393   while (0) {
3394   WriteBimSplit_ret_NOMEM:
3395     reterr = kPglRetNomem;
3396     break;
3397   WriteBimSplit_ret_WRITE_FAIL:
3398     reterr = kPglRetWriteFail;
3399     break;
3400   WriteBimSplit_ret_INVALID_CMDLINE:
3401     reterr = kPglRetInvalidCmdline;
3402     break;
3403   }
3404  WriteBimSplit_ret_1:
3405   CswriteCloseCond(&css, cswritep);
3406   BigstackReset(bigstack_mark);
3407   return reterr;
3408 }
3409 
3410 // We only need to distinguish between the following INFO-value-type cases:
3411 // Number=0 (flag), Number=<positive integer>, Number=., Number=A, Number=R,
3412 // and Number=G.  We use negative numbers to represent the last 4 cases in
3413 // InfoVtype.
3414 CONSTI32(kInfoVtypeUnknown, -1);
3415 CONSTI32(kInfoVtypeA, -2);
3416 CONSTI32(kInfoVtypeR, -3);
3417 CONSTI32(kInfoVtypeG, -4);
3418 
3419 // Main fixed data structure when splitting/joining INFO is a hashmap of keys.
3420 // Behavior when splitting:
3421 // - Field order in the original variant is retained.
3422 // - Number >= 0 and Number=. don't require any special handling, just copy the
3423 //   entire key=value pair (or lone key, in the Flag case).
3424 // - Number=A and Number=R require splitting the value on ',' and verifying the
3425 //   comma count is correct, but is otherwise straightforward since alleles
3426 //   can't be permuted.
3427 // - Number=G requires a bit more work but isn't fundamentally different from
3428 //   A/R.
3429 // When joining:
3430 // - Field order is determined by header line order.
3431 // - Number=. and Number>0 just require a buffer of size ~info_reload_slen, and
3432 //   a boolean indicating whether no mismatch has been found.
3433 // - Number=0 (Flag) requires a single boolean, we perform an or operation.
3434 // - Number=A/R/G are the messy ones: we need to have enough space for
3435 //   max_write_allele_ct (or that minus 1) comma-separated values in the =A and
3436 //   =R cases, and max_write_allele_ct * (max_write_allele_ct + 1) / 2 in the
3437 //   diploid =G case.
3438 //   Since we permit already-multiallelic variants to be part of a join, the =G
3439 //   case may require a lot of working memory to handle.  We reserve up to 1/16
3440 //   of remaining workspace memory for this when we cannot prove that we can
3441 //   get by with less.
3442 
3443 typedef struct InfoVtypeStruct {
3444   NONCOPYABLE(InfoVtypeStruct);
3445   int32_t num;
3446   char key[];
3447 } InfoVtype;
3448 
3449 // info_keys[] entries point to the (variable-size) key[] member of InfoVtype
3450 // structs.  We use [const_]container_of(x)->num to look up the associated
3451 // Number= value.
ParseInfoHeader(const char * xheader,uintptr_t xheader_blen,const char * const ** info_keys_ptr,uint32_t * info_key_ctp,uint32_t ** info_keys_htablep,uint32_t * info_keys_htable_sizep)3452 PglErr ParseInfoHeader(const char* xheader, uintptr_t xheader_blen, const char* const** info_keys_ptr, uint32_t* info_key_ctp, uint32_t** info_keys_htablep, uint32_t* info_keys_htable_sizep) {
3453   unsigned char* bigstack_mark = g_bigstack_base;
3454   unsigned char* bigstack_end_mark = g_bigstack_end;
3455   PglErr reterr = kPglRetSuccess;
3456   {
3457     // Parsing loop is similar to that in ExportVcf().
3458     const char* xheader_iter = xheader;
3459     const char* xheader_end = &(xheader[xheader_blen]);
3460     const char* line_end = xheader;
3461     unsigned char* tmp_alloc_end = bigstack_end_mark;
3462     const char** info_keys = R_CAST(const char**, bigstack_mark);
3463     const char** info_keys_iter = info_keys;
3464     while (line_end != xheader_end) {
3465       xheader_iter = line_end;
3466       line_end = AdvPastDelim(xheader_iter, '\n');
3467       const uint32_t slen = line_end - xheader_iter;
3468       if ((slen <= 12) || (!StrStartsWithUnsafe(xheader_iter, "##INFO=<ID="))) {
3469         continue;
3470       }
3471       const char* key_start = &(xheader_iter[11]);
3472       const char* key_end = S_CAST(const char*, memchr(key_start, ',', slen - 12));
3473       if (unlikely((!key_end) || (!StrStartsWithUnsafe(key_end, ",Number=")))) {
3474         goto ParseInfoHeader_ret_MALFORMED_INFO_HEADER_LINE;
3475       }
3476       const uint32_t key_slen = key_end - key_start;
3477       if (key_slen > kMaxInfoKeySlen) {
3478         logerrputs("Error: " PROG_NAME_STR " does not support INFO keys longer than " MAX_INFO_KEY_SLEN_STR " characters.\n");
3479         // VCF spec doesn't specify a limit, so this isn't "malformed input".
3480         // We enforce a limit so we can safely print INFO keys in error
3481         // messages, etc.; it's trivial to increase the limit if it's ever
3482         // necessary.
3483         reterr = kPglRetNotYetSupported;
3484         goto ParseInfoHeader_ret_1;
3485       }
3486       const uintptr_t entry_byte_ct = RoundUpPow2(offsetof(InfoVtype, key) + 1 + key_slen, sizeof(intptr_t));
3487       if (S_CAST(uintptr_t, tmp_alloc_end - R_CAST(unsigned char*, info_keys_iter)) < entry_byte_ct + 8) {
3488         goto ParseInfoHeader_ret_NOMEM;
3489       }
3490       tmp_alloc_end -= entry_byte_ct;
3491       InfoVtype* new_entry = R_CAST(InfoVtype*, tmp_alloc_end);
3492       memcpyx(new_entry->key, key_start, key_slen, '\0');
3493       *info_keys_iter++ = new_entry->key;
3494 
3495       const char* num_iter = &(key_end[8]);
3496       const unsigned char first_num_char = num_iter[0];
3497       if (first_num_char < '1') {
3498         if (first_num_char == '0') {
3499           // don't see a reason to tolerate Number=01, etc.
3500           if (unlikely(!StrStartsWithUnsafe(num_iter, "0,Type=Flag,"))) {
3501             goto ParseInfoHeader_ret_MALFORMED_INFO_HEADER_LINE;
3502           }
3503           new_entry->num = 0;
3504         } else if (likely(first_num_char == '.')) {
3505           new_entry->num = kInfoVtypeUnknown;
3506         } else {
3507           goto ParseInfoHeader_ret_MALFORMED_INFO_HEADER_LINE;
3508         }
3509       } else if (first_num_char > '9') {
3510         if (first_num_char == 'A') {
3511           new_entry->num = kInfoVtypeA;
3512         } else if (first_num_char == 'R') {
3513           new_entry->num = kInfoVtypeR;
3514         } else if (likely(first_num_char == 'G')) {
3515           new_entry->num = kInfoVtypeG;
3516         } else {
3517           goto ParseInfoHeader_ret_MALFORMED_INFO_HEADER_LINE;
3518         }
3519       } else {
3520         uint32_t val;
3521         if (unlikely(ScanmovPosintCapped(UINT32_MAX, &num_iter, &val) || (num_iter[0] != ','))) {
3522           goto ParseInfoHeader_ret_MALFORMED_INFO_HEADER_LINE;
3523         }
3524         new_entry->num = val;
3525       }
3526     }
3527     const uintptr_t info_key_ct = info_keys_iter - info_keys;
3528 #ifdef __LP64__
3529     if (unlikely(info_key_ct > 0x7ffffffdU)) {
3530       logerrputs("Error: " PROG_NAME_STR " does not support more than 2^31 - 3 INFO keys.\n");
3531       reterr = kPglRetMalformedInput;
3532       goto ParseInfoHeader_ret_1;
3533     }
3534 #endif
3535     assert(info_key_ct);
3536     *info_key_ctp = info_key_ct;
3537     BigstackBaseSet(info_keys_iter);
3538     BigstackEndSet(tmp_alloc_end);
3539     bigstack_end_mark = g_bigstack_end;
3540     const uintptr_t info_key_ctl = BitCtToWordCt(info_key_ct);
3541     uintptr_t* dummy_include;
3542     if (unlikely(
3543             (g_bigstack_base > g_bigstack_end) ||
3544             bigstack_end_alloc_w(info_key_ctl, &dummy_include))) {
3545       goto ParseInfoHeader_ret_NOMEM;
3546     }
3547     SetAllBits(info_key_ct, dummy_include);
3548     reterr = AllocAndPopulateIdHtableMt(dummy_include, info_keys, info_key_ct, bigstack_left() / 32, 1, info_keys_htablep, nullptr, info_keys_htable_sizep, nullptr);
3549     if (unlikely(reterr)) {
3550       goto ParseInfoHeader_ret_1;
3551     }
3552     *info_keys_ptr = info_keys;
3553     bigstack_mark = g_bigstack_base;
3554   }
3555   while (0) {
3556   ParseInfoHeader_ret_NOMEM:
3557     reterr = kPglRetNomem;
3558     break;
3559   ParseInfoHeader_ret_MALFORMED_INFO_HEADER_LINE:
3560     logputs("\n");
3561     logerrputs("Error: Malformed or unrecognized INFO header line.\n");
3562     reterr = kPglRetMalformedInput;
3563     break;
3564   }
3565  ParseInfoHeader_ret_1:
3566   BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
3567   return reterr;
3568 }
3569 
WritePvarSplit(const char * outname,const uintptr_t * variant_include,const ChrInfo * cip,const uint32_t * variant_bps,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const char * const * allele_storage,const uintptr_t * qual_present,const float * quals,const uintptr_t * filter_present,const uintptr_t * filter_npass,const char * const * filter_storage,const uintptr_t * nonref_flags,const char * pvar_info_reload,const double * variant_cms,const char * varid_template_str,const char * missing_varid_match,const char * const * info_keys,const uint32_t * info_keys_htable,uint32_t raw_variant_ct,uint32_t variant_ct,uint32_t max_allele_slen,uint32_t new_variant_id_max_allele_slen,uintptr_t xheader_blen,InfoFlags info_flags,uint32_t nonref_flags_storage,uint32_t max_filter_slen,uint32_t info_reload_slen,UnsortedVar vpos_sortstatus,uint32_t info_key_ct,uint32_t info_keys_htable_size,MiscFlags misc_flags,MakePlink2Flags make_plink2_flags,PvarPsamFlags pvar_psam_flags,uint32_t thread_ct,char * xheader)3570 PglErr WritePvarSplit(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, const char* missing_varid_match, const char* const* info_keys, const uint32_t* info_keys_htable, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t new_variant_id_max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, uint32_t info_key_ct, uint32_t info_keys_htable_size, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct, char* xheader) {
3571   unsigned char* bigstack_mark = g_bigstack_base;
3572   char* cswritep = nullptr;
3573   PglErr reterr = kPglRetSuccess;
3574   CompressStreamState css;
3575   TextStream pvar_reload_txs;
3576   PreinitCstream(&css);
3577   PreinitTextStream(&pvar_reload_txs);
3578   {
3579     const uint32_t max_chr_blen = GetMaxChrSlen(cip) + 1;
3580     // includes trailing tab
3581     char* chr_buf;
3582 
3583     if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
3584       goto WritePvarSplit_ret_NOMEM;
3585     }
3586     const uint32_t new_variant_id_overflow_missing = (misc_flags / kfMiscNewVarIdOverflowMissing) & 1;
3587     const uint32_t varid_dup = (make_plink2_flags / kfMakePlink2VaridDup) & 1;
3588     VaridTemplate* varid_templatep = nullptr;
3589     if (!missing_varid_match) {
3590       missing_varid_match = &(g_one_char_strs[92]); // '.'
3591     }
3592     uint32_t missing_varid_slen = strlen(missing_varid_match);
3593     uint32_t missing_varid_match_blen = 0; // nonzero iff --set-missing-var-ids
3594     if (varid_template_str) {
3595       if (misc_flags & kfMiscSetMissingVarIds) {
3596         missing_varid_match_blen = missing_varid_slen + 1;
3597       }
3598       if (unlikely(BIGSTACK_ALLOC_X(VaridTemplate, 1, &varid_templatep))) {
3599         goto WritePvarSplit_ret_NOMEM;
3600       }
3601       const uint32_t overflow_substitute_blen = new_variant_id_overflow_missing? (missing_varid_slen + 1) : 0;
3602       VaridTemplateInit(varid_template_str, missing_varid_match, chr_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_templatep);
3603       if (varid_dup) {
3604         for (uint32_t uii = 0; uii != varid_templatep->insert_ct; ++uii) {
3605           const uint32_t insert_type = varid_templatep->insert_types[uii];
3606           if ((insert_type == 3) || ((insert_type == 2) && (varid_templatep->alleles_needed & 4))) {
3607             // Could define what takes precedence here, but simpler to prohibit
3608             // this combination.
3609             logerrputs("Error: 'vid-[split-]dup' cannot be used with a --set-all-var-ids or\n--set-missing-var-ids template string containing a non-REF allele.\n");
3610             goto WritePvarSplit_ret_INVALID_CMDLINE;
3611           }
3612         }
3613       }
3614     }
3615 
3616     uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen + max_filter_slen + info_reload_slen;
3617     if (overflow_buf_size < 2 * kCompressStreamBlock) {
3618       overflow_buf_size = 2 * kCompressStreamBlock;
3619     }
3620     const uint32_t output_zst = (pvar_psam_flags / kfPvarZs) & 1;
3621     reterr = InitCstreamAlloc(outname, 0, output_zst, thread_ct, overflow_buf_size, &css, &cswritep);
3622     if (unlikely(reterr)) {
3623       goto WritePvarSplit_ret_1;
3624     }
3625 
3626     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
3627     const uint32_t all_nonref = (nonref_flags_storage == 2);
3628     uint32_t write_info_pr = all_nonref;
3629     uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
3630     if (write_info && nonref_flags) {
3631       write_info_pr = !IntersectionIsEmpty(variant_include, nonref_flags, raw_variant_ctl);
3632     }
3633     write_info_pr = write_info_pr && write_info;
3634     if (unlikely(write_info_pr && (info_flags & kfInfoPrNonflagPresent))) {
3635       logputs("\n");
3636       logerrputs("Error: Conflicting INFO:PR definitions.  Either fix all REF alleles so that the\n'provisional reference' flag is no longer needed, or remove/rename the other\nuse of the INFO:PR key.\n");
3637       goto WritePvarSplit_ret_INCONSISTENT_INPUT;
3638     }
3639 
3640     char* pvar_info_line_iter = nullptr;
3641     uint32_t write_filter = 0;
3642     if (pvar_psam_flags & kfPvarColFilter) {
3643       write_filter = 1;
3644     } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
3645       write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
3646     }
3647     uint32_t info_col_idx = 0;  // could save this during first load instead
3648     const uint32_t info_pr_flag_present = (info_flags / kfInfoPrFlagPresent) & 1;
3649     if (pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) {
3650       reterr = PvarXheaderWrite(variant_include, cip, variant_bps, allele_idx_offsets, allele_storage, nullptr, xheader_blen, (pvar_psam_flags / kfPvarColVcfheader) & 1, write_filter, write_info, write_info_pr && (!info_pr_flag_present), max_allele_slen, vpos_sortstatus, xheader, &css, &cswritep);
3651       if (unlikely(reterr)) {
3652         goto WritePvarSplit_ret_1;
3653       }
3654     }
3655     // could also make this an array-of-structs
3656     uint32_t* info_key_order = nullptr;
3657     const char** info_starts = nullptr;
3658     const char** info_ends = nullptr;
3659     const char** info_curs = nullptr;
3660     uint32_t* info_ref_blens = nullptr;
3661     if (pvar_info_reload) {
3662       if (unlikely(
3663               bigstack_alloc_u32(info_key_ct, &info_key_order) ||
3664               bigstack_alloc_kcp(info_key_ct, &info_starts) ||
3665               bigstack_alloc_kcp(info_key_ct, &info_ends) ||
3666               bigstack_alloc_kcp(info_key_ct, &info_curs) ||
3667               bigstack_alloc_u32(info_key_ct, &info_ref_blens))) {
3668         goto WritePvarSplit_ret_NOMEM;
3669       }
3670       reterr = PvarInfoOpenAndReloadHeader(pvar_info_reload, 1 + (thread_ct > 1), &pvar_reload_txs, &pvar_info_line_iter, &info_col_idx);
3671       if (unlikely(reterr)) {
3672         goto WritePvarSplit_ret_TSTREAM_FAIL;
3673       }
3674     }
3675     if (cip->chrset_source) {
3676       AppendChrsetLine(cip, &cswritep);
3677     }
3678     cswritep = strcpya_k(cswritep, "#CHROM\tPOS\tID\tREF\tALT");
3679 
3680     uint32_t write_qual = 0;
3681     if (pvar_psam_flags & kfPvarColQual) {
3682       write_qual = 1;
3683     } else if ((pvar_psam_flags & kfPvarColMaybequal) && qual_present) {
3684       write_qual = !IntersectionIsEmpty(variant_include, qual_present, raw_variant_ctl);
3685     }
3686     if (write_qual) {
3687       cswritep = strcpya_k(cswritep, "\tQUAL");
3688     }
3689     if (write_filter) {
3690       cswritep = strcpya_k(cswritep, "\tFILTER");
3691     }
3692     if (write_info) {
3693       cswritep = strcpya_k(cswritep, "\tINFO");
3694     }
3695 
3696     uint32_t write_cm = 0;
3697     if (pvar_psam_flags & kfPvarColCm) {
3698       write_cm = 1;
3699     } else if ((pvar_psam_flags & kfPvarColMaybecm) && variant_cms) {
3700       if (raw_variant_ct == variant_ct) {
3701         // nonzero_cm_present check was performed
3702         write_cm = 1;
3703       } else {
3704         uintptr_t variant_uidx_base = 0;
3705         uintptr_t cur_bits = variant_include[0];
3706         for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
3707           const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3708           if (variant_cms[variant_uidx] != 0.0) {
3709             write_cm = 1;
3710             break;
3711           }
3712         }
3713       }
3714     }
3715     if (write_cm) {
3716       cswritep = strcpya_k(cswritep, "\tCM");
3717     }
3718     AppendBinaryEoln(&cswritep);
3719 
3720     const VaridTemplate* cur_varid_templatep = nullptr;
3721     const char* varid_token_start = nullptr; // for vid-split
3722     const uint32_t varid_split = (make_plink2_flags / kfMakePlink2VaridSemicolon) & 1;
3723     const uint32_t varid_dup_nosplit = varid_dup && (!varid_split);
3724     const uint32_t split_just_snps = ((make_plink2_flags & (kfMakePlink2MSplitBase * 3)) == kfMakePlink2MSplitSnps);
3725     uint32_t trs_variant_uidx = 0;
3726     uintptr_t variant_uidx_base = 0;
3727     uintptr_t cur_bits = variant_include[0];
3728     uint32_t chr_fo_idx = UINT32_MAX;
3729     uint32_t chr_end = 0;
3730     uint32_t chr_buf_blen = 0;
3731     uint32_t orig_allele_ct = 2;
3732     uint32_t cur_info_key_ct = 0;
3733     uint32_t pct = 0;
3734     uint32_t next_print_variant_idx = variant_ct / 100;
3735     fputs("0%", stdout);
3736     fflush(stdout);
3737     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
3738       const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
3739       if (variant_uidx >= chr_end) {
3740         do {
3741           ++chr_fo_idx;
3742           chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
3743         } while (variant_uidx >= chr_end);
3744         char* chr_name_end = chrtoa(cip, cip->chr_file_order[chr_fo_idx], chr_buf);
3745         *chr_name_end = '\t';
3746         const uint32_t chr_slen = chr_name_end - chr_buf;
3747         chr_buf_blen = 1 + chr_slen;
3748         if (varid_templatep) {
3749           const int32_t chr_slen_delta = chr_slen - varid_templatep->chr_slen;
3750           varid_templatep->chr_slen = chr_slen;
3751           varid_templatep->base_len += chr_slen_delta;
3752         }
3753       }
3754       uintptr_t allele_idx_offset_base;
3755       if (!allele_idx_offsets) {
3756         allele_idx_offset_base = variant_uidx * 2;
3757       } else {
3758         allele_idx_offset_base = allele_idx_offsets[variant_uidx];
3759         orig_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
3760       }
3761       const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
3762       const char* orig_variant_id = variant_ids[variant_uidx];
3763       const char* ref_allele = cur_alleles[0];
3764       const uint32_t ref_allele_slen = strlen(ref_allele);
3765       const uint32_t cur_bp = variant_bps[variant_uidx];
3766       uint32_t split_ct_p1 = orig_allele_ct;
3767       uint32_t keep_orig_id = 1;
3768       if (orig_allele_ct > 2) {
3769         if (!varid_dup_nosplit) {
3770           keep_orig_id = 0;
3771           if (varid_templatep && (!missing_varid_match_blen)) {
3772             cur_varid_templatep = varid_templatep;
3773           } else {
3774             cur_varid_templatep = nullptr;
3775             if (varid_split) {
3776               if (VaridSplitOk(orig_variant_id, orig_allele_ct)) {
3777                 varid_token_start = orig_variant_id;
3778               } else if (varid_dup) {
3779                 keep_orig_id = 1;
3780               } else {
3781                 varid_token_start = nullptr;
3782               }
3783             }
3784             if ((!varid_token_start) && varid_templatep) {
3785               // Note that --set-missing-var-ids almost always applies here
3786               // when it's specified; only exception is when vid-split was also
3787               // specified and the split succeeded.
3788               cur_varid_templatep = varid_templatep;
3789             }
3790           }
3791         }
3792         // Necessary to distinguish between '-' and '-snps' here.
3793         if (split_just_snps) {
3794           for (uint32_t uii = 0; uii != orig_allele_ct; ++uii) {
3795             if (cur_alleles[uii][1]) {
3796               split_ct_p1 = 2;
3797               break;
3798             }
3799           }
3800         }
3801         if ((split_ct_p1 != 2) && pvar_info_line_iter) {
3802           reterr = PvarInfoReload(info_col_idx, variant_uidx, &pvar_reload_txs, &pvar_info_line_iter, &trs_variant_uidx);
3803           if (unlikely(reterr)) {
3804             goto WritePvarSplit_ret_TSTREAM_FAIL;
3805           }
3806           char* info_subtoken_iter = pvar_info_line_iter;
3807           pvar_info_line_iter = CurTokenEnd(pvar_info_line_iter);
3808           cur_info_key_ct = 0;
3809           // special case: if entire info field is '.', treat as zero keys
3810           if ((info_subtoken_iter[0] != '.') || (pvar_info_line_iter != &(info_subtoken_iter[1]))) {
3811             while (1) {
3812               if (unlikely(cur_info_key_ct == info_key_ct)) {
3813                 snprintf(g_logbuf, kLogbufSize, "Error: Too many INFO keys for variant ID '%s'.\n", orig_variant_id);
3814                 goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
3815               }
3816               char* info_subtoken_end = AdvToDelimOrEnd(info_subtoken_iter, pvar_info_line_iter, ';');
3817               char* key_end = AdvToDelimOrEnd(info_subtoken_iter, info_subtoken_end, '=');
3818               const uint32_t key_slen = key_end - info_subtoken_iter;
3819               const uint32_t kidx = IdHtableFindNnt(info_subtoken_iter, info_keys, info_keys_htable, key_slen, info_keys_htable_size);
3820               if (unlikely(kidx == UINT32_MAX)) {
3821                 snprintf(g_logbuf, kLogbufSize, "Error: INFO key for variant ID '%s' missing from header.\n", orig_variant_id);
3822                 goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
3823               }
3824               info_key_order[cur_info_key_ct] = kidx;
3825               const int32_t knum = const_container_of(info_keys[kidx], InfoVtype, key)->num;
3826               if (key_end == info_subtoken_end) {
3827                 if (unlikely(knum)) {
3828                   snprintf(g_logbuf, kLogbufSize, "Error: INFO key '%s' for variant ID '%s' does not have an accompanying value.\n", info_keys[kidx], orig_variant_id);
3829                   goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
3830                 }
3831               } else {
3832                 if (unlikely(!knum)) {
3833                   snprintf(g_logbuf, kLogbufSize, "Error: INFO key '%s' for variant ID '%s' has an accompanying value, despite being of type Flag.\n", info_keys[kidx], orig_variant_id);
3834                   goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
3835                 }
3836                 info_subtoken_iter = &(key_end[1]);
3837 
3838                 // don't actually need this for Number=A case
3839                 info_starts[cur_info_key_ct] = info_subtoken_iter;
3840 
3841                 info_ends[cur_info_key_ct] = info_subtoken_end;
3842                 if (knum <= kInfoVtypeA) {
3843                   // (Don't need to do anything else for kInfoVtypeUnknown or
3844                   // positive; we unconditionally copy all the text in those
3845                   // cases.)
3846                   if (knum == kInfoVtypeA) {
3847                     info_curs[cur_info_key_ct] = info_subtoken_iter;
3848                   } else {
3849                     char* ref_value_end = S_CAST(char*, memchr(info_subtoken_iter, ',', info_subtoken_end - info_subtoken_iter));
3850                     if (unlikely(!ref_value_end)) {
3851                       snprintf(g_logbuf, kLogbufSize, "Error: Too few values for INFO key '%s', variant ID '%s'.\n", info_keys[kidx], orig_variant_id);
3852                       goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
3853                     }
3854                     ++ref_value_end;
3855                     info_ref_blens[cur_info_key_ct] = ref_value_end - info_subtoken_iter;
3856                     info_curs[cur_info_key_ct] = ref_value_end;
3857                   }
3858                 }
3859               }
3860               ++cur_info_key_ct;
3861               if (info_subtoken_end == pvar_info_line_iter) {
3862                 break;
3863               }
3864               info_subtoken_iter = &(info_subtoken_end[1]);
3865             }
3866           }
3867         }
3868       }
3869       for (uint32_t alt_allele_idx = 1; alt_allele_idx != split_ct_p1; ++alt_allele_idx) {
3870         cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
3871         cswritep = u32toa_x(cur_bp, '\t', cswritep);
3872         const char* cur_alt_allele = cur_alleles[alt_allele_idx];
3873         const uint32_t cur_alt_allele_slen = strlen(cur_alt_allele);
3874         if ((split_ct_p1 == 2) || keep_orig_id) {
3875           cswritep = strcpyax(cswritep, orig_variant_id, '\t');
3876           cswritep = memcpyax(cswritep, ref_allele, ref_allele_slen, '\t');
3877           cswritep = memcpya(cswritep, cur_alt_allele, cur_alt_allele_slen);
3878           if (unlikely(Cswrite(&css, &cswritep))) {
3879             goto WritePvarSplit_ret_WRITE_FAIL;
3880           }
3881           if ((orig_allele_ct > 2) && (split_ct_p1 == 2)) {
3882             // -snps non-split case
3883             for (uint32_t allele_idx = 2; allele_idx != orig_allele_ct; ++allele_idx) {
3884               *cswritep++ = ',';
3885               cswritep = strcpya(cswritep, cur_alleles[allele_idx]);
3886               if (unlikely(Cswrite(&css, &cswritep))) {
3887                 goto WritePvarSplit_ret_WRITE_FAIL;
3888               }
3889             }
3890           }
3891         } else {
3892           if (cur_varid_templatep) {
3893             // Always true in --set-all-var-ids case.  True in
3894             // --set-missing-var-ids case when vid-split unspecified, or split
3895             // failed.
3896             cswritep = VaridTemplateWrite(cur_varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, cswritep);
3897             *cswritep++ = '\t';
3898           } else if (varid_token_start) {
3899             const char* varid_token_end = strchrnul(varid_token_start, ';');
3900             // If substring matches missing code and --set-missing-var-ids is
3901             // specified, we replace it.
3902             if (varid_templatep && (S_CAST(uintptr_t, varid_token_end - varid_token_start) == missing_varid_slen) && memequal(varid_token_start, missing_varid_match, missing_varid_slen)) {
3903               cswritep = VaridTemplateWrite(varid_templatep, ref_allele, cur_alt_allele, cur_bp, ref_allele_slen, 0, cur_alt_allele_slen, cswritep);
3904             } else {
3905               cswritep = memcpya(cswritep, varid_token_start, varid_token_end - varid_token_start);
3906             }
3907             *cswritep++ = '\t';
3908             varid_token_start = &(varid_token_end[1]);
3909           } else {
3910             cswritep = memcpyax(cswritep, missing_varid_match, missing_varid_slen, '\t');
3911           }
3912           cswritep = memcpyax(cswritep, ref_allele, ref_allele_slen, '\t');
3913           cswritep = memcpya(cswritep, cur_alt_allele, cur_alt_allele_slen);
3914           if (unlikely(Cswrite(&css, &cswritep))) {
3915             goto WritePvarSplit_ret_WRITE_FAIL;
3916           }
3917         }
3918         if (write_qual) {
3919           *cswritep++ = '\t';
3920           if ((!qual_present) || (!IsSet(qual_present, variant_uidx))) {
3921             *cswritep++ = '.';
3922           } else {
3923             cswritep = ftoa_g(quals[variant_uidx], cswritep);
3924           }
3925         }
3926 
3927         if (write_filter) {
3928           *cswritep++ = '\t';
3929           if ((!filter_present) || (!IsSet(filter_present, variant_uidx))) {
3930             *cswritep++ = '.';
3931           } else if (!IsSet(filter_npass, variant_uidx)) {
3932             cswritep = strcpya_k(cswritep, "PASS");
3933           } else {
3934             cswritep = strcpya(cswritep, filter_storage[variant_uidx]);
3935           }
3936         }
3937 
3938         if (write_info) {
3939           *cswritep++ = '\t';
3940           const uint32_t is_pr = all_nonref || (nonref_flags && IsSet(nonref_flags, variant_uidx));
3941           if (pvar_info_line_iter) {
3942             if (split_ct_p1 == 2) {
3943               reterr = PvarInfoReloadAndWrite(info_pr_flag_present, info_col_idx, variant_uidx, is_pr, &pvar_reload_txs, &pvar_info_line_iter, &cswritep, &trs_variant_uidx);
3944               if (unlikely(reterr)) {
3945                 goto WritePvarSplit_ret_TSTREAM_FAIL;
3946               }
3947             } else {
3948               if (!cur_info_key_ct) {
3949                 *cswritep++ = '.';
3950               } else {
3951                 const uint32_t is_last_allele = (alt_allele_idx + 1 == split_ct_p1);
3952                 for (uint32_t kpos = 0; kpos != cur_info_key_ct; ++kpos) {
3953                   const uint32_t kidx = info_key_order[kpos];
3954                   const char* cur_key_str = info_keys[kidx];
3955                   cswritep = strcpya(cswritep, cur_key_str);
3956                   const int32_t knum = const_container_of(info_keys[kidx], InfoVtype, key)->num;
3957                   if (knum) {
3958                     *cswritep++ = '=';
3959                     const char* cur_info_start = info_starts[kpos];
3960                     const char* cur_info_end = info_ends[kpos];
3961                     if (knum >= kInfoVtypeUnknown) {
3962                       cswritep = memcpya(cswritep, cur_info_start, cur_info_end - cur_info_start);
3963                     } else {
3964                       if (knum != kInfoVtypeA) {
3965                         cswritep = memcpya(cswritep, cur_info_start, info_ref_blens[kpos]);
3966                       }
3967                       // okay, this needs a better name
3968                       const char* cur_info_cur = info_curs[kpos];
3969 
3970                       const char* subtoken_end = AdvToDelimOrEnd(cur_info_cur, cur_info_end, ',');
3971                       if (knum == kInfoVtypeG) {
3972                         if (unlikely(subtoken_end == cur_info_end)) {
3973                           snprintf(g_logbuf, kLogbufSize, "Error: Too few values for INFO key '%s', variant ID '%s'.\n", cur_key_str, orig_variant_id);
3974                           goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
3975                         }
3976                         cswritep = memcpya(cswritep, cur_info_cur, 1 + S_CAST(uintptr_t, subtoken_end - cur_info_cur));
3977                         cur_info_cur = subtoken_end;
3978                         const uint32_t skip_ct = alt_allele_idx - 1;
3979                         if (skip_ct) {
3980                           cur_info_cur = AdvToNthDelimChecked(&(cur_info_cur[1]), cur_info_end, skip_ct, ',');
3981                           if (unlikely(!subtoken_end)) {
3982                             snprintf(g_logbuf, kLogbufSize, "Error: Too few values for INFO key '%s', variant ID '%s'.\n", cur_key_str, orig_variant_id);
3983                             goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
3984                           }
3985                         }
3986                         ++cur_info_cur;
3987                         subtoken_end = AdvToDelimOrEnd(cur_info_cur, cur_info_end, ',');
3988                       }
3989                       if (unlikely((subtoken_end == cur_info_end) != is_last_allele)) {
3990                         snprintf(g_logbuf, kLogbufSize, "Error: Wrong number of values for INFO key '%s', variant ID '%s'.\n", cur_key_str, orig_variant_id);
3991                         goto WritePvarSplit_ret_MALFORMED_INPUT_WW;
3992                       }
3993                       cswritep = memcpya(cswritep, cur_info_cur, subtoken_end - cur_info_cur);
3994                       info_curs[kpos] = &(subtoken_end[1]);
3995                     }
3996                   }
3997                   *cswritep++ = ';';
3998                 }
3999                 --cswritep;
4000               }
4001             }
4002           } else {
4003             if (is_pr) {
4004               cswritep = strcpya_k(cswritep, "PR");
4005             } else {
4006               *cswritep++ = '.';
4007             }
4008           }
4009         }
4010 
4011         if (write_cm) {
4012           *cswritep++ = '\t';
4013           if (!variant_cms) {
4014             *cswritep++ = '0';
4015           } else {
4016             cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
4017           }
4018         }
4019         AppendBinaryEoln(&cswritep);
4020       }
4021       if (variant_idx >= next_print_variant_idx) {
4022         if (pct > 10) {
4023           putc_unlocked('\b', stdout);
4024         }
4025         pct = (variant_idx * 100LLU) / variant_ct;
4026         printf("\b\b%u%%", pct++);
4027         fflush(stdout);
4028         next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
4029       }
4030     }
4031     if (unlikely(CswriteCloseNull(&css, cswritep))) {
4032       goto WritePvarSplit_ret_WRITE_FAIL;
4033     }
4034     if (pct > 10) {
4035       putc_unlocked('\b', stdout);
4036     }
4037     fputs("\b\b", stdout);
4038   }
4039   while (0) {
4040   WritePvarSplit_ret_NOMEM:
4041     reterr = kPglRetNomem;
4042     break;
4043   WritePvarSplit_ret_TSTREAM_FAIL:
4044     TextStreamErrPrint(pvar_info_reload, &pvar_reload_txs);
4045     break;
4046   WritePvarSplit_ret_WRITE_FAIL:
4047     reterr = kPglRetWriteFail;
4048     break;
4049   WritePvarSplit_ret_INVALID_CMDLINE:
4050     reterr = kPglRetInvalidCmdline;
4051     break;
4052   WritePvarSplit_ret_MALFORMED_INPUT_WW:
4053     logputs("\n");
4054     WordWrapB(0);
4055     logerrputsb();
4056     reterr = kPglRetMalformedInput;
4057     break;
4058   WritePvarSplit_ret_INCONSISTENT_INPUT:
4059     reterr = kPglRetInconsistentInput;
4060     break;
4061   }
4062  WritePvarSplit_ret_1:
4063   CswriteCloseCond(&css, cswritep);
4064   CleanupTextStream2(pvar_info_reload, &pvar_reload_txs, &reterr);
4065   BigstackReset(bigstack_mark);
4066   return reterr;
4067 }
4068 
4069 // Final filter_keys is natural-sorted.
4070 // Return values are allocated on bottom of bigstack.
4071 // Caller must initialize all return values to correspond to the null table.
MakeFilterHtable(const uintptr_t * variant_include,const uintptr_t * filter_npass,const char * const * filter_storage,uint32_t variant_ct,const char *** filter_keys_ptr,uint32_t ** filter_keys_htable_ptr,uint32_t * filter_key_ct_ptr,uint32_t * filter_keys_htable_size_ptr)4072 PglErr MakeFilterHtable(const uintptr_t* variant_include, const uintptr_t* filter_npass, const char* const* filter_storage, uint32_t variant_ct, const char*** filter_keys_ptr, uint32_t** filter_keys_htable_ptr, uint32_t* filter_key_ct_ptr, uint32_t* filter_keys_htable_size_ptr) {
4073   unsigned char* bigstack_end_mark = g_bigstack_end;
4074   PglErr reterr = kPglRetSuccess;
4075   {
4076     // Start with empty size-128 table, which will practically always be enough
4077     // while still being small relative to L1 cache.  Double table size
4078     // whenever load factor reaches 0.25; there shouldn't be *that* many
4079     // distinct filters.
4080     // possible todo: multithread this scan, merge results at the end; can also
4081     // separate this stage from the rest of the function.
4082     uint32_t table_size = 128;
4083     uint32_t hash_shift = 25; // 32 - log2(table_size)
4084     uint32_t filter_key_ct = 0;
4085     char** filter_tokens;
4086     if (unlikely(
4087             bigstack_end_calloc_cp(table_size, &filter_tokens))) {
4088       goto MakeFilterHtable_ret_NOMEM;
4089     }
4090 
4091     unsigned char* tmp_alloc_base = g_bigstack_base;
4092     unsigned char* tmp_alloc_end = g_bigstack_end;
4093     uintptr_t variant_widx = 0;
4094     uintptr_t cur_bits = variant_include[0];
4095     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
4096       const uintptr_t lowbit = BitIter1y(variant_include, &variant_widx, &cur_bits);
4097       if (lowbit & filter_npass[variant_widx]) {
4098         const char* filter_iter = filter_storage[variant_widx * kBitsPerWord + ctzw(lowbit)];
4099         while (1) {
4100           const char* token_end = strchrnul(filter_iter, ';');
4101           const uint32_t cur_id_slen = token_end - filter_iter;
4102           for (uint32_t hashval = Hash32(filter_iter, cur_id_slen) >> hash_shift; ; ) {
4103             char* cur_token_ptr = filter_tokens[hashval];
4104             if (!cur_token_ptr) {
4105               char* storage_loc;
4106               if (StoreStringAtBase(tmp_alloc_end, filter_iter, cur_id_slen, &tmp_alloc_base, &storage_loc)) {
4107                 goto MakeFilterHtable_ret_NOMEM;
4108               }
4109               ++filter_key_ct;
4110               if (filter_key_ct * 4 < table_size) {
4111                 filter_tokens[hashval] = storage_loc;
4112                 break;
4113               }
4114 #ifdef __LP64__
4115               if (unlikely(hash_shift == 1)) {
4116                 // this is technically "not yet supported", but I fail to see a
4117                 // valid use case for >536 million distinct FILTER keys...
4118                 logerrprintf("Error: Too many distinct FILTER keys (max 2^29 - 1).\n");
4119                 goto MakeFilterHtable_ret_MALFORMED_INPUT;
4120               }
4121 #endif
4122               // It's fine for the new table to overlap the old table, since we
4123               // can iterate through all the strings by walking forward from
4124               // g_bigstack_base.
4125               const uintptr_t extra_byte_ct = table_size * sizeof(intptr_t);
4126               if (unlikely(S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) < extra_byte_ct)) {
4127                 goto MakeFilterHtable_ret_NOMEM;
4128               }
4129               tmp_alloc_end -= extra_byte_ct;
4130               filter_tokens = R_CAST(char**, tmp_alloc_end);
4131               memset(filter_tokens, 0, 2 * extra_byte_ct);
4132               table_size *= 2;
4133               --hash_shift;
4134               char* rehash_iter = R_CAST(char*, g_bigstack_base);
4135               for (uint32_t uii = 0; uii != filter_key_ct; ++uii) {
4136                 char* rehash_token_end = strnul(rehash_iter);
4137                 const uint32_t rehash_id_slen = rehash_token_end - rehash_iter;
4138                 for (uint32_t rehashval = Hash32(rehash_iter, rehash_id_slen) >> hash_shift; ; ) {
4139                   if (!filter_tokens[rehashval]) {
4140                     filter_tokens[rehashval] = rehash_iter;
4141                     break;
4142                   }
4143                   if (++rehashval == table_size) {
4144                     rehashval = 0;
4145                   }
4146                 }
4147                 rehash_iter = &(rehash_token_end[1]);
4148               }
4149               break;
4150             }
4151             if ((!memcmp(filter_iter, cur_token_ptr, cur_id_slen)) && (!cur_token_ptr[cur_id_slen])) {
4152               break;
4153             }
4154             if (++hashval == table_size) {
4155               hashval = 0;
4156             }
4157           }
4158           if (!(*token_end)) {
4159             break;
4160           }
4161           filter_iter = &(token_end[1]);
4162         }
4163       }
4164     }
4165     if (!filter_key_ct) {
4166       // All nonpassing variants were already filtered out.
4167       // Caller already initialized null table.
4168       goto MakeFilterHtable_ret_1;
4169     }
4170     char* token_iter = R_CAST(char*, g_bigstack_base);
4171     BigstackBaseSet(tmp_alloc_base);
4172     const uint32_t filter_keys_htable_size = GetHtableFastSize(filter_key_ct);
4173     if (unlikely(
4174             bigstack_alloc_kcp(filter_key_ct, filter_keys_ptr) ||
4175             bigstack_alloc_u32(filter_keys_htable_size, filter_keys_htable_ptr))) {
4176       goto MakeFilterHtable_ret_NOMEM;
4177     }
4178     const char** filter_keys = *filter_keys_ptr;
4179     for (uint32_t uii = 0; uii != filter_key_ct; ++uii) {
4180       filter_keys[uii] = token_iter;
4181       char* token_end = strnul(token_iter);
4182       token_iter = &(token_end[1]);
4183     }
4184     StrptrArrNsort(filter_key_ct, filter_keys);
4185     *filter_key_ct_ptr = filter_key_ct;
4186     *filter_keys_htable_size_ptr = filter_keys_htable_size;
4187     uint32_t* filter_keys_htable = *filter_keys_htable_ptr;
4188     SetAllU32Arr(filter_keys_htable_size, filter_keys_htable);
4189     for (uint32_t uii = 0; uii != filter_key_ct; ++uii) {
4190       for (uint32_t hashval = Hashceil(filter_keys[uii], strlen(filter_keys[uii]), filter_keys_htable_size); ; ) {
4191         if (filter_keys_htable[hashval] == UINT32_MAX) {
4192           filter_keys_htable[hashval] = uii;
4193         }
4194         if (++hashval == filter_keys_htable_size) {
4195           hashval = 0;
4196         }
4197       }
4198     }
4199   }
4200   while (0) {
4201   MakeFilterHtable_ret_NOMEM:
4202     reterr = kPglRetNomem;
4203     break;
4204 #ifdef __LP64__
4205   MakeFilterHtable_ret_MALFORMED_INPUT:
4206     reterr = kPglRetMalformedInput;
4207     break;
4208 #endif
4209   }
4210  MakeFilterHtable_ret_1:
4211   BigstackEndReset(bigstack_end_mark);
4212   return reterr;
4213 }
4214 
4215 /*
4216 PglErr WritePvarJoin(const char* outname, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, const char* missing_varid_match, const char* const* info_keys, const uint32_t* info_keys_htable, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t new_variant_id_max_allele_slen, uint32_t max_write_allele_ct, uint32_t max_missalt_ct, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, uint32_t info_key_ct, uint32_t info_keys_htable_size, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct, char* xheader) {
4217   unsigned char* bigstack_mark = g_bigstack_base;
4218   char* cswritep = nullptr;
4219   PglErr reterr = kPglRetSuccess;
4220   CompressStreamState css;
4221   TextStream pvar_reload_txs;
4222   PreinitCstream(&css);
4223   PreinitTextStream(&pvar_reload_txs);
4224   {
4225     const uint32_t max_chr_blen = GetMaxChrSlen(cip) + 1;
4226     // includes trailing tab
4227     char* chr_buf;
4228 
4229     if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
4230       goto WritePvarJoin_ret_NOMEM;
4231     }
4232     const uint32_t new_variant_id_overflow_missing = (misc_flags / kfMiscNewVarIdOverflowMissing) & 1;
4233     const uint32_t varid_dup = (make_plink2_flags / kfMakePlink2VaridDup) & 1;
4234     VaridTemplate* varid_templatep = nullptr;
4235     if (!missing_varid_match) {
4236       missing_varid_match = &(g_one_char_strs[92]); // '.'
4237     }
4238     uint32_t missing_varid_slen = strlen(missing_varid_match);
4239     uint32_t missing_varid_match_blen = 0; // nonzero iff --set-missing-var-ids
4240     if (varid_template_str) {
4241       if (misc_flags & kfMiscSetMissingVarIds) {
4242         missing_varid_match_blen = missing_varid_slen + 1;
4243       }
4244       if (unlikely(BIGSTACK_ALLOC_X(VaridTemplate, 1, &varid_templatep))) {
4245         goto WritePvarJoin_ret_NOMEM;
4246       }
4247       const uint32_t overflow_substitute_blen = new_variant_id_overflow_missing? (missing_varid_slen + 1) : 0;
4248       VaridTemplateInit(varid_template_str, missing_varid_match, chr_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_templatep);
4249       if (varid_dup) {
4250         for (uint32_t uii = 0; uii != varid_templatep->insert_ct; ++uii) {
4251           const uint32_t insert_type = varid_templatep->insert_types[uii];
4252           if ((insert_type == 3) || ((insert_type == 2) && (varid_templatep->alleles_needed & 4))) {
4253             // Could define what takes precedence here, but simpler to prohibit
4254             // this combination.
4255             logerrputs("Error: 'vid-[split-]dup' cannot be used with a --set-all-var-ids or\n--set-missing-var-ids template string containing a non-REF allele.\n");
4256             goto WritePvarJoin_ret_INVALID_CMDLINE;
4257           }
4258         }
4259       }
4260     }
4261 
4262     uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen + max_filter_slen + S_CAST(uintptr_t, info_reload_slen) * (max_write_allele_ct - 1);
4263     if (overflow_buf_size < 2 * kCompressStreamBlock) {
4264       overflow_buf_size = 2 * kCompressStreamBlock;
4265     }
4266     const uint32_t output_zst = (pvar_psam_flags / kfPvarZs) & 1;
4267     reterr = InitCstreamAlloc(outname, 0, output_zst, thread_ct, overflow_buf_size, &css, &cswritep);
4268     if (unlikely(reterr)) {
4269       goto WritePvarJoin_ret_1;
4270     }
4271 
4272     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
4273     const uint32_t all_nonref = (nonref_flags_storage == 2);
4274     uint32_t write_info_pr = all_nonref;
4275     uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
4276     if (write_info && nonref_flags) {
4277       write_info_pr = !IntersectionIsEmpty(variant_include, nonref_flags, raw_variant_ctl);
4278     }
4279     write_info_pr = write_info_pr && write_info;
4280     if (unlikely(write_info_pr && (info_flags & kfInfoPrNonflagPresent))) {
4281       logputs("\n");
4282       logerrputs("Error: Conflicting INFO:PR definitions.  Either fix all REF alleles so that the\n'provisional reference' flag is no longer needed, or remove/rename the other\nuse of the INFO:PR key.\n");
4283       goto WritePvarJoin_ret_INCONSISTENT_INPUT;
4284     }
4285 
4286     char* pvar_info_line_iter = nullptr;
4287     uint32_t write_filter = 0;
4288     if (pvar_psam_flags & kfPvarColFilter) {
4289       write_filter = 1;
4290     } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
4291       write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
4292     }
4293     uint32_t info_col_idx = 0;  // could save this during first load instead
4294     const uint32_t info_pr_flag_present = (info_flags / kfInfoPrFlagPresent) & 1;
4295     if (pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) {
4296       reterr = PvarXheaderWrite(variant_include, cip, variant_bps, allele_idx_offsets, allele_storage, nullptr, xheader_blen, (pvar_psam_flags / kfPvarColVcfheader) & 1, write_filter, write_info, write_info_pr && (!info_pr_flag_present), max_allele_slen, vpos_sortstatus, xheader, &css, &cswritep);
4297       if (unlikely(reterr)) {
4298         goto WritePvarJoin_ret_1;
4299       }
4300     }
4301     const uint32_t join_mode = (make_plink2_flags & (kfMakePlink2MSplitBase * 7));
4302     uintptr_t info_cache_size = max_missalt_ct + max_write_allele_ct - 1;
4303     if (join_mode != kfMakePlink2MJoinSnps) {
4304       info_cache_size *= 3;
4305     }
4306 #ifndef __LP64__
4307     if (S_CAST(uint64_t, info_cache_size) * info_key_ct * sizeof(intptr_t) > 0x7fffffff) {
4308       goto WritePvarJoin_ret_NOMEM;
4309     }
4310 #endif
4311 
4312     if (cip->chrset_source) {
4313       AppendChrsetLine(cip, &cswritep);
4314     }
4315     cswritep = strcpya_k(cswritep, "#CHROM\tPOS\tID\tREF\tALT");
4316 
4317     uint32_t write_qual = 0;
4318     if (pvar_psam_flags & kfPvarColQual) {
4319       write_qual = 1;
4320     } else if ((pvar_psam_flags & kfPvarColMaybequal) && qual_present) {
4321       write_qual = !IntersectionIsEmpty(variant_include, qual_present, raw_variant_ctl);
4322     }
4323     if (write_qual) {
4324       cswritep = strcpya_k(cswritep, "\tQUAL");
4325     }
4326     const char** filter_keys = nullptr;
4327     uint32_t* filter_keys_htable = nullptr;
4328     uintptr_t* cur_filter_keys = nullptr;
4329     uint32_t filter_keys_htable_size = 0;
4330     uint32_t filter_key_ct = 0;
4331     uint32_t filter_key_ctl = 0;
4332     if (write_filter) {
4333       // The VCF spec doesn't require ##FILTER= header lines, and unlike the
4334       // case with INFO Number=A/R/G, we can join correctly without header
4335       // information.  It's slightly computationally more expensive, but INFO
4336       // and genotype joining costs are more significant.
4337       if (filter_npass) {
4338         reterr = MakeFilterHtable(variant_include, filter_npass, filter_storage, variant_ct, &filter_keys, &filter_keys_htable, &filter_key_ct, &filter_keys_htable_size);
4339         if (unlikely(reterr)) {
4340           goto WritePvarJoin_ret_1;
4341         }
4342         if (filter_key_ct) {
4343           filter_key_ctl = BitCtToWordCt(filter_key_ct);
4344           if (unlikely(bigstack_alloc_w(filter_key_ctl, &cur_filter_keys))) {
4345             goto WritePvarJoin_ret_1;
4346           }
4347         }
4348       }
4349       cswritep = strcpya_k(cswritep, "\tFILTER");
4350     }
4351 
4352     char** info_bufs = nullptr;
4353     const char** info_starts = nullptr;
4354     const char** info_ends = nullptr;  // ugh, this is not related to INFO:END
4355     const char** info_curs = nullptr;
4356     uint32_t info_end_key_idx = UINT32_MAX;
4357     if (pvar_info_reload) {
4358       if (unlikely(
4359               bigstack_alloc_cp(info_cache_size, &info_bufs) ||
4360               bigstack_alloc_kcp(info_key_ct * info_cache_size, &info_starts) ||
4361               bigstack_alloc_kcp(info_key_ct * info_cache_size, &info_ends) ||
4362               bigstack_alloc_kcp(info_key_ct * info_cache_size, &info_curs))) {
4363         goto WritePvarJoin_ret_NOMEM;
4364       }
4365       reterr = PvarInfoOpenAndReloadHeader(pvar_info_reload, 1 + (thread_ct > 1), &pvar_reload_txs, &pvar_info_line_iter, &info_col_idx);
4366       if (unlikely(reterr)) {
4367         goto WritePvarJoin_ret_TSTREAM_FAIL;
4368       }
4369       info_end_key_idx = IdHtableFind("END", info_keys, info_keys_htable, strlen("END"), info_keys_htable_size);
4370       if (info_end_key_idx != UINT32_MAX) {
4371         const int32_t knum = const_container_of(info_keys[info_end_key_idx], InfoVtype, key)->num;
4372         if ((knum != 1) && (knum != kInfoVtypeUnknown)) {
4373           // TODO: verify type instead.
4374           // but if number is not . or 1, this is not the INFO:END we're
4375           // looking for.
4376           info_end_key_idx = UINT32_MAX;
4377         }
4378       }
4379     }
4380     if (write_info) {
4381       cswritep = strcpya_k(cswritep, "\tINFO");
4382     }
4383 
4384     uint32_t write_cm = 0;
4385     if (pvar_psam_flags & kfPvarColCm) {
4386       write_cm = 1;
4387     } else if ((pvar_psam_flags & kfPvarColMaybecm) && variant_cms) {
4388       if (raw_variant_ct == variant_ct) {
4389         // nonzero_cm_present check was performed
4390         write_cm = 1;
4391       } else {
4392         uintptr_t variant_uidx_base = 0;
4393         uintptr_t cur_bits = variant_include[0];
4394         for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
4395           const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
4396           if (variant_cms[variant_uidx] != 0.0) {
4397             write_cm = 1;
4398             break;
4399           }
4400         }
4401       }
4402     }
4403     if (write_cm) {
4404       cswritep = strcpya_k(cswritep, "\tCM");
4405     }
4406     AppendBinaryEoln(&cswritep);
4407 
4408     const VaridTemplate* cur_varid_templatep = nullptr;
4409     const char* varid_token_start = nullptr; // for vid-split
4410     const uint32_t varid_split = (make_plink2_flags / kfMakePlink2VaridSemicolon) & 1;
4411     const uint32_t varid_dup_nosplit = varid_dup && (!varid_split);
4412     uint32_t next_variant_idx = 0;
4413     uint32_t trs_variant_uidx = 0;
4414     uint32_t next_variant_uidx = 0;
4415     uintptr_t next_variant_uidx_base = 0;
4416     uintptr_t next_bits = variant_include[0];
4417     uint32_t chr_fo_idx = UINT32_MAX;
4418     uint32_t chr_end = 0;
4419     uint32_t chr_buf_blen = 0;
4420     uint32_t prev_bp = 0;
4421     uint32_t cur_bp = 0;
4422     uint32_t bp_start_variant_idx = 0;
4423     uint32_t bp_start_variant_uidx = 0;
4424     uintptr_t bp_start_variant_uidx_base = 0;
4425     uintptr_t bp_start_bits = variant_include[0];
4426     uint32_t allele_ct = 2;
4427     uint32_t pct = 0;
4428     uint32_t next_print_variant_idx = variant_ct / 100;
4429     JoinCounts jc;
4430     jc.snp_ct = 0;
4431     jc.nonsnp_ct = 0;
4432     jc.symbolic_ct = 0;
4433     jc.missalt_snp_ct = 0;
4434     jc.missalt_nonsnp_ct = 0;
4435     JoinCounts next_jc = jc;
4436     fputs("0%", stdout);
4437     fflush(stdout);
4438     while (1) {
4439       for (; next_variant_idx != variant_ct; ++next_variant_idx) {
4440         next_variant_uidx = BitIter1(variant_include, &next_variant_uidx_base, &next_bits);
4441         if (next_variant_uidx >= chr_end) {
4442           do {
4443             ++chr_fo_idx;
4444             chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
4445           } while (next_variant_uidx >= chr_end);
4446           char* chr_name_end = chrtoa(cip, cip->chr_file_order[chr_fo_idx], chr_buf);
4447           *chr_name_end = '\t';
4448           const uint32_t chr_slen = chr_name_end - chr_buf;
4449           chr_buf_blen = 1 + chr_slen;
4450           if (varid_templatep) {
4451             const int32_t chr_slen_delta = chr_slen - varid_templatep->chr_slen;
4452             varid_templatep->chr_slen = chr_slen;
4453             varid_templatep->base_len += chr_slen_delta;
4454           }
4455           prev_bp = UINT32_MAX;
4456         }
4457         cur_bp = variant_bps[next_variant_uidx];
4458         if (cur_bp != prev_bp) {
4459           break;
4460         }
4461         uintptr_t allele_idx_offset_base;
4462         if (!allele_idx_offsets) {
4463           allele_idx_offset_base = next_variant_uidx * 2;
4464         } else {
4465           allele_idx_offset_base = allele_idx_offsets[next_variant_uidx];
4466           allele_ct = allele_idx_offsets[next_variant_uidx + 1] - allele_idx_offset_base;
4467         }
4468         const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
4469         JoinVtype jvt = JoinCount(cur_alleles, allele_ct, &next_jc);
4470         // previously validated
4471         // if ((join_mode == kfMakePlink2MJoinSnps) && ()) {
4472         // }
4473 
4474         // TODO
4475         jc.snp_ct += next_jc.snp_ct;
4476         jc.nonsnp_ct += next_jc.nonsnp_ct;
4477         jc.symbolic_ct += next_jc.symbolic_ct;
4478         jc.missalt_snp_ct += next_jc.missalt_snp_ct;
4479         jc.missalt_nonsnp_ct += next_jc.missalt_nonsnp_ct;
4480       }
4481       if (next_variant_idx == bp_start_variant_idx + 1) {
4482         // No join needed.  This is usually the common case, so we duplicate a
4483         // bunch of code for the sake of avoiding slowdown here.
4484         cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
4485         cswritep = u32toa_x(variant_bps[bp_start_variant_uidx], '\t', cswritep);
4486         cswritep = strcpyax(cswritep, variant_ids[bp_start_variant_uidx], '\t');
4487         uintptr_t allele_idx_offset_base;
4488         if (!allele_idx_offsets) {
4489           allele_idx_offset_base = bp_start_variant_uidx * 2;
4490         } else {
4491           allele_idx_offset_base = allele_idx_offsets[bp_start_variant_uidx];
4492           allele_ct = allele_idx_offsets[bp_start_variant_uidx + 1] - allele_idx_offset_base;
4493         }
4494         const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
4495         cswritep = strcpyax(cswritep, cur_alleles[0], '\t');
4496         cswritep = strcpya(cswritep, cur_alleles[1]);
4497         if (unlikely(Cswrite(&css, &cswritep))) {
4498           goto WritePvarJoin_ret_WRITE_FAIL;
4499         }
4500         for (uint32_t allele_idx = 2; allele_idx != allele_ct; ++allele_idx) {
4501           *cswritep++ = ',';
4502           cswritep = strcpya(cswritep, cur_alleles[allele_idx]);
4503           if (unlikely(Cswrite(&css, &cswritep))) {
4504             goto WritePvarJoin_ret_WRITE_FAIL;
4505           }
4506         }
4507 
4508         if (write_qual) {
4509           *cswritep++ = '\t';
4510           if ((!qual_present) || (!IsSet(qual_present, bp_start_variant_uidx))) {
4511             *cswritep++ = '.';
4512           } else {
4513             cswritep = ftoa_g(quals[bp_start_variant_uidx], cswritep);
4514           }
4515         }
4516 
4517         if (write_filter) {
4518           *cswritep++ = '\t';
4519           if ((!filter_present) || (!IsSet(filter_present, bp_start_variant_uidx))) {
4520             *cswritep++ = '.';
4521           } else if (!IsSet(filter_npass, bp_start_variant_uidx)) {
4522             cswritep = strcpya_k(cswritep, "PASS");
4523           } else {
4524             cswritep = strcpya(cswritep, filter_storage[bp_start_variant_uidx]);
4525           }
4526         }
4527 
4528         if (write_info) {
4529           *cswritep++ = '\t';
4530           const uint32_t is_pr = all_nonref || (nonref_flags && IsSet(nonref_flags, bp_start_variant_uidx));
4531           if (pvar_info_line_iter) {
4532             reterr = PvarInfoReloadAndWrite(info_pr_flag_present, info_col_idx, bp_start_variant_uidx, is_pr, &pvar_reload_txs, &pvar_info_line_iter, &cswritep, &trs_variant_uidx);
4533             if (unlikely(reterr)) {
4534               goto WritePvarJoin_ret_TSTREAM_FAIL;
4535             }
4536           } else {
4537             if (is_pr) {
4538               cswritep = strcpya_k(cswritep, "PR");
4539             } else {
4540               *cswritep++ = '.';
4541             }
4542           }
4543         }
4544 
4545         if (write_cm) {
4546           *cswritep++ = '\t';
4547           if (!variant_cms) {
4548             *cswritep++ = '0';
4549           } else {
4550             cswritep = dtoa_g_p8(variant_cms[bp_start_variant_uidx], cswritep);
4551           }
4552         }
4553         AppendBinaryEoln(&cswritep);
4554         // next_jc guaranteed to be zero-initialized
4555       } else if (next_variant_idx) {
4556         // TODO
4557         ;;;;
4558       const char* orig_variant_id = variant_ids[variant_uidx];
4559       const char* ref_allele = cur_alleles[0];
4560       const uint32_t ref_allele_slen = strlen(ref_allele);
4561       uint32_t split_ct_p1 = allele_ct;
4562       if (allele_ct > 2) {
4563         if (!varid_dup) {
4564           if (varid_templatep && (!missing_varid_match_blen)) {
4565             cur_varid_templatep = varid_templatep;
4566           } else {
4567             cur_varid_templatep = nullptr;
4568             if (varid_split) {
4569               if (VaridSplitOk(orig_variant_id, allele_ct)) {
4570                 varid_token_start = orig_variant_id;
4571               } else {
4572                 varid_token_start = nullptr;
4573               }
4574             }
4575             if ((!varid_token_start) && varid_templatep) {
4576               // Note that --set-missing-var-ids almost always applies here
4577               // when it's specified; only exception is when vid-split was also
4578               // specified and the split succeeded.
4579               cur_varid_templatep = varid_templatep;
4580             }
4581           }
4582         }
4583       }
4584        ;;;;
4585         next_jc.snp_ct = 0;
4586         next_jc.nonsnp_ct = 0;
4587         next_jc.symbolic_ct = 0;
4588         next_jc.missalt_snp_ct = 0;
4589         next_jc.missalt_nonsnp_ct = 0;
4590       }
4591       if (next_variant_idx == variant_ct) {
4592         break;
4593       }
4594       // this_pos_write_variant_ct = 0;
4595       jc = next_jc;
4596       prev_bp = cur_bp;
4597       bp_start_variant_idx = next_variant_idx;
4598       bp_start_variant_uidx = next_variant_uidx;
4599       if (next_variant_idx >= next_print_variant_idx) {
4600         if (pct > 10) {
4601           putc_unlocked('\b', stdout);
4602         }
4603         pct = (next_variant_idx * 100LLU) / variant_ct;
4604         printf("\b\b%u%%", pct++);
4605         fflush(stdout);
4606         next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
4607       }
4608     }
4609     if (unlikely(CswriteCloseNull(&css, cswritep))) {
4610       goto WritePvarJoin_ret_WRITE_FAIL;
4611     }
4612     if (pct > 10) {
4613       putc_unlocked('\b', stdout);
4614     }
4615     fputs("\b\b", stdout);
4616   }
4617   while (0) {
4618   WritePvarJoin_ret_NOMEM:
4619     reterr = kPglRetNomem;
4620     break;
4621   WritePvarJoin_ret_TSTREAM_FAIL:
4622     TextStreamErrPrint(pvar_info_reload, &pvar_reload_txs);
4623     break;
4624   WritePvarJoin_ret_WRITE_FAIL:
4625     reterr = kPglRetWriteFail;
4626     break;
4627   WritePvarJoin_ret_INVALID_CMDLINE:
4628     reterr = kPglRetInvalidCmdline;
4629     break;
4630   WritePvarJoin_ret_INCONSISTENT_INPUT:
4631     reterr = kPglRetInconsistentInput;
4632     break;
4633   }
4634  WritePvarJoin_ret_1:
4635   CswriteCloseCond(&css, cswritep);
4636   CleanupTextStream2(pvar_info_reload, &pvar_reload_txs, &reterr);
4637   BigstackReset(bigstack_mark);
4638   return reterr;
4639 }
4640 */
4641 
4642 FLAGSET_DEF_START()
4643   kfPlink2Write0,
4644   kfPlink2WriteSetHhMissing = (1 << 0),
4645   kfPlink2WriteSetHhMissingKeepDosage = (1 << 1),
4646   kfPlink2WriteSetMixedMtMissing = (1 << 2),
4647   kfPlink2WriteSetMixedMtMissingKeepDosage = (1 << 3),
4648   kfPlink2WriteMeMissing = (1 << 4),
4649   kfPlink2WriteZeroCluster = (1 << 5),
4650   kfPlink2WriteFillRef = (1 << 6),
4651   kfPlink2WriteLateDosageErase = (1 << 7),
4652   // no need for sample_sort, determined by collapsed_sort_map != nullptr?
4653   kfPlink2WritePlink1 = (1 << 8)
4654 FLAGSET_DEF_END(Plink2WriteFlags);
4655 // todo: add .pgen-specific stuff
4656 
4657 typedef struct MakeCommonStruct {
4658   const ChrInfo* cip;
4659   const uintptr_t* sample_include;
4660   uintptr_t* sex_male_collapsed_interleaved;
4661   uintptr_t* sex_female_collapsed_interleaved;
4662   const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select);
4663   uint32_t raw_sample_ct;
4664   uint32_t sample_ct;
4665   Plink2WriteFlags plink2_write_flags;
4666   uint32_t hard_call_halfdist;
4667 } MakeCommon;
4668 
4669 typedef struct MakeBedlikeCtxStruct {
4670   const MakeCommon* mcp;
4671 
4672   const uintptr_t* variant_include;
4673   uint32_t* sample_include_cumulative_popcounts;
4674   const uint32_t* collapsed_sort_map;
4675 
4676   PgenReader** pgr_ptrs;
4677 
4678   uint32_t* read_variant_uidx_starts;
4679   uint32_t cur_block_write_ct;
4680 
4681   uintptr_t** genovecs;
4682   uintptr_t** dosage_presents;
4683   Dosage** dosage_mains;
4684 
4685   unsigned char* writebufs[2];
4686   PglErr reterr;  // can only be kPglRetMalformedInput for now
4687 } MakeBedlikeCtx;
4688 
4689 
MakeBedlikeThread(void * raw_arg)4690 THREAD_FUNC_DECL MakeBedlikeThread(void* raw_arg) {
4691   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
4692   const uintptr_t tidx = arg->tidx;
4693   MakeBedlikeCtx* ctx = S_CAST(MakeBedlikeCtx*, arg->sharedp->context);
4694 
4695   PgenReader* pgrp = ctx->pgr_ptrs[tidx];
4696   uintptr_t* genovec = ctx->genovecs[tidx];
4697   const MakeCommon* mcp = ctx->mcp;
4698   uintptr_t* dosage_present = nullptr;
4699   Dosage* dosage_main = nullptr;
4700   uint32_t hard_call_halfdist = 0;
4701   if (ctx->dosage_presents) {
4702     dosage_present = ctx->dosage_presents[tidx];
4703     dosage_main = ctx->dosage_mains[tidx];
4704     hard_call_halfdist = mcp->hard_call_halfdist;
4705   }
4706   const uintptr_t* variant_include = ctx->variant_include;
4707   const ChrInfo* cip = mcp->cip;
4708   const uintptr_t* sample_include = mcp->sample_include;
4709   PgrSampleSubsetIndex pssi;
4710   PgrSetSampleSubsetIndex(ctx->sample_include_cumulative_popcounts, pgrp, &pssi);
4711   const uintptr_t* sex_male_collapsed_interleaved = mcp->sex_male_collapsed_interleaved;
4712   const uintptr_t* sex_female_collapsed_interleaved = mcp->sex_female_collapsed_interleaved;
4713   const uint32_t* collapsed_sort_map = ctx->collapsed_sort_map;
4714   const Plink2WriteFlags plink2_write_flags = mcp->plink2_write_flags;
4715   const uint32_t set_hh_missing = plink2_write_flags & kfPlink2WriteSetHhMissing;
4716   const uint32_t set_mixed_mt_missing = plink2_write_flags & kfPlink2WriteSetMixedMtMissing;
4717   const uint32_t write_plink1 = plink2_write_flags & kfPlink2WritePlink1;
4718   const uint32_t sample_ct = mcp->sample_ct;
4719   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
4720   const uint32_t sample_ctv2 = NypCtToVecCt(sample_ct);
4721   const uint32_t sample_ct4 = NypCtToByteCt(sample_ct);
4722   const uint32_t calc_thread_ct = GetThreadCt(arg->sharedp);
4723   const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select) = mcp->refalt1_select;
4724   const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
4725   const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
4726   const uint32_t mt_code = cip->xymt_codes[kChrOffsetMT];
4727   uint32_t parity = 0;
4728   do {
4729     const uintptr_t cur_block_write_ct = ctx->cur_block_write_ct;
4730     uint32_t write_idx = (tidx * cur_block_write_ct) / calc_thread_ct;
4731     const uint32_t write_idx_end = ((tidx + 1) * cur_block_write_ct) / calc_thread_ct;
4732     unsigned char* writebuf_iter = &(ctx->writebufs[parity][write_idx * sample_ct4]);
4733     uintptr_t variant_uidx_base;
4734     uintptr_t cur_bits;
4735     BitIter1Start(variant_include, ctx->read_variant_uidx_starts[tidx], &variant_uidx_base, &cur_bits);
4736     uint32_t chr_end = 0;
4737     uint32_t is_x = 0;
4738     uint32_t is_y = 0;
4739     uint32_t is_haploid_nonmt = 0;
4740     uint32_t is_mt = 0;
4741     for (; write_idx != write_idx_end; ++write_idx) {
4742       const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
4743       if (variant_uidx >= chr_end) {
4744         const uint32_t chr_fo_idx = GetVariantChrFoIdx(cip, variant_uidx);
4745         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
4746         chr_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
4747         is_x = (chr_idx == x_code);
4748         is_y = (chr_idx == y_code);
4749         is_mt = (chr_idx == mt_code);
4750         is_haploid_nonmt = IsSet(cip->haploid_mask, chr_idx) && (!is_mt);
4751       }
4752       // todo: Multiallelic -> two-specific-alleles downcode.
4753       // This is pretty straightforward if we're just saving hardcalls:
4754       // with 1 copy of one allele and zero copies of the other allele, we
4755       // default to saving a missing call (in the diploid case).
4756       // If dosages are involved, things are a bit less obvious: what if the
4757       // unincluded alleles have a total dosage of 0.1?  0.5?  It'll be
4758       // necessary to define a new flag allowing this threshold to be
4759       // configured.
4760       // I'm currently inclined to set unincluded dosage >= 0.5 to missing, and
4761       // otherwise the dosages are scaled up to sum to 2.
4762       // (Note that the multiallelic split operation won't work this way; it
4763       // has to use the convention that REF = anything other than the current
4764       // ALT allele.  Probably also want to support that here.)
4765       if (!hard_call_halfdist) {
4766         // if multiallelic:
4767         //   if split: call PgrGet1()
4768         //   otherwise, if erase-alt2+: call PgrGet2()
4769         //   otherwise, error out
4770         PglErr reterr = PgrGet(sample_include, pssi, sample_ct, variant_uidx, pgrp, genovec);
4771         if (unlikely(reterr)) {
4772           ctx->reterr = reterr;
4773           break;
4774         }
4775       } else {
4776         // this isn't fully implemented yet.
4777 
4778         // quasi-bugfix (4 Dec 2017): it's user-hostile to make
4779         // --hard-call-threshold not apply here.
4780         uint32_t dosage_ct;
4781         // if multiallelic:
4782         //    if split: call PgrGet1D()
4783         //    otherwise, if refalt1_select + erase-alt2+: call PgrGetMD(),
4784         //      rescale
4785         //    otherwise, error out
4786         PglErr reterr = PgrGetD(sample_include, pssi, sample_ct, variant_uidx, pgrp, genovec, dosage_present, dosage_main, &dosage_ct);
4787         if (unlikely(reterr)) {
4788           ctx->reterr = reterr;
4789           break;
4790         }
4791         ApplyHardCallThresh(dosage_present, dosage_main, dosage_ct, hard_call_halfdist, genovec);
4792       }
4793       // remove this when proper multiallelic logic implemented
4794       if (refalt1_select && (refalt1_select[variant_uidx][0] == 1)) {
4795         GenovecInvertUnsafe(sample_ct, genovec);
4796       }
4797       if (set_hh_missing && is_haploid_nonmt) {
4798         if (is_x) {
4799           SetMaleHetMissing(sex_male_collapsed_interleaved, sample_ctv2, genovec);
4800         } else {
4801           // all hets to missing
4802           SetHetMissing(sample_ctl2, genovec);
4803           if (is_y) {
4804             InterleavedSetMissing(sex_female_collapsed_interleaved, sample_ctv2, genovec);
4805           }
4806         }
4807       } else if (set_mixed_mt_missing && is_mt) {
4808         // all hets to missing
4809         SetHetMissing(sample_ctl2, genovec);
4810       }
4811       // todo: --set-me-missing, --zero-cluster, --fill-missing-with-ref
4812       // (--set-me-missing should happen after --set-hh-missing)
4813       if (write_plink1) {
4814         PgrPlink2ToPlink1InplaceUnsafe(sample_ct, genovec);
4815       }
4816       // trailing bytes don't matter, but trailing bits of last byte may
4817       ZeroTrailingNyps(sample_ct, genovec);
4818       if (!collapsed_sort_map) {
4819         writebuf_iter = memcpyua(writebuf_iter, genovec, sample_ct4);
4820       } else {
4821         GenovecResort(genovec, collapsed_sort_map, sample_ct, writebuf_iter);
4822         writebuf_iter = &(writebuf_iter[sample_ct4]);
4823       }
4824     }
4825     parity = 1 - parity;
4826   } while (!THREAD_BLOCK_FINISH(arg));
4827   THREAD_RETURN;
4828 }
4829 
4830 // initialized mcp fields: cip, sex_male_collapsed_interleaved,
4831 // sex_female_collapsed_interleaved, raw_sample_ct, sample_ct,
4832 // plink2_write_flags
4833 PglErr MakeBedlikeMain(const uintptr_t* sample_include, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_thread_ct, uint32_t hard_call_thresh, MakePlink2Flags make_plink2_flags, uintptr_t pgr_alloc_cacheline_ct, PgenFileInfo* pgfip, MakeCommon* mcp, char* outname, char* outname_end) {
4834   FILE* outfile = nullptr;
4835   PglErr reterr = kPglRetSuccess;
4836   ThreadGroup tg;
4837   PreinitThreads(&tg);
4838   MakeBedlikeCtx ctx;
4839   {
4840     assert(variant_ct);
4841     const uint32_t sample_ct = mcp->sample_ct;
4842     assert(sample_ct);
4843     if (make_plink2_flags & kfMakePlink2MMask) {
4844       logerrputs("Error: Multiallelic-split fixed-width output is not implemented yet.\n");
4845       reterr = kPglRetNotYetSupported;
4846       goto MakeBedlikeMain_ret_1;
4847     }
4848     // fixed-width
4849     const uint32_t make_pgen = make_plink2_flags & kfMakePgen;
4850     if (make_pgen) {
4851       snprintf(outname_end, kMaxOutfnameExtBlen, ".pgen");
4852     } else {
4853       snprintf(outname_end, kMaxOutfnameExtBlen, ".bed");
4854     }
4855     if (unlikely(fopen_checked(outname, FOPEN_WB, &outfile))) {
4856       goto MakeBedlikeMain_ret_OPEN_FAIL;
4857     }
4858     if (make_pgen) {
4859       fwrite_unlocked("l\x1b\x02", 3, 1, outfile);
4860       fwrite_unlocked(&variant_ct, 4, 1, outfile);
4861       fwrite_unlocked(&sample_ct, 4, 1, outfile);
4862       if (!pgfip->nonref_flags) {
4863         const PgenGlobalFlags gflags = pgfip->gflags;
4864         uint32_t uii = 64;
4865         if (gflags & kfPgenGlobalAllNonref) {
4866           uii = 128;
4867         }
4868         putc_unlocked(uii, outfile);
4869       } else {
4870         putc_unlocked(192, outfile);
4871         fwrite_unlocked(pgfip->nonref_flags, DivUp(variant_ct, CHAR_BIT), 1, outfile);
4872       }
4873       if (unlikely(ferror_unlocked(outfile))) {
4874         goto MakeBedlikeMain_ret_WRITE_FAIL;
4875       }
4876     } else {
4877       if (unlikely(fwrite_checked("l\x1b\x01", 3, outfile))) {
4878         goto MakeBedlikeMain_ret_WRITE_FAIL;
4879       }
4880     }
4881     logprintfww5("Writing %s ... ", outname);
4882     fputs("0%", stdout);
4883     fflush(stdout);
4884     uint32_t pct = 0;
4885     const uint32_t raw_sample_ct = mcp->raw_sample_ct;
4886     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
4887     const uintptr_t sample_ct4 = NypCtToByteCt(sample_ct);
4888     if (unlikely(bigstack_alloc_u32(raw_sample_ctl, &ctx.sample_include_cumulative_popcounts))) {
4889       goto MakeBedlikeMain_ret_NOMEM;
4890     }
4891     FillCumulativePopcounts(sample_include, raw_sample_ctl, ctx.sample_include_cumulative_popcounts);
4892     // tried more threads, pointless since this is too I/O-bound
4893     // (exception: reordering samples)
4894     uint32_t calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
4895     ctx.collapsed_sort_map = new_sample_idx_to_old;
4896     if (!new_sample_idx_to_old) {
4897       // Without BMI2 instructions, subsetting is most expensive with
4898       // sample_ct near 2/3 of raw_sample_ct; up to ~7 compute threads are
4899       // useful in that case.  (See CopyNyparrNonemptySubset().)
4900       // With them, 1-2 compute threads appear to suffice.
4901 #ifdef USE_AVX2
4902       const uint32_t calc_thread_max = 2;
4903 #else
4904       uint64_t numer;
4905       if (sample_ct * (3 * k1LU) <= raw_sample_ct * (2 * k1LU)) {
4906         numer = sample_ct * (9 * k1LU);
4907       } else {
4908         numer = (raw_sample_ct - sample_ct) * (18 * k1LU);
4909       }
4910       const uint32_t calc_thread_max = 1 + (numer / raw_sample_ct);
4911 #endif
4912       if (calc_thread_max < calc_thread_ct) {
4913         calc_thread_ct = calc_thread_max;
4914       }
4915     } else if (sample_ct < raw_sample_ct) {
4916       uint32_t* new_collapsed_sort_map;
4917       if (unlikely(bigstack_alloc_u32(sample_ct, &new_collapsed_sort_map))) {
4918         goto MakeBedlikeMain_ret_NOMEM;
4919       }
4920       // bugfix (26 Mar 2018): forgot to initialize this
4921       memcpy(new_collapsed_sort_map, new_sample_idx_to_old, sample_ct * sizeof(int32_t));
4922       UidxsToIdxs(sample_include, ctx.sample_include_cumulative_popcounts, sample_ct, new_collapsed_sort_map);
4923       ctx.collapsed_sort_map = new_collapsed_sort_map;
4924     }
4925 
4926     if (make_plink2_flags & kfMakeBed) {
4927       mcp->plink2_write_flags |= kfPlink2WritePlink1;
4928     }
4929 
4930     mcp->hard_call_halfdist = 0;
4931     if ((hard_call_thresh != UINT32_MAX) && (pgfip->gflags & (kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent))) {
4932       mcp->hard_call_halfdist = kDosage4th - hard_call_thresh;
4933     }
4934     STD_ARRAY_DECL(unsigned char*, 2, main_loadbufs);
4935     ctx.dosage_presents = nullptr;
4936     ctx.dosage_mains = nullptr;
4937     uint32_t read_block_size;
4938     if (unlikely(PgenMtLoadInit(variant_include, sample_ct, variant_ct, bigstack_left(), pgr_alloc_cacheline_ct, 0, 2 * (sample_ct4 + 1), 0, pgfip, &calc_thread_ct, &ctx.genovecs, nullptr, nullptr, nullptr, mcp->hard_call_halfdist? (&ctx.dosage_presents) : nullptr, mcp->hard_call_halfdist? (&ctx.dosage_mains) : nullptr, nullptr, nullptr, &read_block_size, nullptr, main_loadbufs, &ctx.pgr_ptrs, &ctx.read_variant_uidx_starts))) {
4939       goto MakeBedlikeMain_ret_NOMEM;
4940     }
4941     if (unlikely(
4942             bigstack_alloc_uc(sample_ct4 * read_block_size, &(ctx.writebufs[0])) ||
4943             bigstack_alloc_uc(sample_ct4 * read_block_size, &(ctx.writebufs[1])))) {
4944       // shouldn't be possible for this to fail
4945       goto MakeBedlikeMain_ret_NOMEM;
4946     }
4947     if (unlikely(SetThreadCt(calc_thread_ct, &tg))) {
4948       goto MakeBedlikeMain_ret_NOMEM;
4949     }
4950 
4951     ctx.variant_include = variant_include;
4952     mcp->refalt1_select = refalt1_select;
4953     mcp->sample_include = sample_include;
4954     mcp->sample_ct = sample_ct;
4955     ctx.mcp = mcp;
4956     ctx.reterr = kPglRetSuccess;
4957     SetThreadFuncAndData(MakeBedlikeThread, &ctx, &tg);
4958 
4959     // Main workflow:
4960     // 1. Set n=0, load/skip block 0
4961     //
4962     // 2. Spawn threads processing block n
4963     // 3. If n>0, write results for block (n-1)
4964     // 4. Increment n by 1
4965     // 5. Load/skip block n unless eof
4966     // 6. Join threads
4967     // 7. Goto step 2 unless eof
4968     //
4969     // 8. Write results for last block
4970     uint32_t parity = 0;
4971     uint32_t read_block_idx = 0;
4972     uint32_t prev_variant_idx = 0;
4973     uint32_t next_print_variant_idx = variant_ct / 100;
4974     for (uint32_t variant_idx = 0; ; ) {
4975       const uint32_t cur_block_write_ct = MultireadNonempty(variant_include, &tg, raw_variant_ct, read_block_size, pgfip, &read_block_idx, &reterr);
4976       if (unlikely(reterr)) {
4977         goto MakeBedlikeMain_ret_PGR_FAIL;
4978       }
4979       if (variant_idx) {
4980         JoinThreads(&tg);
4981         reterr = ctx.reterr;
4982         if (unlikely(reterr)) {
4983           // this should only be possible in MakePgenRobust()
4984           assert(reterr != kPglRetWriteFail);
4985           goto MakeBedlikeMain_ret_PGR_FAIL;
4986         }
4987       }
4988       if (!IsLastBlock(&tg)) {
4989         ctx.cur_block_write_ct = cur_block_write_ct;
4990         ComputeUidxStartPartition(variant_include, cur_block_write_ct, calc_thread_ct, read_block_idx * read_block_size, ctx.read_variant_uidx_starts);
4991         PgrCopyBaseAndOffset(pgfip, calc_thread_ct, ctx.pgr_ptrs);
4992         if (variant_idx + cur_block_write_ct == variant_ct) {
4993           DeclareLastThreadBlock(&tg);
4994         }
4995         if (unlikely(SpawnThreads(&tg))) {
4996           goto MakeBedlikeMain_ret_THREAD_CREATE_FAIL;
4997         }
4998       }
4999       parity = 1 - parity;
5000       if (variant_idx) {
5001         // write *previous* block results
5002         if (unlikely(fwrite_checked(ctx.writebufs[parity], (variant_idx - prev_variant_idx) * sample_ct4, outfile))) {
5003           goto MakeBedlikeMain_ret_WRITE_FAIL;
5004         }
5005         if (variant_idx == variant_ct) {
5006           break;
5007         }
5008         if (variant_idx >= next_print_variant_idx) {
5009           if (pct > 10) {
5010             putc_unlocked('\b', stdout);
5011           }
5012           pct = (variant_idx * 100LLU) / variant_ct;
5013           printf("\b\b%u%%", pct++);
5014           fflush(stdout);
5015           next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
5016         }
5017         prev_variant_idx = variant_idx;
5018       }
5019       ++read_block_idx;
5020       variant_idx += cur_block_write_ct;
5021       // crucially, this is independent of the PgenReader block_base pointers
5022       pgfip->block_base = main_loadbufs[parity];
5023     }
5024     if (unlikely(fclose_null(&outfile))) {
5025       goto MakeBedlikeMain_ret_WRITE_FAIL;
5026     }
5027     if (pct > 10) {
5028       putc_unlocked('\b', stdout);
5029     }
5030     fputs("\b\b", stdout);
5031     logputs("done.\n");
5032     // BigstackReset(bigstack_mark);
5033   }
5034   while (0) {
5035   MakeBedlikeMain_ret_NOMEM:
5036     reterr = kPglRetNomem;
5037     break;
5038   MakeBedlikeMain_ret_OPEN_FAIL:
5039     reterr = kPglRetOpenFail;
5040     break;
5041   MakeBedlikeMain_ret_PGR_FAIL:
5042     PgenErrPrintN(reterr);
5043     break;
5044   MakeBedlikeMain_ret_WRITE_FAIL:
5045     reterr = kPglRetWriteFail;
5046     break;
5047   MakeBedlikeMain_ret_THREAD_CREATE_FAIL:
5048     reterr = kPglRetThreadCreateFail;
5049     break;
5050   }
5051  MakeBedlikeMain_ret_1:
5052   CleanupThreads(&tg);
5053   fclose_cond(outfile);
5054   // parent will free memory
5055   return reterr;
5056 }
5057 
5058 typedef struct MakePgenCtxStruct {
5059   MakeCommon* mcp;
5060 
5061   const uint32_t* new_sample_idx_to_old;
5062   uint32_t* old_sample_idx_to_new;
5063   // combine existing chr_mask/xymt_codes/haploid_mask/chr_idx_to_foidx with
5064   // new collapsed chromosome boundary table
5065   uint32_t* write_chr_fo_vidx_start;
5066   const uintptr_t* write_allele_idx_offsets;
5067   const uintptr_t* sex_male_collapsed;
5068   uintptr_t* sex_female_collapsed;
5069   uint32_t dosage_erase_halfdist;
5070 
5071   uintptr_t** loadbuf_thread_starts[2];
5072   // phase, dosage
5073   unsigned char* loaded_vrtypes[2];
5074 
5075   uint32_t cur_block_write_ct;
5076 
5077   STPgenWriter* spgwp;
5078   PgenWriterCommon** pwcs;
5079   uintptr_t** thread_write_genovecs;
5080   uintptr_t** thread_write_mhc;
5081   // AlleleCode** thread_ac_rotate;
5082   uintptr_t** thread_write_phasepresents;
5083   uintptr_t** thread_write_phaseinfos;
5084   uintptr_t** thread_all_hets;
5085   uintptr_t** thread_write_dosagepresents;
5086   Dosage** thread_write_dosagevals;
5087   uintptr_t** thread_write_dphasepresents;
5088   SDosage** thread_write_dphasedeltas;
5089   uint32_t** thread_cumulative_popcount_bufs;
5090   PglErr write_reterr;
5091   int32_t write_errno;
5092 } MakePgenCtx;
5093 
5094 // One-thread-per-vblock is sensible for possibly-phased biallelic data, where
5095 // subsetting and LD-compression are a substantial fraction of processing time,
5096 // and memory requirements tend to be low enough that it's actually reasonable
5097 // for each thread job to comprise 64k variants.
5098 // Beyond that... the VCF/.pgen division of labor looks nice, but far too much
5099 // of the work is usually being done in the initial PgrGetRaw() call, so just
5100 // fall back on single-threaded invocation of the same function; only
5101 // difference is that the thread owns the writer object.
MakePgenThread(void * raw_arg)5102 THREAD_FUNC_DECL MakePgenThread(void* raw_arg) {
5103   ThreadGroupFuncArg* arg = S_CAST(ThreadGroupFuncArg*, raw_arg);
5104   const uintptr_t tidx = arg->tidx;
5105   MakePgenCtx* ctx = S_CAST(MakePgenCtx*, arg->sharedp->context);
5106 
5107   const uint32_t* new_sample_idx_to_old = ctx->new_sample_idx_to_old;
5108   const uint32_t* old_sample_idx_to_new = ctx->old_sample_idx_to_new;
5109   const MakeCommon* mcp = ctx->mcp;
5110   const ChrInfo* cip = mcp->cip;
5111   const uint32_t* write_chr_fo_vidx_start = ctx->write_chr_fo_vidx_start;
5112   const uintptr_t* write_allele_idx_offsets = ctx->write_allele_idx_offsets;
5113   const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select_iter) = mcp->refalt1_select;
5114   const uintptr_t* sample_include = mcp->sample_include;
5115 
5116   const uintptr_t* sex_male_collapsed = ctx->sex_male_collapsed;
5117 
5118   const uintptr_t* sex_male_collapsed_interleaved = mcp->sex_male_collapsed_interleaved;
5119   const uintptr_t* sex_female_collapsed = ctx->sex_female_collapsed;
5120   const uintptr_t* sex_female_collapsed_interleaved = mcp->sex_female_collapsed_interleaved;
5121   const uint32_t raw_sample_ct = mcp->raw_sample_ct;
5122   const uint32_t sample_ct = mcp->sample_ct;
5123   const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
5124   const uint32_t sample_ctv2 = NypCtToVecCt(sample_ct);
5125   const uint32_t raw_sample_ctaw2 = NypCtToAlignedWordCt(raw_sample_ct);
5126   const uint32_t raw_sample_ctaw = BitCtToAlignedWordCt(raw_sample_ct);
5127   const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
5128   const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
5129   const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
5130   const uint32_t y_code = cip->xymt_codes[kChrOffsetY];
5131   const uint32_t mt_code = cip->xymt_codes[kChrOffsetMT];
5132 
5133   const Plink2WriteFlags plink2_write_flags = mcp->plink2_write_flags;
5134   const uint32_t set_hh_missing = plink2_write_flags & kfPlink2WriteSetHhMissing;
5135   const uint32_t set_hh_missing_keep_dosage = plink2_write_flags & kfPlink2WriteSetHhMissingKeepDosage;
5136   const uint32_t set_mixed_mt_missing = plink2_write_flags & kfPlink2WriteSetMixedMtMissing;
5137   const uint32_t set_mixed_mt_missing_keep_dosage = plink2_write_flags & kfPlink2WriteSetMixedMtMissingKeepDosage;
5138   const uint32_t late_dosage_erase = plink2_write_flags & kfPlink2WriteLateDosageErase;
5139 
5140   const uint32_t hard_call_halfdist = mcp->hard_call_halfdist;
5141   const uint32_t dosage_erase_halfdist = ctx->dosage_erase_halfdist;
5142   const uintptr_t dosageraw_word_ct = kWordsPerVec * (BitCtToVecCt(raw_sample_ct) + DivUp(raw_sample_ct, (kBytesPerVec / sizeof(Dosage))));
5143 
5144   STPgenWriter* spgwp = ctx->spgwp;
5145   PgenWriterCommon* pwcp;
5146   if (spgwp) {
5147     // make this function stand out as an intrusive one
5148     pwcp = &GET_PRIVATE(*spgwp, pwc);
5149   } else {
5150     pwcp = ctx->pwcs[tidx];
5151   }
5152   uintptr_t* write_genovec = nullptr;
5153   // assumes sample_include == nullptr if sample_ct == raw_sample_ct
5154   if (new_sample_idx_to_old || sample_include) {
5155     write_genovec = ctx->thread_write_genovecs[tidx];
5156     write_genovec[sample_ctl2 - 1] = 0;
5157   }
5158   uintptr_t* write_patch_01_set = nullptr;
5159   AlleleCode* write_patch_01_vals = nullptr;
5160   uintptr_t* write_patch_10_set = nullptr;
5161   AlleleCode* write_patch_10_vals = nullptr;
5162   if (ctx->thread_write_mhc) {
5163     ExpandMhc(sample_ct, ctx->thread_write_mhc[tidx], &write_patch_01_set, &write_patch_01_vals, &write_patch_10_set, &write_patch_10_vals);
5164   }
5165   uintptr_t* write_phasepresent = nullptr;
5166   uintptr_t* write_phaseinfo = nullptr;
5167   uintptr_t* all_hets = nullptr;
5168   if (ctx->thread_write_phasepresents) {
5169     write_phasepresent = ctx->thread_write_phasepresents[tidx];
5170     write_phaseinfo = ctx->thread_write_phaseinfos[tidx];
5171     if (ctx->thread_all_hets) {
5172       all_hets = ctx->thread_all_hets[tidx];
5173     }
5174   }
5175   uintptr_t* write_dosagepresent = nullptr;
5176   Dosage* write_dosagevals = nullptr;
5177   uintptr_t* write_dphasepresent = nullptr;
5178   SDosage* write_dphasedeltas = nullptr;
5179   SDosage* tmp_dphasedeltas = nullptr;
5180   uint32_t* cumulative_popcount_buf = nullptr;
5181   if (ctx->thread_write_dosagepresents) {
5182     write_dosagepresent = ctx->thread_write_dosagepresents[tidx];
5183     write_dosagevals = ctx->thread_write_dosagevals[tidx];
5184     if (ctx->thread_write_dphasepresents) {
5185       write_dphasepresent = ctx->thread_write_dphasepresents[tidx];
5186       write_dphasedeltas = ctx->thread_write_dphasedeltas[tidx];
5187       tmp_dphasedeltas = &(write_dphasedeltas[RoundUpPow2(sample_ct, kCacheline / 2)]);
5188     }
5189   }
5190   if ((ctx->thread_write_mhc || ctx->thread_write_dosagepresents) && new_sample_idx_to_old) {
5191     cumulative_popcount_buf = ctx->thread_cumulative_popcount_bufs[tidx];
5192   }
5193   uint32_t variant_idx_offset = 0;
5194   uint32_t allele_ct = 2;
5195   uint32_t parity = 0;
5196   do {
5197     const uintptr_t cur_block_write_ct = ctx->cur_block_write_ct;
5198     uint32_t write_idx = tidx * kPglVblockSize;
5199     const uint32_t write_idx_end = MINV(write_idx + kPglVblockSize, cur_block_write_ct);
5200     uintptr_t* loadbuf_iter = ctx->loadbuf_thread_starts[parity][tidx];
5201     unsigned char* loaded_vrtypes = ctx->loaded_vrtypes[parity];
5202     uint32_t loaded_vrtype = 0;
5203     uint32_t chr_end_bidx = 0;
5204     uint32_t is_x = 0;
5205     uint32_t is_y = 0;
5206     uint32_t is_haploid_nonmt = 0;
5207     uint32_t is_mt = 0;
5208     // write_idx may start larger than write_idx_end
5209     for (; write_idx < write_idx_end; ++write_idx) {
5210       if (loaded_vrtypes) {
5211         loaded_vrtype = loaded_vrtypes[write_idx];
5212       }
5213       if (write_idx >= chr_end_bidx) {
5214         const uint32_t chr_fo_idx = CountSortedSmallerU32(&(write_chr_fo_vidx_start[1]), cip->chr_ct, write_idx + variant_idx_offset + 1);
5215         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
5216         chr_end_bidx = write_chr_fo_vidx_start[chr_fo_idx + 1] - variant_idx_offset;
5217         is_x = (chr_idx == x_code);
5218         is_y = (chr_idx == y_code);
5219         is_mt = (chr_idx == mt_code);
5220         is_haploid_nonmt = IsSet(cip->haploid_mask, chr_idx) && (!is_mt);
5221       }
5222       uintptr_t* cur_genovec_end = &(loadbuf_iter[raw_sample_ctaw2]);
5223       if (write_allele_idx_offsets) {
5224         allele_ct = write_allele_idx_offsets[write_idx + variant_idx_offset + 1] - write_allele_idx_offsets[write_idx + variant_idx_offset];
5225       }
5226       const uint32_t is_mhc = loaded_vrtype & 8;
5227       uint32_t read_rare01_ct = 0;
5228       uint32_t read_rare10_ct = 0;
5229       uintptr_t* read_patch_01_set = nullptr;
5230       AlleleCode* read_patch_01_vals = nullptr;
5231       uintptr_t* read_patch_10_set = nullptr;
5232       AlleleCode* read_patch_10_vals = nullptr;
5233       if (is_mhc) {
5234         assert(allele_ct > 2);
5235         read_rare01_ct = cur_genovec_end[0];
5236         read_rare10_ct = cur_genovec_end[1];
5237         cur_genovec_end = &(cur_genovec_end[RoundUpPow2(2, kWordsPerVec)]);
5238         if (read_rare01_ct) {
5239           read_patch_01_set = cur_genovec_end;
5240           cur_genovec_end = &(cur_genovec_end[raw_sample_ctl]);
5241           read_patch_01_vals = R_CAST(AlleleCode*, cur_genovec_end);
5242           cur_genovec_end = &(cur_genovec_end[DivUp(read_rare01_ct, kBytesPerWord / sizeof(AlleleCode))]);
5243           VecAlignUp64(&cur_genovec_end);
5244         }
5245         if (read_rare10_ct) {
5246           read_patch_10_set = cur_genovec_end;
5247           cur_genovec_end = &(cur_genovec_end[raw_sample_ctl]);
5248           read_patch_10_vals = R_CAST(AlleleCode*, cur_genovec_end);
5249           cur_genovec_end = &(cur_genovec_end[DivUp(read_rare10_ct, kBytesPerWord / (2 * sizeof(AlleleCode)))]);
5250           VecAlignUp64(&cur_genovec_end);
5251         }
5252       }
5253       uint32_t is_hphase = loaded_vrtype & 0x10;
5254       uintptr_t* cur_phaseraw = nullptr;
5255       if (is_hphase) {
5256         // tried skipping this and using ExpandThenSubsetBytearr in simplest
5257         // case, not worthwhile
5258         if (!read_rare10_ct) {
5259           PgrDetectGenoarrHets(loadbuf_iter, raw_sample_ct, all_hets);
5260         } else {
5261           PgrDetectGenoarrHetsMultiallelic(loadbuf_iter, read_patch_10_set, read_patch_10_vals, raw_sample_ct, all_hets);
5262         }
5263         cur_phaseraw = cur_genovec_end;
5264         const uint32_t het_ct = S_CAST(uint32_t, cur_phaseraw[0]);
5265 #ifdef __LP64__
5266         const uint32_t explicit_phasepresent_ct = cur_phaseraw[0] >> 32;
5267 #else
5268         const uint32_t explicit_phasepresent_ct = cur_phaseraw[1];
5269 #endif
5270         const uint32_t phaseraw_word_ct = (8 / kBytesPerWord) + 1 + (het_ct / kBitsPerWord) + DivUp(explicit_phasepresent_ct, kBitsPerWord);
5271         cur_genovec_end = &(cur_genovec_end[RoundUpPow2(phaseraw_word_ct, kWordsPerVec)]);
5272       }
5273       const uint32_t is_dosage = loaded_vrtype & 0x60;
5274       const uint32_t is_dphase = loaded_vrtype & 0x80;
5275       uintptr_t* cur_write_phasepresent = write_phasepresent;
5276       uintptr_t* cur_dosagepresent = nullptr;
5277       Dosage* cur_dosagevals = nullptr;
5278       uintptr_t* cur_dphasepresent = nullptr;
5279       SDosage* cur_dphasedelta = nullptr;
5280       uint32_t read_dosage_ct = 0;
5281       uint32_t read_dphase_ct = 0;
5282       if (is_dosage) {
5283         // multiallelic dosage not implemented yet
5284         assert(allele_ct == 2);
5285 
5286         // this should have length dependent on dosage_ct
5287         cur_dosagepresent = cur_genovec_end;
5288         cur_dosagevals = R_CAST(Dosage*, &(cur_dosagepresent[raw_sample_ctaw]));
5289         read_dosage_ct = PopcountWords(cur_dosagepresent, raw_sample_ctl);
5290 
5291         // temporary
5292         cur_genovec_end = &(cur_genovec_end[dosageraw_word_ct]);
5293 
5294         if (is_dphase) {
5295           cur_dphasepresent = cur_genovec_end;
5296           cur_dphasedelta = R_CAST(SDosage*, &(cur_dphasepresent[raw_sample_ctaw]));
5297           read_dphase_ct = PopcountWords(cur_dphasepresent, raw_sample_ctl);
5298 
5299           // temporary
5300           cur_genovec_end = &(cur_genovec_end[dosageraw_word_ct]);
5301         }
5302       }
5303       uint32_t write_rare01_ct = 0;
5304       uint32_t write_rare10_ct = 0;
5305       uint32_t write_dosage_ct = 0;
5306       uint32_t write_dphase_ct = 0;
5307       if (new_sample_idx_to_old) {
5308         GenovecResort(loadbuf_iter, new_sample_idx_to_old, sample_ct, write_genovec);
5309         if (read_rare01_ct) {
5310           write_rare01_ct = CopyAndResort8bit(read_patch_01_set, read_patch_01_vals, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_patch_01_set, write_patch_01_vals, cumulative_popcount_buf);
5311         }
5312         if (read_rare10_ct) {
5313           write_rare10_ct = CopyAndResort16bit(read_patch_10_set, read_patch_10_vals, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_patch_10_set, write_patch_10_vals, cumulative_popcount_buf);
5314         }
5315         if (is_hphase) {
5316           UnpackAndResortHphase(all_hets, cur_phaseraw, sample_include, old_sample_idx_to_new, raw_sample_ct, sample_ct, &cur_write_phasepresent, write_phaseinfo);
5317         }
5318         if (is_dosage) {
5319           write_dosage_ct = CopyAndResort16bit(cur_dosagepresent, cur_dosagevals, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_dosagepresent, write_dosagevals, cumulative_popcount_buf);
5320           if (is_dphase) {
5321             write_dphase_ct = CopyAndResort16bit(cur_dphasepresent, cur_dphasedelta, new_sample_idx_to_old, raw_sample_ct, sample_ct, write_dphasepresent, write_dphasedeltas, cumulative_popcount_buf);
5322           }
5323         }
5324       } else if (sample_include) {
5325         CopyNyparrNonemptySubset(loadbuf_iter, sample_include, raw_sample_ct, sample_ct, write_genovec);
5326         if (is_mhc) {
5327           write_rare01_ct = Copy1bit8Subset(read_patch_01_set, read_patch_01_vals, sample_include, read_rare01_ct, sample_ct, write_patch_01_set, write_patch_01_vals);
5328           write_rare10_ct = Copy1bit16Subset(read_patch_10_set, read_patch_10_vals, sample_include, read_rare10_ct, sample_ct, write_patch_10_set, write_patch_10_vals);
5329         }
5330         if (is_hphase) {
5331           UnpackHphaseSubset(all_hets, cur_phaseraw, sample_include, sample_ct, &cur_write_phasepresent, write_phaseinfo);
5332         }
5333         if (is_dosage) {
5334           write_dosage_ct = Copy1bit16Subset(cur_dosagepresent, cur_dosagevals, sample_include, read_dosage_ct, sample_ct, write_dosagepresent, write_dosagevals);
5335           if (is_dphase) {
5336             write_dphase_ct = Copy1bit16Subset(cur_dphasepresent, cur_dphasedelta, sample_include, read_dphase_ct, sample_ct, write_dphasepresent, write_dphasedeltas);
5337           }
5338         }
5339       } else {
5340         write_genovec = loadbuf_iter;
5341         if (is_mhc) {
5342           // this doesn't work in refalt1_select case
5343           write_patch_01_set = read_patch_01_set;
5344           write_patch_01_vals = read_patch_01_vals;
5345           write_patch_10_set = read_patch_10_set;
5346           write_patch_10_vals = read_patch_10_vals;
5347           write_rare01_ct = read_rare01_ct;
5348           write_rare10_ct = read_rare10_ct;
5349         }
5350         if (is_hphase) {
5351           UnpackHphase(all_hets, cur_phaseraw, sample_ct, &cur_write_phasepresent, write_phaseinfo);
5352         }
5353         if (is_dosage) {
5354           CopyDosage(cur_dosagepresent, cur_dosagevals, sample_ct, read_dosage_ct, write_dosagepresent, write_dosagevals, &write_dosage_ct);
5355           if (is_dphase) {
5356             CopyDosage(cur_dphasepresent, R_CAST(Dosage*, cur_dphasedelta), sample_ct, read_dphase_ct, write_dphasepresent, R_CAST(Dosage*, write_dphasedeltas), &write_dphase_ct);
5357           }
5358         }
5359       }
5360       // multiallelic -> biallelic split:
5361       //   main thread will probably compute split mapping in advance (bitarray
5362       //   with filtered-and-split variant indices, set bit = unsplit variant
5363       //   or last variant in a split group)?  same pre-split variant can be
5364       //   loaded multiple times.
5365       // biallelic -> multiallelic merge:
5366       //   could require no multiallelic variants in remainder of dataset?
5367       //   if handled with pgenlib, PgfiMultiread,
5368       //   PgfiMultireadGetCachelineReq, and PgrGetRaw would need to be
5369       //   extended to take a merge-info parameter.  +both will be tricky...
5370       //   probably not worth it.
5371       //   compute merge pattern in MakePgenRobust() before main loop instead.
5372       //   main thread also performs the actual merge.
5373       // both should require sorted .pvar.
5374       // neither should require any handling in this function.
5375       if (refalt1_select_iter && (refalt1_select_iter[write_idx][0] || (refalt1_select_iter[write_idx][1] != 1))) {
5376         if (allele_ct == 2) {
5377           GenovecInvertUnsafe(sample_ct, write_genovec);
5378           if (is_hphase) {
5379             // trailing bits don't matter
5380             BitvecInvert(sample_ctl, write_phaseinfo);
5381           }
5382           if (write_dosage_ct) {
5383             BiallelicDosage16Invert(write_dosage_ct, write_dosagevals);
5384             if (write_dphase_ct) {
5385               BiallelicDphase16Invert(write_dphase_ct, write_dphasedeltas);
5386             }
5387           }
5388         } else {
5389           exit(S_CAST(int32_t, kPglRetNotYetSupported));
5390           // this is the fun case
5391           // 1. fill length-(2 * sample_ct) AlleleCode[] buffer with codes
5392           // 2. fill lookup table describing remapping
5393           // 3. replace elements of table
5394           // 4. normalize order of each code pair, inverting a phaseinfo bit on
5395           //    each swap
5396           // 5. call PglMultiallelicDenseToSparse to write back
5397         }
5398       }
5399       if (write_dosage_ct) {
5400         assert((!write_rare01_ct) && (!write_rare10_ct));
5401         if (hard_call_halfdist || (dosage_erase_halfdist < kDosage4th)) {
5402           if (is_hphase && (!cur_write_phasepresent)) {
5403             // explicit phasepresent required for these
5404             cur_write_phasepresent = write_phasepresent;
5405             // unsafe to just copy all_hets, because we may have resorted
5406             // todo: multiallelic dosage
5407             PgrDetectGenoarrHets(write_genovec, sample_ct, write_phasepresent);
5408           }
5409           if (write_dphasepresent && is_hphase && (!write_dphase_ct)) {
5410             // bugfix (29 Apr 2019): write_dphasepresent not guaranteed to be
5411             // non-null.
5412             ZeroWArr(sample_ctl, write_dphasepresent);
5413           }
5414           if (hard_call_halfdist) {
5415             if ((!is_hphase) && (!write_dphase_ct)) {
5416               ApplyHardCallThresh(write_dosagepresent, write_dosagevals, write_dosage_ct, hard_call_halfdist, write_genovec);
5417             } else {
5418               if (!is_hphase) {
5419                 ZeroWArr(sample_ctl, write_phasepresent);
5420               }
5421               write_dphase_ct = ApplyHardCallThreshPhased(write_dosagepresent, write_dosagevals, write_dosage_ct, hard_call_halfdist, write_genovec, write_phasepresent, write_phaseinfo, write_dphasepresent, write_dphasedeltas, tmp_dphasedeltas);
5422               is_hphase = !AllWordsAreZero(write_phasepresent, sample_ctl);
5423             }
5424           }
5425           if (dosage_erase_halfdist < kDosage4th) {
5426             if (!is_hphase) {
5427               ZeroWArr(sample_ctl, write_phasepresent);
5428             }
5429             uint32_t dosage_read_idx = 0;
5430             uintptr_t sample_widx = 0;
5431             uintptr_t cur_bits = write_dosagepresent[0];
5432             uint32_t dosage_write_idx;
5433             if (!write_dphase_ct) {
5434               // If hardcall-phase and dosage present, threshold/2 applies
5435               // thanks to implicit dosage-phase value
5436               // const uint32_t dosage_erase_halfdist2 = (dosage_erase_halfdist + kDosage4th + 1) / 2;
5437               const uint32_t halfdist_extra = (kDosage4th + 1 - dosage_erase_halfdist) / 2;
5438               for (; dosage_read_idx != write_dosage_ct; ++dosage_read_idx) {
5439                 const uint32_t sample_uidx_lowbits = BitIter1x(write_dosagepresent, &sample_widx, &cur_bits);
5440                 const uint32_t dosage_int = write_dosagevals[dosage_read_idx];
5441                 const uint32_t halfdist = BiallelicDosageHalfdist(dosage_int);
5442                 if (halfdist >= dosage_erase_halfdist + ((write_phasepresent[sample_widx] >> sample_uidx_lowbits) & 1) * halfdist_extra) {
5443                   write_dosagepresent[sample_widx] ^= k1LU << sample_uidx_lowbits;
5444                   break;
5445                 }
5446               }
5447               dosage_write_idx = dosage_read_idx;
5448               while (++dosage_read_idx < write_dosage_ct) {
5449                 const uint32_t sample_uidx_lowbits = BitIter1x(write_dosagepresent, &sample_widx, &cur_bits);
5450                 const uint32_t dosage_int = write_dosagevals[dosage_read_idx];
5451                 const uint32_t halfdist = BiallelicDosageHalfdist(dosage_int);
5452                 if (halfdist < dosage_erase_halfdist + ((write_phasepresent[sample_widx] >> sample_uidx_lowbits) & 1) * halfdist_extra) {
5453                   write_dosagevals[dosage_write_idx++] = dosage_int;
5454                 } else {
5455                   write_dosagepresent[sample_widx] ^= k1LU << sample_uidx_lowbits;
5456                 }
5457               }
5458             } else {
5459               // Only erase dosage if both sides are less than threshold/2
5460               // away from an integer.
5461               const uint32_t halfdist_extra = (kDosage4th + 1 - dosage_erase_halfdist) / 2;
5462               const uint32_t dosage_erase_halfdist2 = dosage_erase_halfdist + halfdist_extra;
5463               uint32_t dphase_read_idx = 0;
5464               uintptr_t lowbit = 0;
5465               for (; dosage_read_idx != write_dosage_ct; ++dosage_read_idx) {
5466                 lowbit = BitIter1y(write_dosagepresent, &sample_widx, &cur_bits);
5467                 const uint32_t dosage_int = write_dosagevals[dosage_read_idx];
5468                 if (!(write_dphasepresent[sample_widx] & lowbit)) {
5469                   // necessary for this to be separate to handle odd
5470                   // dosage_int, missing phase case correctly
5471                   const uint32_t halfdist = BiallelicDosageHalfdist(dosage_int);
5472                   if (halfdist >= dosage_erase_halfdist + ((write_phasepresent[sample_widx] & lowbit) != 0) * halfdist_extra) {
5473                     break;
5474                   }
5475                 } else {
5476                   const int32_t dphase_delta = write_dphasedeltas[dphase_read_idx++];
5477                   const uint32_t halfdist1 = HaploidDosageHalfdist((dosage_int + dphase_delta) >> 1);
5478                   const uint32_t halfdist2 = HaploidDosageHalfdist((dosage_int - dphase_delta) >> 1);
5479                   if ((halfdist1 >= dosage_erase_halfdist2) && (halfdist2 >= dosage_erase_halfdist2)) {
5480                     break;
5481                   }
5482                 }
5483               }
5484               dosage_write_idx = dosage_read_idx;
5485               if (dosage_read_idx < write_dosage_ct) {
5486                 uint32_t dphase_write_idx = dphase_read_idx;
5487                 if (write_dphasepresent[sample_widx] & lowbit) {
5488                   --dphase_write_idx;
5489                   write_dphasepresent[sample_widx] ^= lowbit;
5490                 }
5491                 write_dosagepresent[sample_widx] ^= lowbit;
5492                 while (++dosage_read_idx < write_dosage_ct) {
5493                   lowbit = BitIter1y(write_dosagepresent, &sample_widx, &cur_bits);
5494                   const uint32_t dosage_int = write_dosagevals[dosage_read_idx];
5495                   if (!(write_dphasepresent[sample_widx] & lowbit)) {
5496                     const uint32_t halfdist = BiallelicDosageHalfdist(dosage_int);
5497                     if (halfdist < dosage_erase_halfdist + ((write_phasepresent[sample_widx] & lowbit) != 0) * halfdist_extra) {
5498                       write_dosagevals[dosage_write_idx++] = dosage_int;
5499                     } else {
5500                       write_dosagepresent[sample_widx] ^= lowbit;
5501                     }
5502                   } else {
5503                     const int32_t dphase_delta = write_dphasedeltas[dphase_read_idx++];
5504                     const uint32_t halfdist1 = HaploidDosageHalfdist((dosage_int + dphase_delta) >> 1);
5505                     const uint32_t halfdist2 = HaploidDosageHalfdist((dosage_int - dphase_delta) >> 1);
5506                     if ((halfdist1 < dosage_erase_halfdist2) || (halfdist2 < dosage_erase_halfdist2)) {
5507                       write_dosagevals[dosage_write_idx++] = dosage_int;
5508                       write_dphasedeltas[dphase_write_idx++] = dphase_delta;
5509                     } else {
5510                       write_dosagepresent[sample_widx] ^= lowbit;
5511                       write_dphasepresent[sample_widx] ^= lowbit;
5512                     }
5513                   }
5514                 }
5515                 write_dphase_ct = dphase_write_idx;
5516               }
5517             }
5518             write_dosage_ct = dosage_write_idx;
5519           }
5520         }
5521         if (late_dosage_erase) {
5522           write_dosage_ct = 0;
5523           write_dphase_ct = 0;
5524         }
5525       }
5526       // moved after --hard-call-threshold, since it makes sense to
5527       // immediately erase fresh het haploid calls
5528       if (set_hh_missing && is_haploid_nonmt) {
5529         if (is_x) {
5530           EraseMaleDphases(sex_male_collapsed, &write_dphase_ct, write_dphasepresent, write_dphasedeltas);
5531           if (!set_hh_missing_keep_dosage) {
5532             // need to erase dosages associated with the hardcalls we're
5533             // about to clear
5534 
5535             // male 0/x hets to missing
5536             SetMaleHetMissingCleardosage(sex_male_collapsed, sex_male_collapsed_interleaved, sample_ctv2, write_genovec, &write_dosage_ct, write_dosagepresent, write_dosagevals);
5537             // male x/y hets to missing
5538             if (write_rare10_ct) {
5539               uintptr_t sample_widx = 0;
5540               uintptr_t patch_10_bits = write_patch_10_set[0];
5541               uint32_t read_patch_10_idx = 0;
5542               for (; read_patch_10_idx != write_rare10_ct; ++read_patch_10_idx) {
5543                 uintptr_t lowbit = BitIter1y(write_patch_10_set, &sample_widx, &patch_10_bits);
5544                 AlleleCode lo_code = write_patch_10_vals[read_patch_10_idx * 2];
5545                 AlleleCode hi_code = write_patch_10_vals[read_patch_10_idx * 2 + 1];
5546                 if ((sex_male_collapsed[sample_widx] & lowbit) && (lo_code != hi_code)) {
5547                   write_patch_10_set[sample_widx] ^= lowbit;
5548                   uint32_t write_patch_10_idx = read_patch_10_idx;
5549                   ++read_patch_10_idx;
5550                   for (; read_patch_10_idx != write_rare10_ct; ++read_patch_10_idx) {
5551                     lowbit = BitIter1y(write_patch_10_set, &sample_widx, &patch_10_bits);
5552                     lo_code = write_patch_10_vals[read_patch_10_idx * 2];
5553                     hi_code = write_patch_10_vals[read_patch_10_idx * 2 + 1];
5554                     if ((sex_male_collapsed[sample_widx] & lowbit) && (lo_code != hi_code)) {
5555                       write_patch_10_set[sample_widx] ^= lowbit;
5556                     } else {
5557                       write_patch_10_vals[write_patch_10_idx * 2] = lo_code;
5558                       write_patch_10_vals[write_patch_10_idx * 2 + 1] = hi_code;
5559                       ++write_patch_10_idx;
5560                     }
5561                   }
5562                   write_rare10_ct = write_patch_10_idx;
5563                   break;
5564                 }
5565               }
5566             }
5567           } else {
5568             assert(!write_rare01_ct);
5569             assert(!write_rare10_ct);
5570             // need to generate a new unphased dosage for each cleared
5571             // hardcall lacking a dosage entry
5572             SetMaleHetMissingKeepdosage(sex_male_collapsed, sex_male_collapsed_interleaved, sample_ctl2, write_genovec, &write_dosage_ct, write_dosagepresent, write_dosagevals);
5573           }
5574           if (is_hphase && cur_write_phasepresent) {
5575             // bugfix (28 Jul 2018): I was on crack when I moved this code
5576             // before SetMaleHetMissing{Clear,Keep}dosage() on 31 Mar
5577             if (!write_rare10_ct) {
5578               MaskGenoarrHetsUnsafe(write_genovec, sample_ctl2, cur_write_phasepresent);
5579             } else {
5580               MaskGenoarrHetsMultiallelicUnsafe(write_genovec, write_patch_10_set, write_patch_10_vals, sample_ctl2, cur_write_phasepresent);
5581             }
5582             is_hphase = !AllWordsAreZero(write_phasepresent, sample_ctl);
5583           }
5584           if (write_rare01_ct) {
5585             ClearGenoarrMissing1bit8Unsafe(write_genovec, &write_rare01_ct, write_patch_01_set, write_patch_01_vals);
5586           }
5587           if (write_rare10_ct) {
5588             ClearGenoarrMissing1bit16Unsafe(write_genovec, &write_rare10_ct, write_patch_10_set, write_patch_10_vals);
5589           }
5590         } else {
5591           // all hets to missing
5592           // may want to move is_hphase zeroing in front
5593           if (!set_hh_missing_keep_dosage) {
5594             SetHetMissingCleardosage(sample_ctl2, write_genovec, &write_dosage_ct, write_dosagepresent, write_dosagevals);
5595           } else {
5596             SetHetMissingKeepdosage(sample_ctl2, write_genovec, &write_dosage_ct, write_dosagepresent, write_dosagevals);
5597           }
5598           if (is_y) {
5599             InterleavedSetMissingCleardosage(sex_female_collapsed, sex_female_collapsed_interleaved, sample_ctv2, write_genovec, &write_dosage_ct, write_dosagepresent, write_dosagevals);
5600           }
5601           is_hphase = 0;
5602           write_rare01_ct = 0;
5603           if (write_rare10_ct) {
5604             ClearGenoarrMissing1bit16Unsafe(write_genovec, &write_rare10_ct, write_patch_10_set, write_patch_10_vals);
5605           }
5606           write_dphase_ct = 0;
5607         }
5608       } else if (set_mixed_mt_missing && is_mt) {
5609         if (!set_mixed_mt_missing_keep_dosage) {
5610           // all hets to missing
5611           SetHetMissingCleardosage(sample_ctl2, write_genovec, &write_dosage_ct, write_dosagepresent, write_dosagevals);
5612         } else {
5613           SetHetMissingKeepdosage(sample_ctl2, write_genovec, &write_dosage_ct, write_dosagepresent, write_dosagevals);
5614         }
5615         is_hphase = 0;
5616         write_rare01_ct = 0;
5617         if (write_rare10_ct) {
5618           ClearGenoarrMissing1bit16Unsafe(write_genovec, &write_rare10_ct, write_patch_10_set, write_patch_10_vals);
5619         }
5620         write_dphase_ct = 0;
5621       }
5622       ZeroTrailingNyps(sample_ct, write_genovec);
5623       // todo: --set-me-missing, --zero-cluster, --fill-missing-with-ref
5624       if (spgwp) {
5625         if (pwcp->fwrite_bufp >= &(pwcp->fwrite_buf[kPglFwriteBlockSize])) {
5626           const uintptr_t cur_byte_ct = pwcp->fwrite_bufp - pwcp->fwrite_buf;
5627           if (unlikely(fwrite_checked(pwcp->fwrite_buf, cur_byte_ct, GET_PRIVATE(*spgwp, pgen_outfile)))) {
5628             ctx->write_reterr = kPglRetWriteFail;
5629             ctx->write_errno = errno;
5630             break;
5631           }
5632           // printf("vblock_fpos_offset: %llu\n", pwcp->vblock_fpos_offset);
5633           pwcp->vblock_fpos_offset += cur_byte_ct;
5634           // printf("%u %llu\n", write_idx + variant_idx_offset, pwcp->vblock_fpos_offset);
5635           pwcp->fwrite_bufp = pwcp->fwrite_buf;
5636         }
5637       }
5638       if ((!write_rare01_ct) && (!write_rare10_ct)) {
5639         if ((!is_hphase) && (!write_dphase_ct)) {
5640           if (unlikely(PwcAppendBiallelicGenovecDosage16(write_genovec, write_dosagepresent, write_dosagevals, write_dosage_ct, pwcp))) {
5641             ctx->write_reterr = kPglRetVarRecordTooLarge;
5642             break;
5643           }
5644         } else {
5645           if (!is_hphase) {
5646             ZeroWArr(sample_ctl, write_phasepresent);
5647           }
5648           // extraneous phaseinfo bits may be set
5649           if (unlikely(PwcAppendBiallelicGenovecDphase16(write_genovec, cur_write_phasepresent, write_phaseinfo, write_dosagepresent, write_dphasepresent, write_dosagevals, write_dphasedeltas, write_dosage_ct, write_dphase_ct, pwcp))) {
5650             ctx->write_reterr = kPglRetVarRecordTooLarge;
5651             break;
5652           }
5653         }
5654       } else {
5655         // multiallelic dosage not supported
5656         if (!is_hphase) {
5657           if (unlikely(PwcAppendMultiallelicSparse(write_genovec, write_patch_01_set, write_patch_01_vals, write_patch_10_set, write_patch_10_vals, write_rare01_ct, write_rare10_ct, pwcp))) {
5658             ctx->write_reterr = kPglRetVarRecordTooLarge;
5659             break;
5660           }
5661         } else {
5662           if (unlikely(PwcAppendMultiallelicGenovecHphase(write_genovec, write_patch_01_set, write_patch_01_vals, write_patch_10_set, write_patch_10_vals, cur_write_phasepresent, write_phaseinfo, write_rare01_ct, write_rare10_ct, pwcp))) {
5663             ctx->write_reterr = kPglRetVarRecordTooLarge;
5664             break;
5665           }
5666         }
5667       }
5668       loadbuf_iter = cur_genovec_end;
5669     }
5670     parity = 1 - parity;
5671     variant_idx_offset += cur_block_write_ct;
5672     if (refalt1_select_iter) {
5673       refalt1_select_iter = &(refalt1_select_iter[cur_block_write_ct]);
5674     }
5675   } while (!THREAD_BLOCK_FINISH(arg));
5676   THREAD_RETURN;
5677 }
5678 
GflagsVfilter(const uintptr_t * variant_include,const unsigned char * vrtypes,uint32_t raw_variant_ct,PgenGlobalFlags input_gflags)5679 PgenGlobalFlags GflagsVfilter(const uintptr_t* variant_include, const unsigned char* vrtypes, uint32_t raw_variant_ct, PgenGlobalFlags input_gflags) {
5680   PgenGlobalFlags read_gflags = kfPgenGlobal0;
5681   const uintptr_t* vrtypes_alias = R_CAST(const uintptr_t*, vrtypes);
5682   const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
5683   uint32_t mask_multiply = ((input_gflags & kfPgenGlobalHardcallPhasePresent)? 0x10 : 0) + ((input_gflags & kfPgenGlobalDosagePresent)? 0x60 : 0) + ((input_gflags & kfPgenGlobalDosagePhasePresent)? 0x80 : 0);
5684   uintptr_t vrtypes_or = 0;
5685   // todo: try changing loop to be vec-based, use movemask to extract
5686   // information from vrtypes in 64-bit cases
5687   for (uint32_t widx = 0; widx != raw_variant_ctl; ++widx) {
5688     uintptr_t cur_variant_include_word = variant_include[widx];
5689     if (cur_variant_include_word) {
5690       // bugfix (20 Aug 2018): this needs to advance on every variant_include
5691       // word, not just the nonzero ones
5692       const uintptr_t* cur_vrtypes = &(vrtypes_alias[8 * widx]);
5693 #ifdef __LP64__
5694       for (uint32_t vi_byte_idx = 0; vi_byte_idx != 8; ++vi_byte_idx) {
5695 #  ifdef USE_AVX2
5696         // this doesn't seem to be much faster than non-AVX2 code on my Mac...
5697         // inverse-movemask shouldn't be better than regular movemask here
5698         const uintptr_t cur_mask = _pdep_u64(cur_variant_include_word, kMask0101);
5699 #  else
5700         // this operation maps binary hgfedcba to h0000000g0000000f...
5701         //                                        ^       ^       ^
5702         //                                        |       |       |
5703         //                                       56      48      40
5704         // 1. (cur_variant_include_word & 0xfe) gives us hgfedcb0;
5705         //    necessary to avoid carryover.
5706         // 2. multiply by the number with bits 7, 14, 21, ..., 49 set, to
5707         //    get hgfedcbhgfedcbhgf...
5708         //        ^       ^       ^
5709         //        |       |       |
5710         //       56      48      40
5711         // 3. mask out all but bits 8, 16, 24, ..., 56
5712         // todo: test if this actually beats the per-character loop...
5713         const uintptr_t cur_mask = (((cur_variant_include_word & 0xfe) * 0x2040810204080LLU) & kMask0101) | (cur_variant_include_word & 1);
5714 #  endif
5715         vrtypes_or |= cur_vrtypes[vi_byte_idx] & (cur_mask * mask_multiply);
5716         cur_variant_include_word >>= 8;
5717       }
5718 #else
5719       for (uint32_t vi_hexa_idx = 0; vi_hexa_idx != 8; ++vi_hexa_idx) {
5720         // dcba -> d0000000c0000000b0000000a
5721         const uintptr_t cur_mask = ((cur_variant_include_word & 0xf) * 0x204081) & kMask0101;
5722         vrtypes_or |= cur_vrtypes[vi_hexa_idx] & (cur_mask * mask_multiply);
5723         cur_variant_include_word >>= 4;
5724       }
5725 #endif
5726       if (vrtypes_or) {
5727         // bugfix (8 Oct 2017): forgot to multiply by kMask0101
5728         if (vrtypes_or & (0x10 * kMask0101)) {
5729           read_gflags |= kfPgenGlobalHardcallPhasePresent;
5730           mask_multiply -= 0x10;
5731         }
5732         if (vrtypes_or & (0x60 * kMask0101)) {
5733           read_gflags |= kfPgenGlobalDosagePresent;
5734           mask_multiply -= 0x60;
5735         }
5736         if (vrtypes_or & (0x80 * kMask0101)) {
5737           read_gflags |= kfPgenGlobalDosagePhasePresent;
5738           mask_multiply -= 0x80;
5739         }
5740         if (!mask_multiply) {
5741           return read_gflags;
5742         }
5743       }
5744     }
5745   }
5746   return read_gflags;
5747 }
5748 
SplitNonrefFlags()5749 void SplitNonrefFlags() {
5750   logerrputs("Provisional-reference flag split is not implemented yet.\n");
5751   exit(S_CAST(int32_t, kPglRetNotYetSupported));
5752 }
5753 
JoinNonrefFlags()5754 void JoinNonrefFlags() {
5755   logerrputs("Provisional-reference flag join is not implemented yet.\n");
5756   exit(S_CAST(int32_t, kPglRetNotYetSupported));
5757 }
5758 
5759 // Single-output-thread implementation.  Allows variants to be unsorted.
5760 // (Note that MakePlink2NoVsort() currently requires enough memory for 64k * 2
5761 // variants per output thread, due to LD compression.  This is faster in the
5762 // common case, but once you have 150k+ samples with dosage data...)
5763 //
5764 // initialized mcp fields: cip, sex_male_collapsed_interleaved,
5765 // sex_female_collapsed_interleaved, raw_sample_ct, sample_ct,
5766 // plink2_write_flags
5767 PglErr MakePgenRobust(const uintptr_t* sample_include, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const uintptr_t* allele_idx_offsets, __maybe_unused const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* write_allele_idx_offsets, const uint32_t* new_variant_idx_to_old, const uintptr_t* sex_male_collapsed, uintptr_t* sex_female_collapsed, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t write_variant_ct, uint32_t max_read_allele_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, MakePlink2Flags make_plink2_flags, MakeCommon* mcp, PgenReader* simple_pgrp, char* outname, char* outname_end) {
5768   // variant_uidx_new_to_old[] can be nullptr
5769 
5770   unsigned char* bigstack_mark = g_bigstack_base;
5771   PglErr reterr = kPglRetSuccess;
5772   ThreadGroup tg;
5773   PreinitThreads(&tg);
5774   STPgenWriter spgw;
5775   PreinitSpgw(&spgw);
5776   MakePgenCtx ctx;
5777   {
5778     // plink2_write_flags assumed to include --set-hh-missing and
5779     //   --set-mixed-mt-missing
5780     // sex_{fe}male_collapsed_interleaved assumed to be initialized if
5781     //   necessary
5782 
5783     if (unlikely(SetThreadCt(1, &tg))) {
5784       goto MakePgenRobust_ret_NOMEM;
5785     }
5786     ctx.spgwp = &spgw;
5787     const uint32_t raw_sample_ct = mcp->raw_sample_ct;
5788     const uint32_t sample_ct = mcp->sample_ct;
5789     const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
5790     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
5791     mcp->sample_include = subsetting_required? sample_include : nullptr;
5792     ctx.new_sample_idx_to_old = new_sample_idx_to_old;
5793     ctx.sex_male_collapsed = sex_male_collapsed;
5794     ctx.sex_female_collapsed = sex_female_collapsed;
5795     ctx.write_reterr = kPglRetSuccess;
5796     if ((make_plink2_flags & kfMakeBed) || ((make_plink2_flags & (kfMakePgen | (kfMakePgenFormatBase * 3))) == (kfMakePgen | kfMakePgenFormatBase))) {
5797       logerrputs("Error: Fixed-width .bed/.pgen output doesn't support sorting yet.  Generate a\nregular sorted .pgen first, and then reformat it.\n");
5798       reterr = kPglRetNotYetSupported;
5799       goto MakePgenRobust_ret_1;
5800     } else {
5801       const uint32_t input_biallelic = (!allele_idx_offsets);
5802       // output_biallelic: test write_allele_idx_offsets equality to null
5803       ctx.write_allele_idx_offsets = write_allele_idx_offsets;
5804       if ((variant_ct == raw_variant_ct) || new_variant_idx_to_old) {
5805         ctx.write_chr_fo_vidx_start = mcp->cip->chr_fo_vidx_start;
5806       } else {
5807         if (unlikely(AllocAndFillSubsetChrFoVidxStart(variant_include, mcp->cip, &ctx.write_chr_fo_vidx_start))) {
5808           goto MakePgenRobust_ret_NOMEM;
5809         }
5810       }
5811       PgenGlobalFlags read_gflags = PgrGetGflags(simple_pgrp) & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
5812       if (make_plink2_flags & (kfMakePlink2MJoin | kfMakePlink2EraseAlt2Plus)) {
5813         logerrputs("Error: multiallelic-join and 'erase-alt2+' modifiers are under development.\n");
5814         reterr = kPglRetNotYetSupported;
5815         goto MakePgenRobust_ret_1;
5816       }
5817       if (make_plink2_flags & kfMakePgenErasePhase) {
5818         read_gflags &= ~(kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent);
5819       }
5820       if (make_plink2_flags & kfMakePgenEraseDosage) {
5821         if (hard_call_thresh == UINT32_MAX) {
5822           read_gflags &= ~(kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
5823         } else {
5824           // bugfix (11 Apr 2018): this was in the wrong branch
5825           mcp->plink2_write_flags |= kfPlink2WriteLateDosageErase;
5826         }
5827       }
5828       if (read_gflags && (variant_ct < raw_variant_ct)) {
5829         read_gflags &= GflagsVfilter(variant_include, PgrGetVrtypes(simple_pgrp), raw_variant_ct, PgrGetGflags(simple_pgrp));
5830       }
5831       if (!input_biallelic) {
5832         // todo: conditional erase-alt2+ exception
5833         read_gflags |= kfPgenGlobalMultiallelicHardcallFound;
5834       }
5835       const uint32_t read_dosage_present = (read_gflags / kfPgenGlobalDosagePresent) & 1;
5836       // bugfix (25 Jul 2018): left expression needs ||, not &&
5837       mcp->hard_call_halfdist = ((hard_call_thresh == UINT32_MAX) || (!read_dosage_present))? 0 : (kDosage4th - hard_call_thresh);
5838       ctx.dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
5839       // bugfix/simplification (10 Mar 2020): it is possible for dosage-phase
5840       // to be present in the input without hardcall-phase.  Don't try to treat
5841       // that differently than the usual scenario where hardcall-phase is
5842       // present.
5843       const uint32_t read_phase_present = !!(read_gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent));
5844       const uint32_t read_dphase_present = (read_gflags / kfPgenGlobalDosagePhasePresent) & 1;
5845       PgenGlobalFlags write_gflags = read_gflags;
5846       // When --hard-call-threshold is specified, if either hphase or dphase
5847       // values exist, the other can be generated.
5848       uint32_t read_or_write_phase_present = read_phase_present;
5849       uint32_t read_or_write_dphase_present = read_dphase_present;
5850       if (mcp->hard_call_halfdist && (read_phase_present || read_or_write_dphase_present)) {
5851         read_or_write_phase_present = 1;
5852         read_or_write_dphase_present = 1;
5853         write_gflags |= kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent;
5854       } else if (dosage_erase_thresh && read_dosage_present) {
5855         // need write_phasepresent, pretty harmless to allocate write_phaseinfo
5856         read_or_write_phase_present = 1;
5857       }
5858       uint32_t read_or_write_dosage_present = read_dosage_present;
5859       if (mcp->plink2_write_flags & kfPlink2WriteLateDosageErase) {
5860         write_gflags &= ~(kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
5861       } else if (mcp->plink2_write_flags & (kfPlink2WriteSetHhMissingKeepDosage | kfPlink2WriteSetMixedMtMissingKeepDosage)) {
5862         // bugfix (25 Jul 2018): this needs to check plink2_write_flags, not
5863         // make_plink2_flags
5864 
5865         // command-line parser guarantees erase-dosage and
5866         // --set-hh-missing/--set-mixed-mt-missing keep-dosage aren't used
5867         // together
5868         read_or_write_dosage_present = 1;
5869 
5870         // could verify at least one het haploid is present before setting this
5871         // flag...
5872         write_gflags |= kfPgenGlobalDosagePresent;
5873       }
5874       if ((write_gflags & (kfPgenGlobalMultiallelicHardcallFound | kfPgenGlobalDosagePresent)) == (kfPgenGlobalMultiallelicHardcallFound | kfPgenGlobalDosagePresent)) {
5875         logerrputs("Error: Multiallelic dosages aren't supported yet.\n");
5876         reterr = kPglRetNotYetSupported;
5877         goto MakePgenRobust_ret_1;
5878       }
5879 
5880       uint32_t nonref_flags_storage = 3;
5881       uintptr_t* nonref_flags_write = PgrGetNonrefFlags(simple_pgrp);
5882       if (!nonref_flags_write) {
5883         nonref_flags_storage = (PgrGetGflags(simple_pgrp) & kfPgenGlobalAllNonref)? 2 : 1;
5884       } else if (variant_ct < raw_variant_ct) {
5885         const uint32_t write_variant_ctl = BitCtToWordCt(write_variant_ct);
5886         uintptr_t* old_nonref_flags = nonref_flags_write;
5887         if (bigstack_alloc_w(write_variant_ctl, &nonref_flags_write)) {
5888           goto MakePgenRobust_ret_NOMEM;
5889         }
5890         if ((variant_ct == write_variant_ct) && (!new_variant_idx_to_old)) {
5891           CopyBitarrSubset(old_nonref_flags, variant_include, variant_ct, nonref_flags_write);
5892         } else {
5893           ZeroWArr(write_variant_ctl, nonref_flags_write);
5894           if (variant_ct == write_variant_ct) {
5895             for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
5896               const uintptr_t variant_uidx = new_variant_idx_to_old[variant_idx];
5897               if (IsSet(old_nonref_flags, variant_uidx)) {
5898                 SetBit(variant_idx, nonref_flags_write);
5899               }
5900             }
5901           } else if (!write_allele_idx_offsets) {
5902             SplitNonrefFlags();
5903           } else {
5904             JoinNonrefFlags();
5905           }
5906         }
5907         if (nonref_flags_write[0] & 1) {
5908           if (AllBitsAreOne(nonref_flags_write, write_variant_ct)) {
5909             BigstackReset(nonref_flags_write);
5910             nonref_flags_write = nullptr;
5911             nonref_flags_storage = 2;
5912           }
5913         } else if (AllWordsAreZero(nonref_flags_write, write_variant_ctl)) {
5914           BigstackReset(nonref_flags_write);
5915           nonref_flags_write = nullptr;
5916           nonref_flags_storage = 1;
5917         }
5918       }
5919       snprintf(outname_end, kMaxOutfnameExtBlen, ".pgen");
5920       uintptr_t spgw_alloc_cacheline_ct;
5921       uint32_t max_vrec_len;
5922       reterr = SpgwInitPhase1(outname, write_allele_idx_offsets, nonref_flags_write, write_variant_ct, sample_ct, write_gflags, nonref_flags_storage, ctx.spgwp, &spgw_alloc_cacheline_ct, &max_vrec_len);
5923       if (unlikely(reterr)) {
5924         if (reterr == kPglRetOpenFail) {
5925           logerrprintfww(kErrprintfFopen, outname, strerror(errno));
5926         }
5927         goto MakePgenRobust_ret_1;
5928       }
5929       unsigned char* spgw_alloc;
5930       if (unlikely(
5931               bigstack_alloc_wp(1, &(ctx.loadbuf_thread_starts[0])) ||
5932               bigstack_alloc_wp(1, &(ctx.loadbuf_thread_starts[1])) ||
5933               bigstack_alloc_uc(spgw_alloc_cacheline_ct * kCacheline, &spgw_alloc))) {
5934         goto MakePgenRobust_ret_NOMEM;
5935       }
5936       SpgwInitPhase2(max_vrec_len, ctx.spgwp, spgw_alloc);
5937 
5938       const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
5939       const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
5940       ctx.thread_write_genovecs = nullptr;
5941       uint32_t write_mhc_needed = 0;
5942       if (new_sample_idx_to_old || subsetting_required) {
5943         if (unlikely(bigstack_alloc_wp(1, &ctx.thread_write_genovecs))) {
5944           goto MakePgenRobust_ret_NOMEM;
5945         }
5946         if (read_phase_present && new_sample_idx_to_old) {
5947           if (unlikely(bigstack_alloc_u32(raw_sample_ct, &ctx.old_sample_idx_to_new))) {
5948             goto MakePgenRobust_ret_NOMEM;
5949           }
5950           for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
5951             ctx.old_sample_idx_to_new[new_sample_idx_to_old[new_sample_idx]] = new_sample_idx;
5952           }
5953         }
5954         if (unlikely(bigstack_alloc_w(sample_ctl2, &(ctx.thread_write_genovecs[0])))) {
5955           goto MakePgenRobust_ret_NOMEM;
5956         }
5957         write_mhc_needed = 1;
5958       }
5959       ctx.thread_write_mhc = nullptr;
5960       if (write_mhc_needed) {
5961         if (unlikely(bigstack_alloc_wp(1, &ctx.thread_write_mhc))) {
5962           goto MakePgenRobust_ret_NOMEM;
5963         }
5964         // todo: refalt1_select
5965         const uintptr_t mhcwrite_word_ct = GetMhcWordCt(sample_ct);
5966         if (unlikely(bigstack_alloc_w(mhcwrite_word_ct, &(ctx.thread_write_mhc[0])))) {
5967           goto MakePgenRobust_ret_NOMEM;
5968         }
5969       }
5970       ctx.thread_write_phasepresents = nullptr;
5971       ctx.thread_all_hets = nullptr;
5972       if (read_or_write_phase_present) {
5973         if (unlikely(
5974                 bigstack_alloc_wp(1, &ctx.thread_write_phasepresents) ||
5975                 bigstack_alloc_wp(1, &ctx.thread_write_phaseinfos) ||
5976                 bigstack_alloc_w(sample_ctl, &(ctx.thread_write_phasepresents[0])) ||
5977                 bigstack_alloc_w(sample_ctl, &(ctx.thread_write_phaseinfos[0])))) {
5978           goto MakePgenRobust_ret_NOMEM;
5979         }
5980         if (read_phase_present) {
5981           if (unlikely(
5982                   bigstack_alloc_wp(1, &ctx.thread_all_hets) ||
5983                   bigstack_alloc_w(raw_sample_ctl, &(ctx.thread_all_hets[0])))) {
5984             goto MakePgenRobust_ret_NOMEM;
5985           }
5986         }
5987       }
5988       ctx.thread_write_dosagepresents = nullptr;
5989       ctx.thread_write_dphasepresents = nullptr;
5990       if (read_or_write_dosage_present) {
5991         if (unlikely(
5992                 bigstack_alloc_wp(1, &ctx.thread_write_dosagepresents) ||
5993                 bigstack_alloc_dosagep(1, &ctx.thread_write_dosagevals) ||
5994                 bigstack_alloc_w(sample_ctl, &(ctx.thread_write_dosagepresents[0])) ||
5995                 bigstack_alloc_dosage(sample_ct, &(ctx.thread_write_dosagevals[0])))) {
5996           goto MakePgenRobust_ret_NOMEM;
5997         }
5998         if (read_or_write_dphase_present) {
5999           if (unlikely(
6000                   bigstack_alloc_wp(1, &ctx.thread_write_dphasepresents) ||
6001                   bigstack_alloc_dphasep(1, &ctx.thread_write_dphasedeltas) ||
6002                   bigstack_alloc_w(sample_ctl, &(ctx.thread_write_dphasepresents[0])) ||
6003                   bigstack_alloc_dphase(sample_ct + RoundUpPow2(sample_ct, kCacheline / 2), &(ctx.thread_write_dphasedeltas[0])))) {
6004             goto MakePgenRobust_ret_NOMEM;
6005           }
6006         }
6007       }
6008       if ((write_mhc_needed || read_dosage_present) && new_sample_idx_to_old) {
6009         if (unlikely(
6010                 bigstack_alloc_u32p(1, &ctx.thread_cumulative_popcount_bufs) ||
6011                 bigstack_alloc_u32(raw_sample_ctl, &(ctx.thread_cumulative_popcount_bufs[0])))) {
6012           goto MakePgenRobust_ret_NOMEM;
6013         }
6014       }
6015       mcp->refalt1_select = refalt1_select;
6016       if (refalt1_select) {
6017         if (write_allele_idx_offsets) {
6018           // this will require write_mhc and an additional AlleleCode buffer
6019           logerrputs("Error: Multiallelic allele rotation is under development.\n");
6020           reterr = kPglRetNotYetSupported;
6021           goto MakePgenRobust_ret_1;
6022         }
6023         if (new_variant_idx_to_old || (variant_ct < raw_variant_ct)) {
6024           // might want inner loop to map variant uidx -> idx instead
6025           STD_ARRAY_PTR_DECL(AlleleCode, 2, tmp_refalt1_select);
6026           if (unlikely(BIGSTACK_ALLOC_STD_ARRAY(AlleleCode, 2, variant_ct, &tmp_refalt1_select))) {
6027             goto MakePgenRobust_ret_NOMEM;
6028           }
6029           if (new_variant_idx_to_old) {
6030             for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
6031               const uintptr_t variant_uidx = new_variant_idx_to_old[variant_idx];
6032               STD_ARRAY_COPY(refalt1_select[variant_uidx], 2, tmp_refalt1_select[variant_idx]);
6033             }
6034           } else {
6035             uintptr_t variant_uidx_base = 0;
6036             uintptr_t cur_bits = variant_include[0];
6037             for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
6038               const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
6039               STD_ARRAY_COPY(refalt1_select[variant_uidx], 2, tmp_refalt1_select[variant_idx]);
6040             }
6041           }
6042           mcp->refalt1_select = tmp_refalt1_select;
6043         }
6044       }
6045       ctx.mcp = mcp;
6046       const uint32_t raw_sample_ctl2 = NypCtToWordCt(raw_sample_ct);
6047       PgenVariant pgv;
6048       PreinitPgv(&pgv);
6049       uint32_t* alt_regular_one_cts = nullptr;
6050       uint32_t* alt_invphase_one_cts = nullptr;
6051       uint32_t* alt_two_cts = nullptr;
6052       uint32_t* alt_sample_idx_buf = nullptr;
6053       uint32_t** alt_regular_one_sample_idx_starts = nullptr;
6054       uint32_t** alt_invphase_one_sample_idx_starts = nullptr;
6055       uint32_t** alt_two_sample_idx_starts = nullptr;
6056       if (make_plink2_flags & (kfMakePlink2MSplitBase * 7)) {
6057         // split or join
6058         // this is currently for split with no dosages
6059         if (unlikely(
6060                 bigstack_alloc_w(raw_sample_ctl2, &pgv.genovec) ||
6061                 bigstack_alloc_w(raw_sample_ctl, &pgv.patch_01_set) ||
6062                 bigstack_alloc_ac(raw_sample_ct, &pgv.patch_01_vals) ||
6063                 bigstack_alloc_w(raw_sample_ctl, &pgv.patch_10_set) ||
6064                 bigstack_alloc_ac(2 * raw_sample_ct, &pgv.patch_10_vals) ||
6065                 bigstack_alloc_u32(max_read_allele_ct, &alt_regular_one_cts) ||
6066                 bigstack_alloc_u32(max_read_allele_ct, &alt_two_cts) ||
6067                 bigstack_alloc_u32(2 * raw_sample_ct + 1, &alt_sample_idx_buf) ||
6068                 bigstack_alloc_u32p(max_read_allele_ct + 1, &alt_regular_one_sample_idx_starts) ||
6069                 bigstack_alloc_u32p(max_read_allele_ct + 1, &alt_two_sample_idx_starts))) {
6070           goto MakePgenRobust_ret_NOMEM;
6071         }
6072         if (read_phase_present) {
6073           if (unlikely(
6074                   bigstack_alloc_w(raw_sample_ctl, &pgv.phasepresent) ||
6075                   bigstack_alloc_w(raw_sample_ctl, &pgv.phaseinfo) ||
6076                   bigstack_alloc_u32(max_read_allele_ct, &alt_invphase_one_cts) ||
6077                   bigstack_alloc_u32p(max_read_allele_ct + 1, &alt_invphase_one_sample_idx_starts))) {
6078             goto MakePgenRobust_ret_NOMEM;
6079           }
6080         }
6081         if (read_dosage_present) {
6082           logerrputs("Error: Multiallelic dosages aren't supported yet.\n");
6083           reterr = kPglRetNotYetSupported;
6084           goto MakePgenRobust_ret_1;
6085         }
6086       }
6087 
6088       const uint32_t raw_sample_ctv2 = NypCtToVecCt(raw_sample_ct);
6089       uintptr_t load_variant_vec_ct = raw_sample_ctv2;
6090       uint32_t loaded_vrtypes_needed = (read_gflags & kfPgenGlobalMultiallelicHardcallFound)? 1 : 0;
6091       if (read_phase_present || read_dosage_present) {
6092         loaded_vrtypes_needed = 1;
6093         if (read_phase_present) {
6094           // phaseraw has three parts:
6095           // 1. het_ct as uint32_t, and explicit_phasepresent_ct as uint32_t.
6096           // 2. vec-aligned bitarray of up to (raw_sample_ct + 1) bits.  first
6097           //    bit is set iff phasepresent is explicitly stored at all (if
6098           //    not, all hets are assumed to be phased), if yes the remaining
6099           //    bits store packed phasepresent values for all hets, if no the
6100           //    remaining bits store packed phaseinfo values for all hets.
6101           // 3. word-aligned bitarray of up to raw_sample_ct bits, storing
6102           //    phaseinfo values.  (end of this array is vec-aligned.)
6103           const uintptr_t phaseraw_word_ct = (8 / kBytesPerWord) + kWordsPerVec + RoundDownPow2(raw_sample_ct / kBitsPerWordD2, kWordsPerVec);
6104           load_variant_vec_ct += WordCtToVecCt(phaseraw_word_ct);
6105         }
6106         if (read_dosage_present) {
6107           // biallelic dosageraw has two parts:
6108           // 1. vec-aligned bitarray of up to raw_sample_ct bits, storing which
6109           //    samples have dosages.
6110           // 2. word-aligned array of uint16s with 0..32768 fixed-point
6111           //    dosages.
6112           // dphaseraw has the same structure, with the uint16s replaced with
6113           // an int16 array of (left - right) values.
6114           const uintptr_t dosageraw_word_ct = kWordsPerVec * (BitCtToVecCt(raw_sample_ct) + DivUp(raw_sample_ct, (kBytesPerVec / sizeof(Dosage))));
6115           load_variant_vec_ct += WordCtToVecCt(dosageraw_word_ct) * (1 + read_dphase_present);
6116         }
6117       }
6118 
6119       uintptr_t bytes_left = bigstack_left();
6120       if (unlikely(bytes_left < 7 * kCacheline)) {
6121         goto MakePgenRobust_ret_NOMEM;
6122       }
6123       bytes_left -= 7 * kCacheline;  // defend against adverse rounding
6124       uintptr_t ulii = bytes_left / (2 * (kBytesPerVec * load_variant_vec_ct + loaded_vrtypes_needed));
6125       if (unlikely(!ulii)) {
6126         goto MakePgenRobust_ret_NOMEM;
6127       }
6128       if (ulii > MINV(kPglVblockSize, write_variant_ct)) {
6129         ulii = MINV(kPglVblockSize, write_variant_ct);
6130       }
6131       const uint32_t write_block_size = ulii;
6132       uintptr_t* main_loadbufs[2];
6133       main_loadbufs[0] = S_CAST(uintptr_t*, bigstack_alloc_raw_rd(load_variant_vec_ct * kBytesPerVec * write_block_size));
6134       main_loadbufs[1] = S_CAST(uintptr_t*, bigstack_alloc_raw_rd(load_variant_vec_ct * kBytesPerVec * write_block_size));
6135 
6136       // todo: multiallelic trim-alts support
6137 
6138       if (loaded_vrtypes_needed) {
6139         ctx.loaded_vrtypes[0] = S_CAST(unsigned char*, bigstack_alloc_raw_rd(write_block_size));
6140         ctx.loaded_vrtypes[1] = S_CAST(unsigned char*, bigstack_alloc_raw_rd(write_block_size));
6141       } else {
6142         ctx.loaded_vrtypes[0] = nullptr;
6143         ctx.loaded_vrtypes[1] = nullptr;
6144       }
6145       SetThreadFuncAndData(MakePgenThread, &ctx, &tg);
6146 
6147       logprintfww5("Writing %s ... ", outname);
6148       fputs("0%", stdout);
6149       fflush(stdout);
6150 
6151       // Main workflow:
6152       // 1. Set n=0, load first write_block_size post-filtering variants
6153       //
6154       // 2. Spawn single thread processing batch n
6155       // 3. Load batch (n+1) unless eof
6156       // 4. Join thread
6157       // 5. Increment n by 1
6158       // 6. Goto step 2 unless eof
6159       const uint32_t* new_variant_idx_to_old_iter = new_variant_idx_to_old;
6160       const uintptr_t* cur_write_allele_idx_offsets = nullptr;
6161       const uint32_t batch_ct_m1 = (write_variant_ct - 1) / write_block_size;
6162       uint32_t pct = 0;
6163       uint32_t parity = 0;
6164       uint32_t cur_batch_size = write_block_size;
6165       uint32_t next_print_write_variant_idx = write_variant_ct / 100;
6166       uint32_t cur_read_allele_ct = 2;
6167       uint32_t cur_write_allele_ct = 2;
6168       uint32_t cur_het_ct = 0;
6169       uintptr_t read_variant_uidx_base = 0;
6170 
6171       // now need to retain these across loop iterations in case a split is
6172       // interrupted by batch-end
6173       uint32_t read_variant_uidx = 0;
6174       uint32_t write_aidx = 1;
6175 
6176       uintptr_t cur_bits = variant_include[0];
6177       PgrSampleSubsetIndex null_pssi;
6178       PgrClearSampleSubsetIndex(simple_pgrp, &null_pssi);
6179       for (uint32_t read_batch_idx = 0; ; ++read_batch_idx) {
6180         if (!IsLastBlock(&tg)) {
6181           if (read_batch_idx == batch_ct_m1) {
6182             cur_batch_size = write_variant_ct - (read_batch_idx * write_block_size);
6183           }
6184           uintptr_t* cur_loadbuf = main_loadbufs[parity];
6185           uintptr_t* loadbuf_iter = cur_loadbuf;
6186           unsigned char* cur_loaded_vrtypes = ctx.loaded_vrtypes[parity];
6187           ctx.loadbuf_thread_starts[parity][0] = loadbuf_iter;
6188           if (write_allele_idx_offsets) {
6189             cur_write_allele_idx_offsets = &(write_allele_idx_offsets[read_batch_idx * write_block_size]);
6190           }
6191           for (uint32_t block_widx = 0; block_widx != cur_batch_size; ) {
6192             if (write_aidx == 1) {
6193               if (!new_variant_idx_to_old_iter) {
6194                 read_variant_uidx = BitIter1(variant_include, &read_variant_uidx_base, &cur_bits);
6195               } else {
6196                 read_variant_uidx = *new_variant_idx_to_old_iter++;
6197               }
6198               // todo: multiallelic trim-alts
6199               // todo: multiallelic merge
6200               // split: load to buffer instead of loadbuf_iter, have function
6201               //        for writing to loadbuf_iter given buffer contents, this
6202               //        should work if split is 'interrupted' by batch boundary
6203               //        in middle
6204               // merge: track loadbuf_iter location at beginning of
6205               //        same-position block... (finish writing this later)
6206               if (allele_idx_offsets) {
6207                 cur_read_allele_ct = allele_idx_offsets[read_variant_uidx + 1] - allele_idx_offsets[read_variant_uidx];
6208               }
6209             }
6210             if (cur_write_allele_idx_offsets) {
6211               cur_write_allele_ct = cur_write_allele_idx_offsets[block_widx + 1] - cur_write_allele_idx_offsets[block_widx];
6212             }
6213             if (cur_read_allele_ct == cur_write_allele_ct) {
6214               reterr = PgrGetRaw(read_variant_uidx, read_gflags, simple_pgrp, &loadbuf_iter, cur_loaded_vrtypes? (&(cur_loaded_vrtypes[block_widx])) : nullptr);
6215               if (unlikely(reterr)) {
6216                 goto MakePgenRobust_ret_PGR_FAIL;
6217               }
6218               ++block_widx;
6219               continue;
6220             } else if (cur_write_allele_ct == 2) {
6221               if (write_aidx == 1) {
6222                 // 1. read into normal, not raw representation
6223                 if (read_phase_present) {
6224                   reterr = PgrGetMDp(nullptr, null_pssi, raw_sample_ct, read_variant_uidx, simple_pgrp, &pgv);
6225                 } else {
6226                   reterr = PgrGetMD(nullptr, null_pssi, raw_sample_ct, read_variant_uidx, simple_pgrp, &pgv);
6227                 }
6228                 if (unlikely(reterr)) {
6229                   goto MakePgenRobust_ret_PGR_FAIL;
6230                 }
6231 
6232                 // 2a. count # of each alt
6233                 // 2b. create het and hom lists for each alt
6234                 uintptr_t* genovec = pgv.genovec;
6235                 ZeroTrailingNyps(raw_sample_ct, genovec);
6236                 uint32_t raw_01_ct;
6237                 uint32_t raw_10_ct;
6238                 GenovecCount12Unsafe(genovec, raw_sample_ct, &raw_01_ct, &raw_10_ct);
6239                 ZeroU32Arr(cur_read_allele_ct, alt_regular_one_cts);
6240                 alt_regular_one_cts[1] = raw_01_ct - pgv.patch_01_ct;
6241                 for (uint32_t rarealt_idx = 0; rarealt_idx != pgv.patch_01_ct; ++rarealt_idx) {
6242                   alt_regular_one_cts[pgv.patch_01_vals[rarealt_idx]] += 1;
6243                 }
6244                 ZeroU32Arr(cur_read_allele_ct, alt_two_cts);
6245                 if (!pgv.phasepresent_ct) {
6246                   for (uint32_t uii = 0; uii != pgv.patch_10_ct; ++uii) {
6247                     const AlleleCode ac0 = pgv.patch_10_vals[2 * uii];
6248                     const AlleleCode ac1 = pgv.patch_10_vals[2 * uii + 1];
6249                     if (ac0 == ac1) {
6250                       alt_two_cts[ac0] += 1;
6251                     } else {
6252                       alt_regular_one_cts[ac0] += 1;
6253                       alt_regular_one_cts[ac1] += 1;
6254                     }
6255                   }
6256                 } else {
6257                   ZeroU32Arr(cur_read_allele_ct, alt_invphase_one_cts);
6258                   for (uint32_t uii = 0; uii != pgv.patch_10_ct; ++uii) {
6259                     const AlleleCode ac0 = pgv.patch_10_vals[2 * uii];
6260                     const AlleleCode ac1 = pgv.patch_10_vals[2 * uii + 1];
6261                     if (ac0 == ac1) {
6262                       alt_two_cts[ac0] += 1;
6263                     } else {
6264                       alt_invphase_one_cts[ac0] += 1;
6265                       alt_regular_one_cts[ac1] += 1;
6266                     }
6267                   }
6268                 }
6269 
6270                 alt_two_cts[1] = raw_10_ct - pgv.patch_10_ct;
6271                 cur_het_ct = raw_01_ct + pgv.patch_10_ct;
6272                 for (uint32_t aidx = 2; aidx != cur_read_allele_ct; ++aidx) {
6273                   cur_het_ct -= alt_two_cts[aidx];
6274                 }
6275 
6276                 uint32_t* sample_idx_buf_iter = alt_sample_idx_buf;
6277                 alt_regular_one_sample_idx_starts[0] = alt_sample_idx_buf;
6278                 for (uint32_t aidx = 1; aidx != cur_read_allele_ct; ++aidx) {
6279                   alt_regular_one_sample_idx_starts[aidx] = sample_idx_buf_iter;
6280                   sample_idx_buf_iter = &(sample_idx_buf_iter[alt_regular_one_cts[aidx]]);
6281                 }
6282                 alt_regular_one_sample_idx_starts[cur_read_allele_ct] = sample_idx_buf_iter;
6283                 if (pgv.phasepresent_ct) {
6284                   alt_invphase_one_sample_idx_starts[0] = sample_idx_buf_iter;
6285                   for (uint32_t aidx = 1; aidx != cur_read_allele_ct - 1; ++aidx) {
6286                     alt_invphase_one_sample_idx_starts[aidx] = sample_idx_buf_iter;
6287                     sample_idx_buf_iter = &(sample_idx_buf_iter[alt_invphase_one_cts[aidx]]);
6288                   }
6289                   alt_invphase_one_sample_idx_starts[cur_read_allele_ct - 1] = sample_idx_buf_iter;
6290                   alt_invphase_one_sample_idx_starts[cur_read_allele_ct] = sample_idx_buf_iter;
6291                 }
6292                 alt_two_sample_idx_starts[0] = sample_idx_buf_iter;
6293                 for (uint32_t aidx = 1; aidx != cur_read_allele_ct; ++aidx) {
6294                   alt_two_sample_idx_starts[aidx] = sample_idx_buf_iter;
6295                   sample_idx_buf_iter = &(sample_idx_buf_iter[alt_two_cts[aidx]]);
6296                 }
6297                 alt_two_sample_idx_starts[cur_read_allele_ct] = sample_idx_buf_iter;
6298 
6299                 Halfword* patch_01_set_alias = R_CAST(Halfword*, pgv.patch_01_set);
6300                 Halfword* patch_10_set_alias = R_CAST(Halfword*, pgv.patch_10_set);
6301                 uint32_t idx_01 = 0;
6302                 uint32_t idx_10 = 0;
6303                 for (uint32_t widx = 0; widx != raw_sample_ctl2; ++widx) {
6304                   const uintptr_t geno_word = genovec[widx];
6305                   const uint32_t sample_idx_offset = widx * kBitsPerWordD2;
6306                   uintptr_t geno_01 = Word01(geno_word);
6307                   if (geno_01) {
6308                     if (!pgv.patch_01_ct) {
6309                       // patch_01_set not initialized in this case
6310                       do {
6311                         const uint32_t sample_idx = sample_idx_offset + ctzw(geno_01) / 2;
6312                         alt_regular_one_sample_idx_starts[1][0] = sample_idx;
6313                         alt_regular_one_sample_idx_starts[1] += 1;
6314                         geno_01 &= geno_01 - 1;
6315                       } while (geno_01);
6316                     } else {
6317                       uint32_t geno_01_hw = PackWordToHalfword(geno_01);
6318                       const uint32_t patch_01_hw = patch_01_set_alias[widx];
6319                       do {
6320                         const uint32_t lowbit = geno_01_hw & (-geno_01_hw);
6321                         const uint32_t sample_idx = sample_idx_offset + ctzu32(lowbit);
6322                         if (lowbit & patch_01_hw) {
6323                           AlleleCode ac = pgv.patch_01_vals[idx_01];
6324                           alt_regular_one_sample_idx_starts[ac][0] = sample_idx;
6325                           alt_regular_one_sample_idx_starts[ac] += 1;
6326                           ++idx_01;
6327                         } else {
6328                           alt_regular_one_sample_idx_starts[1][0] = sample_idx;
6329                           alt_regular_one_sample_idx_starts[1] += 1;
6330                         }
6331                         geno_01_hw ^= lowbit;
6332                       } while (geno_01_hw);
6333                     }
6334                   }
6335                   uintptr_t geno_10 = Word10(geno_word);
6336                   if (geno_10) {
6337                     if (!pgv.patch_10_ct) {
6338                       // patch_10_set not initialized in this case
6339                       do {
6340                         const uint32_t sample_idx = sample_idx_offset + ctzw(geno_10) / 2;
6341                         alt_two_sample_idx_starts[1][0] = sample_idx;
6342                         alt_two_sample_idx_starts[1] += 1;
6343                         geno_10 &= geno_10 - 1;
6344                       } while (geno_10);
6345                     } else {
6346                       uint32_t geno_10_hw = PackWordToHalfword(geno_10);
6347                       const uint32_t patch_10_hw = patch_10_set_alias[widx];
6348                       if (!pgv.phasepresent_ct) {
6349                         do {
6350                           const uint32_t lowbit = geno_10_hw & (-geno_10_hw);
6351                           const uint32_t sample_idx = sample_idx_offset + ctzu32(lowbit);
6352                           if (lowbit & patch_10_hw) {
6353                             AlleleCode ac0 = pgv.patch_10_vals[2 * idx_10];
6354                             AlleleCode ac1 = pgv.patch_10_vals[2 * idx_10 + 1];
6355                             if (ac0 == ac1) {
6356                               alt_two_sample_idx_starts[ac0][0] = sample_idx;
6357                               alt_two_sample_idx_starts[ac0] += 1;
6358                             } else {
6359                               alt_regular_one_sample_idx_starts[ac0][0] = sample_idx;
6360                               alt_regular_one_sample_idx_starts[ac0] += 1;
6361                               alt_regular_one_sample_idx_starts[ac1][0] = sample_idx;
6362                               alt_regular_one_sample_idx_starts[ac1] += 1;
6363                             }
6364                             ++idx_10;
6365                           } else {
6366                             alt_two_sample_idx_starts[1][0] = sample_idx;
6367                             alt_two_sample_idx_starts[1] += 1;
6368                           }
6369                           geno_10_hw ^= lowbit;
6370                         } while (geno_10_hw);
6371                       } else {
6372                         do {
6373                           const uint32_t lowbit = geno_10_hw & (-geno_10_hw);
6374                           const uint32_t sample_idx = sample_idx_offset + ctzu32(lowbit);
6375                           if (lowbit & patch_10_hw) {
6376                             AlleleCode ac0 = pgv.patch_10_vals[2 * idx_10];
6377                             AlleleCode ac1 = pgv.patch_10_vals[2 * idx_10 + 1];
6378                             if (ac0 == ac1) {
6379                               alt_two_sample_idx_starts[ac0][0] = sample_idx;
6380                               alt_two_sample_idx_starts[ac0] += 1;
6381                             } else {
6382                               alt_invphase_one_sample_idx_starts[ac0][0] = sample_idx;
6383                               alt_invphase_one_sample_idx_starts[ac0] += 1;
6384                               alt_regular_one_sample_idx_starts[ac1][0] = sample_idx;
6385                               alt_regular_one_sample_idx_starts[ac1] += 1;
6386                             }
6387                             ++idx_10;
6388                           } else {
6389                             alt_two_sample_idx_starts[1][0] = sample_idx;
6390                             alt_two_sample_idx_starts[1] += 1;
6391                           }
6392                           geno_10_hw ^= lowbit;
6393                         } while (geno_10_hw);
6394                       }
6395                     }
6396                   }
6397                 }
6398                 for (uint32_t aidx = cur_read_allele_ct - 1; aidx; --aidx) {
6399                   alt_regular_one_sample_idx_starts[aidx] = alt_regular_one_sample_idx_starts[aidx - 1];
6400                   alt_two_sample_idx_starts[aidx] = alt_two_sample_idx_starts[aidx - 1];
6401                 }
6402                 if (pgv.phasepresent_ct) {
6403                   for (uint32_t aidx = cur_read_allele_ct - 1; aidx; --aidx) {
6404                     alt_invphase_one_sample_idx_starts[aidx] = alt_invphase_one_sample_idx_starts[aidx - 1];
6405                   }
6406                 }
6407                 // todo: multiallelic dosage
6408 
6409                 for (uint32_t widx = 0; widx != raw_sample_ctl2; ++widx) {
6410                   // keep 3s, set 1s and 2s to 0
6411                   genovec[widx] = Word11(genovec[widx]) * 3;
6412                 }
6413               }
6414               const uint32_t split_stop = MINV(cur_batch_size + 1 - block_widx, cur_read_allele_ct);
6415               for (; write_aidx != split_stop; ++write_aidx, ++block_widx) {
6416                 // 3. synthesize raw
6417                 //   (save to loaded_vrtypes if necessary)
6418                 //   genovec, vector-aligned
6419                 //   if hphase present and relevant:
6420                 //     (compute het_ct; het_ctdl := het_ct / kBitsPerWord)
6421                 //     (first_half_byte_ct := 1 + (het_ct / CHAR_BIT))
6422                 //     <uint32 het_ct>
6423                 //     <uint32 raw_phasepresent_ct if explicit>
6424                 //     <first_half_byte_ct phasepresent or phaseinfo bytes>
6425                 //     <0-pad up to word boundary, to make popcount safe>
6426                 //     [if explicit phasepresent, i.e. lowest bit set:
6427                 //       (second_half_byte_ct := DivUp(raw_phasepresent_ct, 8))
6428                 //       <second_half_byte_ct phaseinfo contents>
6429                 //     ]
6430                 //     align up to vector boundary
6431                 uintptr_t* new_genovec = loadbuf_iter;
6432                 memcpy(new_genovec, pgv.genovec, raw_sample_ctl2 * sizeof(intptr_t));
6433                 loadbuf_iter = &(loadbuf_iter[raw_sample_ctv2 * kWordsPerVec]);
6434                 uint32_t new_phasepresent_ct = 0;
6435                 uint32_t new_het_ct = 0;
6436                 uint32_t* regular_stop = alt_regular_one_sample_idx_starts[write_aidx + 1];
6437                 if (pgv.phasepresent_ct) {
6438                   uint32_t* regular_iter = alt_regular_one_sample_idx_starts[write_aidx];
6439                   uint32_t* invphase_iter = alt_invphase_one_sample_idx_starts[write_aidx];
6440                   uint32_t* invphase_stop = alt_invphase_one_sample_idx_starts[write_aidx + 1];
6441                   new_het_ct = (regular_stop - regular_iter) + (invphase_stop - invphase_iter);
6442                   if (pgv.phasepresent_ct == cur_het_ct) {
6443                     new_phasepresent_ct = new_het_ct;
6444                   } else {
6445                     uintptr_t* phasepresent = pgv.phasepresent;
6446                     for (; regular_iter != regular_stop; ++regular_iter) {
6447                       new_phasepresent_ct += IsSet(phasepresent, *regular_iter);
6448                     }
6449                     for (; invphase_iter != invphase_stop; ++invphase_iter) {
6450                       new_phasepresent_ct += IsSet(phasepresent, *invphase_iter);
6451                     }
6452                   }
6453                 }
6454                 uint32_t* two_stop = alt_two_sample_idx_starts[write_aidx + 1];
6455                 for (uint32_t* two_iter = alt_two_sample_idx_starts[write_aidx]; two_iter != two_stop; ++two_iter) {
6456                   const uint32_t sample_uidx = *two_iter;
6457                   SetBit(sample_uidx * 2 + 1, new_genovec);
6458                 }
6459                 uint32_t* regular_iter = alt_regular_one_sample_idx_starts[write_aidx];
6460                 if (!new_phasepresent_ct) {
6461                   for (; regular_iter != regular_stop; ++regular_iter) {
6462                     const uint32_t sample_uidx = *regular_iter;
6463                     SetBit(sample_uidx * 2, new_genovec);
6464                   }
6465                   if (pgv.phasepresent_ct) {
6466                     uint32_t* invphase_stop = alt_invphase_one_sample_idx_starts[write_aidx + 1];
6467                     for (uint32_t* invphase_iter = alt_invphase_one_sample_idx_starts[write_aidx]; invphase_iter != invphase_stop; ++invphase_iter) {
6468                       const uint32_t sample_uidx = *invphase_iter;
6469                       SetBit(sample_uidx * 2, new_genovec);
6470                     }
6471                   }
6472                   if (cur_loaded_vrtypes) {
6473                     cur_loaded_vrtypes[block_widx] = 0;
6474                   }
6475                 } else {
6476                   // need to write raw hphase
6477                   const uint32_t het_ctdl = new_het_ct / kBitsPerWord;
6478                   uintptr_t* shifted_part1 = &(loadbuf_iter[8 / kBytesPerWord]);
6479                   uintptr_t* part1_end = &(shifted_part1[1 + het_ctdl]);
6480                   uint32_t* invphase_iter = alt_invphase_one_sample_idx_starts[write_aidx];
6481                   uint32_t* invphase_stop = alt_invphase_one_sample_idx_starts[write_aidx + 1];
6482                   const uint32_t orig_regular_end = *regular_stop;
6483                   const uint32_t orig_invphase_end = *invphase_stop;
6484                   // sentinel value to simplify the next loop.
6485                   *invphase_stop = UINT32_MAX;
6486                   // must grab this before setting *regular_stop, in case
6487                   // they overlap; and after setting *invphase_stop, in case
6488                   // this list is empty
6489                   uint32_t invphase_idx = *invphase_iter++;
6490 
6491                   *regular_stop = UINT32_MAX;
6492                   uint32_t regular_idx = *regular_iter++;
6493                   uint32_t shifted_het_idx = 1;
6494                   if (new_phasepresent_ct == new_het_ct) {
6495                     loadbuf_iter[0] = new_het_ct;
6496 #ifndef __LP64__
6497                     loadbuf_iter[1] = 0;
6498 #endif
6499                     // shifted_part1 is phaseinfo
6500                     shifted_part1[0] = 0;
6501                     shifted_part1[het_ctdl] = 0;
6502                     while (regular_idx != invphase_idx) {
6503                       uintptr_t is_inverted = (invphase_idx < regular_idx);
6504                       uint32_t sample_uidx;
6505                       if (is_inverted) {
6506                         sample_uidx = invphase_idx;
6507                         invphase_idx = *invphase_iter++;
6508                       } else {
6509                         sample_uidx = regular_idx;
6510                         regular_idx = *regular_iter++;
6511                       }
6512                       SetBit(sample_uidx * 2, new_genovec);
6513                       AssignBit(shifted_het_idx, is_inverted ^ IsSet(pgv.phaseinfo, sample_uidx), shifted_part1);
6514                       ++shifted_het_idx;
6515                     }
6516                     assert(shifted_het_idx == new_het_ct + 1);
6517                     loadbuf_iter = part1_end;
6518                   } else {
6519 #ifdef __LP64__
6520                     loadbuf_iter[0] = new_het_ct | (S_CAST(uint64_t, new_phasepresent_ct) << 32);
6521 #else
6522                     loadbuf_iter[0] = new_het_ct;
6523                     loadbuf_iter[1] = new_phasepresent_ct;
6524 #endif
6525                     shifted_part1[0] = 1;
6526                     memset(shifted_part1, 0, (1 + het_ctdl) * sizeof(intptr_t));
6527                     // shifted_part1 is phasepresent
6528                     // part1_end is start of phaseinfo
6529                     const uint32_t new_phasepresent_ctl = BitCtToWordCt(new_phasepresent_ct);
6530                     part1_end[new_phasepresent_ctl - 1] = 0;
6531                     uint32_t phasepresent_idx = 0;
6532                     while (regular_idx != invphase_idx) {
6533                       uintptr_t is_inverted = (invphase_idx < regular_idx);
6534                       uint32_t sample_uidx;
6535                       if (is_inverted) {
6536                         sample_uidx = invphase_idx;
6537                         invphase_idx = *invphase_iter++;
6538                       } else {
6539                         sample_uidx = regular_idx;
6540                         regular_idx = *regular_iter++;
6541                       }
6542                       SetBit(sample_uidx * 2, new_genovec);
6543                       if (IsSet(pgv.phasepresent, sample_uidx)) {
6544                         SetBit(shifted_het_idx, shifted_part1);
6545                         AssignBit(phasepresent_idx, is_inverted ^ IsSet(pgv.phaseinfo, sample_uidx), part1_end);
6546                         ++phasepresent_idx;
6547                       }
6548                       ++shifted_het_idx;
6549                     }
6550                     assert(phasepresent_idx == new_phasepresent_ct);
6551                   }
6552                   assert(regular_idx == UINT32_MAX);
6553                   *regular_stop = orig_regular_end;
6554                   *invphase_stop = orig_invphase_end;
6555                   VecAlignUp(&loadbuf_iter);
6556                   if (cur_loaded_vrtypes) {
6557                     cur_loaded_vrtypes[block_widx] = 0x10;
6558                   }
6559                 }
6560               }
6561               if (split_stop != cur_read_allele_ct) {
6562                 break;
6563               }
6564               write_aidx = 1;
6565             } else {
6566               // merge; todo
6567             }
6568           }
6569         }
6570         if (read_batch_idx) {
6571           JoinThreads(&tg);
6572           reterr = ctx.write_reterr;
6573           if (unlikely(reterr)) {
6574             if (reterr == kPglRetWriteFail) {
6575               errno = ctx.write_errno;
6576             }
6577             goto MakePgenRobust_ret_1;
6578           }
6579         }
6580         if (!IsLastBlock(&tg)) {
6581           ctx.cur_block_write_ct = cur_batch_size;
6582           if (read_batch_idx == batch_ct_m1) {
6583             DeclareLastThreadBlock(&tg);
6584           }
6585           if (unlikely(SpawnThreads(&tg))) {
6586             goto MakePgenRobust_ret_THREAD_CREATE_FAIL;
6587           }
6588         }
6589         parity = 1 - parity;
6590         if (read_batch_idx) {
6591           if (read_batch_idx > batch_ct_m1) {
6592             break;
6593           }
6594           const uint32_t write_idx_end = read_batch_idx * write_block_size;
6595           if (write_idx_end >= next_print_write_variant_idx) {
6596             if (pct > 10) {
6597               putc_unlocked('\b', stdout);
6598             }
6599             pct = (write_idx_end * 100LLU) / write_variant_ct;
6600             printf("\b\b%u%%", pct++);
6601             fflush(stdout);
6602             next_print_write_variant_idx = (pct * S_CAST(uint64_t, write_variant_ct)) / 100;
6603           }
6604         }
6605       }
6606       SpgwFinish(ctx.spgwp);
6607       if (pct > 10) {
6608         putc_unlocked('\b', stdout);
6609       }
6610       fputs("\b\b", stdout);
6611       logputs("done.\n");
6612     }
6613   }
6614   while (0) {
6615   MakePgenRobust_ret_NOMEM:
6616     reterr = kPglRetNomem;
6617     break;
6618   MakePgenRobust_ret_PGR_FAIL:
6619     PgenErrPrintN(reterr);
6620     break;
6621   MakePgenRobust_ret_THREAD_CREATE_FAIL:
6622     reterr = kPglRetThreadCreateFail;
6623     break;
6624   }
6625  MakePgenRobust_ret_1:
6626   CleanupThreads(&tg);
6627   CleanupSpgw(&spgw, &reterr);
6628   BigstackReset(bigstack_mark);
6629   return reterr;
6630 }
6631 
6632 // allele_presents should be nullptr iff trim_alts not true
6633 PglErr MakePlink2NoVsort(const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const char* varid_template_str, __maybe_unused const char* varid_multi_template_str, __maybe_unused const char* varid_multi_nonsnp_template_str, const char* missing_varid_match, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, UnsortedVar vpos_sortstatus, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, uint32_t new_variant_id_max_allele_slen, MiscFlags misc_flags, MakePlink2Flags make_plink2_flags, PvarPsamFlags pvar_psam_flags, uintptr_t pgr_alloc_cacheline_ct, char* xheader, PgenFileInfo* pgfip, PgenReader* simple_pgrp, char* outname, char* outname_end) {
6634   unsigned char* bigstack_mark = g_bigstack_base;
6635   FILE* outfile = nullptr;
6636   PglErr reterr = kPglRetSuccess;
6637   ThreadGroup tg;
6638   PreinitThreads(&tg);
6639   MTPgenWriter* mpgwp = nullptr;
6640   MakePgenCtx ctx;
6641   {
6642     if (make_plink2_flags & kfMakeFam) {
6643       snprintf(outname_end, kMaxOutfnameExtBlen, ".fam");
6644       logprintfww5("Writing %s ... ", outname);
6645       fflush(stdout);
6646       reterr = WriteFam(outname, sample_include, piip, sex_nm, sex_male, pheno_cols, new_sample_idx_to_old, sample_ct, pheno_ct, '\t');
6647       if (unlikely(reterr)) {
6648         goto MakePlink2NoVsort_ret_1;
6649       }
6650       logputs("done.\n");
6651     }
6652     if (make_plink2_flags & kfMakePsam) {
6653       snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
6654       logprintfww5("Writing %s ... ", outname);
6655       fflush(stdout);
6656       reterr = WritePsam(outname, sample_include, piip, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
6657       if (unlikely(reterr)) {
6658         goto MakePlink2NoVsort_ret_1;
6659       }
6660       logputs("done.\n");
6661     }
6662     const uint32_t input_biallelic = (!allele_idx_offsets);
6663     // output_biallelic: test write_allele_idx_offsets equality to null
6664     PgenGlobalFlags read_gflags = pgfip->gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
6665     if (!input_biallelic) {
6666       // Can only skip this when there are actually zero copies of alt2+.
6667       // Otherwise, even with erase-alt2+, we still need to distinguish alt1
6668       // from alt2 so we can set calls involving the latter to missing.
6669       read_gflags |= kfPgenGlobalMultiallelicHardcallFound;
6670     }
6671     const uintptr_t* write_allele_idx_offsets = nullptr;
6672     uint32_t write_variant_ct = variant_ct;
6673     uint32_t max_write_allele_ct = max_allele_ct;
6674     uint32_t max_missalt_ct = 0;
6675     if (make_plink2_flags & kfMakePlink2MMask) {
6676       // TODO: enforce on command line
6677       assert((!refalt1_select) && (!allele_presents));
6678       if (make_plink2_flags & kfMakePlink2MJoin) {
6679         reterr = PlanMultiallelicJoin(variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, make_plink2_flags, &write_variant_ct, &write_allele_idx_offsets, &max_write_allele_ct, &max_missalt_ct);
6680       } else if (!allele_idx_offsets) {
6681         // no splitting to do
6682         logputs("Note: All variants are biallelic; nothing to split.\n");
6683       } else {
6684         reterr = PlanMultiallelicSplit(variant_include, allele_idx_offsets, allele_storage, max_allele_ct, make_plink2_flags, &write_variant_ct, &write_allele_idx_offsets);
6685       }
6686       if (unlikely(reterr)) {
6687         goto MakePlink2NoVsort_ret_1;
6688       }
6689     } else if (allele_idx_offsets) {
6690       if (allele_presents) {
6691         fputs("multiallelic variants + trim-alts not yet supported\n", stderr);
6692         exit(S_CAST(int32_t, kPglRetNotYetSupported));
6693       }
6694       if (variant_ct < raw_variant_ct) {
6695         uintptr_t* new_allele_idx_offsets;
6696         if (bigstack_alloc_w(variant_ct + 1, &new_allele_idx_offsets)) {
6697           goto MakePlink2NoVsort_ret_NOMEM;
6698         }
6699         const uintptr_t final_offset = InitWriteAlleleIdxOffsets(variant_include, allele_idx_offsets, nullptr, refalt1_select, nullptr, variant_ct, new_allele_idx_offsets);
6700         if (final_offset != 2 * variant_ct) {
6701           new_allele_idx_offsets[variant_ct] = final_offset;
6702           write_allele_idx_offsets = new_allele_idx_offsets;
6703         } else {
6704           BigstackReset(new_allele_idx_offsets);
6705         }
6706       } else {
6707         write_allele_idx_offsets = allele_idx_offsets;
6708       }
6709     }
6710     if (make_plink2_flags & kfMakeBim) {
6711       const uint32_t bim_zst = (make_plink2_flags / kfMakeBimZs) & 1;
6712       OutnameZstSet(".bim", bim_zst, outname_end);
6713       logprintfww5("Writing %s ... ", outname);
6714       fflush(stdout);
6715       if (unlikely(write_allele_idx_offsets)) {
6716         logputs("\n");
6717         logerrprintf("Error: %s cannot contain multiallelic variants.\n", outname);
6718         goto MakePlink2NoVsort_ret_INCONSISTENT_INPUT;
6719       }
6720       if (write_variant_ct == variant_ct) {
6721         reterr = WriteMapOrBim(outname, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, variant_cms, variant_ct, max_allele_slen, '\t', bim_zst, max_thread_ct);
6722       } else {
6723         assert(write_variant_ct > variant_ct);
6724         reterr = WriteBimSplit(outname, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, variant_cms, varid_template_str, missing_varid_match, variant_ct, max_allele_slen, new_variant_id_max_allele_slen, (make_plink2_flags / kfMakePlink2VaridSemicolon) & 1, (make_plink2_flags / kfMakePlink2VaridDup) & 1, misc_flags, bim_zst, max_thread_ct);
6725       }
6726       if (unlikely(reterr)) {
6727         goto MakePlink2NoVsort_ret_1;
6728       }
6729       logputs("done.\n");
6730     }
6731     if (make_plink2_flags & kfMakePvar) {
6732       OutnameZstSet(".pvar", pvar_psam_flags & kfPvarZs, outname_end);
6733       logprintfww5("Writing %s ... ", outname);
6734       fflush(stdout);
6735       uint32_t nonref_flags_storage = 3;
6736       if (!pgfip->nonref_flags) {
6737         nonref_flags_storage = (pgfip->gflags & kfPgenGlobalAllNonref)? 2 : 1;
6738       }
6739       if (write_variant_ct == variant_ct) {
6740         reterr = WritePvar(outname, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, vpos_sortstatus, pvar_psam_flags, max_thread_ct, xheader);
6741       } else {
6742         const char* const* info_keys = nullptr;
6743         uint32_t info_key_ct = 0;
6744         uint32_t* info_keys_htable = nullptr;
6745         uint32_t info_keys_htable_size = 0;
6746         if (pvar_info_reload) {
6747           reterr = ParseInfoHeader(xheader, xheader_blen, &info_keys, &info_key_ct, &info_keys_htable, &info_keys_htable_size);
6748           if (reterr) {
6749             goto MakePlink2NoVsort_ret_1;
6750           }
6751         }
6752         if (write_variant_ct > variant_ct) {
6753           reterr = WritePvarSplit(outname, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, varid_template_str, missing_varid_match, info_keys, info_keys_htable, raw_variant_ct, variant_ct, max_allele_slen, new_variant_id_max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, vpos_sortstatus, info_key_ct, info_keys_htable_size, misc_flags, make_plink2_flags, pvar_psam_flags, max_thread_ct, xheader);
6754         } else {
6755           logerrputs("Error: Multiallelic join is under development.\n");
6756           reterr = kPglRetNotYetSupported;
6757           goto MakePlink2NoVsort_ret_1;
6758           // reterr = WritePvarJoin(outname, variant_include, cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, pgfip->nonref_flags, pvar_info_reload, variant_cms, varid_template_str, missing_varid_match, info_keys, info_keys_htable, raw_variant_ct, variant_ct, max_allele_slen, new_variant_id_max_allele_slen, max_write_allele_ct, max_missalt_ct, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, vpos_sortstatus, info_key_ct, info_keys_htable_size, misc_flags, make_plink2_flags, pvar_psam_flags, max_thread_ct, xheader);
6759         }
6760       }
6761       if (unlikely(reterr)) {
6762         goto MakePlink2NoVsort_ret_1;
6763       }
6764       logputs("done.\n");
6765     }
6766     MakeCommon mc;
6767     mc.plink2_write_flags = kfPlink2Write0;
6768     const uint32_t raw_sample_ctl = BitCtToWordCt(raw_sample_ct);
6769     const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
6770     ctx.sex_male_collapsed = nullptr;  // defensive
6771     if (make_plink2_flags & kfMakePlink2SetHhMissing) {
6772       const uint32_t sample_ctv = BitCtToVecCt(sample_ct);
6773       uintptr_t* new_sex_male;
6774       if (unlikely(
6775               bigstack_alloc_w(sample_ctv * kWordsPerVec, &new_sex_male) ||
6776               bigstack_alloc_w(sample_ctv * kWordsPerVec, &mc.sex_male_collapsed_interleaved) ||
6777               bigstack_alloc_w(sample_ctv * kWordsPerVec, &ctx.sex_female_collapsed) ||
6778               bigstack_alloc_w(sample_ctv * kWordsPerVec, &mc.sex_female_collapsed_interleaved))) {
6779         goto MakePlink2NoVsort_ret_NOMEM;
6780       }
6781       CopyBitarrSubset(sex_male, sample_include, sample_ct, new_sex_male);
6782       ZeroTrailingWords(sample_ctl, new_sex_male);
6783       ctx.sex_male_collapsed = new_sex_male;
6784       FillInterleavedMaskVec(ctx.sex_male_collapsed, sample_ctv, mc.sex_male_collapsed_interleaved);
6785 
6786       CopyBitarrSubset(sex_nm, sample_include, sample_ct, ctx.sex_female_collapsed);
6787       BitvecInvmask(new_sex_male, sample_ctl, ctx.sex_female_collapsed);
6788       ZeroTrailingWords(sample_ctl, ctx.sex_female_collapsed);
6789       FillInterleavedMaskVec(ctx.sex_female_collapsed, sample_ctv, mc.sex_female_collapsed_interleaved);
6790 
6791       mc.plink2_write_flags |= kfPlink2WriteSetHhMissing;
6792       if (make_plink2_flags & kfMakePlink2SetHhMissingKeepDosage) {
6793         mc.plink2_write_flags |= kfPlink2WriteSetHhMissingKeepDosage;
6794       }
6795     } else {
6796       // defensive
6797       mc.sex_male_collapsed_interleaved = nullptr;
6798       mc.sex_female_collapsed_interleaved = nullptr;
6799     }
6800     if (make_plink2_flags & kfMakePlink2SetMixedMtMissing) {
6801       mc.plink2_write_flags |= kfPlink2WriteSetMixedMtMissing;
6802       if (make_plink2_flags & kfMakePlink2SetMixedMtMissingKeepDosage) {
6803         mc.plink2_write_flags |= kfPlink2WriteSetMixedMtMissingKeepDosage;
6804       }
6805     }
6806     mc.cip = cip;
6807     mc.raw_sample_ct = raw_sample_ct;
6808     mc.sample_ct = sample_ct;
6809     unsigned char* bigstack_mark2 = g_bigstack_base;
6810     const uint32_t make_pgen = make_plink2_flags & kfMakePgen;
6811     // todo: prohibit .pgen + .bim write when data is multiallelic without
6812     //   either multiallelic split or erase-alt2+ specified
6813     //   (--make-bed = automatic erase-alt2+?)
6814     if ((make_plink2_flags & kfMakeBed) || ((make_plink2_flags & (kfMakePgen | (kfMakePgenFormatBase * 3))) == (kfMakePgen | kfMakePgenFormatBase))) {
6815       reterr = MakeBedlikeMain(sample_include, new_sample_idx_to_old, variant_include, refalt1_select, raw_variant_ct, variant_ct, max_thread_ct, hard_call_thresh, make_plink2_flags, pgr_alloc_cacheline_ct, pgfip, &mc, outname, outname_end);
6816     } else if (make_pgen) {
6817       assert(variant_ct);
6818       assert(sample_ct);
6819       if (make_plink2_flags & (kfMakePlink2MSplitBase * 7)) {
6820         // don't duplicate complicated multiallelic split/merge/trim-alts logic
6821         // here for now.
6822         // (also auto-punt multiallelic dosage?)
6823         goto MakePlink2NoVsort_fallback;
6824       }
6825       ctx.write_allele_idx_offsets = write_allele_idx_offsets;
6826       if (variant_ct == raw_variant_ct) {
6827         ctx.write_chr_fo_vidx_start = cip->chr_fo_vidx_start;
6828       } else {
6829         if (AllocAndFillSubsetChrFoVidxStart(variant_include, cip, &ctx.write_chr_fo_vidx_start)) {
6830           goto MakePlink2NoVsort_fallback;
6831         }
6832       }
6833       if (make_plink2_flags & kfMakePgenErasePhase) {
6834         read_gflags &= ~(kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent);
6835       }
6836       if (make_plink2_flags & kfMakePgenEraseDosage) {
6837         if (hard_call_thresh == UINT32_MAX) {
6838           read_gflags &= ~(kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
6839         } else {
6840           // erase-dosage + --hard-call-threshold currently requires dosages to
6841           // be read, and only thrown away at the last minute
6842           // (alternatively, we could build --hard-call-threshold directly into
6843           // pgr_read_raw?)
6844           mc.plink2_write_flags |= kfPlink2WriteLateDosageErase;
6845         }
6846       }
6847       if (read_gflags && (variant_ct < raw_variant_ct)) {
6848         // did we e.g. filter out all the phased variants?
6849         // do not check for multiallelic-hc here for now
6850         // (write_allele_idx_offsets check above serves the same purpose)
6851         read_gflags &= kfPgenGlobalMultiallelicHardcallFound | GflagsVfilter(variant_include, pgfip->vrtypes, raw_variant_ct, pgfip->gflags);
6852       }
6853       // could check if all the phased samples were also filtered out, but
6854       // that's already caught by running --make-pgen twice, so not a big deal
6855 
6856       const uint32_t read_dosage_present = (read_gflags / kfPgenGlobalDosagePresent) & 1;
6857       mc.hard_call_halfdist = ((hard_call_thresh == UINT32_MAX) || (!read_dosage_present))? 0 : (kDosage4th - hard_call_thresh);
6858       ctx.dosage_erase_halfdist = kDosage4th - dosage_erase_thresh;
6859       const uint32_t read_phase_present = !!(read_gflags & (kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent));
6860       const uint32_t read_dphase_present = (read_gflags / kfPgenGlobalDosagePhasePresent) & 1;
6861       PgenGlobalFlags write_gflags = read_gflags;
6862       uint32_t read_or_write_phase_present = read_phase_present;
6863       uint32_t read_or_write_dphase_present = read_dphase_present;
6864       if (mc.hard_call_halfdist && (read_phase_present || read_or_write_dphase_present)) {
6865         read_or_write_phase_present = 1;
6866         read_or_write_dphase_present = 1;
6867         write_gflags |= kfPgenGlobalHardcallPhasePresent | kfPgenGlobalDosagePhasePresent;
6868       } else if (dosage_erase_thresh && read_dosage_present) {
6869         read_or_write_phase_present = 1;
6870       }
6871       uint32_t read_or_write_dosage_present = read_dosage_present;
6872       if (mc.plink2_write_flags & kfPlink2WriteLateDosageErase) {
6873         write_gflags &= ~(kfPgenGlobalDosagePresent | kfPgenGlobalDosagePhasePresent);
6874       } else if (mc.plink2_write_flags & (kfPlink2WriteSetHhMissingKeepDosage | kfPlink2WriteSetMixedMtMissingKeepDosage)) {
6875         read_or_write_dosage_present = 1;
6876         write_gflags |= kfPgenGlobalDosagePresent;
6877       }
6878       if ((write_gflags & (kfPgenGlobalMultiallelicHardcallFound | kfPgenGlobalDosagePresent)) == (kfPgenGlobalMultiallelicHardcallFound | kfPgenGlobalDosagePresent)) {
6879         logerrputs("Error: Multiallelic dosages aren't supported yet.\n");
6880         reterr = kPglRetNotYetSupported;
6881         goto MakePlink2NoVsort_ret_1;
6882       }
6883       write_gflags &= ~kfPgenGlobalMultiallelicHardcallFound;
6884       uintptr_t alloc_base_cacheline_ct;
6885       uint64_t mpgw_per_thread_cacheline_ct;
6886       uint32_t vrec_len_byte_ct;
6887       uint64_t vblock_cacheline_ct;
6888       // may want to have a load_sample_ct which is raw_sample_ct when e.g.
6889       // sample_ct > 0.1 * raw_sample_ct, and sample_ct otherwise.
6890       MpgwInitPhase1(write_allele_idx_offsets, variant_ct, sample_ct, write_gflags, &alloc_base_cacheline_ct, &mpgw_per_thread_cacheline_ct, &vrec_len_byte_ct, &vblock_cacheline_ct);
6891 
6892       // bugfix: each variant currently needs to be vector-aligned
6893       // bugfix?: need to use raw_sample_ct here, not sample_ct
6894       const uint32_t raw_sample_ctv2 = NypCtToVecCt(raw_sample_ct);
6895       const uint32_t max_vblock_size = MINV(kPglVblockSize, variant_ct);
6896       uint64_t load_vblock_cacheline_ct = VecCtToCachelineCtU64(S_CAST(uint64_t, raw_sample_ctv2) * max_vblock_size);
6897 
6898       if (make_plink2_flags & ((kfMakePlink2MSplitBase * 7) | kfMakePlink2EraseAlt2Plus)) {
6899         logerrputs("Error: 'multiallelics=' and 'erase-alt2+' modifiers are under development.\n");
6900         reterr = kPglRetNotYetSupported;
6901         goto MakePlink2NoVsort_ret_NOMEM;
6902       }
6903       if (read_gflags & kfPgenGlobalMultiallelicHardcallFound) {
6904         // raw multiallelic hardcall track has three parts:
6905         // 1. two words with rare01_ct and rare10_ct.
6906         // 2. (vector-aligned) patch_01_set and patch_01_vals.
6907         // 3. (vector-aligned) patch_10_set and patch_10_vals.
6908         const uintptr_t mhcraw_word_ct = RoundUpPow2(2, kWordsPerVec) + GetMhcWordCt(raw_sample_ct);
6909         load_vblock_cacheline_ct += WordCtToCachelineCtU64(S_CAST(uint64_t, mhcraw_word_ct) * max_vblock_size);
6910       }
6911       if (read_phase_present) {
6912         // could make this bound tighter when lots of unphased variants are
6913         // mixed in among the phased variants, but this isn't nearly as
6914         // important as the analogous multiallelic optimization
6915 
6916         // phaseraw has three parts:
6917         // 1. het_ct as uint32_t, and explicit_phasepresent_ct as uint32_t.
6918         // 2. vec-aligned bitarray of up to (raw_sample_ct + 1) bits.  first
6919         //    bit is set iff phasepresent is explicitly stored at all (if not,
6920         //    all hets are assumed to be phased), if yes the remaining bits
6921         //    store packed phasepresent values for all hets, if no the
6922         //    remaining bits store packed phaseinfo values for all hets.
6923         // 3. word-aligned bitarray of up to raw_sample_ct bits, storing
6924         //    phaseinfo values.  (end of this array is vec-aligned.)
6925         const uintptr_t phaseraw_word_ct = (8 / kBytesPerWord) + kWordsPerVec + RoundDownPow2(raw_sample_ct / kBitsPerWordD2, kWordsPerVec);
6926         load_vblock_cacheline_ct += WordCtToCachelineCtU64(S_CAST(uint64_t, phaseraw_word_ct) * max_vblock_size);
6927       }
6928       if (read_dosage_present) {
6929         // biallelic dosageraw has two parts:
6930         // 1. vec-aligned bitarray of up to raw_sample_ct bits, storing which
6931         //    samples have dosages.
6932         // 2. word-aligned array of uint16s with 0..32768 fixed-point dosages.
6933         // dphaseraw has the same structure, with the uint16s replaced with an
6934         // int16 array of (left - right) values.
6935         const uintptr_t dosageraw_word_ct = kWordsPerVec * (BitCtToVecCt(raw_sample_ct) + DivUp(raw_sample_ct, kBytesPerVec / sizeof(Dosage)));
6936         load_vblock_cacheline_ct += WordCtToCachelineCtU64(dosageraw_word_ct * S_CAST(uint64_t, max_vblock_size)) * (1 + read_dphase_present);
6937       }
6938 
6939 #ifndef __LP64__
6940       if ((mpgw_per_thread_cacheline_ct > (0x7fffffff / kCacheline)) || (load_vblock_cacheline_ct > (0x7fffffff / kCacheline))) {
6941         goto MakePlink2NoVsort_fallback;
6942       }
6943 #endif
6944       uint32_t calc_thread_ct = DivUp(variant_ct, kPglVblockSize);
6945       if (calc_thread_ct >= max_thread_ct) {
6946         calc_thread_ct = (max_thread_ct > 2)? (max_thread_ct - 1) : max_thread_ct;
6947       }
6948       const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
6949       if (!new_sample_idx_to_old) {
6950         // hphase doesn't seem to affect read:write ratio much
6951 #ifdef USE_AVX2
6952         const uint32_t max_calc_thread_ct = 2;
6953 #else
6954         const uint32_t max_calc_thread_ct = 2 + subsetting_required;
6955 #endif
6956         if (calc_thread_ct > max_calc_thread_ct) {
6957           calc_thread_ct = max_calc_thread_ct;
6958         }
6959       }
6960       // this is frequently I/O-bound even when resorting, but I'll postpone
6961       // tuning thread count there
6962       mc.refalt1_select = refalt1_select;
6963       if (refalt1_select) {
6964         if (write_allele_idx_offsets) {
6965           // this will require write_mhc and an additional AlleleCode buffer
6966           logerrputs("Error: Multiallelic allele rotation is under development.\n");
6967           reterr = kPglRetNotYetSupported;
6968           goto MakePlink2NoVsort_ret_1;
6969         }
6970         if (variant_ct < raw_variant_ct) {
6971           // might want inner loop to map variant uidx -> idx instead
6972           STD_ARRAY_PTR_DECL(AlleleCode, 2, tmp_refalt1_select);
6973           if (BIGSTACK_ALLOC_STD_ARRAY(AlleleCode, 2, variant_ct, &tmp_refalt1_select)) {
6974             goto MakePlink2NoVsort_fallback;
6975           }
6976           uintptr_t variant_uidx_base = 0;
6977           uintptr_t cur_bits = variant_include[0];
6978           for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
6979             const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
6980             STD_ARRAY_COPY(refalt1_select[variant_uidx], 2, tmp_refalt1_select[variant_idx]);
6981           }
6982           mc.refalt1_select = tmp_refalt1_select;
6983         }
6984       }
6985       mpgwp = S_CAST(MTPgenWriter*, bigstack_alloc((calc_thread_ct + DivUp(sizeof(MTPgenWriter), kBytesPerWord)) * sizeof(intptr_t)));
6986       if (!mpgwp) {
6987         goto MakePlink2NoVsort_fallback;
6988       }
6989       mpgwp->pgen_outfile = nullptr;
6990       if (bigstack_alloc_wp(calc_thread_ct, &(ctx.loadbuf_thread_starts[0])) ||
6991           bigstack_alloc_wp(calc_thread_ct, &(ctx.loadbuf_thread_starts[1]))) {
6992         goto MakePlink2NoVsort_fallback;
6993       }
6994       uint32_t nonref_flags_storage = 3;
6995       uintptr_t* nonref_flags_write = pgfip->nonref_flags;
6996       if (!nonref_flags_write) {
6997         nonref_flags_storage = (pgfip->gflags & kfPgenGlobalAllNonref)? 2 : 1;
6998       } else if (variant_ct < raw_variant_ct) {
6999         const uint32_t write_variant_ctl = BitCtToWordCt(write_variant_ct);
7000         uintptr_t* old_nonref_flags = nonref_flags_write;
7001         if (bigstack_alloc_w(write_variant_ctl, &nonref_flags_write)) {
7002           goto MakePlink2NoVsort_fallback;
7003         }
7004         if (variant_ct == write_variant_ct) {
7005           CopyBitarrSubset(old_nonref_flags, variant_include, variant_ct, nonref_flags_write);
7006         } else {
7007           ZeroWArr(write_variant_ctl, nonref_flags_write);
7008           if (!write_allele_idx_offsets) {
7009             SplitNonrefFlags();
7010           } else {
7011             JoinNonrefFlags();
7012           }
7013         }
7014         if (nonref_flags_write[0] & 1) {
7015           if (AllBitsAreOne(nonref_flags_write, write_variant_ct)) {
7016             BigstackReset(nonref_flags_write);
7017             nonref_flags_write = nullptr;
7018             nonref_flags_storage = 2;
7019           }
7020         } else if (AllWordsAreZero(nonref_flags_write, write_variant_ctl)) {
7021           BigstackReset(nonref_flags_write);
7022           nonref_flags_write = nullptr;
7023           nonref_flags_storage = 1;
7024         }
7025       }
7026       ctx.pwcs = &(mpgwp->pwcs[0]);
7027       ctx.new_sample_idx_to_old = new_sample_idx_to_old;
7028       ctx.thread_write_genovecs = nullptr;
7029       ctx.thread_write_mhc = nullptr;
7030 
7031       // Each worker thread handles with 64k loaded variants at a time, while
7032       // the I/O thread loads the next (64k * thread_ct).
7033       uintptr_t other_per_thread_cacheline_ct = 2 * load_vblock_cacheline_ct;
7034 
7035       uint32_t write_mhc_needed = 0;
7036       if (new_sample_idx_to_old || subsetting_required) {
7037         if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_genovecs)) {
7038           goto MakePlink2NoVsort_fallback;
7039         }
7040         if (read_phase_present && new_sample_idx_to_old) {
7041           if (bigstack_alloc_u32(raw_sample_ct, &ctx.old_sample_idx_to_new)) {
7042             goto MakePlink2NoVsort_fallback;
7043           }
7044           for (uint32_t new_sample_idx = 0; new_sample_idx != sample_ct; ++new_sample_idx) {
7045             ctx.old_sample_idx_to_new[new_sample_idx_to_old[new_sample_idx]] = new_sample_idx;
7046           }
7047         }
7048         // ctx.thread_write_genovecs
7049         other_per_thread_cacheline_ct += NypCtToCachelineCt(sample_ct);
7050         write_mhc_needed = 1;
7051       }
7052       uintptr_t write_mhcraw_cacheline_ct = 0;
7053       if (write_mhc_needed) {
7054         if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_mhc)) {
7055           goto MakePlink2NoVsort_fallback;
7056         }
7057         // todo: refalt1_select
7058         const uintptr_t mhcwrite_word_ct = GetMhcWordCt(sample_ct);
7059         write_mhcraw_cacheline_ct = DivUp(mhcwrite_word_ct, kWordsPerCacheline);
7060         other_per_thread_cacheline_ct += write_mhcraw_cacheline_ct;
7061       }
7062       if ((write_mhc_needed || read_dosage_present) && new_sample_idx_to_old) {
7063         // ctx.thread_cumulative_popcount_bufs
7064         other_per_thread_cacheline_ct += Int32CtToCachelineCt(raw_sample_ctl);
7065         if (bigstack_alloc_u32p(calc_thread_ct, &ctx.thread_cumulative_popcount_bufs)) {
7066           goto MakePlink2NoVsort_fallback;
7067         }
7068       }
7069       ctx.thread_write_phasepresents = nullptr;
7070       ctx.thread_all_hets = nullptr;
7071       ctx.thread_write_dosagepresents = nullptr;
7072       ctx.thread_write_dphasepresents = nullptr;
7073       if (read_or_write_phase_present || read_or_write_dosage_present) {
7074         if (read_or_write_phase_present) {
7075           if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_phasepresents) ||
7076               bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_phaseinfos)) {
7077             goto MakePlink2NoVsort_fallback;
7078           }
7079           if (read_phase_present) {
7080             if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_all_hets)) {
7081               goto MakePlink2NoVsort_fallback;
7082             }
7083             other_per_thread_cacheline_ct += BitCtToCachelineCt(raw_sample_ct);
7084           }
7085           // phasepresent, phaseinfo
7086           other_per_thread_cacheline_ct += 2 * BitCtToCachelineCt(sample_ct);
7087         }
7088         if (read_or_write_dosage_present) {
7089           if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_dosagepresents) ||
7090               bigstack_alloc_dosagep(calc_thread_ct, &ctx.thread_write_dosagevals)) {
7091             goto MakePlink2NoVsort_fallback;
7092           }
7093           if (read_or_write_dphase_present) {
7094             if (bigstack_alloc_wp(calc_thread_ct, &ctx.thread_write_dphasepresents) ||
7095                 bigstack_alloc_dphasep(calc_thread_ct, &ctx.thread_write_dphasedeltas)) {
7096               goto MakePlink2NoVsort_fallback;
7097             }
7098           }
7099           // dosage_present, dphase_present
7100           other_per_thread_cacheline_ct += BitCtToCachelineCt(sample_ct) * (1 + read_or_write_dphase_present);
7101 
7102           // dosage_main, dphase_delta
7103           other_per_thread_cacheline_ct += DivUp(sample_ct, (kCacheline / sizeof(Dosage))) * (1 + 2 * read_or_write_dphase_present);
7104 
7105           // todo: multiallelic dosage
7106         }
7107       }
7108       if (read_or_write_phase_present || read_dosage_present || (read_gflags & kfPgenGlobalMultiallelicHardcallFound)) {
7109         // ctx.loaded_vrtypes
7110         other_per_thread_cacheline_ct += 2 * (kPglVblockSize / kCacheline);
7111       }
7112       const uintptr_t cachelines_avail = bigstack_left() / kCacheline;
7113       if (cachelines_avail < alloc_base_cacheline_ct + (mpgw_per_thread_cacheline_ct + other_per_thread_cacheline_ct) * calc_thread_ct) {
7114         if (cachelines_avail < alloc_base_cacheline_ct + mpgw_per_thread_cacheline_ct + other_per_thread_cacheline_ct) {
7115           goto MakePlink2NoVsort_fallback;
7116         }
7117         calc_thread_ct = (cachelines_avail - alloc_base_cacheline_ct) / (mpgw_per_thread_cacheline_ct + other_per_thread_cacheline_ct);
7118       }
7119       uintptr_t* main_loadbufs[2];
7120       main_loadbufs[0] = S_CAST(uintptr_t*, bigstack_alloc_raw(load_vblock_cacheline_ct * calc_thread_ct * kCacheline));
7121       main_loadbufs[1] = S_CAST(uintptr_t*, bigstack_alloc_raw(load_vblock_cacheline_ct * calc_thread_ct * kCacheline));
7122       ctx.loaded_vrtypes[0] = nullptr;
7123       ctx.loaded_vrtypes[1] = nullptr;
7124       if (read_or_write_phase_present || read_dosage_present || (read_gflags & kfPgenGlobalMultiallelicHardcallFound)) {
7125         ctx.loaded_vrtypes[0] = S_CAST(unsigned char*, bigstack_alloc_raw(kPglVblockSize * calc_thread_ct));
7126         ctx.loaded_vrtypes[1] = S_CAST(unsigned char*, bigstack_alloc_raw(kPglVblockSize * calc_thread_ct));
7127       }
7128       if (read_or_write_phase_present || read_or_write_dosage_present) {
7129         const uint32_t bitvec_writebuf_byte_ct = BitCtToCachelineCt(sample_ct) * kCacheline;
7130         const uintptr_t dosagevals_writebuf_byte_ct = DivUp(sample_ct, (kCacheline / 2)) * kCacheline;
7131         for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
7132           if (read_or_write_phase_present) {
7133             ctx.thread_write_phasepresents[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitvec_writebuf_byte_ct));
7134             ctx.thread_write_phaseinfos[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitvec_writebuf_byte_ct));
7135 
7136             if (read_phase_present) {
7137               ctx.thread_all_hets[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(BitCtToCachelineCt(raw_sample_ct) * kCacheline));
7138             }
7139           }
7140           if (read_or_write_dosage_present) {
7141             ctx.thread_write_dosagepresents[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitvec_writebuf_byte_ct));
7142             ctx.thread_write_dosagevals[tidx] = S_CAST(Dosage*, bigstack_alloc_raw(dosagevals_writebuf_byte_ct));
7143             if (read_or_write_dphase_present) {
7144               ctx.thread_write_dphasepresents[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(bitvec_writebuf_byte_ct));
7145               ctx.thread_write_dphasedeltas[tidx] = S_CAST(SDosage*, bigstack_alloc_raw(2 * dosagevals_writebuf_byte_ct));
7146             }
7147           }
7148         }
7149       }
7150       if (new_sample_idx_to_old || subsetting_required) {
7151         uintptr_t writebuf_byte_ct = input_biallelic? NypCtToByteCt(sample_ct) : (2 * sample_ct * sizeof(AlleleCode));
7152         writebuf_byte_ct = RoundUpPow2(writebuf_byte_ct, kCacheline);
7153         for (uint32_t tidx = 0; tidx != calc_thread_ct; ++tidx) {
7154           ctx.thread_write_genovecs[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(writebuf_byte_ct));
7155           if (write_mhc_needed) {
7156             ctx.thread_write_mhc[tidx] = S_CAST(uintptr_t*, bigstack_alloc_raw(write_mhcraw_cacheline_ct * kCacheline));
7157           }
7158           if ((write_mhc_needed || read_dosage_present) && new_sample_idx_to_old) {
7159             ctx.thread_cumulative_popcount_bufs[tidx] = S_CAST(uint32_t*, bigstack_alloc_raw(Int32CtToCachelineCt(raw_sample_ctl) * kCacheline));
7160           }
7161         }
7162       }
7163       snprintf(outname_end, kMaxOutfnameExtBlen, ".pgen");
7164       logprintfww5("Writing %s ... ", outname);
7165       fputs("0%", stdout);
7166       fflush(stdout);
7167       unsigned char* mpgw_alloc = S_CAST(unsigned char*, bigstack_alloc_raw((alloc_base_cacheline_ct + mpgw_per_thread_cacheline_ct * calc_thread_ct) * kCacheline));
7168       assert(g_bigstack_base <= g_bigstack_end);
7169       reterr = MpgwInitPhase2(outname, write_allele_idx_offsets, nonref_flags_write, variant_ct, sample_ct, write_gflags, nonref_flags_storage, vrec_len_byte_ct, vblock_cacheline_ct, calc_thread_ct, mpgw_alloc, mpgwp);
7170       if (unlikely(reterr)) {
7171         if (reterr == kPglRetOpenFail) {
7172           logputs("\n");
7173           logerrprintfww(kErrprintfFopen, outname, strerror(errno));
7174         }
7175         goto MakePlink2NoVsort_ret_1;
7176       }
7177       if (unlikely(SetThreadCt(calc_thread_ct, &tg))) {
7178         goto MakePlink2NoVsort_ret_NOMEM;
7179       }
7180       mc.sample_include = subsetting_required? sample_include : nullptr;
7181       ctx.mcp = &mc;
7182       ctx.spgwp = nullptr;
7183       ctx.write_reterr = kPglRetSuccess;
7184       SetThreadFuncAndData(MakePgenThread, &ctx, &tg);
7185 
7186       // Main workflow:
7187       // 1. Set n=0, load first calc_thread_ct * kPglVblockSize
7188       //    *post-filtering* variants.
7189       //    This doesn't play well with blockload when any variants are
7190       //    filtered out, so we don't use it.  (todo: look into special-casing
7191       //    variant_ct == raw_variant_ct.)
7192       //
7193       // 2. Spawn threads processing batch n
7194       // 3. Load batch (n+1) unless eof
7195       // 4. Join threads
7196       // 5. Flush results for batch n (must happen here since we aren't using
7197       //    two output buffers.  this may be a mistake, revisit this choice...)
7198       // 6. Increment n by 1
7199       // 7. Goto step 2 unless eof
7200       const uint32_t batch_ct_m1 = (variant_ct - 1) / (kPglVblockSize * calc_thread_ct);
7201       uint32_t pct = 0;
7202       uint32_t parity = 0;
7203       uint32_t read_batch_idx = 0;
7204       uint32_t cur_batch_size = kPglVblockSize * calc_thread_ct;
7205       uint32_t next_print_variant_idx = variant_ct / 100;
7206       uintptr_t read_variant_uidx_base = 0;
7207       uintptr_t cur_bits = variant_include[0];
7208       PgrClearLdCache(simple_pgrp);
7209       for (uint32_t write_idx_end = 0; ; ++read_batch_idx, write_idx_end += cur_batch_size) {
7210         if (read_batch_idx) {
7211           ctx.cur_block_write_ct = cur_batch_size;
7212           if (write_idx_end == variant_ct) {
7213             DeclareLastThreadBlock(&tg);
7214           }
7215           if (unlikely(SpawnThreads(&tg))) {
7216             goto MakePlink2NoVsort_ret_THREAD_CREATE_FAIL;
7217           }
7218         }
7219         if (!IsLastBlock(&tg)) {
7220           if (read_batch_idx == batch_ct_m1) {
7221             cur_batch_size = variant_ct - (read_batch_idx * kPglVblockSize * calc_thread_ct);
7222           }
7223           uintptr_t* cur_loadbuf = main_loadbufs[parity];
7224           uintptr_t* loadbuf_iter = cur_loadbuf;
7225           unsigned char* cur_loaded_vrtypes = ctx.loaded_vrtypes[parity];
7226           for (uint32_t uii = 0; uii != cur_batch_size; ++uii) {
7227             if (!(uii % kPglVblockSize)) {
7228               ctx.loadbuf_thread_starts[parity][uii / kPglVblockSize] = loadbuf_iter;
7229             }
7230             const uintptr_t read_variant_uidx = BitIter1(variant_include, &read_variant_uidx_base, &cur_bits);
7231             reterr = PgrGetRaw(read_variant_uidx, read_gflags, simple_pgrp, &loadbuf_iter, cur_loaded_vrtypes? (&(cur_loaded_vrtypes[uii])) : nullptr);
7232             if (unlikely(reterr)) {
7233               goto MakePlink2NoVsort_ret_PGR_FAIL;
7234             }
7235           }
7236         }
7237         if (read_batch_idx) {
7238           JoinThreads(&tg);
7239           reterr = ctx.write_reterr;
7240           if (unlikely(reterr)) {
7241             // only possible error is kPglRetVarRecordTooLarge?
7242             goto MakePlink2NoVsort_ret_1;
7243           }
7244         }
7245         parity = 1 - parity;
7246         if (write_idx_end) {
7247           reterr = MpgwFlush(mpgwp);
7248           if (unlikely(reterr)) {
7249             goto MakePlink2NoVsort_ret_WRITE_FAIL;
7250           }
7251           if (write_idx_end == variant_ct) {
7252             mpgwp = nullptr;
7253             break;
7254           }
7255           if (write_idx_end >= next_print_variant_idx) {
7256             if (pct > 10) {
7257               putc_unlocked('\b', stdout);
7258             }
7259             pct = (write_idx_end * 100LLU) / variant_ct;
7260             printf("\b\b%u%%", pct++);
7261             fflush(stdout);
7262             next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
7263           }
7264         }
7265       }
7266       if (pct > 10) {
7267         putc_unlocked('\b', stdout);
7268       }
7269       fputs("\b\b", stdout);
7270       logputs("done.\n");
7271       // BigstackReset(bigstack_mark);
7272     } else if (0) {
7273     MakePlink2NoVsort_fallback:
7274       g_failed_alloc_attempt_size = 0;
7275       mpgwp = nullptr;
7276       BigstackReset(bigstack_mark2);
7277       reterr = MakePgenRobust(sample_include, new_sample_idx_to_old, variant_include, allele_idx_offsets, allele_presents, refalt1_select, write_allele_idx_offsets, nullptr, ctx.sex_male_collapsed, ctx.sex_female_collapsed, raw_variant_ct, variant_ct, write_variant_ct, max_allele_ct, hard_call_thresh, dosage_erase_thresh, make_plink2_flags, &mc, simple_pgrp, outname, outname_end);
7278       if (unlikely(reterr)) {
7279         goto MakePlink2NoVsort_ret_1;
7280       }
7281       if (variant_ct != write_variant_ct) {
7282         logprintfww("Multiallelic %s: %u variant%s written.\n", (variant_ct < write_variant_ct)? "split" : "join", write_variant_ct, (write_variant_ct == 1)? "" : "s");
7283       }
7284     }
7285   }
7286   while (0) {
7287   MakePlink2NoVsort_ret_NOMEM:
7288     reterr = kPglRetNomem;
7289     break;
7290   MakePlink2NoVsort_ret_PGR_FAIL:
7291     PgenErrPrintN(reterr);
7292     break;
7293   MakePlink2NoVsort_ret_WRITE_FAIL:
7294     reterr = kPglRetWriteFail;
7295     break;
7296   MakePlink2NoVsort_ret_INCONSISTENT_INPUT:
7297     reterr = kPglRetInconsistentInput;
7298     break;
7299   MakePlink2NoVsort_ret_THREAD_CREATE_FAIL:
7300     reterr = kPglRetThreadCreateFail;
7301     break;
7302   }
7303  MakePlink2NoVsort_ret_1:
7304   CleanupMpgw(mpgwp, &reterr);
7305   CleanupThreads(&tg);
7306   fclose_cond(outfile);
7307   pgfip->block_base = nullptr;
7308   BigstackReset(bigstack_mark);
7309   return reterr;
7310 }
7311 
7312 
SortChr(const ChrInfo * cip,const uint32_t * chr_idx_to_size,uint32_t use_nsort,ChrInfo * write_cip)7313 BoolErr SortChr(const ChrInfo* cip, const uint32_t* chr_idx_to_size, uint32_t use_nsort, ChrInfo* write_cip) {
7314   // Finishes initialization of write_cip.  Assumes chr_fo_vidx_start is
7315   // allocated and initialized to all-bits-one, chr_file_order/chr_idx_to_foidx
7316   // are unallocated, and chr_ct is uninitialized.
7317   const uint32_t max_code = cip->max_code;
7318   const uint32_t chr_code_end = max_code + 1 + cip->name_ct;
7319   uint32_t new_chr_ct = 0;
7320   for (uint32_t chr_idx = 0; chr_idx != chr_code_end; ++chr_idx) {
7321     const uint32_t cur_chr_size = chr_idx_to_size[chr_idx];
7322     if (cur_chr_size) {
7323       ++new_chr_ct;
7324     }
7325   }
7326   // bugfix (25 Nov 2019): must add 1 for chr_fo_vidx_start
7327   if (bigstack_alloc_u32(new_chr_ct, &(write_cip->chr_file_order)) ||
7328       bigstack_alloc_u32(new_chr_ct + 1, &(write_cip->chr_fo_vidx_start))) {
7329     return 1;
7330   }
7331   write_cip->chr_ct = new_chr_ct;
7332   // now for the actual sorting.
7333   // autosomes and PAR1/X/PAR2/Y/XY/MT come first, then contig names.
7334   const uint32_t autosome_ct = cip->autosome_ct;
7335   const uint32_t xymt_ct = max_code - autosome_ct;
7336   const uint32_t autosome_ct_p1 = autosome_ct + 1;
7337 
7338   STD_ARRAY_KREF(uint32_t, kChrOffsetCt) xymt_codes = cip->xymt_codes;
7339   const uintptr_t xymt_idx_to_chr_sort_offset[kChrOffsetCt] = {1, 3, 4, 5, 0, 2};
7340 
7341   // chr_sort_idx in high bits, original chr_idx in low
7342   uint64_t* std_sortbuf;
7343   uint64_t* std_sortbuf_iter;
7344   if (bigstack_alloc_u64(max_code + 1, &std_sortbuf)) {
7345     return 1;
7346   }
7347   std_sortbuf_iter = std_sortbuf;
7348   for (uintptr_t chr_idx = 0; chr_idx <= autosome_ct; ++chr_idx) {
7349     if (chr_idx_to_size[chr_idx]) {
7350       *std_sortbuf_iter++ = chr_idx * 0x100000001LLU;
7351     }
7352   }
7353   for (uint32_t xymt_idx = 0; xymt_idx != xymt_ct; ++xymt_idx) {
7354     const uint32_t xymt_code = xymt_codes[xymt_idx];
7355     if (!IsI32Neg(xymt_code)) {
7356       if (chr_idx_to_size[xymt_idx + autosome_ct_p1]) {
7357         *std_sortbuf_iter++ = (S_CAST(uint64_t, xymt_idx_to_chr_sort_offset[xymt_idx] + autosome_ct_p1) << 32) | (xymt_idx + autosome_ct_p1);
7358       }
7359     }
7360   }
7361   const uint32_t std_sortbuf_len = std_sortbuf_iter - std_sortbuf;
7362   STD_SORT(std_sortbuf_len, u64cmp, std_sortbuf);
7363   uint32_t write_vidx = 0;
7364   write_cip->chr_fo_vidx_start[0] = 0;
7365   for (uint32_t new_chr_fo_idx = 0; new_chr_fo_idx != std_sortbuf_len; ++new_chr_fo_idx) {
7366     const uint64_t cur_entry = std_sortbuf[new_chr_fo_idx];
7367     const uintptr_t chr_idx = S_CAST(uint32_t, cur_entry);
7368     const uint32_t chr_size = chr_idx_to_size[chr_idx];
7369     write_cip->chr_file_order[new_chr_fo_idx] = chr_idx;
7370     write_vidx += chr_size;
7371     write_cip->chr_fo_vidx_start[new_chr_fo_idx + 1] = write_vidx;
7372     write_cip->chr_idx_to_foidx[chr_idx] = new_chr_fo_idx;
7373   }
7374 
7375   const uint32_t new_nonstd_ct = new_chr_ct - std_sortbuf_len;
7376   if (new_nonstd_ct) {
7377     StrSortIndexedDeref* nonstd_sort_buf = S_CAST(StrSortIndexedDeref*, bigstack_alloc_raw_rd(new_nonstd_ct * sizeof(StrSortIndexedDeref)));
7378     if (!nonstd_sort_buf) {
7379       return 1;
7380     }
7381     const char** nonstd_names = cip->nonstd_names;
7382     uint32_t str_idx = 0;
7383     for (uint32_t chr_idx = max_code + 1; chr_idx != chr_code_end; ++chr_idx) {
7384       if (chr_idx_to_size[chr_idx]) {
7385         nonstd_sort_buf[str_idx].strptr = nonstd_names[chr_idx];
7386         nonstd_sort_buf[str_idx].orig_idx = chr_idx;
7387         ++str_idx;
7388       }
7389     }
7390     assert(str_idx == new_nonstd_ct);
7391     // nonstd_names are not allocated in main workspace, so can't overread.
7392     StrptrArrSortMain(new_nonstd_ct, 0, use_nsort, nonstd_sort_buf);
7393     uint32_t new_chr_fo_idx = std_sortbuf_len;
7394     for (str_idx = 0; str_idx != new_nonstd_ct; ++str_idx, ++new_chr_fo_idx) {
7395       const uint32_t chr_idx = nonstd_sort_buf[str_idx].orig_idx;
7396       const uint32_t chr_size = chr_idx_to_size[chr_idx];
7397       write_cip->chr_file_order[new_chr_fo_idx] = chr_idx;
7398       write_vidx += chr_size;
7399       write_cip->chr_fo_vidx_start[new_chr_fo_idx + 1] = write_vidx;
7400       write_cip->chr_idx_to_foidx[chr_idx] = new_chr_fo_idx;
7401     }
7402   }
7403   BigstackReset(std_sortbuf);
7404   return 0;
7405 }
7406 
7407 // hybrid of WriteMapOrBim() and write_pvar_resorted()
7408 PglErr WriteBimResorted(const char* outname, const ChrInfo* write_cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const double* variant_cms, const uint32_t* new_variant_idx_to_old, uint32_t variant_ct, uint32_t max_allele_slen, uint32_t output_zst, uint32_t thread_ct) {
7409   // allele_presents must be nullptr unless we're trimming alt alleles
7410   unsigned char* bigstack_mark = g_bigstack_base;
7411   char* cswritep = nullptr;
7412   CompressStreamState css;
7413   PglErr reterr = kPglRetSuccess;
7414   PreinitCstream(&css);
7415   {
7416     const uint32_t max_chr_blen = GetMaxChrSlen(write_cip) + 1;
7417     // includes trailing tab
7418     char* chr_buf;
7419 
7420     if (unlikely(bigstack_alloc_c(max_chr_blen, &chr_buf))) {
7421       goto WriteBimResorted_ret_NOMEM;
7422     }
7423     const uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen;
7424     reterr = InitCstreamAlloc(outname, 0, output_zst, thread_ct, overflow_buf_size, &css, &cswritep);
7425     if (unlikely(reterr)) {
7426       goto WriteBimResorted_ret_1;
7427     }
7428 
7429     const char output_missing_geno_char = *g_output_missing_geno_ptr;
7430     uint32_t chr_fo_idx = UINT32_MAX;
7431     uint32_t chr_end = 0;
7432     uint32_t chr_buf_blen = 0;
7433     for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
7434       const uint32_t variant_uidx = new_variant_idx_to_old[variant_idx];
7435       if (variant_idx >= chr_end) {
7436         do {
7437           ++chr_fo_idx;
7438           chr_end = write_cip->chr_fo_vidx_start[chr_fo_idx + 1];
7439         } while (variant_idx >= chr_end);
7440         char* chr_name_end = chrtoa(write_cip, write_cip->chr_file_order[chr_fo_idx], chr_buf);
7441         *chr_name_end = '\t';
7442         chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
7443       }
7444       cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
7445       cswritep = strcpyax(cswritep, variant_ids[variant_uidx], '\t');
7446       if (!variant_cms) {
7447         *cswritep++ = '0';
7448       } else {
7449         cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
7450       }
7451       *cswritep++ = '\t';
7452       cswritep = u32toa(variant_bps[variant_uidx], cswritep);
7453       *cswritep++ = '\t';
7454       const uintptr_t allele_idx_offset_base = allele_idx_offsets? allele_idx_offsets[variant_uidx] : (variant_uidx * 2);
7455       const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
7456       // note that VCF ref allele corresponds to A2, not A1
7457       if (!refalt1_select) {
7458         if ((!allele_presents) || IsSet(allele_presents, 1 + allele_idx_offset_base)) {
7459           cswritep = strcpya(cswritep, cur_alleles[1]);
7460         } else {
7461           *cswritep++ = output_missing_geno_char;
7462         }
7463         *cswritep++ = '\t';
7464         cswritep = strcpya(cswritep, cur_alleles[0]);
7465       } else {
7466         STD_ARRAY_KREF(AlleleCode, 2) cur_refalt1_select = refalt1_select[variant_uidx];
7467         if ((!allele_presents) || IsSet(allele_presents, cur_refalt1_select[1] + allele_idx_offset_base)) {
7468           cswritep = strcpya(cswritep, cur_alleles[cur_refalt1_select[1]]);
7469         } else {
7470           *cswritep++ = output_missing_geno_char;
7471         }
7472         *cswritep++ = '\t';
7473         cswritep = strcpya(cswritep, cur_alleles[cur_refalt1_select[0]]);
7474       }
7475       AppendBinaryEoln(&cswritep);
7476       if (unlikely(Cswrite(&css, &cswritep))) {
7477         goto WriteBimResorted_ret_WRITE_FAIL;
7478       }
7479     }
7480     if (unlikely(CswriteCloseNull(&css, cswritep))) {
7481       goto WriteBimResorted_ret_WRITE_FAIL;
7482     }
7483   }
7484   while (0) {
7485   WriteBimResorted_ret_NOMEM:
7486     reterr = kPglRetNomem;
7487     break;
7488   WriteBimResorted_ret_WRITE_FAIL:
7489     reterr = kPglRetWriteFail;
7490     break;
7491   }
7492  WriteBimResorted_ret_1:
7493   CswriteCloseCond(&css, cswritep);
7494   BigstackReset(bigstack_mark);
7495   return reterr;
7496 }
7497 
PvarInfoReloadInterval(const uint32_t * old_variant_uidx_to_new,uint32_t variant_idx_start,uint32_t variant_idx_end,TextStream * pvar_reload_txsp,char ** pvar_info_strs)7498 PglErr PvarInfoReloadInterval(const uint32_t* old_variant_uidx_to_new, uint32_t variant_idx_start, uint32_t variant_idx_end, TextStream* pvar_reload_txsp, char** pvar_info_strs) {
7499   // We assume the batch size was chosen such that there's no risk of
7500   // scribbling past g_bigstack_end (barring pathological cases like another
7501   // process modifying the .pvar file after initial load).
7502   // We also assume no more dynamic allocations are needed after this;
7503   // otherwise, str_store_iter should be returned.
7504   char* line_iter;
7505   // probable todo: avoid rewind when one batch is entirely after the previous
7506   // batch (this is likely when input was already almost-sorted, and just a few
7507   // coordinates changed due to e.g. --normalize)
7508   PglErr reterr = TextRewind(pvar_reload_txsp);
7509   if (unlikely(reterr)) {
7510     return reterr;
7511   }
7512   const uint32_t cur_batch_size = variant_idx_end - variant_idx_start;
7513   char* str_store_iter = R_CAST(char*, g_bigstack_base);
7514   uint32_t info_col_idx;
7515   reterr = PvarInfoReloadHeader(pvar_reload_txsp, &line_iter, &info_col_idx);
7516   if (unlikely(reterr)) {
7517     return reterr;
7518   }
7519   uint32_t variant_idx = 0;
7520   for (uint32_t variant_uidx = 0; ; ++variant_uidx) {
7521     reterr = TextNextLineLstrip(pvar_reload_txsp, &line_iter);
7522     if (unlikely(reterr)) {
7523       return reterr;
7524     }
7525     const uint32_t new_variant_idx_offset = old_variant_uidx_to_new[variant_uidx] - variant_idx_start;
7526     // exploit wraparound, UINT32_MAX null value
7527     if (new_variant_idx_offset >= cur_batch_size) {
7528       continue;
7529     }
7530     line_iter = NextTokenMultFar(line_iter, info_col_idx);
7531     if (!line_iter) {
7532       return kPglRetRewindFail;
7533     }
7534     char* info_end = CurTokenEnd(line_iter);
7535     const uint32_t info_slen = info_end - line_iter;
7536     pvar_info_strs[new_variant_idx_offset] = str_store_iter;
7537     str_store_iter = memcpyax(str_store_iter, line_iter, info_slen, '\0');
7538     line_iter = info_end;
7539     if (++variant_idx == cur_batch_size) {
7540       break;
7541     }
7542   }
7543   assert(str_store_iter <= R_CAST(char*, g_bigstack_end));
7544   return kPglRetSuccess;
7545 }
7546 
7547 // could be BoolErr
7548 PglErr WritePvarResortedInterval(const ChrInfo* write_cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const double* variant_cms, const uint32_t* new_variant_idx_to_old, uint32_t variant_idx_start, uint32_t variant_idx_end, uint32_t info_pr_flag_present, uint32_t write_qual, uint32_t write_filter, uint32_t write_info, uint32_t all_nonref, uint32_t write_cm, char** pvar_info_strs, CompressStreamState* cssp, char** cswritepp, uint32_t* chr_fo_idxp, uint32_t* chr_endp, uint32_t* chr_buf_blenp, char* chr_buf) {
7549   char* cswritep = *cswritepp;
7550   uint32_t chr_fo_idx = *chr_fo_idxp;
7551   uint32_t chr_end = *chr_endp;
7552   uint32_t chr_buf_blen = *chr_buf_blenp;
7553   PglErr reterr = kPglRetSuccess;
7554   {
7555     const char output_missing_geno_char = *g_output_missing_geno_ptr;
7556     uint32_t ref_allele_idx = 0;
7557     uint32_t alt1_allele_idx = 1;
7558     uint32_t cur_allele_ct = 2;
7559     for (uint32_t variant_idx = variant_idx_start; variant_idx != variant_idx_end; ++variant_idx) {
7560       const uint32_t variant_uidx = new_variant_idx_to_old[variant_idx];
7561       if (variant_idx == chr_end) {
7562         ++chr_fo_idx;
7563         chr_end = write_cip->chr_fo_vidx_start[chr_fo_idx + 1];
7564         assert(variant_idx < chr_end);
7565         char* chr_name_end = chrtoa(write_cip, write_cip->chr_file_order[chr_fo_idx], chr_buf);
7566         *chr_name_end = '\t';
7567         chr_buf_blen = 1 + S_CAST(uintptr_t, chr_name_end - chr_buf);
7568       }
7569       cswritep = memcpya(cswritep, chr_buf, chr_buf_blen);
7570       cswritep = u32toa_x(variant_bps[variant_uidx], '\t', cswritep);
7571       cswritep = strcpyax(cswritep, variant_ids[variant_uidx], '\t');
7572       uintptr_t allele_idx_offset_base;
7573       if (!allele_idx_offsets) {
7574         allele_idx_offset_base = variant_uidx * 2;
7575       } else {
7576         allele_idx_offset_base = allele_idx_offsets[variant_uidx];
7577         cur_allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
7578       }
7579       const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
7580       if (refalt1_select) {
7581         ref_allele_idx = refalt1_select[variant_uidx][0];
7582         alt1_allele_idx = refalt1_select[variant_uidx][1];
7583       }
7584       cswritep = strcpyax(cswritep, cur_alleles[ref_allele_idx], '\t');
7585       uint32_t alt_allele_written = 0;
7586       if ((!allele_presents) || IsSet(allele_presents, allele_idx_offset_base + alt1_allele_idx)) {
7587         cswritep = strcpya(cswritep, cur_alleles[alt1_allele_idx]);
7588         alt_allele_written = 1;
7589       }
7590       if (unlikely(Cswrite(cssp, &cswritep))) {
7591         goto WritePvarResortedInterval_ret_WRITE_FAIL;
7592       }
7593       if (cur_allele_ct > 2) {
7594         for (uint32_t allele_idx = 0; allele_idx != cur_allele_ct; ++allele_idx) {
7595           if ((allele_idx == ref_allele_idx) || (allele_idx == alt1_allele_idx) || (allele_presents && (!IsSet(allele_presents, allele_idx_offset_base + allele_idx)))) {
7596             continue;
7597           }
7598           if (alt_allele_written) {
7599             *cswritep++ = ',';
7600           }
7601           alt_allele_written = 1;
7602           cswritep = strcpya(cswritep, cur_alleles[allele_idx]);
7603           if (unlikely(Cswrite(cssp, &cswritep))) {
7604             goto WritePvarResortedInterval_ret_WRITE_FAIL;
7605           }
7606         }
7607       }
7608       if (!alt_allele_written) {
7609         *cswritep++ = output_missing_geno_char;
7610       }
7611 
7612       if (write_qual) {
7613         *cswritep++ = '\t';
7614         if (!IsSet(qual_present, variant_uidx)) {
7615           *cswritep++ = '.';
7616         } else {
7617           cswritep = ftoa_g(quals[variant_uidx], cswritep);
7618         }
7619       }
7620 
7621       if (write_filter) {
7622         *cswritep++ = '\t';
7623         if (!IsSet(filter_present, variant_uidx)) {
7624           *cswritep++ = '.';
7625         } else if (!IsSet(filter_npass, variant_uidx)) {
7626           cswritep = strcpya_k(cswritep, "PASS");
7627         } else {
7628           cswritep = strcpya(cswritep, filter_storage[variant_uidx]);
7629         }
7630       }
7631 
7632       if (write_info) {
7633         *cswritep++ = '\t';
7634         const uint32_t is_pr = all_nonref || (nonref_flags && IsSet(nonref_flags, variant_uidx));
7635         if (pvar_info_strs) {
7636           PvarInfoWrite(info_pr_flag_present, is_pr, pvar_info_strs[variant_idx - variant_idx_start], &cswritep);
7637         } else {
7638           if (is_pr) {
7639             cswritep = strcpya_k(cswritep, "PR");
7640           } else {
7641             *cswritep++ = '.';
7642           }
7643         }
7644       }
7645 
7646       if (write_cm) {
7647         *cswritep++ = '\t';
7648         if (!variant_cms) {
7649           *cswritep++ = '0';
7650         } else {
7651           cswritep = dtoa_g_p8(variant_cms[variant_uidx], cswritep);
7652         }
7653       }
7654       AppendBinaryEoln(&cswritep);
7655     }
7656 
7657   }
7658   while (0) {
7659   WritePvarResortedInterval_ret_WRITE_FAIL:
7660     reterr = kPglRetWriteFail;
7661     break;
7662   }
7663   *cswritepp = cswritep;
7664   *chr_fo_idxp = chr_fo_idx;
7665   *chr_endp = chr_end;
7666   *chr_buf_blenp = chr_buf_blen;
7667   return reterr;
7668 }
7669 
7670 // allele_presents must be nullptr unless we're trimming alt alleles.
7671 //
7672 // The annoying part of this is handling a sequence of INFO strings that don't
7673 // fit in memory; we use a multipass approach for that.  File creation,
7674 // allocation of buffers, and generating the header line occurs directly in
7675 // this function, while loading the next pvar_info_strs batch and writing the
7676 // next .pvar line batch are one level down.
7677 PglErr WritePvarResorted(const char* outname, const uintptr_t* variant_include, const ChrInfo* write_cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* qual_present, const float* quals, const uintptr_t* filter_present, const uintptr_t* filter_npass, const char* const* filter_storage, const uintptr_t* nonref_flags, const char* pvar_info_reload, const double* variant_cms, const uint32_t* new_variant_idx_to_old, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_slen, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t nonref_flags_storage, uint32_t max_filter_slen, uint32_t info_reload_slen, PvarPsamFlags pvar_psam_flags, uint32_t thread_ct, char* xheader) {
7678   unsigned char* bigstack_mark = g_bigstack_base;
7679   char* cswritep = nullptr;
7680   PglErr reterr = kPglRetSuccess;
7681   CompressStreamState css;
7682   TextStream pvar_reload_txs;
7683   PreinitCstream(&css);
7684   PreinitTextStream(&pvar_reload_txs);
7685   {
7686     const uint32_t max_chr_blen = GetMaxChrSlen(write_cip) + 1;
7687     // includes trailing tab
7688     char* chr_buf;
7689 
7690     if (unlikely(
7691                  bigstack_alloc_c(max_chr_blen, &chr_buf))) {
7692       goto WritePvarResorted_ret_NOMEM;
7693     }
7694     uintptr_t overflow_buf_size = kCompressStreamBlock + kMaxIdSlen + 512 + 2 * max_allele_slen + max_filter_slen + info_reload_slen;
7695     if (overflow_buf_size < 2 * kCompressStreamBlock) {
7696       overflow_buf_size = 2 * kCompressStreamBlock;
7697     }
7698     const uint32_t output_zst = (pvar_psam_flags / kfPvarZs) & 1;
7699     reterr = InitCstreamAlloc(outname, 0, output_zst, thread_ct, overflow_buf_size, &css, &cswritep);
7700     if (unlikely(reterr)) {
7701       goto WritePvarResorted_ret_1;
7702     }
7703     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
7704     const uint32_t all_nonref = (nonref_flags_storage == 2);
7705     uint32_t write_info_pr = all_nonref;
7706     uint32_t write_info = (pvar_psam_flags & kfPvarColInfo) || pvar_info_reload;
7707     if (write_info && nonref_flags) {
7708       write_info_pr = !IntersectionIsEmpty(variant_include, nonref_flags, raw_variant_ctl);
7709     }
7710     write_info_pr = write_info_pr && write_info;
7711     if (unlikely(write_info_pr && (info_flags & kfInfoPrNonflagPresent))) {
7712       logputs("\n");
7713       logerrputs("Error: Conflicting INFO:PR definitions.  Either fix all REF alleles so that the\n'provisional reference' flag is no longer needed, or remove/rename the other\nuse of the INFO:PR key.\n");
7714       goto WritePvarResorted_ret_INCONSISTENT_INPUT;
7715     }
7716 
7717     uint32_t write_filter = 0;
7718     if (pvar_psam_flags & kfPvarColFilter) {
7719       write_filter = 1;
7720     } else if ((pvar_psam_flags & kfPvarColMaybefilter) && filter_present) {
7721       write_filter = !IntersectionIsEmpty(variant_include, filter_present, raw_variant_ctl);
7722     }
7723     const uint32_t info_pr_flag_present = (info_flags / kfInfoPrFlagPresent) & 1;
7724     if (pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) {
7725       reterr = PvarXheaderWrite(nullptr, write_cip, variant_bps, allele_idx_offsets, allele_storage, new_variant_idx_to_old, xheader_blen, (pvar_psam_flags / kfPvarColVcfheader) & 1, write_filter, write_info, write_info_pr && (!info_pr_flag_present), max_allele_slen, kfUnsortedVar0, xheader, &css, &cswritep);
7726       if (unlikely(reterr)) {
7727         goto WritePvarResorted_ret_1;
7728       }
7729     }
7730     if (write_cip->chrset_source) {
7731       AppendChrsetLine(write_cip, &cswritep);
7732     }
7733     cswritep = strcpya_k(cswritep, "#CHROM\tPOS\tID\tREF\tALT");
7734 
7735     uint32_t write_qual = 0;
7736     if (pvar_psam_flags & kfPvarColQual) {
7737       write_qual = 1;
7738     } else if ((pvar_psam_flags & kfPvarColMaybequal) && qual_present) {
7739       write_qual = !IntersectionIsEmpty(variant_include, qual_present, raw_variant_ctl);
7740     }
7741     if (write_qual) {
7742       cswritep = strcpya_k(cswritep, "\tQUAL");
7743     }
7744 
7745     if (write_filter) {
7746       cswritep = strcpya_k(cswritep, "\tFILTER");
7747     }
7748 
7749     if (write_info) {
7750       cswritep = strcpya_k(cswritep, "\tINFO");
7751     }
7752 
7753     uint32_t write_cm = 0;
7754     if (pvar_psam_flags & kfPvarColCm) {
7755       write_cm = 1;
7756     } else if ((pvar_psam_flags & kfPvarColMaybecm) && variant_cms) {
7757       if (raw_variant_ct == variant_ct) {
7758         // nonzero_cm_present check was performed
7759         write_cm = 1;
7760       } else {
7761         uintptr_t variant_uidx_base = 0;
7762         uintptr_t cur_bits = variant_include[0];
7763         for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
7764           const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
7765           if (variant_cms[variant_uidx] != 0.0) {
7766             write_cm = 1;
7767             break;
7768           }
7769         }
7770       }
7771     }
7772     if (write_cm) {
7773       cswritep = strcpya_k(cswritep, "\tCM");
7774     }
7775     AppendBinaryEoln(&cswritep);
7776 
7777     uint32_t* old_variant_uidx_to_new = nullptr;
7778     char** pvar_info_strs = nullptr;
7779     uint32_t batch_size = variant_ct;
7780     uint32_t batch_ct = 1;
7781     if (pvar_info_reload) {
7782       if (unlikely(bigstack_alloc_u32(raw_variant_ct, &old_variant_uidx_to_new))) {
7783         goto WritePvarResorted_ret_NOMEM;
7784       }
7785       SetAllU32Arr(raw_variant_ct, old_variant_uidx_to_new);
7786       for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
7787         const uint32_t old_variant_uidx = new_variant_idx_to_old[variant_idx];
7788         old_variant_uidx_to_new[old_variant_uidx] = variant_idx;
7789       }
7790 
7791       uint32_t decompress_thread_ct = 1;
7792       if (!output_zst) {
7793         decompress_thread_ct = thread_ct - 1;
7794         if (!decompress_thread_ct) {
7795           decompress_thread_ct = 1;
7796         }
7797       }
7798       reterr = SizeAndInitTextStream(pvar_info_reload, bigstack_left() / 4, decompress_thread_ct, &pvar_reload_txs);
7799       if (unlikely(reterr)) {
7800         goto WritePvarResorted_ret_TSTREAM_FAIL;
7801       }
7802 
7803       // subtract kCacheline to allow for rounding
7804       uintptr_t bytes_left = bigstack_left() - kCacheline;
7805       uint32_t single_variant_byte_ct = info_reload_slen + 1 + sizeof(intptr_t);
7806       if (variant_ct * single_variant_byte_ct > bytes_left) {
7807         batch_size = bytes_left / single_variant_byte_ct;
7808         batch_ct = 1 + (variant_ct - 1) / batch_size;
7809       }
7810       pvar_info_strs = S_CAST(char**, bigstack_alloc_raw_rd(batch_size * sizeof(intptr_t)));
7811     }
7812 
7813     uint32_t variant_idx_start = 0;
7814     uint32_t chr_fo_idx = UINT32_MAX;
7815     uint32_t chr_end = 0;
7816     uint32_t chr_buf_blen = 0;
7817     uint32_t pct = 0;
7818     uint32_t next_print_variant_idx = variant_ct / 100;
7819     fputs("0%", stdout);
7820     fflush(stdout);
7821     for (uint32_t batch_idx = 0; batch_idx != batch_ct; ++batch_idx) {
7822       if (variant_idx_start >= next_print_variant_idx) {
7823         if (pct > 10) {
7824           putc_unlocked('\b', stdout);
7825         }
7826         pct = (variant_idx_start * 100LLU) / variant_ct;
7827         printf("\b\b%u%%", pct++);
7828         fflush(stdout);
7829         next_print_variant_idx = (pct * S_CAST(uint64_t, variant_ct)) / 100;
7830       }
7831       uint32_t variant_idx_end = MINV(variant_idx_start + batch_size, variant_ct);
7832       if (pvar_info_reload) {
7833         reterr = PvarInfoReloadInterval(old_variant_uidx_to_new, variant_idx_start, variant_idx_end, &pvar_reload_txs, pvar_info_strs);
7834         if (unlikely(reterr)) {
7835           goto WritePvarResorted_ret_TSTREAM_FAIL;
7836         }
7837       }
7838       reterr = WritePvarResortedInterval(write_cip, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, qual_present, quals, filter_present, filter_npass, filter_storage, nonref_flags, variant_cms, new_variant_idx_to_old, variant_idx_start, variant_idx_end, info_pr_flag_present, write_qual, write_filter, write_info, all_nonref, write_cm, pvar_info_strs, &css, &cswritep, &chr_fo_idx, &chr_end, &chr_buf_blen, chr_buf);
7839       if (unlikely(reterr)) {
7840         goto WritePvarResorted_ret_1;
7841       }
7842       variant_idx_start = variant_idx_end;
7843     }
7844 
7845     if (unlikely(CswriteCloseNull(&css, cswritep))) {
7846       goto WritePvarResorted_ret_WRITE_FAIL;
7847     }
7848     if (pct > 10) {
7849       putc_unlocked('\b', stdout);
7850     }
7851     fputs("\b\b", stdout);
7852   }
7853   while (0) {
7854   WritePvarResorted_ret_NOMEM:
7855     reterr = kPglRetNomem;
7856     break;
7857   WritePvarResorted_ret_TSTREAM_FAIL:
7858     TextStreamErrPrint(pvar_info_reload, &pvar_reload_txs);
7859     break;
7860   WritePvarResorted_ret_WRITE_FAIL:
7861     reterr = kPglRetWriteFail;
7862     break;
7863   WritePvarResorted_ret_INCONSISTENT_INPUT:
7864     reterr = kPglRetInconsistentInput;
7865     break;
7866   }
7867  WritePvarResorted_ret_1:
7868   CswriteCloseCond(&css, cswritep);
7869   CleanupTextStream2(pvar_info_reload, &pvar_reload_txs, &reterr);
7870   BigstackReset(bigstack_mark);
7871   return reterr;
7872 }
7873 
7874 PglErr MakePlink2Vsort(const uintptr_t* sample_include, const PedigreeIdInfo* piip, const uintptr_t* sex_nm, const uintptr_t* sex_male, const PhenoCol* pheno_cols, const char* pheno_names, const uint32_t* new_sample_idx_to_old, const uintptr_t* variant_include, const ChrInfo* cip, const uint32_t* variant_bps, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const uintptr_t* allele_presents, const STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), const uintptr_t* pvar_qual_present, const float* pvar_quals, const uintptr_t* pvar_filter_present, const uintptr_t* pvar_filter_npass, const char* const* pvar_filter_storage, const char* pvar_info_reload, const double* variant_cms, const ChrIdx* chr_idxs, uintptr_t xheader_blen, InfoFlags info_flags, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t pheno_ct, uintptr_t max_pheno_name_blen, uint32_t raw_variant_ct, uint32_t variant_ct, uint32_t max_allele_ct, uint32_t max_allele_slen, uint32_t max_filter_slen, uint32_t info_reload_slen, uint32_t max_thread_ct, uint32_t hard_call_thresh, uint32_t dosage_erase_thresh, MakePlink2Flags make_plink2_flags, uint32_t use_nsort, PvarPsamFlags pvar_psam_flags, char* xheader, PgenReader* simple_pgrp, char* outname, char* outname_end) {
7875   unsigned char* bigstack_mark = g_bigstack_base;
7876   unsigned char* bigstack_end_mark = g_bigstack_end;
7877   PglErr reterr = kPglRetSuccess;
7878   {
7879     // Resort the variants.
7880     // 1. (todo) Apply --update-chr if necessary.
7881     // 2. Count number of remaining variants in each chromosome, then sort the
7882     //    chromosomes.
7883     // 3. Within each chromosome, sort by position.  Could add 0.5 for
7884     //    non-SNPs (not currently implemented)?  Could multithread this by
7885     //    chromosome, and/or use C++17 multithreaded sort, but INFO-reload is a
7886     //    much bigger bottleneck in practice.
7887     // 4. Scan for position ties, sort on ID (according to --sort-vars setting,
7888     //    defaults to natural-sort but can be ASCII).
7889     // 5. Fill new_variant_idx_to_old, free sort buffers.
7890 
7891     // possible todo: put this in a "copy constructor" function
7892     ChrInfo write_chr_info;
7893 
7894     write_chr_info.haploid_mask = K_CAST(uintptr_t*, cip->haploid_mask);
7895     write_chr_info.nonstd_names = K_CAST(const char**, cip->nonstd_names);
7896     write_chr_info.nonstd_id_htable = K_CAST(uint32_t*, cip->nonstd_id_htable);
7897     write_chr_info.chrset_source = cip->chrset_source;
7898     memcpy(write_chr_info.chr_exclude, cip->chr_exclude, kChrExcludeWords * sizeof(intptr_t));
7899     STD_ARRAY_COPY(cip->xymt_codes, kChrOffsetCt, write_chr_info.xymt_codes);
7900     write_chr_info.max_numeric_code = cip->max_numeric_code;
7901     write_chr_info.max_code = cip->max_code;
7902     write_chr_info.autosome_ct = cip->autosome_ct;
7903     write_chr_info.zero_extra_chrs = cip->zero_extra_chrs;
7904     write_chr_info.name_ct = cip->name_ct;
7905     write_chr_info.incl_excl_name_stack = K_CAST(LlStr*, cip->incl_excl_name_stack);
7906     write_chr_info.is_include_stack = cip->is_include_stack;
7907     write_chr_info.output_encoding = cip->output_encoding;
7908 
7909     const uint32_t chr_code_end = cip->max_code + 1 + cip->name_ct;
7910     uint32_t* chr_idx_to_size;
7911     if (unlikely(
7912             bigstack_calloc_w(kChrMaskWords, &write_chr_info.chr_mask) ||
7913             bigstack_alloc_u32(chr_code_end, &write_chr_info.chr_idx_to_foidx) ||
7914             bigstack_end_calloc_u32(chr_code_end, &chr_idx_to_size))) {
7915       goto MakePlink2Vsort_ret_NOMEM;
7916     }
7917     SetAllU32Arr(chr_code_end, write_chr_info.chr_idx_to_foidx);
7918     if (chr_idxs) {
7919       uintptr_t variant_uidx_base = 0;
7920       uintptr_t cur_base = variant_include[0];
7921       for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
7922         const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_base);
7923         chr_idx_to_size[chr_idxs[variant_uidx]] += 1;
7924       }
7925       for (uint32_t chr_idx = 0; chr_idx != chr_code_end; ++chr_idx) {
7926         if (chr_idx_to_size[chr_idx]) {
7927           SetBit(chr_idx, write_chr_info.chr_mask);
7928         }
7929       }
7930       // bugfix: chr_file_order is invalid
7931     } else {
7932       const uint32_t* chr_fo_vidx_start = cip->chr_fo_vidx_start;
7933       const uint32_t orig_chr_ct = cip->chr_ct;
7934       uint32_t vidx_start = 0;
7935       for (uint32_t chr_fo_idx = 0; chr_fo_idx != orig_chr_ct; ++chr_fo_idx) {
7936         const uint32_t vidx_end = chr_fo_vidx_start[chr_fo_idx + 1];
7937         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
7938         chr_idx_to_size[chr_idx] = PopcountBitRange(variant_include, vidx_start, vidx_end);
7939         if (chr_idx_to_size[chr_idx]) {
7940           SetBit(chr_idx, write_chr_info.chr_mask);
7941         }
7942         vidx_start = vidx_end;
7943       }
7944     }
7945     if (unlikely(SortChr(cip, chr_idx_to_size, use_nsort, &write_chr_info))) {
7946       goto MakePlink2Vsort_ret_NOMEM;
7947     }
7948 
7949     uint32_t* new_variant_idx_to_old;
7950 
7951     // pos_vidx_sort_buf has variant_bp in high bits, variant_uidx in low
7952     uint64_t* pos_vidx_sort_buf;
7953     if (unlikely(
7954             bigstack_alloc_u32(variant_ct, &new_variant_idx_to_old) ||
7955             bigstack_alloc_u64(variant_ct + 1, &pos_vidx_sort_buf))) {
7956       goto MakePlink2Vsort_ret_NOMEM;
7957     }
7958     pos_vidx_sort_buf[variant_ct] = ~0LLU;
7959     const uint32_t new_chr_ct = write_chr_info.chr_ct;
7960     if (chr_idxs) {
7961       uint32_t* next_write_vidxs;
7962       if (unlikely(bigstack_alloc_u32(chr_code_end, &next_write_vidxs))) {
7963         goto MakePlink2Vsort_ret_NOMEM;
7964       }
7965       for (uint32_t new_chr_fo_idx = 0; new_chr_fo_idx != new_chr_ct; ++new_chr_fo_idx) {
7966         const uint32_t chr_idx = write_chr_info.chr_file_order[new_chr_fo_idx];
7967         next_write_vidxs[chr_idx] = write_chr_info.chr_fo_vidx_start[new_chr_fo_idx];
7968       }
7969       uintptr_t variant_uidx_base = 0;
7970       uintptr_t cur_bits = variant_include[0];
7971       for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx) {
7972         const uintptr_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
7973         const uint32_t chr_idx = chr_idxs[variant_uidx];
7974         const uint32_t write_vidx = next_write_vidxs[chr_idx];
7975         pos_vidx_sort_buf[write_vidx] = (S_CAST(uint64_t, variant_bps[variant_uidx]) << 32) | variant_uidx;
7976         next_write_vidxs[chr_idx] += 1;
7977       }
7978       BigstackReset(next_write_vidxs);
7979     } else {
7980       uint32_t old_chr_fo_idx = UINT32_MAX;
7981       uint32_t chr_end = 0;
7982       uintptr_t variant_uidx_base = 0;
7983       uintptr_t cur_bits = variant_include[0];
7984       uint32_t chr_idx = 0;
7985       uint32_t write_vidx = 0;
7986       for (uint32_t variant_idx = 0; variant_idx != variant_ct; ++variant_idx, ++write_vidx) {
7987         const uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
7988         if (variant_uidx >= chr_end) {
7989           do {
7990             ++old_chr_fo_idx;
7991             chr_end = cip->chr_fo_vidx_start[old_chr_fo_idx + 1];
7992           } while (variant_uidx >= chr_end);
7993           chr_idx = cip->chr_file_order[old_chr_fo_idx];
7994           // bugfix (8 Sep 2018): write_vidx was set to the wrong value here
7995           const uint32_t new_chr_fo_idx = write_chr_info.chr_idx_to_foidx[chr_idx];
7996           write_vidx = write_chr_info.chr_fo_vidx_start[new_chr_fo_idx];
7997         }
7998         pos_vidx_sort_buf[write_vidx] = (S_CAST(uint64_t, variant_bps[variant_uidx]) << 32) | variant_uidx;
7999       }
8000     }
8001 
8002     StrSortIndexedDeref* same_pos_sort_buf = R_CAST(StrSortIndexedDeref*, g_bigstack_base);
8003     const uintptr_t same_pos_sort_buf_size = bigstack_left() / sizeof(StrSortIndexedDeref);
8004 
8005     uint32_t vidx_start = 0;
8006     uint32_t* new_variant_idx_to_old_iter = new_variant_idx_to_old;
8007     for (uint32_t new_chr_fo_idx = 0; new_chr_fo_idx != new_chr_ct; ++new_chr_fo_idx) {
8008       const uint32_t vidx_end = write_chr_info.chr_fo_vidx_start[new_chr_fo_idx + 1];
8009       const uint32_t chr_size = vidx_end - vidx_start;
8010       const uint64_t post_entry = pos_vidx_sort_buf[vidx_end];
8011       pos_vidx_sort_buf[vidx_end] = ~0LLU;  // simplify end-of-chromosome logic
8012       uint64_t* pos_vidx_sort_chr = &(pos_vidx_sort_buf[vidx_start]);
8013       STD_SORT_PAR_UNSEQ(chr_size, u64cmp, pos_vidx_sort_chr);
8014       uint32_t prev_pos = pos_vidx_sort_chr[0] >> 32;
8015       uint32_t prev_variant_uidx = S_CAST(uint32_t, pos_vidx_sort_chr[0]);
8016       uint32_t prev_cidx = 0;
8017       uint32_t cidx = 1;
8018       // is chr_size == 0 possible here?  document if this code is revisited.
8019       for (; cidx < chr_size; ++cidx) {
8020         uint64_t cur_entry = pos_vidx_sort_chr[cidx];
8021         uint32_t cur_pos = cur_entry >> 32;
8022         if (cur_pos == prev_pos) {
8023           same_pos_sort_buf[0].strptr = variant_ids[prev_variant_uidx];
8024           same_pos_sort_buf[0].orig_idx = prev_variant_uidx;
8025           uint32_t equal_pos_ct = 1;
8026           const uint64_t* pos_vidx_sort_chr2 = &(pos_vidx_sort_chr[prev_cidx]);
8027           do {
8028             if (unlikely(equal_pos_ct >= same_pos_sort_buf_size)) {
8029               goto MakePlink2Vsort_ret_NOMEM;
8030             }
8031             const uint32_t variant_uidx = S_CAST(uint32_t, cur_entry);
8032             same_pos_sort_buf[equal_pos_ct].strptr = variant_ids[variant_uidx];
8033             same_pos_sort_buf[equal_pos_ct].orig_idx = variant_uidx;
8034             cur_entry = pos_vidx_sort_chr2[++equal_pos_ct];
8035             cur_pos = cur_entry >> 32;
8036           } while (cur_pos == prev_pos);
8037           StrptrArrSortMain(equal_pos_ct, 1, use_nsort, same_pos_sort_buf);
8038           for (uint32_t equal_pos_idx = 0; equal_pos_idx != equal_pos_ct; ++equal_pos_idx) {
8039             *new_variant_idx_to_old_iter++ = same_pos_sort_buf[equal_pos_idx].orig_idx;
8040           }
8041           cidx += equal_pos_ct - 1;
8042         } else {
8043           *new_variant_idx_to_old_iter++ = prev_variant_uidx;
8044         }
8045         prev_pos = cur_pos;
8046         prev_cidx = cidx;
8047         prev_variant_uidx = S_CAST(uint32_t, cur_entry);
8048       }
8049       if (cidx == chr_size) {
8050         // if [cidx - 1] is part of an identical-bp batch, cidx will actually
8051         // be chr_size + 1 after loop exit.  It's equal to chr_size iff we
8052         // haven't written the last entry to new_variant_idx_to_old[].
8053         *new_variant_idx_to_old_iter++ = prev_variant_uidx;
8054       }
8055       vidx_start = vidx_end;
8056       pos_vidx_sort_buf[vidx_end] = post_entry;
8057     }
8058     BigstackReset(pos_vidx_sort_buf);
8059 
8060     if (make_plink2_flags & kfMakeBim) {
8061       const uint32_t bim_zst = (make_plink2_flags / kfMakeBimZs) & 1;
8062       OutnameZstSet(".bim", bim_zst, outname_end);
8063       logprintfww5("Writing %s ... ", outname);
8064       fflush(stdout);
8065 
8066       reterr = WriteBimResorted(outname, &write_chr_info, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, variant_cms, new_variant_idx_to_old, variant_ct, max_allele_slen, bim_zst, max_thread_ct);
8067       if (unlikely(reterr)) {
8068         goto MakePlink2Vsort_ret_1;
8069       }
8070       logputs("done.\n");
8071     }
8072     if (make_plink2_flags & kfMakePvar) {
8073       OutnameZstSet(".pvar", pvar_psam_flags & kfPvarZs, outname_end);
8074       logprintfww5("Writing %s ... ", outname);
8075       fflush(stdout);
8076       uint32_t nonref_flags_storage = 3;
8077       if (!PgrGetNonrefFlags(simple_pgrp)) {
8078         nonref_flags_storage = (PgrGetGflags(simple_pgrp) & kfPgenGlobalAllNonref)? 2 : 1;
8079       }
8080       reterr = WritePvarResorted(outname, variant_include, &write_chr_info, variant_bps, variant_ids, allele_idx_offsets, allele_storage, allele_presents, refalt1_select, pvar_qual_present, pvar_quals, pvar_filter_present, pvar_filter_npass, pvar_filter_storage, PgrGetNonrefFlags(simple_pgrp), pvar_info_reload, variant_cms, new_variant_idx_to_old, raw_variant_ct, variant_ct, max_allele_slen, xheader_blen, info_flags, nonref_flags_storage, max_filter_slen, info_reload_slen, pvar_psam_flags, max_thread_ct, xheader);
8081       if (unlikely(reterr)) {
8082         goto MakePlink2Vsort_ret_1;
8083       }
8084       logputs("done.\n");
8085     }
8086     if (make_plink2_flags & kfMakeFam) {
8087       snprintf(outname_end, kMaxOutfnameExtBlen, ".fam");
8088       logprintfww5("Writing %s ... ", outname);
8089       fflush(stdout);
8090       reterr = WriteFam(outname, sample_include, piip, sex_nm, sex_male, pheno_cols, new_sample_idx_to_old, sample_ct, pheno_ct, '\t');
8091       if (unlikely(reterr)) {
8092         goto MakePlink2Vsort_ret_1;
8093       }
8094       logputs("done.\n");
8095     }
8096     if (make_plink2_flags & kfMakePsam) {
8097       snprintf(outname_end, kMaxOutfnameExtBlen, ".psam");
8098       logprintfww5("Writing %s ... ", outname);
8099       fflush(stdout);
8100       reterr = WritePsam(outname, sample_include, piip, sex_nm, sex_male, pheno_cols, pheno_names, new_sample_idx_to_old, sample_ct, pheno_ct, max_pheno_name_blen, pvar_psam_flags);
8101       if (unlikely(reterr)) {
8102         goto MakePlink2Vsort_ret_1;
8103       }
8104       logputs("done.\n");
8105     }
8106     if (make_plink2_flags & (kfMakeBed | kfMakePgen)) {
8107       // boilerplate from start of MakePlink2NoVsort()
8108       if (make_plink2_flags & kfMakePlink2MMask) {
8109         logerrputs("Error: --make-bed/--make-[b]pgen multiallelics= is currently under development.\n");
8110         reterr = kPglRetNotYetSupported;
8111         goto MakePlink2Vsort_ret_1;
8112       }
8113       MakeCommon mc;
8114       mc.plink2_write_flags = kfPlink2Write0;
8115       mc.raw_sample_ct = raw_sample_ct;
8116       mc.sample_ct = sample_ct;
8117       uintptr_t* sex_male_collapsed = nullptr;
8118       uintptr_t* sex_female_collapsed = nullptr;
8119       if (make_plink2_flags & kfMakePlink2SetHhMissing) {
8120         const uint32_t sample_ctv = BitCtToVecCt(sample_ct);
8121         const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
8122         uintptr_t* new_sex_male;
8123         if (unlikely(
8124                 bigstack_alloc_w(sample_ctv * kWordsPerVec, &new_sex_male) ||
8125                 bigstack_alloc_w(sample_ctv * kWordsPerVec, &mc.sex_male_collapsed_interleaved) ||
8126                 bigstack_alloc_w(sample_ctv * kWordsPerVec, &sex_female_collapsed) ||
8127                 bigstack_alloc_w(sample_ctv * kWordsPerVec, &mc.sex_female_collapsed_interleaved))) {
8128           goto MakePlink2Vsort_ret_NOMEM;
8129         }
8130         CopyBitarrSubset(sex_male, sample_include, sample_ct, new_sex_male);
8131         ZeroTrailingWords(sample_ctl, new_sex_male);
8132         sex_male_collapsed = new_sex_male;
8133         FillInterleavedMaskVec(sex_male_collapsed, sample_ctv, mc.sex_male_collapsed_interleaved);
8134 
8135         CopyBitarrSubset(sex_nm, sample_include, sample_ct, sex_female_collapsed);
8136         BitvecInvmask(new_sex_male, sample_ctl, sex_female_collapsed);
8137         ZeroTrailingWords(sample_ctl, sex_female_collapsed);
8138         FillInterleavedMaskVec(sex_female_collapsed, sample_ctv, mc.sex_female_collapsed_interleaved);
8139 
8140         mc.plink2_write_flags |= kfPlink2WriteSetHhMissing;
8141       } else {
8142         // defensive
8143         mc.sex_male_collapsed_interleaved = nullptr;
8144         mc.sex_female_collapsed_interleaved = nullptr;
8145       }
8146       if (make_plink2_flags & kfMakePlink2SetMixedMtMissing) {
8147         mc.plink2_write_flags |= kfPlink2WriteSetMixedMtMissing;
8148       }
8149       mc.cip = &write_chr_info;
8150       const uintptr_t* write_allele_idx_offsets = nullptr;
8151       if (allele_idx_offsets && (!(make_plink2_flags & kfMakePlink2EraseAlt2Plus))) {
8152         if ((variant_ct < raw_variant_ct) || new_variant_idx_to_old) {
8153           uintptr_t* new_allele_idx_offsets;
8154           if (unlikely(bigstack_alloc_w(variant_ct + 1, &new_allele_idx_offsets))) {
8155             goto MakePlink2Vsort_ret_NOMEM;
8156           }
8157           const uintptr_t final_offset = InitWriteAlleleIdxOffsets(variant_include, allele_idx_offsets, allele_presents, refalt1_select, new_variant_idx_to_old, variant_ct, new_allele_idx_offsets);
8158           if (final_offset != 2 * variant_ct) {
8159             new_allele_idx_offsets[variant_ct] = final_offset;
8160             write_allele_idx_offsets = new_allele_idx_offsets;
8161           } else {
8162             BigstackReset(new_allele_idx_offsets);
8163           }
8164         } else {
8165           write_allele_idx_offsets = allele_idx_offsets;
8166         }
8167       }
8168       reterr = MakePgenRobust(sample_include, new_sample_idx_to_old, variant_include, allele_idx_offsets, allele_presents, refalt1_select, write_allele_idx_offsets, new_variant_idx_to_old, sex_male_collapsed, sex_female_collapsed, raw_variant_ct, variant_ct, variant_ct, max_allele_ct, hard_call_thresh, dosage_erase_thresh, make_plink2_flags, &mc, simple_pgrp, outname, outname_end);
8169       if (unlikely(reterr)) {
8170         goto MakePlink2Vsort_ret_1;
8171       }
8172     }
8173   }
8174   while (0) {
8175   MakePlink2Vsort_ret_NOMEM:
8176     reterr = kPglRetNomem;
8177     break;
8178   }
8179  MakePlink2Vsort_ret_1:
8180   BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
8181   return reterr;
8182 }
8183 
SampleSortFileMap(const uintptr_t * sample_include,const SampleIdInfo * siip,const char * sample_sort_fname,uint32_t raw_sample_ct,uint32_t sample_ct,uint32_t ** new_sample_idx_to_old_ptr)8184 PglErr SampleSortFileMap(const uintptr_t* sample_include, const SampleIdInfo* siip, const char* sample_sort_fname, uint32_t raw_sample_ct, uint32_t sample_ct, uint32_t** new_sample_idx_to_old_ptr) {
8185   // assumes sample_ct >= 2 (enforced by caller)
8186   // return strbox is not collapsed
8187   unsigned char* bigstack_mark = g_bigstack_base;
8188   uintptr_t line_idx = 0;
8189   PglErr reterr = kPglRetSuccess;
8190   TextStream txs;
8191   PreinitTextStream(&txs);
8192   {
8193     char* idbuf;
8194     uintptr_t* already_seen;
8195     if (unlikely(
8196             bigstack_alloc_u32(raw_sample_ct, new_sample_idx_to_old_ptr) ||
8197             bigstack_alloc_c(siip->max_sample_id_blen, &idbuf) ||
8198             bigstack_calloc_w(BitCtToWordCt(raw_sample_ct), &already_seen))) {
8199       goto SampleSortFileMap_ret_NOMEM;
8200     }
8201 
8202     uint32_t max_line_blen;
8203     if (unlikely(StandardizeMaxLineBlen(bigstack_left() - (bigstack_left() / 4), &max_line_blen))) {
8204       goto SampleSortFileMap_ret_NOMEM;
8205     }
8206     char* line_start;
8207     XidMode xid_mode;
8208     reterr = OpenAndLoadXidHeader(sample_sort_fname, "indiv-sort", (siip->sids || (siip->flags & kfSampleIdStrictSid0))? kfXidHeader0 : kfXidHeaderIgnoreSid, max_line_blen, &txs, &xid_mode, &line_idx, &line_start, nullptr);
8209     if (unlikely(reterr)) {
8210       if (reterr == kPglRetEof) {
8211         logerrputs("Error: --indiv-sort file is empty.\n");
8212         goto SampleSortFileMap_ret_MALFORMED_INPUT;
8213       }
8214       goto SampleSortFileMap_ret_TSTREAM_XID_FAIL;
8215     }
8216     uint32_t* xid_map;
8217     char* sorted_xidbox;
8218     uintptr_t max_xid_blen;
8219     reterr = SortedXidboxInitAlloc(sample_include, siip, sample_ct, 0, xid_mode, 0, &sorted_xidbox, &xid_map, &max_xid_blen);
8220     if (unlikely(reterr)) {
8221       goto SampleSortFileMap_ret_1;
8222     }
8223     uint32_t* new_sample_idx_to_old_iter = *new_sample_idx_to_old_ptr;
8224     if (*line_start == '#') {
8225       ++line_idx;
8226       line_start = TextGet(&txs);
8227     }
8228     for (; line_start; ++line_idx, line_start = TextGet(&txs)) {
8229       if (unlikely(line_start[0] == '#')) {
8230         snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --indiv-sort file starts with a '#'. (This is only permitted before the first nonheader line, and if a #FID/IID header line is present it must denote the end of the header block.)\n", line_idx);
8231         goto SampleSortFileMap_ret_MALFORMED_INPUT_WW;
8232       }
8233       const char* linebuf_iter = line_start;
8234       uint32_t sample_uidx;
8235       if (!SortedXidboxReadFind(sorted_xidbox, xid_map, max_xid_blen, sample_ct, 0, xid_mode, &linebuf_iter, &sample_uidx, idbuf)) {
8236         if (unlikely(IsSet(already_seen, sample_uidx))) {
8237           char* tab_iter = AdvToDelim(idbuf, '\t');
8238           *tab_iter = ' ';
8239           if (xid_mode & kfXidModeFlagSid) {
8240             *AdvToDelim(&(tab_iter[1]), '\t') = ' ';
8241           }
8242           snprintf(g_logbuf, kLogbufSize, "Error: Duplicate sample ID '%s' in --indiv-sort file.\n", idbuf);
8243           goto SampleSortFileMap_ret_MALFORMED_INPUT_WW;
8244         }
8245         SetBit(sample_uidx, already_seen);
8246         *new_sample_idx_to_old_iter++ = sample_uidx;
8247       } else if (unlikely(!linebuf_iter)) {
8248         goto SampleSortFileMap_ret_MISSING_TOKENS;
8249       }
8250     }
8251     if (unlikely(TextStreamErrcode2(&txs, &reterr))) {
8252       goto SampleSortFileMap_ret_TSTREAM_FAIL;
8253     }
8254     if (unlikely(S_CAST(uintptr_t, new_sample_idx_to_old_iter - (*new_sample_idx_to_old_ptr)) != sample_ct)) {
8255       logerrputs("Error: --indiv-sort file does not contain all loaded sample IDs.\n");
8256       goto SampleSortFileMap_ret_INCONSISTENT_INPUT;
8257     }
8258     bigstack_mark = R_CAST(unsigned char*, idbuf);
8259   }
8260   while (0) {
8261   SampleSortFileMap_ret_NOMEM:
8262     reterr = kPglRetNomem;
8263     break;
8264   SampleSortFileMap_ret_MALFORMED_INPUT_WW:
8265     WordWrapB(0);
8266     logerrputsb();
8267   SampleSortFileMap_ret_MALFORMED_INPUT:
8268     reterr = kPglRetMalformedInput;
8269     break;
8270   SampleSortFileMap_ret_TSTREAM_XID_FAIL:
8271     if (!TextStreamErrcode(&txs)) {
8272       break;
8273     }
8274   SampleSortFileMap_ret_TSTREAM_FAIL:
8275     TextStreamErrPrint("--indiv-sort file", &txs);
8276     break;
8277   SampleSortFileMap_ret_MISSING_TOKENS:
8278     logerrprintf("Error: Line %" PRIuPTR " of --indiv-sort file has fewer tokens than expected.\n", line_idx);
8279   SampleSortFileMap_ret_INCONSISTENT_INPUT:
8280     reterr = kPglRetInconsistentInput;
8281     break;
8282   }
8283  SampleSortFileMap_ret_1:
8284   CleanupTextStream2("--indiv-sort file", &txs, &reterr);
8285   BigstackReset(bigstack_mark);
8286   return reterr;
8287 }
8288 
8289 #ifdef __cplusplus
8290 }  // namespace plink2
8291 #endif
8292