1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library.  If not, see <http://www.gnu.org/licenses/>.
16 
17 #include "plink2_pvar.h"
18 
19 #ifdef __cplusplus
20 namespace plink2 {
21 #endif
22 
23 // this used to employ a backward-growing linked list, with variable-size
24 // elements, but demultiplexing was relatively expensive.  now we allocate
25 // size-64k pos[], allele_idxs[], ids[], cms[], etc. blocks, and just memcpy
26 // those chunks at the end.  (cms[] is lazy-initialized.)
27 //
28 // probable todo: try making this more parallel.  could e.g. repeatedly peek at
29 // end of ReadLineStream buffer, count # of '\n's, and divvy them up between
30 // worker threads.
31 CONSTI32(kLoadPvarBlockSize, 65536);
32 static_assert(!(kLoadPvarBlockSize & (kLoadPvarBlockSize - 1)), "kLoadPvarBlockSize must be a power of 2.");
33 static_assert(kLoadPvarBlockSize >= (kMaxMediumLine / 8), "kLoadPvarBlockSize cannot be smaller than kMaxMediumLine / 8.");
34 
35 
36 static_assert(!kChrOffsetX, "ReadChrsetHeaderLine() assumes kChrOffsetX == 0.");
37 static_assert(kChrOffsetY == 1, "ReadChrsetHeaderLine() assumes kChrOffsetY == 1.");
38 static_assert(kChrOffsetPAR1 == 4, "ReadChrsetHeaderLine() assumes kChrOffsetPAR1 == 4.");
ReadChrsetHeaderLine(const char * chrset_iter,const char * file_descrip,MiscFlags misc_flags,uintptr_t line_idx,ChrInfo * cip)39 PglErr ReadChrsetHeaderLine(const char* chrset_iter, const char* file_descrip, MiscFlags misc_flags, uintptr_t line_idx, ChrInfo* cip) {
40   // chrset_iter is expected to point to first character after
41   // "##chrSet=<".
42   PglErr reterr = kPglRetSuccess;
43   {
44     uint32_t cmdline_autosome_ct = 0;
45     uint32_t cmdline_haploid = 0;
46     STD_ARRAY_DECL(uint32_t, kChrOffsetCt, cmdline_xymt_codes);
47     if (cip->chrset_source == kChrsetSourceCmdline) {
48       if (misc_flags & kfMiscChrOverrideCmdline) {
49         goto ReadChrsetHeaderLine_ret_1;
50       }
51       if (!(misc_flags & kfMiscChrOverrideFile)) {
52         // save off info we need for consistency check
53         cmdline_autosome_ct = cip->autosome_ct;
54         cmdline_haploid = cip->haploid_mask[0] & 1;
55         STD_ARRAY_COPY(cip->xymt_codes, kChrOffsetCt, cmdline_xymt_codes);
56       }
57       ZeroWArr(kChrMaskWords, cip->haploid_mask);
58     }
59     for (uint32_t uii = 0; uii != kChrOffsetCt; ++uii) {
60       cip->xymt_codes[uii] = UINT32_MAXM1;
61     }
62     if (StrStartsWithUnsafe(chrset_iter, "haploidAutosomeCt=")) {
63       uint32_t explicit_haploid_ct;
64       if (unlikely(ScanPosintCapped(&(chrset_iter[strlen("haploidAutosomeCt=")]), kMaxChrTextnum, &explicit_haploid_ct))) {
65         snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s has an invalid ##chrSet haploid count (max %u).\n", line_idx, file_descrip, kMaxChrTextnum);
66         goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
67       }
68       // could verify that X, Y, etc. are not present?
69       if (cmdline_autosome_ct) {
70         if (unlikely(!cmdline_haploid)) {
71           snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies a haploid genome, while a diploid genome was specified on the command line.\n", line_idx, file_descrip);
72           goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
73         }
74         if (unlikely(explicit_haploid_ct != cmdline_autosome_ct)) {
75           snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies %u autosome%s, while the command line specified %u.\n", line_idx, file_descrip, explicit_haploid_ct, (explicit_haploid_ct == 1)? "" : "s", cmdline_autosome_ct);
76           goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
77         }
78       }
79       cip->autosome_ct = explicit_haploid_ct;
80       SetAllBits(explicit_haploid_ct + 1, cip->haploid_mask);
81     } else {
82       if (unlikely(!StrStartsWithUnsafe(chrset_iter, "autosomePairCt="))) {
83         snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s does not have expected ##chrSet format.\n", line_idx, file_descrip);
84         goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
85       }
86       chrset_iter = &(chrset_iter[strlen("autosomePairCt=")]);
87       uint32_t explicit_autosome_ct;
88       if (unlikely(ScanmovPosintCapped(kMaxChrTextnum, &chrset_iter, &explicit_autosome_ct))) {
89         snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s has an invalid ##chrSet autosome count (max %u).\n", line_idx, file_descrip, kMaxChrTextnum);
90         goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
91       }
92       cip->autosome_ct = explicit_autosome_ct;
93       if (*chrset_iter != '>') {
94         if (unlikely(*chrset_iter != ',')) {
95           snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s does not have expected ##chrSet format.\n", line_idx, file_descrip);
96           goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
97         }
98         // this can theoretically be confused by e.g. a Description="..." field
99         // containing commas not followed by spaces
100         do {
101           ++chrset_iter;
102           // uppercase
103           uint32_t first_char_ui = ctou32(*chrset_iter) & 0xdf;
104 
105           uint32_t second_char_ui = ctou32(chrset_iter[1]);
106           // 44 is ',', 62 is '>'
107           if ((second_char_ui == 44) || (second_char_ui == 62)) {
108             if (first_char_ui == 77) {
109               // M
110               cip->xymt_codes[kChrOffsetMT] = explicit_autosome_ct + 1 + kChrOffsetMT;
111             } else {
112               first_char_ui -= 88;  // X = 0, Y = 1, everything else larger
113               if (first_char_ui < 2) {
114                 cip->xymt_codes[first_char_ui] = explicit_autosome_ct + 1 + first_char_ui;
115               }
116             }
117           } else {
118             second_char_ui &= 0xdf;
119             const uint32_t third_char_ui = ctou32(chrset_iter[2]);
120             if ((third_char_ui == 44) || (third_char_ui == 62)) {
121               if ((first_char_ui == 88) && (second_char_ui == 89)) {
122                 // XY
123                 cip->xymt_codes[kChrOffsetXY] = explicit_autosome_ct + 1 + kChrOffsetXY;
124               } else if ((first_char_ui == 77) && (second_char_ui == 84)) {
125                 // MT
126                 cip->xymt_codes[kChrOffsetMT] = explicit_autosome_ct + 1 + kChrOffsetMT;
127               }
128             } else if ((first_char_ui == 80) && (second_char_ui == 65) && ((third_char_ui & 0xdf) == 82)) {
129               // PAR1, PAR2
130               const uint32_t par_idx_m1 = ctou32(chrset_iter[3]) - '1';
131               if ((par_idx_m1 < 2) && ((chrset_iter[4] == ',') || (chrset_iter[4] == '>'))) {
132                 cip->xymt_codes[kChrOffsetPAR1] = explicit_autosome_ct + 1 + kChrOffsetPAR1 + par_idx_m1;
133               }
134             }
135           }
136         } while (!strchrnul_n_mov(',', &chrset_iter));
137       }
138       if (cmdline_autosome_ct) {
139         if (unlikely(cmdline_haploid)) {
140           snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies a diploid genome, while a haploid genome was specified on the command line.\n", line_idx, file_descrip);
141           goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
142         }
143         if (unlikely(explicit_autosome_ct != cmdline_autosome_ct)) {
144           snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies %u autosome%s, while the command line specified %u.\n", line_idx, file_descrip, explicit_autosome_ct, (explicit_autosome_ct == 1)? "" : "s", cmdline_autosome_ct);
145           goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
146         }
147         for (uint32_t xymt_idx = 0; xymt_idx != kChrOffsetPAR1; ++xymt_idx) {
148           // it's okay if the command line doesn't explicitly exclude e.g. chrX
149           // while for whatever reason it is excluded from ##chrSet; but the
150           // reverse can create problems
151           if (unlikely(IsI32Neg(cmdline_xymt_codes[xymt_idx]) && (!IsI32Neg(cip->xymt_codes[xymt_idx])))) {
152             snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies a chromosome set including %s, while the command line excludes it.\n", line_idx, file_descrip, g_xymt_log_names[xymt_idx]);
153             goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
154           }
155         }
156       }
157     }
158     cip->chrset_source = kChrsetSourceFile;
159   }
160   while (0) {
161   ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW:
162     WordWrapB(0);
163     logerrputsb();
164     reterr = kPglRetMalformedInput;
165     break;
166   ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW:
167     WordWrapB(0);
168     logerrputsb();
169     reterr = kPglRetInconsistentInput;
170     break;
171   }
172  ReadChrsetHeaderLine_ret_1:
173   return reterr;
174 }
175 
VaridTemplateInit(const char * varid_template_str,const char * missing_id_match,char * chr_output_name_buf,uint32_t new_id_max_allele_slen,uint32_t overflow_substitute_blen,VaridTemplate * vtp)176 void VaridTemplateInit(const char* varid_template_str, const char* missing_id_match, char* chr_output_name_buf, uint32_t new_id_max_allele_slen, uint32_t overflow_substitute_blen, VaridTemplate* vtp) {
177   // template string was previously validated
178   // varid_template is only input, everything else is output values
179   const char* varid_template_str_iter = varid_template_str;
180   uint32_t template_insert_ct = 0;
181   uint32_t template_base_len = 0;
182   uint32_t alleles_needed = 0;  // bit 0 = ref, bit 1 = alt, bit 2 = ascii sort
183   vtp->chr_output_name_buf = chr_output_name_buf;
184   vtp->segs[0] = varid_template_str_iter;
185   vtp->chr_slen = 0;
186   for (unsigned char ucc = *varid_template_str_iter; ucc; ucc = *(++varid_template_str_iter)) {
187     if (ucc > '@') {
188       continue;
189     }
190     uint32_t seg_len;
191     uint32_t insert_type;
192     if (ucc == '@') {
193       seg_len = varid_template_str_iter - vtp->segs[template_insert_ct];
194       insert_type = 0;
195     } else if (ucc == '#') {
196       seg_len = varid_template_str_iter - vtp->segs[template_insert_ct];
197       insert_type = 1;
198     } else if (ucc != '$') {
199       continue;
200     } else {
201       seg_len = varid_template_str_iter - vtp->segs[template_insert_ct];
202       const uint32_t uii = ctou32(*(++varid_template_str_iter));
203       if (uii <= '2') {
204         alleles_needed += 2;  // this happens twice
205         insert_type = uii - 48;  // '1' -> type 2, '2' -> type 3
206       } else {
207         // 'r' -> type 2, 'a' -> type 3
208         insert_type = 1 + ((uii & 0xdf) == 'A');
209       }
210       alleles_needed += insert_type;
211       ++insert_type;
212     }
213     vtp->seg_lens[template_insert_ct] = seg_len;
214     template_base_len += seg_len;
215     vtp->insert_types[template_insert_ct++] = insert_type;
216     vtp->segs[template_insert_ct] = &(varid_template_str_iter[1]);
217   }
218   const uint32_t seg_len = varid_template_str_iter - vtp->segs[template_insert_ct];
219   vtp->seg_lens[template_insert_ct] = seg_len;
220   vtp->insert_ct = template_insert_ct;
221   vtp->base_len = template_base_len + seg_len;
222   vtp->alleles_needed = alleles_needed;
223   vtp->new_id_max_allele_slen = new_id_max_allele_slen;
224   vtp->overflow_substitute_blen = overflow_substitute_blen;
225   vtp->missing_id_match = missing_id_match;
226 }
227 
228 // alt1_end currently allowed to be nullptr in biallelic case
VaridTemplateApply(unsigned char * tmp_alloc_base,const VaridTemplate * vtp,const char * ref_start,const char * alt1_start,uint32_t cur_bp,uint32_t ref_token_slen,uint32_t extra_alt_ct,uint32_t alt_token_slen,unsigned char ** tmp_alloc_endp,uintptr_t * new_id_allele_len_overflowp,uint32_t * id_slen_ptr)229 BoolErr VaridTemplateApply(unsigned char* tmp_alloc_base, const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, unsigned char** tmp_alloc_endp, uintptr_t* new_id_allele_len_overflowp, uint32_t* id_slen_ptr) {
230   uint32_t insert_slens[4];
231   const uint32_t alleles_needed = vtp->alleles_needed;
232   const uint32_t new_id_max_allele_slen = vtp->new_id_max_allele_slen;
233   uint32_t id_slen = UintSlen(cur_bp);
234   insert_slens[0] = vtp->chr_slen;
235   insert_slens[1] = id_slen;
236   id_slen += vtp->base_len;
237   uint32_t ref_slen = 0;
238   uint32_t cur_overflow = 0;
239   const char* tmp_allele_ptrs[2];
240   if (alleles_needed & 1) {
241     ref_slen = ref_token_slen;
242     if (ref_slen > new_id_max_allele_slen) {
243       ref_slen = new_id_max_allele_slen;
244       cur_overflow = 1;
245     }
246     insert_slens[2] = ref_slen;
247     id_slen += ref_slen;
248     tmp_allele_ptrs[0] = ref_start;
249   }
250   if (alleles_needed > 1) {
251     uint32_t alt1_slen;
252     if (!extra_alt_ct) {
253       alt1_slen = alt_token_slen;
254     } else {
255       alt1_slen = AdvToDelim(alt1_start, ',') - alt1_start;
256     }
257     if (alt1_slen > new_id_max_allele_slen) {
258       alt1_slen = new_id_max_allele_slen;
259       ++cur_overflow;
260     }
261     id_slen += alt1_slen;
262     if (alleles_needed <= 3) {
263     VaridTemplateApply_keep_allele_ascii_order:
264       insert_slens[3] = alt1_slen;
265       tmp_allele_ptrs[1] = alt1_start;
266     } else {
267       uint32_t smaller_slen = alt1_slen;
268       const int32_t ref_slen_geq = (ref_slen >= alt1_slen);
269       if (!ref_slen_geq) {
270         smaller_slen = ref_slen;
271       }
272       int32_t memcmp_result = Memcmp(ref_start, alt1_start, smaller_slen);
273       if (!memcmp_result) {
274         memcmp_result = ref_slen_geq;
275       }
276       if (memcmp_result <= 0) {
277         goto VaridTemplateApply_keep_allele_ascii_order;
278       }
279       insert_slens[3] = ref_slen;
280       tmp_allele_ptrs[1] = tmp_allele_ptrs[0];
281       insert_slens[2] = alt1_slen;
282       tmp_allele_ptrs[0] = alt1_start;
283     }
284   }
285   const uint32_t overflow_substitute_blen = vtp->overflow_substitute_blen;
286   if (overflow_substitute_blen && cur_overflow) {
287     // er, do we really want to make a separate allocation here?  see if we can
288     // remove this.  (that would impose more constraints on downstream
289     // variant-ID-changing functions, though.)
290     if (PtrWSubCk(tmp_alloc_base, overflow_substitute_blen, tmp_alloc_endp)) {
291       return 1;
292     }
293     memcpy(*tmp_alloc_endp, vtp->missing_id_match, overflow_substitute_blen);
294     id_slen = 0;
295     cur_overflow = 1;
296   } else {
297     if (PtrWSubCk(tmp_alloc_base, id_slen + 1, tmp_alloc_endp)) {
298       return 1;
299     }
300     char* id_iter = R_CAST(char*, *tmp_alloc_endp);
301     const uint32_t insert_ct = vtp->insert_ct;
302     char* insert_ptrs[4];
303 
304     // maybe-uninitialized warnings
305     insert_ptrs[0] = nullptr;
306     insert_ptrs[1] = nullptr;
307     insert_ptrs[2] = nullptr;
308 
309     for (uint32_t insert_idx = 0; insert_idx != insert_ct; ++insert_idx) {
310       id_iter = memcpya(id_iter, vtp->segs[insert_idx], vtp->seg_lens[insert_idx]);
311       const uint32_t cur_insert_type = vtp->insert_types[insert_idx];
312       insert_ptrs[cur_insert_type] = id_iter;
313       id_iter = &(id_iter[insert_slens[cur_insert_type]]);
314     }
315     memcpyx(id_iter, vtp->segs[insert_ct], vtp->seg_lens[insert_ct], '\0');
316 
317     memcpy(insert_ptrs[0], vtp->chr_output_name_buf, insert_slens[0]);
318     u32toa(cur_bp, insert_ptrs[1]);
319     // insert_ct currently guaranteed to be >= 2 since @ and # required
320     for (uint32_t insert_type_idx = 2; insert_type_idx != insert_ct; ++insert_type_idx) {
321       memcpy(insert_ptrs[insert_type_idx], tmp_allele_ptrs[insert_type_idx - 2], insert_slens[insert_type_idx]);
322     }
323   }
324   *id_slen_ptr = id_slen;
325   *new_id_allele_len_overflowp += cur_overflow;
326   return 0;
327 }
328 
329 // Exported variant of VaridTemplateApply() which appends to a buffer.
330 // Probable todo: pull out the common parts of the functions.
VaridTemplateWrite(const VaridTemplate * vtp,const char * ref_start,const char * alt1_start,uint32_t cur_bp,uint32_t ref_token_slen,uint32_t extra_alt_ct,uint32_t alt_token_slen,char * dst)331 char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, char* dst) {
332   uint32_t insert_slens[4];
333   const uint32_t alleles_needed = vtp->alleles_needed;
334   const uint32_t new_id_max_allele_slen = vtp->new_id_max_allele_slen;
335   uint32_t id_slen = UintSlen(cur_bp);
336   insert_slens[0] = vtp->chr_slen;
337   insert_slens[1] = id_slen;
338   id_slen += vtp->base_len;
339   uint32_t ref_slen = 0;
340   uint32_t cur_overflow = 0;
341   const char* tmp_allele_ptrs[2];
342   if (alleles_needed & 1) {
343     ref_slen = ref_token_slen;
344     if (ref_slen > new_id_max_allele_slen) {
345       ref_slen = new_id_max_allele_slen;
346       cur_overflow = 1;
347     }
348     insert_slens[2] = ref_slen;
349     id_slen += ref_slen;
350     tmp_allele_ptrs[0] = ref_start;
351   }
352   if (alleles_needed > 1) {
353     uint32_t alt1_slen;
354     if (!extra_alt_ct) {
355       alt1_slen = alt_token_slen;
356     } else {
357       alt1_slen = AdvToDelim(alt1_start, ',') - alt1_start;
358     }
359     if (alt1_slen > new_id_max_allele_slen) {
360       alt1_slen = new_id_max_allele_slen;
361       cur_overflow = 1;
362     }
363     id_slen += alt1_slen;
364     if (alleles_needed <= 3) {
365     VaridTemplateWrite_keep_allele_ascii_order:
366       insert_slens[3] = alt1_slen;
367       tmp_allele_ptrs[1] = alt1_start;
368     } else {
369       uint32_t smaller_slen = alt1_slen;
370       const int32_t ref_slen_geq = (ref_slen >= alt1_slen);
371       if (!ref_slen_geq) {
372         smaller_slen = ref_slen;
373       }
374       int32_t memcmp_result = Memcmp(ref_start, alt1_start, smaller_slen);
375       if (!memcmp_result) {
376         memcmp_result = ref_slen_geq;
377       }
378       if (memcmp_result <= 0) {
379         goto VaridTemplateWrite_keep_allele_ascii_order;
380       }
381       insert_slens[3] = ref_slen;
382       tmp_allele_ptrs[1] = tmp_allele_ptrs[0];
383       insert_slens[2] = alt1_slen;
384       tmp_allele_ptrs[0] = alt1_start;
385     }
386   }
387   const uint32_t overflow_substitute_blen = vtp->overflow_substitute_blen;
388   if (overflow_substitute_blen && cur_overflow) {
389     return memcpya(dst, vtp->missing_id_match, overflow_substitute_blen);
390   }
391   char* id_iter = dst;
392   const uint32_t insert_ct = vtp->insert_ct;
393   char* insert_ptrs[4];
394 
395   // maybe-uninitialized warnings
396   insert_ptrs[0] = nullptr;
397   insert_ptrs[1] = nullptr;
398   insert_ptrs[2] = nullptr;
399 
400   for (uint32_t insert_idx = 0; insert_idx != insert_ct; ++insert_idx) {
401     id_iter = memcpya(id_iter, vtp->segs[insert_idx], vtp->seg_lens[insert_idx]);
402     const uint32_t cur_insert_type = vtp->insert_types[insert_idx];
403     insert_ptrs[cur_insert_type] = id_iter;
404     id_iter = &(id_iter[insert_slens[cur_insert_type]]);
405   }
406   char* id_end = memcpya(id_iter, vtp->segs[insert_ct], vtp->seg_lens[insert_ct]);
407 
408   memcpy(insert_ptrs[0], vtp->chr_output_name_buf, insert_slens[0]);
409   u32toa(cur_bp, insert_ptrs[1]);
410   // insert_ct currently guaranteed to be >= 2 since @ and # required
411   for (uint32_t insert_type_idx = 2; insert_type_idx != insert_ct; ++insert_type_idx) {
412     memcpy(insert_ptrs[insert_type_idx], tmp_allele_ptrs[insert_type_idx - 2], insert_slens[insert_type_idx]);
413   }
414   return id_end;
415 }
416 
VaridWorstCaseSlen(const VaridTemplate * vtp,uint32_t max_chr_slen,uint32_t max_allele_slen)417 uint32_t VaridWorstCaseSlen(const VaridTemplate* vtp, uint32_t max_chr_slen, uint32_t max_allele_slen) {
418   // +10 for base-pair coordinate
419   return (max_allele_slen * vtp->alleles_needed + vtp->base_len + max_chr_slen + 10);
420 }
421 
BackfillChrIdxs(const ChrInfo * cip,uint32_t chrs_encountered_m1,uint32_t offset,uint32_t end_vidx,ChrIdx * chr_idxs)422 void BackfillChrIdxs(const ChrInfo* cip, uint32_t chrs_encountered_m1, uint32_t offset, uint32_t end_vidx, ChrIdx* chr_idxs) {
423   for (uint32_t chr_fo_idx = chrs_encountered_m1; ; --chr_fo_idx) {
424     uint32_t start_vidx = cip->chr_fo_vidx_start[chr_fo_idx];
425     if (start_vidx < offset) {
426       start_vidx = offset;
427     }
428     ChrIdx* chr_idxs_write_base = &(chr_idxs[start_vidx - offset]);
429     const uint32_t vidx_ct = end_vidx - start_vidx;
430     const ChrIdx cur_chr_idx = cip->chr_file_order[chr_fo_idx];
431     for (uint32_t uii = 0; uii != vidx_ct; ++uii) {
432       chr_idxs_write_base[uii] = cur_chr_idx;
433     }
434     if (start_vidx == offset) {
435       return;
436     }
437     end_vidx = start_vidx;
438   }
439 }
440 
PrInInfoToken(uint32_t info_slen,char * info_token)441 char* PrInInfoToken(uint32_t info_slen, char* info_token) {
442   if (memequal_k(info_token, "PR", 2) && ((info_slen == 2) || (info_token[2] == ';'))) {
443     return info_token;
444   }
445   if (memequal_k(&(info_token[S_CAST(int32_t, info_slen) - 3]), ";PR", 3)) {
446     return &(info_token[info_slen - 2]);
447   }
448   info_token[info_slen] = '\0';
449   char* first_info_end = strchr(info_token, ';');
450   if (!first_info_end) {
451     return nullptr;
452   }
453   char* pr_prestart = strstr(first_info_end, ";PR;");
454   // bugfix (27 Sep 2019): had this backward
455   return pr_prestart? (&(pr_prestart[1])) : nullptr;
456 }
457 
458 typedef struct InfoExistStruct {
459   NONCOPYABLE(InfoExistStruct);
460   char* prekeys;
461   uint32_t key_ct;
462   uint32_t key_slens[];
463 } InfoExist;
464 
InfoExistInit(const unsigned char * arena_end,const char * require_info_flattened,const char * flagname_p,unsigned char ** arena_base_ptr,InfoExist ** existpp)465 PglErr InfoExistInit(const unsigned char* arena_end, const char* require_info_flattened, const char* flagname_p, unsigned char** arena_base_ptr, InfoExist** existpp) {
466   const char* read_iter = require_info_flattened;
467   uintptr_t prekeys_blen = 0;
468   do {
469     const char* key_end_or_invalid = strchrnul2(read_iter, ';', '=');
470     if (unlikely(*key_end_or_invalid)) {
471       if (*key_end_or_invalid == ';') {
472         logerrprintfww("Error: Invalid --%s key '%s' (semicolon prohibited).\n", flagname_p, read_iter);
473       } else {
474         logerrprintfww("Error: Invalid --%s key '%s' ('=' prohibited).\n", flagname_p, read_iter);
475       }
476       return kPglRetInvalidCmdline;
477     }
478     ++prekeys_blen;
479     read_iter = &(key_end_or_invalid[1]);
480   } while (*read_iter);
481   prekeys_blen += read_iter - require_info_flattened;
482   const uint32_t key_ct = prekeys_blen - S_CAST(uintptr_t, read_iter - require_info_flattened);
483   const uintptr_t bytes_used = offsetof(InfoExist, key_slens) + key_ct * sizeof(int32_t) + prekeys_blen;
484   const uintptr_t cur_alloc = RoundUpPow2(bytes_used, kCacheline);
485   if (unlikely(S_CAST(uintptr_t, arena_end - (*arena_base_ptr)) < cur_alloc)) {
486     return kPglRetNomem;
487   }
488   *existpp = R_CAST(InfoExist*, *arena_base_ptr);
489   (*existpp)->prekeys = R_CAST(char*, &((*arena_base_ptr)[bytes_used - prekeys_blen]));
490   (*arena_base_ptr) += cur_alloc;
491   (*existpp)->key_ct = key_ct;
492   read_iter = require_info_flattened;
493   char* write_iter = (*existpp)->prekeys;
494   for (uint32_t kidx = 0; kidx != key_ct; ++kidx) {
495     *write_iter++ = ';';
496     const uintptr_t slen = strlen(read_iter);
497     (*existpp)->key_slens[kidx] = slen;
498     const uintptr_t blen = slen + 1;
499     write_iter = memcpya(write_iter, read_iter, blen);
500     read_iter = &(read_iter[blen]);
501   }
502   return kPglRetSuccess;
503 }
504 
InfoExistCheck(const char * info_token,const InfoExist * existp)505 uint32_t InfoExistCheck(const char* info_token, const InfoExist* existp) {
506   const uint32_t key_ct = existp->key_ct;
507   const char* prekeys_iter = existp->prekeys;
508   const uint32_t* key_slens = existp->key_slens;
509   for (uint32_t kidx = 0; kidx != key_ct; ++kidx) {
510     const uint32_t key_slen = key_slens[kidx];
511     const char* possible_hit;
512     // similar logic to hardcoded PR, except key can also be followed by '=',
513     // and if it is there can't be a lone '.' after the equals
514     if (memequal(info_token, &(prekeys_iter[1]), key_slen)) {
515       possible_hit = &(info_token[key_slen]);
516     } else {
517       possible_hit = strstr(info_token, prekeys_iter);
518       if (!possible_hit) {
519         return 0;
520       }
521       possible_hit = &(possible_hit[key_slen + 1]);
522     }
523     while (1) {
524       const char cc = *possible_hit;
525       if ((!cc) || (cc == ';')) {
526         break;
527       }
528       if (cc == '=') {
529         if ((possible_hit[1] != '.') || (possible_hit[2] && (possible_hit[2] != ';'))) {
530           break;
531         }
532         return 0;
533       }
534       possible_hit = strstr(possible_hit, prekeys_iter);
535       if (!possible_hit) {
536         return 0;
537       }
538       possible_hit = &(possible_hit[key_slen + 1]);
539     }
540     prekeys_iter = &(prekeys_iter[key_slen + 2]);
541   }
542   return 1;
543 }
544 
InfoNonexistCheck(const char * info_token,const InfoExist * nonexistp)545 uint32_t InfoNonexistCheck(const char* info_token, const InfoExist* nonexistp) {
546   const uint32_t key_ct = nonexistp->key_ct;
547   const char* prekeys_iter = nonexistp->prekeys;
548   const uint32_t* key_slens = nonexistp->key_slens;
549   uint32_t key_slen = 0;
550   for (uint32_t kidx = 0; kidx != key_ct; ++kidx, prekeys_iter = &(prekeys_iter[key_slen + 2])) {
551     key_slen = key_slens[kidx];
552     const char* possible_hit;
553     if (memequal(info_token, &(prekeys_iter[1]), key_slen)) {
554       possible_hit = &(info_token[key_slen]);
555     } else {
556       possible_hit = strstr(info_token, prekeys_iter);
557       if (!possible_hit) {
558         continue;
559       }
560       possible_hit = &(possible_hit[key_slen + 1]);
561     }
562     while (1) {
563       const char cc = *possible_hit;
564       if ((!cc) || (cc == ';')) {
565         return 0;
566       }
567       if (cc == '=') {
568         if ((possible_hit[1] != '.') || (possible_hit[2] && (possible_hit[2] != ';'))) {
569           return 0;
570         }
571         break;
572       }
573       possible_hit = strstr(possible_hit, prekeys_iter);
574       if (!possible_hit) {
575         break;
576       }
577       possible_hit = &(possible_hit[key_slen + 1]);
578     }
579   }
580   return 1;
581 }
582 
583 typedef struct InfoFilterStruct {
584   NONCOPYABLE(InfoFilterStruct);
585   char* prekey;
586   const char* val_str;
587   uint32_t key_slen;
588   uint32_t val_slen;
589   CmpBinaryOp binary_op;
590   double val;
591 } InfoFilter;
592 
InfoFilterInit(const unsigned char * arena_end,const CmpExpr * filter_exprp,const char * flagname_p,unsigned char ** arena_base_ptr,InfoFilter * filterp)593 PglErr InfoFilterInit(const unsigned char* arena_end, const CmpExpr* filter_exprp, const char* flagname_p, unsigned char** arena_base_ptr, InfoFilter* filterp) {
594   const char* pheno_name = filter_exprp->pheno_name;
595   const char* pheno_name_end_or_invalid = strchrnul3(pheno_name, ';', '=', ',');
596   if (unlikely(*pheno_name_end_or_invalid)) {
597     if (*pheno_name_end_or_invalid == ';') {
598       logerrprintfww("Error: Invalid --%s key '%s' (semicolon prohibited).\n", flagname_p, pheno_name);
599     } else if (*pheno_name_end_or_invalid == '=') {
600       logerrprintfww("Error: Invalid --%s key '%s' ('=' prohibited).\n", flagname_p, pheno_name);
601     } else {
602       logerrprintfww("Error: Invalid --%s key '%s' (comma prohibited).\n", flagname_p, pheno_name);
603     }
604     return kPglRetInvalidCmdline;
605   }
606   uint32_t key_slen = pheno_name_end_or_invalid - pheno_name;
607   const uintptr_t cur_alloc = RoundUpPow2(3 + key_slen, kCacheline);
608   if (unlikely(S_CAST(uintptr_t, arena_end - (*arena_base_ptr)) < cur_alloc)) {
609     return kPglRetNomem;
610   }
611   filterp->prekey = R_CAST(char*, *arena_base_ptr);
612   (*arena_base_ptr) += cur_alloc;
613   filterp->prekey[0] = ';';
614   char* key_iter = memcpya(&(filterp->prekey[1]), pheno_name, key_slen);
615   strcpy_k(key_iter, "=");
616   ++key_slen;
617   filterp->key_slen = key_slen;
618   filterp->val_str = nullptr;
619   filterp->val_slen = 0;
620   const CmpBinaryOp binary_op = filter_exprp->binary_op;
621   filterp->binary_op = binary_op;
622   // shouldn't need to initialize val
623 
624   const char* val_str = &(pheno_name[key_slen]);
625   // bugfix (14 Dec 2017): INFO string constants are not guaranteed to start in
626   // a letter.  Only interpret the value as a number if ScanadvDouble()
627   // consumes the entire value string.
628   const char* val_num_end = ScanadvDouble(val_str, &filterp->val);
629   if (val_num_end && (!val_num_end[0])) {
630     return kPglRetSuccess;
631   }
632   if (unlikely((binary_op != kCmpOperatorNoteq) && (binary_op != kCmpOperatorEq))) {
633     logerrprintfww("Error: Invalid --%s value '%s' (finite number expected).\n", flagname_p, val_str);
634     return kPglRetInvalidCmdline;
635   }
636   filterp->val_str = val_str;
637   const char* val_str_end_or_invalid = strchrnul2(val_str, ';', '=');
638   if (unlikely(*val_str_end_or_invalid)) {
639     if (*val_str_end_or_invalid == ';') {
640       logerrprintfww("Error: Invalid --%s value '%s' (semicolon prohibited).\n", flagname_p, val_str);
641     } else {
642       logerrprintfww("Error: Invalid --%s value '%s' ('=' prohibited).\n", flagname_p, val_str);
643     }
644     return kPglRetInvalidCmdline;
645   }
646   filterp->val_slen = val_str_end_or_invalid - val_str;
647   return kPglRetSuccess;
648 }
649 
InfoConditionSatisfied(const char * info_token,const InfoFilter * filterp)650 uint32_t InfoConditionSatisfied(const char* info_token, const InfoFilter* filterp) {
651   const char* prekey = filterp->prekey;
652   const uint32_t key_slen = filterp->key_slen;
653   const CmpBinaryOp binary_op = filterp->binary_op;
654   const char* possible_hit;
655   // search for "[key]=" at start or ";[key]=" later; key_slen includes the
656   //   trailing =
657   if (memequal(info_token, &(prekey[1]), key_slen)) {
658     possible_hit = &(info_token[key_slen]);
659   } else {
660     possible_hit = strstr(info_token, prekey);
661     if (!possible_hit) {
662       return (binary_op == kCmpOperatorNoteq);
663     }
664     possible_hit = &(possible_hit[key_slen + 1]);
665   }
666   if (filterp->val_str) {
667     const uint32_t val_slen = filterp->val_slen;
668     const uint32_t mismatch = ((!memequal(possible_hit, filterp->val_str, val_slen)) || (possible_hit[val_slen] && (possible_hit[val_slen] != ';')));
669     return mismatch ^ (binary_op != kCmpOperatorNoteq);
670   }
671   // bugfix (3 Jan 2020): semicolon (or \0) terminator expected, can't use
672   // ScantokDouble
673   double dxx;
674   const char* scan_end = ScanadvDouble(possible_hit, &dxx);
675   if ((!scan_end) || ((*scan_end != ';') && (*scan_end))) {
676     return (binary_op == kCmpOperatorNoteq);
677   }
678   const double val = filterp->val;
679   switch (binary_op) {
680   case kCmpOperatorNoteq:
681     return (dxx != val);
682   case kCmpOperatorLe:
683     return (dxx < val);
684   case kCmpOperatorLeq:
685     return (dxx <= val);
686   case kCmpOperatorGe:
687     return (dxx > val);
688   case kCmpOperatorGeq:
689     return (dxx >= val);
690   case kCmpOperatorEq:
691     return (dxx == val);
692   }
693   // should be unreachable, but some gcc versions complain
694   return 0;
695 }
696 
SplitPar(const uint32_t * variant_bps,UnsortedVar vpos_sortstatus,uint32_t splitpar_bound1,uint32_t splitpar_bound2,uintptr_t * variant_include,uintptr_t * loaded_chr_mask,ChrInfo * cip,uint32_t * chrs_encountered_m1_ptr,uint32_t * exclude_ct_ptr)697 PglErr SplitPar(const uint32_t* variant_bps, UnsortedVar vpos_sortstatus, uint32_t splitpar_bound1, uint32_t splitpar_bound2, uintptr_t* variant_include, uintptr_t* loaded_chr_mask, ChrInfo* cip, uint32_t* chrs_encountered_m1_ptr, uint32_t* exclude_ct_ptr) {
698   const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
699   if (IsI32Neg(x_code) || (!IsSet(loaded_chr_mask, x_code))) {
700     logerrputs("Warning: --split-par had no effect (no X chromosome in dataset).\n");
701     return kPglRetSuccess;
702   }
703   const uint32_t par1_code = cip->xymt_codes[kChrOffsetPAR1];
704   const uint32_t par2_code = cip->xymt_codes[kChrOffsetPAR2];
705   if (unlikely(IsI32Neg(par2_code))) {
706     // may want to remove this restriction later
707     logerrputs("Error: --split-par cannot currently be used with a custom chromosome set.\n");
708     return kPglRetInvalidCmdline;
709   }
710   if (unlikely(IsSet(loaded_chr_mask, par1_code) || IsSet(loaded_chr_mask, par2_code))) {
711     logerrputs("Error: --split-par cannot be used on a dataset which already contains a PAR1 or\nPAR2 region.\n");
712     return kPglRetInvalidCmdline;
713   }
714   if (unlikely(vpos_sortstatus & kfUnsortedVarBp)) {
715     logerrputs("Error: --split-par cannot be used with an unsorted .bim/.pvar file.\n");
716     return kPglRetInvalidCmdline;
717   }
718   const uint32_t orig_xchr_fo_idx = cip->chr_idx_to_foidx[x_code];
719   const uint32_t orig_x_start = cip->chr_fo_vidx_start[orig_xchr_fo_idx];
720   const uint32_t orig_x_end = cip->chr_fo_vidx_start[orig_xchr_fo_idx + 1];
721   const uint32_t par1_end = orig_x_start + CountSortedSmallerU32(&(variant_bps[orig_x_start]), orig_x_end - orig_x_start, splitpar_bound1 + 1);
722   const uint32_t par2_start = par1_end + CountSortedSmallerU32(&(variant_bps[par1_end]), orig_x_end - par1_end, splitpar_bound2);
723   uint32_t tot_codes_changed = (par1_end - orig_x_start) + (orig_x_end - par2_start);
724   if (!tot_codes_changed) {
725     logerrputs("Warning: --split-par had no effect (no X variants were in the PARs).\n");
726     return kPglRetSuccess;
727   }
728   // one of the PARs, and/or the main chrX body, may be empty; that's not a big
729   // deal
730   *chrs_encountered_m1_ptr += 2;
731   const uint32_t chrs_encountered_m1 = *chrs_encountered_m1_ptr;
732   cip->chr_fo_vidx_start[chrs_encountered_m1 + 1] = cip->chr_fo_vidx_start[chrs_encountered_m1 - 1];
733   for (uint32_t chr_fo_idx = chrs_encountered_m1 - 2; chr_fo_idx != orig_xchr_fo_idx; --chr_fo_idx) {
734     cip->chr_fo_vidx_start[chr_fo_idx + 2] = cip->chr_fo_vidx_start[chr_fo_idx];
735     const uint32_t cur_chr_idx = cip->chr_file_order[chr_fo_idx];
736     cip->chr_file_order[chr_fo_idx + 2] = cur_chr_idx;
737     cip->chr_idx_to_foidx[cur_chr_idx] = chr_fo_idx + 2;
738   }
739   cip->chr_fo_vidx_start[orig_xchr_fo_idx + 1] = par1_end;
740   cip->chr_fo_vidx_start[orig_xchr_fo_idx + 2] = par2_start;
741   cip->chr_file_order[orig_xchr_fo_idx] = par1_code;
742   cip->chr_file_order[orig_xchr_fo_idx + 1] = x_code;
743   cip->chr_file_order[orig_xchr_fo_idx + 2] = par2_code;
744   cip->chr_idx_to_foidx[par1_code] = orig_xchr_fo_idx;
745   cip->chr_idx_to_foidx[x_code] = orig_xchr_fo_idx + 1;
746   cip->chr_idx_to_foidx[par2_code] = orig_xchr_fo_idx + 2;
747   uintptr_t* chr_mask = cip->chr_mask;
748   if (par1_end > orig_x_start) {
749     if (!IsSet(chr_mask, par1_code)) {
750       *exclude_ct_ptr += PopcountBitRange(variant_include, orig_x_start, par1_end);
751       ClearBitsNz(orig_x_start, par1_end, variant_include);
752     } else {
753       SetBit(par1_code, loaded_chr_mask);
754     }
755   }
756   if (par1_end == par2_start) {
757     ClearBit(x_code, chr_mask);
758   } else if (!IsSet(chr_mask, x_code)) {
759     ClearBit(x_code, chr_mask);
760     *exclude_ct_ptr += PopcountBitRange(variant_include, par1_end, par2_start);
761     ClearBitsNz(par1_end, par2_start, variant_include);
762   }
763   if (par2_start < orig_x_end) {
764     if (!IsSet(chr_mask, par2_code)) {
765       *exclude_ct_ptr += PopcountBitRange(variant_include, par2_start, orig_x_end);
766       ClearBitsNz(par2_start, orig_x_end, variant_include);
767     } else {
768       SetBit(par2_code, loaded_chr_mask);
769     }
770   }
771   logprintf("--split-par: %u chromosome code%s changed.\n", tot_codes_changed, (tot_codes_changed == 1)? "" : "s");
772   return kPglRetSuccess;
773 }
774 
775 static_assert((!(kMaxIdSlen % kCacheline)), "LoadPvar() must be updated.");
LoadPvar(const char * pvarname,const char * var_filter_exceptions_flattened,const char * varid_template_str,const char * varid_multi_template_str,const char * varid_multi_nonsnp_template_str,const char * missing_varid_match,const char * require_info_flattened,const char * require_no_info_flattened,const CmpExpr * extract_if_info_exprp,const CmpExpr * exclude_if_info_exprp,MiscFlags misc_flags,PvarPsamFlags pvar_psam_flags,uint32_t xheader_needed,uint32_t qualfilter_needed,float var_min_qual,uint32_t splitpar_bound1,uint32_t splitpar_bound2,uint32_t new_variant_id_max_allele_slen,uint32_t snps_only,uint32_t split_chr_ok,uint32_t filter_min_allele_ct,uint32_t filter_max_allele_ct,uint32_t max_thread_ct,ChrInfo * cip,uint32_t * max_variant_id_slen_ptr,uint32_t * info_reload_slen_ptr,UnsortedVar * vpos_sortstatus_ptr,char ** xheader_ptr,uintptr_t ** variant_include_ptr,uint32_t ** variant_bps_ptr,char *** variant_ids_ptr,uintptr_t ** allele_idx_offsets_ptr,const char *** allele_storage_ptr,uintptr_t ** qual_present_ptr,float ** quals_ptr,uintptr_t ** filter_present_ptr,uintptr_t ** filter_npass_ptr,char *** filter_storage_ptr,uintptr_t ** nonref_flags_ptr,double ** variant_cms_ptr,ChrIdx ** chr_idxs_ptr,uint32_t * raw_variant_ct_ptr,uint32_t * variant_ct_ptr,uint32_t * max_allele_ct_ptr,uint32_t * max_allele_slen_ptr,uintptr_t * xheader_blen_ptr,InfoFlags * info_flags_ptr,uint32_t * max_filter_slen_ptr)776 PglErr LoadPvar(const char* pvarname, const char* var_filter_exceptions_flattened, const char* varid_template_str, const char* varid_multi_template_str, const char* varid_multi_nonsnp_template_str, const char* missing_varid_match, const char* require_info_flattened, const char* require_no_info_flattened, const CmpExpr* extract_if_info_exprp, const CmpExpr* exclude_if_info_exprp, MiscFlags misc_flags, PvarPsamFlags pvar_psam_flags, uint32_t xheader_needed, uint32_t qualfilter_needed, float var_min_qual, uint32_t splitpar_bound1, uint32_t splitpar_bound2, uint32_t new_variant_id_max_allele_slen, uint32_t snps_only, uint32_t split_chr_ok, uint32_t filter_min_allele_ct, uint32_t filter_max_allele_ct, uint32_t max_thread_ct, ChrInfo* cip, uint32_t* max_variant_id_slen_ptr, uint32_t* info_reload_slen_ptr, UnsortedVar* vpos_sortstatus_ptr, char** xheader_ptr, uintptr_t** variant_include_ptr, uint32_t** variant_bps_ptr, char*** variant_ids_ptr, uintptr_t** allele_idx_offsets_ptr, const char*** allele_storage_ptr, uintptr_t** qual_present_ptr, float** quals_ptr, uintptr_t** filter_present_ptr, uintptr_t** filter_npass_ptr, char*** filter_storage_ptr, uintptr_t** nonref_flags_ptr, double** variant_cms_ptr, ChrIdx** chr_idxs_ptr, uint32_t* raw_variant_ct_ptr, uint32_t* variant_ct_ptr, uint32_t* max_allele_ct_ptr, uint32_t* max_allele_slen_ptr, uintptr_t* xheader_blen_ptr, InfoFlags* info_flags_ptr, uint32_t* max_filter_slen_ptr) {
777   // chr_info, max_variant_id_slen, and info_reload_slen are in/out; just
778   // outparameters after them.  (Due to its large size in some VCFs, INFO is
779   // not kept in memory for now.  This has a speed penalty, of course; maybe
780   // it's worthwhile to conditionally load it later.)
781 
782   // allele_idx_offsets currently assumed to be initialized to nullptr
783 
784   // should handle raw_variant_ct == 0 properly
785 
786   // possible todo: optionally skip allele code loading
787   // probable todo: load INFO:END.  (does this allow the CNV module to be
788   //   unified with the rest of the program?)  but this will probably wait
789   //   until I need to analyze some sort of CNV data, and that day keeps
790   //   getting postponed... for now, the BCF exporter performs its own parsing
791   //   of INFO:END so that it can fill each variant's rlen field correctly.
792   // possible todo: require FILTER to only contain values declared in header,
793   //   and modify its storage accordingly?  (pointless for now, but worthwhile
794   //   to keep an eye on what typical VCF files look like.)
795 
796   // Workspace is used as follows:
797   // |--header, allele_storage->----|--other return arrays---|--linebuf--|-
798   //                                                        1/4
799   //
800   // -temp-->----|----<- filter failures, variant IDs, long alleles--|
801   //                                                                end
802   // I.e. on successful return, both bigstack_base and bigstack_end will move.
803   // This is designed to be called near the start of a program, at a time when
804   // no large temporary buffer is needed.
805   unsigned char* bigstack_mark = g_bigstack_base;
806   unsigned char* bigstack_end_mark = g_bigstack_end;
807   uintptr_t line_idx = 0;
808   uint32_t max_extra_alt_ct = 0;
809   uint32_t max_allele_slen = 1;
810   PglErr reterr = kPglRetSuccess;
811   TextStream pvar_txs;
812   PreinitTextStream(&pvar_txs);
813   {
814     const uintptr_t quarter_left = RoundDownPow2(bigstack_left() / 4, kCacheline);
815     uint32_t max_line_blen;
816     if (unlikely(StandardizeMaxLineBlenEx(quarter_left, kLoadPvarBlockSize * 2 * sizeof(intptr_t), &max_line_blen))) {
817       goto LoadPvar_ret_NOMEM;
818     }
819     // bugfix (29 Jun 2018): don't cap allele_storage space at 2 GB, otherwise
820     // we're limited to 134M variants
821     unsigned char* rlstream_start = &(bigstack_mark[quarter_left]);
822     g_bigstack_base = rlstream_start;
823     uint32_t decompress_thread_ct = max_thread_ct - 1;
824     // 3 still seems best on a heavily multicore Linux test machine
825     if (decompress_thread_ct > 3) {
826       decompress_thread_ct = 3;
827     } else if (!decompress_thread_ct) {
828       decompress_thread_ct = 1;
829     }
830     reterr = InitTextStream(pvarname, max_line_blen, decompress_thread_ct, &pvar_txs);
831     if (unlikely(reterr)) {
832       goto LoadPvar_ret_TSTREAM_FAIL;
833     }
834     unsigned char* tmp_alloc_base = g_bigstack_base;
835     g_bigstack_base = bigstack_mark;
836 
837     char* xheader_end = ((pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) || xheader_needed)? R_CAST(char*, bigstack_mark) : nullptr;
838     uint32_t chrset_present = 0;
839     uint32_t info_pr_present = 0;
840     uint32_t info_pr_nonflag_present = 0;
841     uint32_t info_nonpr_present = 0;
842     char* line_iter = TextLineEnd(&pvar_txs);
843     char* line_start;
844     while (1) {
845       ++line_idx;
846       if (!TextGetUnsafe2(&pvar_txs, &line_iter)) {
847         if (unlikely(TextStreamErrcode2(&pvar_txs, &reterr))) {
848           goto LoadPvar_ret_TSTREAM_FAIL;
849         }
850         line_start = K_CAST(char*, &(g_one_char_strs[0]));
851         --line_iter;
852         break;
853       }
854       line_start = line_iter;
855       if ((*line_start != '#') || tokequal_k(line_start, "#CHROM")) {
856         break;
857       }
858       if (StrStartsWithUnsafe(line_start, "##INFO=<ID=PR,Number=")) {
859         if (unlikely(info_pr_present || info_pr_nonflag_present)) {
860           snprintf(g_logbuf, kLogbufSize, "Error: Duplicate INFO:PR header line in %s.\n", pvarname);
861           goto LoadPvar_ret_MALFORMED_INPUT_WW;
862         }
863         info_pr_nonflag_present = !StrStartsWithUnsafe(&(line_start[strlen("##INFO=<ID=PR,Number=")]), "0,Type=Flag,Description=");
864         info_pr_present = 1 - info_pr_nonflag_present;
865         if (info_pr_nonflag_present) {
866           logerrprintfww("Warning: Header line %" PRIuPTR " of %s has an unexpected definition of INFO:PR. This interferes with a few merge and liftover operations.\n", line_idx, pvarname);
867         }
868       } else if ((!info_nonpr_present) && StrStartsWithUnsafe(line_start, "##INFO=<ID=")) {
869         info_nonpr_present = 1;
870       }
871       if (StrStartsWithUnsafe(line_start, "##chrSet=<")) {
872         if (unlikely(chrset_present)) {
873           snprintf(g_logbuf, kLogbufSize, "Error: Multiple ##chrSet header lines in %s.\n", pvarname);
874           goto LoadPvar_ret_MALFORMED_INPUT_WW;
875         }
876         chrset_present = 1;
877         const uint32_t cmdline_chrset = (cip->chrset_source == kChrsetSourceCmdline) && (!(misc_flags & kfMiscChrOverrideFile));
878         reterr = ReadChrsetHeaderLine(&(line_start[strlen("##chrSet=<")]), pvarname, misc_flags, line_idx, cip);
879         if (unlikely(reterr)) {
880           goto LoadPvar_ret_1;
881         }
882         if (!cmdline_chrset) {
883           const uint32_t autosome_ct = cip->autosome_ct;
884           if (cip->haploid_mask[0] & 1) {
885             logprintf("chrSet header line: %u autosome%s (haploid).\n", autosome_ct, (autosome_ct == 1)? "" : "s");
886           } else {
887             logprintf("chrSet header line: %u autosome pair%s.\n", autosome_ct, (autosome_ct == 1)? "" : "s");
888           }
889         }
890       } else if (xheader_end) {
891         // if the "pvar file" was actually a VCF, suppress the same lines we'd
892         // suppress when importing with --vcf.
893         if ((!StrStartsWithUnsafe(line_start, "##fileformat=")) &&
894             (!StrStartsWithUnsafe(line_start, "##fileDate=")) &&
895             (!StrStartsWithUnsafe(line_start, "##source=")) &&
896             (!StrStartsWithUnsafe(line_start, "##FORMAT="))) {
897           char* line_end = AdvToDelim(line_start, '\n');
898           uint32_t line_slen = line_end - line_start;
899           if (line_start[line_slen - 1] == '\r') {
900             --line_slen;
901           }
902           if (unlikely(S_CAST(uintptr_t, R_CAST(char*, rlstream_start) - xheader_end) < line_slen + 2)) {
903             goto LoadPvar_ret_NOMEM;
904           }
905           xheader_end = memcpya(xheader_end, line_start, line_slen);
906           AppendBinaryEoln(&xheader_end);
907           line_iter = line_end;
908         }
909       }
910       line_iter = AdvPastDelim(line_iter, '\n');
911     }
912     if (xheader_end) {
913       *xheader_ptr = R_CAST(char*, bigstack_mark);
914       *xheader_blen_ptr = xheader_end - (*xheader_ptr);
915       BigstackBaseSet(xheader_end);
916     }
917     FinalizeChrset(misc_flags, cip);
918     const char** allele_storage = R_CAST(const char**, g_bigstack_base);
919     const char** allele_storage_iter = allele_storage;
920 
921     uint32_t col_skips[8];
922     uint32_t col_types[8];
923     uint32_t relevant_postchr_col_ct = 5;
924     uint32_t alt_col_idx = 4;
925     uint32_t load_qual_col = 0;
926     uint32_t load_filter_col = 0;
927     uint32_t info_col_present = 0;
928     uint32_t cm_col_present = 0;
929     if (line_start[0] == '#') {
930       *info_flags_ptr = S_CAST(InfoFlags, (info_pr_present * kfInfoPrFlagPresent) | (info_pr_nonflag_present * kfInfoPrNonflagPresent) | (info_nonpr_present * kfInfoNonprPresent));
931       // parse header
932       // [-1] = #CHROM (must be first column)
933       // [0] = POS
934       // [1] = ID
935       // [2] = REF
936       // [3] = ALT
937       // [4] = QUAL
938       // [5] = FILTER
939       // [6] = INFO
940       // [7] = CM (usually absent)
941 
942       // code is similar to plink 1.9 annotate() and gene_report(), but they
943       // don't have a forced first column
944       // might want to write plink2_common library functions for this...
945       const char* token_end = &(line_start[6]);
946       uint32_t found_header_bitset = 0;
947       relevant_postchr_col_ct = 0;
948       const char* linebuf_iter;
949       for (uint32_t col_idx = 1; ; ++col_idx) {
950         linebuf_iter = FirstNonTspace(token_end);
951         if (IsEolnKns(*linebuf_iter)) {
952           break;
953         }
954         token_end = CurTokenEnd(linebuf_iter);
955         const uint32_t token_slen = token_end - linebuf_iter;
956         uint32_t cur_col_type;
957         if (token_slen <= 3) {
958           if (token_slen == 3) {
959             if (memequal_k(linebuf_iter, "POS", 3)) {
960               cur_col_type = 0;
961             } else if (memequal_k(linebuf_iter, "REF", 3)) {
962               cur_col_type = 2;
963             } else if (memequal_k(linebuf_iter, "ALT", 3)) {
964               cur_col_type = 3;
965               alt_col_idx = col_idx;
966             } else {
967               continue;
968             }
969           } else if (token_slen == 2) {
970             if (memequal_k(linebuf_iter, "ID", 2)) {
971               cur_col_type = 1;
972             } else if (memequal_k(linebuf_iter, "CM", 2)) {
973               cur_col_type = 7;
974               cm_col_present = 1;
975             } else {
976               continue;
977             }
978           } else {
979             continue;
980           }
981         } else if (strequal_k(linebuf_iter, "QUAL", token_slen)) {
982           load_qual_col = 2 * ((pvar_psam_flags & (kfPvarColMaybequal | kfPvarColQual)) || qualfilter_needed) + (var_min_qual != -1);
983           if (!load_qual_col) {
984             continue;
985           }
986           cur_col_type = 4;
987         } else if (strequal_k(linebuf_iter, "INFO", token_slen)) {
988           cur_col_type = 6;
989           info_col_present = 1;
990         } else if (token_slen == 6) {
991           if (memequal_k(linebuf_iter, "FILTER", 6)) {
992             load_filter_col = 2 * ((pvar_psam_flags & (kfPvarColMaybefilter | kfPvarColFilter)) || qualfilter_needed) + ((misc_flags / kfMiscExcludePvarFilterFail) & 1);
993             if (!load_filter_col) {
994               continue;
995             }
996             cur_col_type = 5;
997           } else if (memequal_k(linebuf_iter, "FORMAT", 6)) {
998             break;
999           } else {
1000             continue;
1001           }
1002         } else {
1003           continue;
1004         }
1005         const uint32_t cur_col_type_shifted = 1 << cur_col_type;
1006         if (unlikely(found_header_bitset & cur_col_type_shifted)) {
1007           // known token, so no overflow danger
1008           char* write_iter = strcpya_k(g_logbuf, "Error: Duplicate column header '");
1009           write_iter = memcpya(write_iter, linebuf_iter, token_slen);
1010           write_iter = strcpya_k(write_iter, "' on line ");
1011           write_iter = wtoa(line_idx, write_iter);
1012           write_iter = strcpya_k(write_iter, " of ");
1013           write_iter = strcpya(write_iter, pvarname);
1014           memcpy_k(write_iter, ".\n\0", 4);
1015           goto LoadPvar_ret_MALFORMED_INPUT_WW;
1016         }
1017         found_header_bitset |= cur_col_type_shifted;
1018         col_skips[relevant_postchr_col_ct] = col_idx;
1019         col_types[relevant_postchr_col_ct++] = cur_col_type;
1020       }
1021       if (unlikely((found_header_bitset & 0x0f) != 0x0f)) {
1022         snprintf(g_logbuf, kLogbufSize, "Error: Missing column header(s) on line %" PRIuPTR " of %s. (POS, ID, REF, and ALT are required.)\n", line_idx, pvarname);
1023         goto LoadPvar_ret_MALFORMED_INPUT_WW;
1024       }
1025       if ((var_min_qual != -1) && (!(found_header_bitset & 0x10))) {
1026         logerrputs("Error: --var-min-qual used on a variant file with no QUAL column.\n");
1027         goto LoadPvar_ret_INCONSISTENT_INPUT;
1028       }
1029       for (uint32_t rpc_col_idx = relevant_postchr_col_ct - 1; rpc_col_idx; --rpc_col_idx) {
1030         col_skips[rpc_col_idx] -= col_skips[rpc_col_idx - 1];
1031       }
1032       // skip this line in main loop
1033       line_iter = K_CAST(char*, AdvToDelim(linebuf_iter, '\n'));
1034       line_start = line_iter;
1035     } else if (line_start[0] != '\n') {
1036       if (var_min_qual != -1) {
1037         logerrputs("Error: --var-min-qual used on a variant file with no QUAL column.\n");
1038         goto LoadPvar_ret_INCONSISTENT_INPUT;
1039       }
1040       *info_flags_ptr = kfInfoPrNonrefDefault;
1041       col_skips[0] = 1;
1042       col_skips[1] = 1;
1043       col_skips[2] = 1;
1044       col_skips[3] = 1;
1045       col_types[0] = 1;
1046       // CM column is formally optional in headerless .pvar files (and it was
1047       // "secretly" optional for the standard plink 1.9 standard .bim loader).
1048       // If the line has exactly 5 columns, assume CM is omitted.
1049 
1050       // Note that this doesn't break in the line_start = &(g_one_char_strs[0])
1051       // case.
1052       const char* linebuf_iter = NextTokenMult(line_start, 4);
1053       if (unlikely(!linebuf_iter)) {
1054         goto LoadPvar_ret_MISSING_TOKENS;
1055       }
1056       linebuf_iter = NextToken(linebuf_iter);
1057       if (!linebuf_iter) {
1058         // #CHROM ID POS ALT REF
1059         relevant_postchr_col_ct = 4;
1060         col_types[1] = 0;
1061         col_types[2] = 3;
1062         col_types[3] = 2;
1063         alt_col_idx = 3;
1064       } else {
1065         // #CHROM ID CM POS ALT REF
1066         col_skips[4] = 1;
1067         col_types[1] = 7;
1068         col_types[2] = 0;
1069         col_types[3] = 3;
1070         col_types[4] = 2;
1071         // alt_col_idx = 4;
1072         cm_col_present = 1;
1073       }
1074     }
1075     uint32_t info_reload_slen = *info_reload_slen_ptr;
1076 
1077     // done with header.  line_start now points to either the beginning of the
1078     // first real line, or an eoln character.
1079     uint32_t max_variant_id_slen = *max_variant_id_slen_ptr;
1080     uint32_t chrs_encountered_m1 = UINT32_MAX;  // intentional overflow
1081     uint32_t prev_chr_code = UINT32_MAX;  // force initial mismatch
1082     uint32_t raw_variant_ct = 0;
1083     uintptr_t* chr_mask = cip->chr_mask;
1084     const char* missing_allele_str = &(g_one_char_strs[92]);
1085     double last_cm = -DBL_MAX;
1086     int32_t last_bp = 0;
1087 
1088     // this way, we only need to check allele_storage_iter against this (i)
1089     // when processing a multiallelic variant or (ii) at the end of a block
1090     const char** allele_storage_limit = R_CAST(const char**, &(rlstream_start[kLoadPvarBlockSize * (-2) * sizeof(intptr_t)]));
1091 
1092     uintptr_t* loaded_chr_mask = R_CAST(uintptr_t*, tmp_alloc_base);
1093     unsigned char* tmp_alloc_end = bigstack_end_mark;
1094     // guaranteed to succeed since max_line_blen > 128k, etc.
1095     /*
1096     if ((uintptr_t)(tmp_alloc_end - tmp_alloc_base) < RoundUpPow2(kChrMaskWords * sizeof(intptr_t), kCacheline)) {
1097       goto LoadPvar_ret_NOMEM;
1098     }
1099     */
1100     tmp_alloc_base = &(tmp_alloc_base[RoundUpPow2(kChrMaskWords * sizeof(intptr_t), kCacheline)]);
1101     // bugfix (2 Jun 2017): forgot to zero-initialize loaded_chr_mask
1102     ZeroWArr(kChrMaskWords, loaded_chr_mask);
1103 
1104     InfoExist* info_existp = nullptr;
1105     if (require_info_flattened) {
1106       reterr = InfoExistInit(tmp_alloc_end, require_info_flattened, "require-info", &tmp_alloc_base, &info_existp);
1107       if (unlikely(reterr)) {
1108         goto LoadPvar_ret_1;
1109       }
1110     }
1111     InfoExist* info_nonexistp = nullptr;
1112     if (require_no_info_flattened) {
1113       reterr = InfoExistInit(tmp_alloc_end, require_no_info_flattened, "require-no-info", &tmp_alloc_base, &info_nonexistp);
1114       if (unlikely(reterr)) {
1115         goto LoadPvar_ret_1;
1116       }
1117     }
1118     InfoFilter info_keep;
1119     info_keep.prekey = nullptr;
1120     if (extract_if_info_exprp->pheno_name) {
1121       // todo: also print warning (or optionally error out?) if header line is
1122       // missing or doesn't match type expectation
1123       // (same for --require-info)
1124       reterr = InfoFilterInit(tmp_alloc_end, extract_if_info_exprp, "extract-if-info", &tmp_alloc_base, &info_keep);
1125       if (unlikely(reterr)) {
1126         goto LoadPvar_ret_1;
1127       }
1128     }
1129     InfoFilter info_remove;
1130     info_remove.prekey = nullptr;
1131     if (exclude_if_info_exprp->pheno_name) {
1132       reterr = InfoFilterInit(tmp_alloc_end, exclude_if_info_exprp, "exclude-if-info", &tmp_alloc_base, &info_remove);
1133       if (unlikely(reterr)) {
1134         goto LoadPvar_ret_1;
1135       }
1136     }
1137     if (!info_col_present) {
1138       if (unlikely(require_info_flattened)) {
1139         logerrputs("Error: --require-info used on a variant file with no INFO column.\n");
1140         goto LoadPvar_ret_INCONSISTENT_INPUT;
1141       }
1142       if (unlikely(info_keep.prekey)) {
1143         logerrputs("Error: --extract-if-info used on a variant file with no INFO column.\n");
1144         goto LoadPvar_ret_INCONSISTENT_INPUT;
1145       }
1146       info_pr_present = 0;
1147       info_reload_slen = 0;
1148     } else if ((!info_pr_present) && (!info_reload_slen) && (!info_existp) && (!info_nonexistp) && (!info_keep.prekey) && (!info_remove.prekey)) {
1149       info_col_present = 0;
1150     }
1151 
1152     uint32_t fexcept_ct = 0;
1153     uintptr_t max_fexcept_blen = 2;
1154     char* sorted_fexcepts = nullptr;
1155     if (var_filter_exceptions_flattened) {
1156       if (unlikely(MultistrToStrboxDedupArenaAlloc(tmp_alloc_end, var_filter_exceptions_flattened, &tmp_alloc_base, &sorted_fexcepts, &fexcept_ct, &max_fexcept_blen))) {
1157         goto LoadPvar_ret_NOMEM;
1158       }
1159     }
1160     const uint32_t new_variant_id_overflow_missing = (misc_flags / kfMiscNewVarIdOverflowMissing) & 1;
1161     char* chr_output_name_buf = nullptr;
1162     VaridTemplate* varid_templatep = nullptr;
1163     VaridTemplate* varid_multi_templatep = nullptr;
1164     VaridTemplate* varid_multi_nonsnp_templatep = nullptr;
1165     uint32_t missing_varid_blen = 0;
1166     uint32_t missing_varid_match_slen = 0;
1167     if (varid_template_str) {
1168       if (unlikely(S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) < kMaxIdSlen + 3 * RoundUpPow2(sizeof(VaridTemplate), kCacheline))) {
1169         goto LoadPvar_ret_NOMEM;
1170       }
1171       chr_output_name_buf = R_CAST(char*, tmp_alloc_base);
1172       tmp_alloc_base = &(tmp_alloc_base[kMaxIdSlen]);
1173       if (!missing_varid_match) {
1174         missing_varid_match = &(g_one_char_strs[92]);  // '.'
1175       }
1176       missing_varid_blen = strlen(missing_varid_match);
1177       if (misc_flags & kfMiscSetMissingVarIds) {
1178         missing_varid_match_slen = missing_varid_blen;
1179       }
1180       ++missing_varid_blen;
1181       varid_templatep = R_CAST(VaridTemplate*, tmp_alloc_base);
1182       tmp_alloc_base = &(tmp_alloc_base[RoundUpPow2(sizeof(VaridTemplate), kCacheline)]);
1183       const uint32_t overflow_substitute_blen = new_variant_id_overflow_missing? missing_varid_blen : 0;
1184       VaridTemplateInit(varid_template_str, missing_varid_match, chr_output_name_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_templatep);
1185       if (varid_multi_template_str) {
1186         varid_multi_templatep = R_CAST(VaridTemplate*, tmp_alloc_base);
1187         tmp_alloc_base = &(tmp_alloc_base[RoundUpPow2(sizeof(VaridTemplate), kCacheline)]);
1188         VaridTemplateInit(varid_multi_template_str, missing_varid_match, chr_output_name_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_multi_templatep);
1189       }
1190       if (varid_multi_nonsnp_template_str) {
1191         varid_multi_nonsnp_templatep = R_CAST(VaridTemplate*, tmp_alloc_base);
1192         tmp_alloc_base = &(tmp_alloc_base[RoundUpPow2(sizeof(VaridTemplate), kCacheline)]);
1193         VaridTemplateInit(varid_multi_nonsnp_template_str, missing_varid_match, chr_output_name_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_multi_nonsnp_templatep);
1194       }
1195     }
1196 
1197     // prevent later return-array allocations from overlapping with temporary
1198     // storage
1199     g_bigstack_end = tmp_alloc_base;
1200 
1201     // prevent variant_id_htable_find from breaking
1202     if (R_CAST(const char*, tmp_alloc_end) > (&(g_one_char_strs[512 - kMaxIdSlen]))) {
1203       tmp_alloc_end = R_CAST(unsigned char*, K_CAST(char*, &(g_one_char_strs[512 - kMaxIdSlen])));
1204     }
1205     const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
1206     const uint32_t merge_par = ((misc_flags & (kfMiscMergePar | kfMiscMergeX)) != 0);
1207     const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
1208     const char input_missing_geno_char = *g_input_missing_geno_ptr;
1209     uint32_t parx_code = cip->xymt_codes[kChrOffsetPAR1];
1210     uint32_t par2_code = cip->xymt_codes[kChrOffsetPAR2];
1211     if (misc_flags & kfMiscMergeX) {
1212       parx_code = cip->xymt_codes[kChrOffsetXY];
1213       par2_code = UINT32_MAXM1;
1214     }
1215     uint32_t merge_par_ct = 0;
1216 
1217     // Corner case: with --split-par + --not-chr x, we should keep the
1218     // pseudoautosomal regions.  To facilitate this, we temporarily don't mask
1219     // out chrX; SplitPar() handles this properly later.
1220     const uint32_t splitpar_and_exclude_x = splitpar_bound2 && (!IsI32Neg(x_code)) && (!IsSet(cip->chr_mask, x_code));
1221     if (splitpar_and_exclude_x) {
1222       SetBit(x_code, cip->chr_mask);
1223     }
1224 
1225     uint8_t acgtm_bool_table[256] = {
1226       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1227       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1228       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1229       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1230       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1231       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1232       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1233       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1234       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1235       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1236       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1237       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1238       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1239       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1240       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1241       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1242     if (snps_only > 1) {
1243       acgtm_bool_table[ctou32(input_missing_geno_char)] = 1;
1244     }
1245 
1246     uint32_t* cur_bps = nullptr;
1247     uintptr_t* cur_allele_idxs = nullptr;
1248     char** cur_ids = nullptr;
1249     uintptr_t* cur_include = nullptr;
1250     uintptr_t* cur_qual_present = nullptr;
1251     float* cur_quals = nullptr;
1252     uintptr_t* cur_filter_present = nullptr;
1253     uintptr_t* cur_filter_npass = nullptr;
1254     char** cur_filter_storage = nullptr;
1255     uintptr_t* cur_nonref_flags = nullptr;
1256 
1257     // only need this for --set-{missing,all}-var-ids error message
1258     uint32_t max_chr_slen = 0;
1259 
1260     uint32_t max_filter_slen = 0;
1261     uint32_t exclude_ct = 0;
1262 
1263     // only allocated when necessary
1264     // if we want to scale this approach to more fields, we'll need to add a
1265     // few pointers to the start of each block.  right now, we force cur_cms[]
1266     // to be allocated before cur_chr_idxs[] when both are present, but this
1267     // is error-prone.
1268     uint32_t at_least_one_npass_filter = 0;
1269     uint32_t at_least_one_nzero_cm = 0;
1270     uintptr_t new_variant_id_allele_len_overflow = 0;
1271     double* cur_cms = nullptr;
1272     uint32_t cms_start_block = UINT32_MAX;
1273 
1274     ChrIdx* cur_chr_idxs = nullptr;
1275     uint32_t chr_idxs_start_block = UINT32_MAX;
1276     uint32_t is_split_chr = 0;
1277     UnsortedVar vpos_sortstatus = kfUnsortedVar0;
1278 
1279     if (IsEolnKns(*line_start)) {
1280       ++line_iter;
1281       ++line_idx;
1282     } else {
1283       line_iter = line_start;
1284     }
1285     for (; TextGetUnsafe2(&pvar_txs, &line_iter); ++line_iter, ++line_idx) {
1286       if (unlikely(line_iter[0] == '#')) {
1287         snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #CHROM header line is present it must denote the end of the header block.)\n", line_idx, pvarname);
1288         goto LoadPvar_ret_MALFORMED_INPUT_WW;
1289       }
1290 #ifdef __LP64__
1291       // maximum prime < 2^32 is 4294967291; quadratic hashing guarantee
1292       // breaks down past that divided by 2.
1293       if (unlikely(raw_variant_ct == 0x7ffffffd)) {
1294         logerrputs("Error: " PROG_NAME_STR " does not support more than 2^31 - 3 variants.  We recommend using\nother software for very deep studies of small numbers of genomes.\n");
1295         goto LoadPvar_ret_MALFORMED_INPUT;
1296       }
1297 #endif
1298       const uint32_t variant_idx_lowbits = raw_variant_ct % kLoadPvarBlockSize;
1299       if (!variant_idx_lowbits) {
1300         if (unlikely(
1301                 (S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) <=
1302                   kLoadPvarBlockSize *
1303                     (sizeof(int32_t) +
1304                      2 * sizeof(intptr_t) +
1305                      at_least_one_nzero_cm * sizeof(double)) +
1306                   is_split_chr * sizeof(ChrIdx) +
1307                   (1 + info_pr_present) * (kLoadPvarBlockSize / CHAR_BIT) +
1308                   (load_qual_col? ((kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(float)) : 0) +
1309                   (load_filter_col? (2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)) : 0)) ||
1310                 (allele_storage_iter >= allele_storage_limit))) {
1311           goto LoadPvar_ret_NOMEM;
1312         }
1313         cur_bps = R_CAST(uint32_t*, tmp_alloc_base);
1314         cur_allele_idxs = R_CAST(uintptr_t*, &(tmp_alloc_base[kLoadPvarBlockSize * sizeof(int32_t)]));
1315         cur_ids = R_CAST(char**, &(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]));
1316         cur_include = R_CAST(uintptr_t*, &(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t))]));
1317         SetAllWArr(kLoadPvarBlockSize / kBitsPerWord, cur_include);
1318         tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t)) + (kLoadPvarBlockSize / CHAR_BIT)]);
1319         if (load_qual_col > 1) {
1320           cur_qual_present = R_CAST(uintptr_t*, tmp_alloc_base);
1321           ZeroWArr(kLoadPvarBlockSize / kBitsPerWord, cur_qual_present);
1322           cur_quals = R_CAST(float*, &(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]));
1323           tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize * sizeof(float) + (kLoadPvarBlockSize / CHAR_BIT)]);
1324         }
1325         if (load_filter_col > 1) {
1326           cur_filter_present = R_CAST(uintptr_t*, tmp_alloc_base);
1327           cur_filter_npass = R_CAST(uintptr_t*, &(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]));
1328           cur_filter_storage = R_CAST(char**, &(tmp_alloc_base[2 * (kLoadPvarBlockSize / CHAR_BIT)]));
1329           ZeroWArr(kLoadPvarBlockSize / kBitsPerWord, cur_filter_present);
1330           ZeroWArr(kLoadPvarBlockSize / kBitsPerWord, cur_filter_npass);
1331           tmp_alloc_base = &(tmp_alloc_base[2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)]);
1332         }
1333         if (info_pr_present) {
1334           cur_nonref_flags = R_CAST(uintptr_t*, tmp_alloc_base);
1335           ZeroWArr(kLoadPvarBlockSize / kBitsPerWord, cur_nonref_flags);
1336           tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]);
1337         }
1338         if (at_least_one_nzero_cm) {
1339           cur_cms = R_CAST(double*, tmp_alloc_base);
1340           ZeroDArr(kLoadPvarBlockSize, cur_cms);
1341           tmp_alloc_base = R_CAST(unsigned char*, &(cur_cms[kLoadPvarBlockSize]));
1342         }
1343         if (is_split_chr) {
1344           cur_chr_idxs = R_CAST(ChrIdx*, tmp_alloc_base);
1345           tmp_alloc_base = R_CAST(unsigned char*, &(cur_chr_idxs[kLoadPvarBlockSize]));
1346         }
1347       }
1348       char* linebuf_iter = CurTokenEnd(line_iter);
1349       // #CHROM
1350       if (unlikely(*linebuf_iter == '\n')) {
1351         goto LoadPvar_ret_MISSING_TOKENS;
1352       }
1353       uint32_t cur_chr_code;
1354       reterr = GetOrAddChrCodeDestructive(".pvar file", line_idx, allow_extra_chrs, line_iter, linebuf_iter, cip, &cur_chr_code);
1355       if (unlikely(reterr)) {
1356         goto LoadPvar_ret_1;
1357       }
1358       if (merge_par) {
1359         if (cur_chr_code == par2_code) {
1360           // don't permit PAR1 variants after PAR2
1361           parx_code = par2_code;
1362         }
1363         if (cur_chr_code == parx_code) {
1364           ++merge_par_ct;
1365           cur_chr_code = x_code;
1366         }
1367       }
1368       if (cur_chr_code != prev_chr_code) {
1369         prev_chr_code = cur_chr_code;
1370         if (!is_split_chr) {
1371           if (IsSet(loaded_chr_mask, cur_chr_code)) {
1372             if (unlikely(!split_chr_ok)) {
1373               snprintf(g_logbuf, kLogbufSize, "Error: %s has a split chromosome. Use --make-pgen + --sort-vars to remedy this.\n", pvarname);
1374               goto LoadPvar_ret_MALFORMED_INPUT_WW;
1375             }
1376             if (unlikely(S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) < kLoadPvarBlockSize * sizeof(ChrIdx))) {
1377               goto LoadPvar_ret_NOMEM;
1378             }
1379             cur_chr_idxs = R_CAST(ChrIdx*, tmp_alloc_base);
1380             tmp_alloc_base = R_CAST(unsigned char*, &(cur_chr_idxs[kLoadPvarBlockSize]));
1381             // may want to track the first problem variant index
1382             // cip->chr_fo_vidx_start[chrs_encountered_m1] = raw_variant_ct;
1383             BackfillChrIdxs(cip, chrs_encountered_m1, RoundDownPow2(raw_variant_ct, kLoadPvarBlockSize), raw_variant_ct, cur_chr_idxs);
1384             chr_idxs_start_block = raw_variant_ct / kLoadPvarBlockSize;
1385             is_split_chr = 1;
1386             vpos_sortstatus |= kfUnsortedVarBp | kfUnsortedVarCm | kfUnsortedVarSplitChr;
1387           } else {
1388             // how much of this do we need in split-chrom case?
1389             cip->chr_file_order[++chrs_encountered_m1] = cur_chr_code;
1390             cip->chr_fo_vidx_start[chrs_encountered_m1] = raw_variant_ct;
1391             cip->chr_idx_to_foidx[cur_chr_code] = chrs_encountered_m1;
1392             last_cm = -DBL_MAX;
1393           }
1394         }
1395         // always need to set this, if we want to avoid chromosome-length
1396         // overestimation in --sort-vars + early-variant-filter +
1397         // pvar-cols=+vcfheader case.
1398         last_bp = 0;
1399 
1400         SetBit(cur_chr_code, loaded_chr_mask);
1401         if (chr_output_name_buf) {
1402           char* chr_name_end = chrtoa(cip, cur_chr_code, chr_output_name_buf);
1403           const uint32_t chr_slen = chr_name_end - chr_output_name_buf;
1404           if (chr_slen > max_chr_slen) {
1405             max_chr_slen = chr_slen;
1406           }
1407           const int32_t chr_slen_delta = chr_slen - varid_templatep->chr_slen;
1408           varid_templatep->chr_slen = chr_slen;
1409           varid_templatep->base_len += chr_slen_delta;
1410           if (varid_multi_templatep) {
1411             varid_multi_templatep->chr_slen = chr_slen;
1412             varid_multi_templatep->base_len += chr_slen_delta;
1413           }
1414           if (varid_multi_nonsnp_templatep) {
1415             varid_multi_nonsnp_templatep->chr_slen = chr_slen;
1416             varid_multi_nonsnp_templatep->base_len += chr_slen_delta;
1417           }
1418         }
1419       }
1420       *linebuf_iter = '\t';
1421 
1422       // could make this store (and cur_allele_idxs[] allocation) conditional
1423       // on a multiallelic variant being sighted, but unlike the CM column
1424       // this should become common
1425       cur_allele_idxs[variant_idx_lowbits] = allele_storage_iter - allele_storage;
1426 
1427       char* token_ptrs[8];
1428       uint32_t token_slens[8];
1429       uint32_t extra_alt_ct;
1430       if (IsSet(chr_mask, cur_chr_code) || info_pr_present) {
1431         linebuf_iter = TokenLex(linebuf_iter, col_types, col_skips, relevant_postchr_col_ct, token_ptrs, token_slens);
1432         if (unlikely(!linebuf_iter)) {
1433           goto LoadPvar_ret_MISSING_TOKENS;
1434         }
1435 
1436         extra_alt_ct = CountByte(token_ptrs[3], ',', token_slens[3]);
1437         if (extra_alt_ct > max_extra_alt_ct) {
1438           max_extra_alt_ct = extra_alt_ct;
1439         }
1440 
1441         // It is possible for the info_token[info_slen] assignment below to
1442         // clobber the line terminator, so we advance line_iter to eoln here
1443         // and never reference it again before the next line.
1444         line_iter = AdvToDelim(linebuf_iter, '\n');
1445         if (info_col_present) {
1446           const uint32_t info_slen = token_slens[6];
1447           if (info_slen > info_reload_slen) {
1448             info_reload_slen = info_slen;
1449           }
1450           char* info_token = token_ptrs[6];
1451           info_token[info_slen] = '\0';
1452           if (info_pr_present) {
1453             // always load all nonref_flags entries so (i) --ref-from-fa +
1454             // --make-just-pvar works and (ii) they can be compared against
1455             // the .pgen.
1456             if ((memequal_k(info_token, "PR", 2) && ((info_slen == 2) || (info_token[2] == ';'))) || memequal_k(&(info_token[S_CAST(int32_t, info_slen) - 3]), ";PR", 3)) {
1457               SetBit(variant_idx_lowbits, cur_nonref_flags);
1458             } else {
1459               const char* first_info_end = strchr(info_token, ';');
1460               if (first_info_end && strstr(first_info_end, ";PR;")) {
1461                 SetBit(variant_idx_lowbits, cur_nonref_flags);
1462               }
1463             }
1464             if (!IsSet(chr_mask, cur_chr_code)) {
1465               goto LoadPvar_skip_variant;
1466             }
1467           }
1468           if (info_existp) {
1469             if (!InfoExistCheck(info_token, info_existp)) {
1470               goto LoadPvar_skip_variant;
1471             }
1472           }
1473           if (info_nonexistp) {
1474             if (!InfoNonexistCheck(info_token, info_nonexistp)) {
1475               goto LoadPvar_skip_variant;
1476             }
1477           }
1478           if (info_keep.prekey) {
1479             if (!InfoConditionSatisfied(info_token, &info_keep)) {
1480               goto LoadPvar_skip_variant;
1481             }
1482           }
1483           if (info_remove.prekey) {
1484             if (InfoConditionSatisfied(info_token, &info_remove)) {
1485               goto LoadPvar_skip_variant;
1486             }
1487           }
1488         }
1489         // POS
1490         int32_t cur_bp;
1491         if (unlikely(ScanIntAbsDefcap(token_ptrs[0], &cur_bp))) {
1492           snprintf(g_logbuf, kLogbufSize, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
1493           goto LoadPvar_ret_MALFORMED_INPUT_WW;
1494         }
1495 
1496         if (cur_bp < 0) {
1497           goto LoadPvar_skip_variant;
1498         }
1499 
1500         // QUAL
1501         if (load_qual_col) {
1502           const char* qual_token = token_ptrs[4];
1503           if ((qual_token[0] != '.') || (qual_token[1] > ' ')) {
1504             float cur_qual;
1505             if (unlikely(ScanFloat(qual_token, &cur_qual))) {
1506               snprintf(g_logbuf, kLogbufSize, "Error: Invalid QUAL value on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
1507               goto LoadPvar_ret_MALFORMED_INPUT_WW;
1508             }
1509             if ((load_qual_col & 1) && (cur_qual < var_min_qual)) {
1510               goto LoadPvar_skip_variant;
1511             }
1512             if (load_qual_col > 1) {
1513               SetBit(variant_idx_lowbits, cur_qual_present);
1514               // possible todo: optimize all-quals-same case
1515               // possible todo: conditionally allocate, like cur_cms
1516               cur_quals[variant_idx_lowbits] = cur_qual;
1517             }
1518           } else if (load_qual_col & 1) {
1519             goto LoadPvar_skip_variant;
1520           }
1521         }
1522 
1523         // avoid repeating the ALT string split in --set-...-var-ids case
1524         linebuf_iter = token_ptrs[3];
1525         uint32_t remaining_alt_char_ct = token_slens[3];
1526         // handle --snps-only here instead of later, since it reduces the
1527         // amount of data we need to load
1528         if (snps_only) {
1529           if ((token_slens[2] != 1) || (remaining_alt_char_ct != 2 * extra_alt_ct + 1)) {
1530             goto LoadPvar_skip_variant;
1531           }
1532           if (snps_only > 1) {
1533             // just-acgt
1534             if (!acgtm_bool_table[ctou32(token_ptrs[2][0])]) {
1535               goto LoadPvar_skip_variant;
1536             }
1537             for (uint32_t uii = 0; uii <= extra_alt_ct; ++uii) {
1538               if (!acgtm_bool_table[ctou32(linebuf_iter[2 * uii])]) {
1539                 goto LoadPvar_skip_variant;
1540               }
1541             }
1542           }
1543         }
1544 
1545         // also handle --min-alleles/--max-alleles here
1546         if (filter_min_allele_ct || (filter_max_allele_ct <= kPglMaxAltAlleleCt)) {
1547           uint32_t allele_ct = extra_alt_ct + 2;
1548           if (!extra_alt_ct) {
1549             // allele_ct == 1 or 2 for filtering purposes, depending on
1550             // whether ALT allele matches a missing code.
1551             if ((remaining_alt_char_ct == 1) && ((linebuf_iter[0] == '.') || (linebuf_iter[0] == input_missing_geno_char))) {
1552               allele_ct = 1;
1553             }
1554           }
1555           if ((allele_ct < filter_min_allele_ct) || (allele_ct > filter_max_allele_ct)) {
1556             goto LoadPvar_skip_variant;
1557           }
1558         }
1559 
1560         // FILTER
1561         if (load_filter_col) {
1562           const char* filter_token = token_ptrs[5];
1563           const uint32_t filter_slen = token_slens[5];
1564           if ((filter_slen > 1) || (filter_token[0] != '.')) {
1565             if (!strequal_k(filter_token, "PASS", filter_slen)) {
1566               if (load_filter_col & 1) {
1567                 if (!fexcept_ct) {
1568                   goto LoadPvar_skip_variant;
1569                 }
1570                 const char* filter_token_end = &(filter_token[filter_slen]);
1571                 for (const char* filter_token_iter = filter_token; ; ) {
1572                   const char* cur_filter_name_end = AdvToDelimOrEnd(filter_token_iter, filter_token_end, ';');
1573                   uint32_t cur_slen = cur_filter_name_end - filter_token_iter;
1574                   // possible todo: error out on "PASS", since that
1575                   // shouldn't coexist with other filters
1576                   // possible todo: maintain a dictionary of FILTER
1577                   // strings, analogous to what BCF2 does on disk
1578                   if (bsearch_str(filter_token_iter, sorted_fexcepts, cur_slen, max_fexcept_blen, fexcept_ct) == -1) {
1579                     goto LoadPvar_skip_variant;
1580                   }
1581                   if (cur_filter_name_end == filter_token_end) {
1582                     break;
1583                   }
1584                   filter_token_iter = &(cur_filter_name_end[1]);
1585                 }
1586               }
1587               if (load_filter_col > 1) {
1588                 SetBit(variant_idx_lowbits, cur_filter_npass);
1589                 at_least_one_npass_filter = 1;
1590                 // possible todo: detect repeated filter values, store more
1591                 // compactly
1592                 if (filter_slen > max_filter_slen) {
1593                   max_filter_slen = filter_slen;
1594                 }
1595                 if (StoreStringAtEnd(tmp_alloc_base, filter_token, filter_slen, &tmp_alloc_end, &(cur_filter_storage[variant_idx_lowbits]))) {
1596                   goto LoadPvar_ret_NOMEM;
1597                 }
1598               }
1599             }
1600             if (load_filter_col > 1) {
1601               SetBit(variant_idx_lowbits, cur_filter_present);
1602             }
1603           }
1604         }
1605 
1606         if (cur_chr_idxs) {
1607           cur_chr_idxs[variant_idx_lowbits] = cur_chr_code;
1608         }
1609         if (cur_bp < last_bp) {
1610           vpos_sortstatus |= kfUnsortedVarBp;
1611         }
1612         cur_bps[variant_idx_lowbits] = cur_bp;
1613         last_bp = cur_bp;
1614         const uint32_t ref_slen = token_slens[2];
1615         uint32_t id_slen;
1616         if ((!varid_templatep) || (missing_varid_match_slen && ((token_slens[1] != missing_varid_match_slen) || (!memequal(token_ptrs[1], missing_varid_match, missing_varid_match_slen))))) {
1617           id_slen = token_slens[1];
1618           if (PtrWSubCk(tmp_alloc_base, id_slen + 1, &tmp_alloc_end)) {
1619             goto LoadPvar_ret_NOMEM;
1620           }
1621           memcpyx(tmp_alloc_end, token_ptrs[1], id_slen, '\0');
1622         } else {
1623           VaridTemplate* cur_varid_templatep = varid_templatep;
1624           if (extra_alt_ct && (varid_multi_templatep || varid_multi_nonsnp_templatep)) {
1625             if (varid_multi_templatep) {
1626               cur_varid_templatep = varid_multi_templatep;
1627             }
1628             if (varid_multi_nonsnp_templatep) {
1629               if ((ref_slen > 1) || (remaining_alt_char_ct != 2 * extra_alt_ct + 1)) {
1630                 cur_varid_templatep = varid_multi_nonsnp_templatep;
1631               }
1632             }
1633           }
1634           if (unlikely(VaridTemplateApply(tmp_alloc_base, cur_varid_templatep, token_ptrs[2], linebuf_iter, cur_bp, token_slens[2], extra_alt_ct, remaining_alt_char_ct, &tmp_alloc_end, &new_variant_id_allele_len_overflow, &id_slen))) {
1635             goto LoadPvar_ret_NOMEM;
1636           }
1637         }
1638         if (id_slen > max_variant_id_slen) {
1639           max_variant_id_slen = id_slen;
1640         }
1641         cur_ids[variant_idx_lowbits] = R_CAST(char*, tmp_alloc_end);
1642 
1643         // REF
1644         const char* ref_allele = token_ptrs[2];
1645         if (ref_slen == 1) {
1646           char geno_char = ref_allele[0];
1647           if (geno_char == input_missing_geno_char) {
1648             geno_char = '.';
1649           }
1650           *allele_storage_iter = &(g_one_char_strs[2 * ctou32(geno_char)]);
1651         } else {
1652           if (StoreStringAtEndK(tmp_alloc_base, ref_allele, ref_slen, &tmp_alloc_end, allele_storage_iter)) {
1653             goto LoadPvar_ret_NOMEM;
1654           }
1655           if (ref_slen > max_allele_slen) {
1656             max_allele_slen = ref_slen;
1657           }
1658         }
1659         ++allele_storage_iter;
1660 
1661         // ALT
1662         if (extra_alt_ct) {
1663           if (PtrCheck(allele_storage_limit, allele_storage_iter, extra_alt_ct * sizeof(intptr_t))) {
1664             goto LoadPvar_ret_NOMEM;
1665           }
1666           char* alt_token_end = &(linebuf_iter[remaining_alt_char_ct]);
1667           for (uint32_t alt_idx = 0; alt_idx != extra_alt_ct; ++alt_idx) {
1668             char* cur_alt_end = AdvToDelim(linebuf_iter, ',');
1669             const uint32_t cur_allele_slen = cur_alt_end - linebuf_iter;
1670             if (cur_allele_slen == 1) {
1671               char geno_char = linebuf_iter[0];
1672               if (geno_char == input_missing_geno_char) {
1673                 geno_char = '.';
1674               }
1675               *allele_storage_iter = &(g_one_char_strs[2 * ctou32(geno_char)]);
1676             } else {
1677               if (unlikely(!cur_allele_slen)) {
1678                 goto LoadPvar_ret_EMPTY_ALLELE_CODE;
1679               }
1680               if (StoreStringAtEndK(tmp_alloc_base, linebuf_iter, cur_allele_slen, &tmp_alloc_end, allele_storage_iter)) {
1681                 goto LoadPvar_ret_NOMEM;
1682               }
1683               if (cur_allele_slen > max_allele_slen) {
1684                 max_allele_slen = cur_allele_slen;
1685               }
1686             }
1687             ++allele_storage_iter;
1688             linebuf_iter = &(cur_alt_end[1]);
1689           }
1690           remaining_alt_char_ct = alt_token_end - linebuf_iter;
1691           if (unlikely(!remaining_alt_char_ct)) {
1692             goto LoadPvar_ret_EMPTY_ALLELE_CODE;
1693           }
1694         }
1695         if (remaining_alt_char_ct == 1) {
1696           char geno_char = linebuf_iter[0];
1697           if (geno_char == input_missing_geno_char) {
1698             geno_char = '.';
1699           }
1700           *allele_storage_iter = &(g_one_char_strs[2 * ctou32(geno_char)]);
1701         } else {
1702           if (StoreStringAtEndK(tmp_alloc_base, linebuf_iter, remaining_alt_char_ct, &tmp_alloc_end, allele_storage_iter)) {
1703             goto LoadPvar_ret_NOMEM;
1704           }
1705           if (remaining_alt_char_ct > max_allele_slen) {
1706             max_allele_slen = remaining_alt_char_ct;
1707           }
1708         }
1709         ++allele_storage_iter;
1710 
1711         // CM
1712         if (cm_col_present) {
1713           const char* cm_token = token_ptrs[7];
1714           if ((cm_token[0] != '0') || (cm_token[1] > ' ')) {
1715             double cur_cm;
1716             if (unlikely(!ScantokDouble(cm_token, &cur_cm))) {
1717               snprintf(g_logbuf, kLogbufSize, "Error: Invalid centimorgan position on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
1718               goto LoadPvar_ret_MALFORMED_INPUT_WW;
1719             }
1720             if (cur_cm < last_cm) {
1721               vpos_sortstatus |= kfUnsortedVarCm;
1722             } else {
1723               last_cm = cur_cm;
1724             }
1725             if (cur_cm != 0.0) {
1726               if (!at_least_one_nzero_cm) {
1727                 if (unlikely(S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) < kLoadPvarBlockSize * sizeof(double))) {
1728                   goto LoadPvar_ret_NOMEM;
1729                 }
1730                 if (cur_chr_idxs) {
1731                   // reposition cur_chr_idxs[] after cur_cms[]
1732                   cur_cms = R_CAST(double*, cur_chr_idxs);
1733                   cur_chr_idxs = R_CAST(ChrIdx*, &(cur_cms[kLoadPvarBlockSize]));
1734                   memcpy(cur_chr_idxs, cur_cms, kLoadPvarBlockSize * sizeof(ChrIdx));
1735                   tmp_alloc_base = R_CAST(unsigned char*, &(cur_chr_idxs[kLoadPvarBlockSize]));
1736                 } else {
1737                   cur_cms = R_CAST(double*, tmp_alloc_base);
1738                   tmp_alloc_base = R_CAST(unsigned char*, &(cur_cms[kLoadPvarBlockSize]));
1739                 }
1740                 ZeroDArr(kLoadPvarBlockSize, cur_cms);
1741                 cms_start_block = raw_variant_ct / kLoadPvarBlockSize;
1742                 at_least_one_nzero_cm = 1;
1743               }
1744               cur_cms[variant_idx_lowbits] = cur_cm;
1745             }
1746           }
1747         }
1748       } else {
1749         {
1750           // linebuf_iter guaranteed to be at '\t' after chromosome code
1751           char* alt_col_start = NextTokenMult(linebuf_iter, alt_col_idx);
1752           if (unlikely(!alt_col_start)) {
1753             goto LoadPvar_ret_MISSING_TOKENS;
1754           }
1755           char* alt_col_end = CurTokenEnd(alt_col_start);
1756           extra_alt_ct = CountByte(alt_col_start, ',', alt_col_end - alt_col_start);
1757           line_iter = AdvToDelim(alt_col_end, '\n');
1758         }
1759       LoadPvar_skip_variant:
1760         ++exclude_ct;
1761         ClearBit(variant_idx_lowbits, cur_include);
1762         cur_bps[variant_idx_lowbits] = last_bp;
1763         // need to advance allele_storage_iter for later allele_idx_offsets
1764         // lookups to work properly
1765         *allele_storage_iter++ = missing_allele_str;
1766         *allele_storage_iter++ = missing_allele_str;
1767         if (extra_alt_ct) {
1768           if (PtrCheck(allele_storage_limit, allele_storage_iter, extra_alt_ct * sizeof(intptr_t))) {
1769             goto LoadPvar_ret_NOMEM;
1770           }
1771           for (uint32_t uii = 0; uii != extra_alt_ct; ++uii) {
1772             *allele_storage_iter++ = missing_allele_str;
1773           }
1774         }
1775       }
1776       ++raw_variant_ct;
1777     }
1778     if (unlikely(TextStreamErrcode2(&pvar_txs, &reterr))) {
1779       goto LoadPvar_ret_TSTREAM_FAIL;
1780     }
1781     reterr = kPglRetSuccess;
1782     if (unlikely(max_variant_id_slen > kMaxIdSlen)) {
1783       logerrputs("Error: Variant names are limited to " MAX_ID_SLEN_STR " characters.\n");
1784       goto LoadPvar_ret_MALFORMED_INPUT;
1785     }
1786     if (new_variant_id_allele_len_overflow) {
1787       if (new_variant_id_overflow_missing) {
1788         logerrprintfww("Warning: %" PRIuPTR " variant ID%s %s due to allele code length.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "unchanged by --set-missing-var-ids" : "erased by --set-all-var-ids");
1789         if (max_variant_id_slen < missing_varid_blen - 1) {
1790           max_variant_id_slen = missing_varid_blen - 1;
1791         }
1792       } else if (likely(misc_flags & kfMiscNewVarIdOverflowTruncate)) {
1793         // this is less likely in practice than the error branch, but let's
1794         // keep our usage of likely()/unlikely() as easy to interpret as
1795         // possible.
1796         logerrprintf("Warning: %" PRIuPTR " allele code%s truncated by --set-%s-var-ids.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "missing" : "all");
1797       } else {
1798         uint32_t worst_case_id_slen = VaridWorstCaseSlen(varid_templatep, max_chr_slen, max_allele_slen);
1799         if ((worst_case_id_slen <= kMaxIdSlen) && varid_multi_templatep) {
1800           const uint32_t tmp_slen = VaridWorstCaseSlen(varid_multi_templatep, max_chr_slen, max_allele_slen);
1801           if (tmp_slen > worst_case_id_slen) {
1802             worst_case_id_slen = tmp_slen;
1803           }
1804         }
1805         if ((worst_case_id_slen <= kMaxIdSlen) && varid_multi_nonsnp_templatep) {
1806           const uint32_t tmp_slen = VaridWorstCaseSlen(varid_multi_nonsnp_templatep, max_chr_slen, max_allele_slen);
1807           if (tmp_slen > worst_case_id_slen) {
1808             worst_case_id_slen = tmp_slen;
1809           }
1810         }
1811         logerrprintf("Error: %" PRIuPTR " allele code%s too long for --set-%s-var-ids.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "missing" : "all");
1812         if (worst_case_id_slen <= kMaxIdSlen) {
1813           logerrprintfww("The longest observed allele code in this dataset has length %u. If you're fine with the corresponding ID length, rerun with \"--new-id-max-allele-len %u\" added to your command line.\n", max_allele_slen, max_allele_slen);
1814           logerrputs("Otherwise, use \"--new-id-max-allele-len <limit> missing\" to set the IDs of all\nvariants with an allele code longer than the given length-limit to '.' (and\nthen process those variants with another script, if necessary).\n");
1815         } else {
1816           logerrprintfww("The longest observed allele code in this dataset has length %u. We recommend deciding on a length-limit, and then adding \"--new-id-max-allele-len <limit> missing\" to your command line to cause variants with longer allele codes to be assigned '.' IDs. (You can then process just those variants with another script, if necessary.)\n", max_allele_slen);
1817         }
1818         goto LoadPvar_ret_INCONSISTENT_INPUT;
1819       }
1820     }
1821     *max_variant_id_slen_ptr = max_variant_id_slen;
1822     *max_allele_ct_ptr = max_extra_alt_ct + 2;
1823     *max_allele_slen_ptr = max_allele_slen;
1824     *max_filter_slen_ptr = max_filter_slen;
1825     *raw_variant_ct_ptr = raw_variant_ct;
1826     uintptr_t allele_idx_end = allele_storage_iter - allele_storage;
1827     BigstackFinalizeCp(allele_storage, allele_idx_end);
1828     // We may clobber this object soon, so close it now (verifying
1829     // rewindability first, if necessary).
1830     if (unlikely(CleanupTextStream2(pvarname, &pvar_txs, &reterr))) {
1831       goto LoadPvar_ret_1;
1832     }
1833     uintptr_t* allele_idx_offsets = nullptr;
1834     const uint32_t full_block_ct = raw_variant_ct / kLoadPvarBlockSize;
1835     const uintptr_t raw_variant_ct_lowbits = raw_variant_ct % kLoadPvarBlockSize;
1836     // todo: determine whether we want variant_include to be guaranteed to be
1837     // terminated by a zero bit
1838     const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
1839     if (unlikely(
1840             bigstack_alloc_w(raw_variant_ctl, variant_include_ptr) ||
1841             bigstack_alloc_u32(raw_variant_ct, variant_bps_ptr) ||
1842             bigstack_alloc_cp(raw_variant_ct, variant_ids_ptr))) {
1843       goto LoadPvar_ret_NOMEM;
1844     }
1845     uintptr_t* qual_present = nullptr;
1846     float* quals = nullptr;
1847     if (load_qual_col > 1) {
1848       if (unlikely(
1849               bigstack_alloc_w(raw_variant_ctl, qual_present_ptr) ||
1850               bigstack_alloc_f(raw_variant_ct, quals_ptr))) {
1851         goto LoadPvar_ret_NOMEM;
1852       }
1853       qual_present = *qual_present_ptr;
1854       quals = *quals_ptr;
1855     }
1856     uintptr_t* filter_present = nullptr;
1857     uintptr_t* filter_npass = nullptr;
1858     char** filter_storage = nullptr;
1859     if (load_filter_col > 1) {
1860       if (unlikely(
1861               bigstack_alloc_w(raw_variant_ctl, filter_present_ptr) ||
1862               bigstack_alloc_w(raw_variant_ctl, filter_npass_ptr))) {
1863         goto LoadPvar_ret_NOMEM;
1864       }
1865       filter_present = *filter_present_ptr;
1866       filter_npass = *filter_npass_ptr;
1867       if (at_least_one_npass_filter) {
1868         // possible todo: store this in a sparse manner
1869         if (unlikely(bigstack_alloc_cp(raw_variant_ct, filter_storage_ptr))) {
1870           goto LoadPvar_ret_NOMEM;
1871         }
1872         filter_storage = *filter_storage_ptr;
1873       }
1874     }
1875     uintptr_t* nonref_flags = nullptr;
1876     if (info_pr_present) {
1877       if (unlikely(bigstack_alloc_w(raw_variant_ctl, nonref_flags_ptr))) {
1878         goto LoadPvar_ret_NOMEM;
1879       }
1880       nonref_flags = *nonref_flags_ptr;
1881     }
1882     // load_qual_col > 1:
1883     //   kLoadPvarBlockSize / CHAR_BIT for qual_present
1884     //   kLoadPvarBlockSize * sizeof(float) for quals
1885     // load_filter_col > 1:
1886     //   2 * (kLoadPvarBlockSize / CHAR_BIT) for filter_present, filter_npass
1887     //   kLoadPvarBlockSize * sizeof(intptr_t) for filter_storage
1888     // at_least_one_nzero_cm:
1889     //   kLoadPvarBlockSize * sizeof(double)
1890     // is_split_chr:
1891     //   kLoadPvarBlockSize * sizeof(ChrIdx)
1892     unsigned char* read_iter = g_bigstack_end;
1893     uint32_t* variant_bps = *variant_bps_ptr;
1894     char** variant_ids = *variant_ids_ptr;
1895     uintptr_t* variant_include = *variant_include_ptr;
1896     for (uint32_t block_idx = 0; block_idx != full_block_ct; ++block_idx) {
1897       memcpy(&(variant_bps[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(int32_t));
1898       // skip over allele_idx_offsets
1899       read_iter = &(read_iter[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]);
1900       memcpy(&(variant_ids[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
1901       read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
1902       memcpy(&(variant_include[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1903       read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1904       if (qual_present) {
1905         memcpy(&(qual_present[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1906         read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1907         memcpy(&(quals[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(float));
1908         read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(float)]);
1909       }
1910       if (filter_present) {
1911         memcpy(&(filter_present[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1912         read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1913         memcpy(&(filter_npass[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1914         read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1915         if (filter_storage) {
1916           memcpy(&(filter_storage[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
1917         }
1918         read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
1919       }
1920       if (info_pr_present) {
1921         memcpy(&(nonref_flags[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1922         read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1923       }
1924       // skip over cms
1925       if (block_idx >= cms_start_block) {
1926         read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(double)]);
1927       }
1928       // skip over chr_idxs
1929       if (block_idx >= chr_idxs_start_block) {
1930         read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(ChrIdx)]);
1931       }
1932     }
1933     memcpy(&(variant_bps[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(int32_t));
1934     read_iter = &(read_iter[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]);
1935     memcpy(&(variant_ids[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
1936     read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
1937     const uint32_t last_bitblock_size = DivUp(raw_variant_ct_lowbits, CHAR_BIT);
1938     memcpy(&(variant_include[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1939     ZeroTrailingBits(raw_variant_ct, variant_include);
1940     read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1941     if (qual_present) {
1942       memcpy(&(qual_present[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1943       ZeroTrailingBits(raw_variant_ct, qual_present);
1944       read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1945       memcpy(&(quals[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(float));
1946       read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(float)]);
1947     }
1948     if (filter_present) {
1949       memcpy(&(filter_present[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1950       ZeroTrailingBits(raw_variant_ct, filter_present);
1951       read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1952       memcpy(&(filter_npass[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1953       ZeroTrailingBits(raw_variant_ct, filter_npass);
1954       read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1955       if (filter_storage) {
1956         memcpy(&(filter_storage[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
1957         read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
1958       }
1959     }
1960     if (info_pr_present) {
1961       memcpy(&(nonref_flags[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1962       ZeroTrailingBits(raw_variant_ct, nonref_flags);
1963       // read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1964     }
1965     const uintptr_t read_iter_stride_base = kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t)) + (kLoadPvarBlockSize / CHAR_BIT) + (load_qual_col > 1) * ((kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(float)) + (load_filter_col > 1) * (2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)) + info_pr_present * (kLoadPvarBlockSize / CHAR_BIT);
1966     if (allele_idx_end > 2 * S_CAST(uintptr_t, raw_variant_ct)) {
1967       if (unlikely(bigstack_alloc_w(raw_variant_ct + 1, allele_idx_offsets_ptr))) {
1968         goto LoadPvar_ret_NOMEM;
1969       }
1970       allele_idx_offsets = *allele_idx_offsets_ptr;
1971       uintptr_t* allele_idx_read_iter = R_CAST(uintptr_t*, &(g_bigstack_end[kLoadPvarBlockSize * sizeof(int32_t)]));
1972       for (uint32_t block_idx = 0; block_idx != full_block_ct; ++block_idx) {
1973         memcpy(&(allele_idx_offsets[block_idx * kLoadPvarBlockSize]), allele_idx_read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
1974         allele_idx_read_iter = R_CAST(uintptr_t*, R_CAST(uintptr_t, allele_idx_read_iter) + read_iter_stride_base + (block_idx >= cms_start_block) * kLoadPvarBlockSize * sizeof(double) + (block_idx >= chr_idxs_start_block) * kLoadPvarBlockSize * sizeof(ChrIdx));
1975       }
1976       memcpy(&(allele_idx_offsets[full_block_ct * kLoadPvarBlockSize]), allele_idx_read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
1977       allele_idx_offsets[raw_variant_ct] = allele_idx_end;
1978     }
1979     if (at_least_one_nzero_cm) {
1980       if (unlikely(bigstack_alloc_d(raw_variant_ct, variant_cms_ptr))) {
1981         goto LoadPvar_ret_NOMEM;
1982       }
1983       double* variant_cms = *variant_cms_ptr;
1984       ZeroDArr(cms_start_block * kLoadPvarBlockSize, variant_cms);
1985       double* cms_read_iter = R_CAST(double*, &(g_bigstack_end[read_iter_stride_base * (cms_start_block + 1)]));
1986       if (cms_start_block > chr_idxs_start_block) {
1987         cms_read_iter = R_CAST(double*, R_CAST(uintptr_t, cms_read_iter) + kLoadPvarBlockSize * sizeof(ChrIdx) * (cms_start_block - chr_idxs_start_block));
1988       }
1989       for (uint32_t block_idx = cms_start_block; block_idx != full_block_ct; ++block_idx) {
1990         memcpy(&(variant_cms[block_idx * kLoadPvarBlockSize]), cms_read_iter, kLoadPvarBlockSize * sizeof(double));
1991         cms_read_iter = R_CAST(double*, R_CAST(uintptr_t, cms_read_iter) + read_iter_stride_base + kLoadPvarBlockSize * sizeof(double) + (block_idx >= chr_idxs_start_block) * kLoadPvarBlockSize * sizeof(ChrIdx));
1992       }
1993       memcpy(&(variant_cms[full_block_ct * kLoadPvarBlockSize]), cms_read_iter, raw_variant_ct_lowbits * sizeof(double));
1994     } else {
1995       *variant_cms_ptr = nullptr;
1996     }
1997     if (!is_split_chr) {
1998       cip->chr_fo_vidx_start[chrs_encountered_m1 + 1] = raw_variant_ct;
1999       if (splitpar_bound2) {
2000         if (splitpar_and_exclude_x) {
2001           ClearBit(x_code, chr_mask);
2002         }
2003         reterr = SplitPar(variant_bps, *vpos_sortstatus_ptr, splitpar_bound1, splitpar_bound2, variant_include, loaded_chr_mask, cip, &chrs_encountered_m1, &exclude_ct);
2004         if (unlikely(reterr)) {
2005           goto LoadPvar_ret_1;
2006         }
2007       }
2008       cip->chr_ct = chrs_encountered_m1 + 1;
2009     } else {
2010       ChrIdx* chr_idxs;
2011       if (unlikely(BIGSTACK_ALLOC_X(ChrIdx, raw_variant_ct, &chr_idxs))) {
2012         goto LoadPvar_ret_NOMEM;
2013       }
2014       *chr_idxs_ptr = chr_idxs;
2015       if (chr_idxs_start_block) {
2016         const uint32_t end_vidx = chr_idxs_start_block * kLoadPvarBlockSize;
2017         uint32_t chr_fo_idx = chrs_encountered_m1;
2018         while (cip->chr_fo_vidx_start[chr_fo_idx] >= end_vidx) {
2019           --chr_fo_idx;
2020         }
2021         BackfillChrIdxs(cip, chr_fo_idx, 0, end_vidx, chr_idxs);
2022       }
2023       ChrIdx* chr_idxs_read_iter = R_CAST(ChrIdx*, &(g_bigstack_end[read_iter_stride_base * (chr_idxs_start_block + 1)]));
2024       if (chr_idxs_start_block >= cms_start_block) {
2025         chr_idxs_read_iter = R_CAST(ChrIdx*, R_CAST(uintptr_t, chr_idxs_read_iter) + kLoadPvarBlockSize * sizeof(double) * (chr_idxs_start_block + 1 - cms_start_block));
2026       }
2027       for (uint32_t block_idx = chr_idxs_start_block; block_idx != full_block_ct; ) {
2028         memcpy(&(chr_idxs[block_idx * kLoadPvarBlockSize]), chr_idxs_read_iter, kLoadPvarBlockSize * sizeof(ChrIdx));
2029         ++block_idx;
2030         chr_idxs_read_iter = R_CAST(ChrIdx*, R_CAST(uintptr_t, chr_idxs_read_iter) + read_iter_stride_base + kLoadPvarBlockSize * sizeof(ChrIdx) + (block_idx >= cms_start_block) * kLoadPvarBlockSize * sizeof(double));
2031       }
2032       memcpy(&(chr_idxs[full_block_ct * kLoadPvarBlockSize]), chr_idxs_read_iter, raw_variant_ct_lowbits * sizeof(ChrIdx));
2033       cip->chr_ct = PopcountWords(loaded_chr_mask, DivUp(cip->max_code + cip->name_ct + 1, kBitsPerWord));
2034     }
2035     if (merge_par) {
2036       const uint32_t is_plink2_par = (misc_flags / kfMiscMergePar) & 1;
2037       if (merge_par_ct) {
2038         logprintf("--merge-%s: %u chromosome code%s changed.\n", is_plink2_par? "par" : "x", merge_par_ct, (merge_par_ct == 1)? "" : "s");
2039       } else if (is_plink2_par) {
2040         logerrputs("Warning: --merge-par had no effect (no PAR1/PAR2 chromosome codes present).\n");
2041       } else {
2042         logerrputs("Warning: --merge-x had no effect (no XY chromosome codes present).\n");
2043       }
2044     }
2045     const uint32_t last_chr_code = cip->max_code + cip->name_ct;
2046     const uint32_t chr_word_ct = BitCtToWordCt(last_chr_code + 1);
2047     BitvecAnd(loaded_chr_mask, chr_word_ct, chr_mask);
2048     BigstackEndSet(tmp_alloc_end);
2049     *variant_ct_ptr = raw_variant_ct - exclude_ct;
2050     *vpos_sortstatus_ptr = vpos_sortstatus;
2051     *allele_storage_ptr = allele_storage;
2052     // if only INFO:PR flag present, no need to reload
2053     if (!(info_nonpr_present || info_pr_nonflag_present)) {
2054       info_reload_slen = 0;
2055     }
2056     if (info_reload_slen) {
2057       // treat open-fail as rewind-fail here
2058       if (unlikely(ForceNonFifo(pvarname))) {
2059         logerrprintfww(kErrprintfRewind, pvarname);
2060         reterr = kPglRetRewindFail;
2061         goto LoadPvar_ret_1;
2062       }
2063     }
2064     *info_reload_slen_ptr = info_reload_slen;
2065   }
2066 
2067   while (0) {
2068   LoadPvar_ret_NOMEM:
2069     reterr = kPglRetNomem;
2070     break;
2071   LoadPvar_ret_TSTREAM_FAIL:
2072     TextStreamErrPrint(pvarname, &pvar_txs);
2073     break;
2074   LoadPvar_ret_EMPTY_ALLELE_CODE:
2075     snprintf(g_logbuf, kLogbufSize, "Error: Empty allele code on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
2076   LoadPvar_ret_MALFORMED_INPUT_WW:
2077     WordWrapB(0);
2078     logerrputsb();
2079   LoadPvar_ret_MALFORMED_INPUT:
2080     reterr = kPglRetMalformedInput;
2081     break;
2082   LoadPvar_ret_INCONSISTENT_INPUT:
2083     reterr = kPglRetInconsistentInput;
2084     break;
2085   LoadPvar_ret_MISSING_TOKENS:
2086     logerrprintfww("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, pvarname);
2087     reterr = kPglRetMalformedInput;
2088     break;
2089   }
2090  LoadPvar_ret_1:
2091   CleanupTextStream2(pvarname, &pvar_txs, &reterr);
2092   if (reterr) {
2093     BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
2094   }
2095   return reterr;
2096 }
2097 
2098 #ifdef __cplusplus
2099 }  // namespace plink2
2100 #endif
2101