1 // This library is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This library is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU Lesser General Public License as published by the
6 // Free Software Foundation; either version 3 of the License, or (at your
7 // option) any later version.
8 //
9 // This library is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
12 // for more details.
13 //
14 // You should have received a copy of the GNU Lesser General Public License
15 // along with this library. If not, see <http://www.gnu.org/licenses/>.
16
17 #include "plink2_pvar.h"
18
19 #ifdef __cplusplus
20 namespace plink2 {
21 #endif
22
23 // this used to employ a backward-growing linked list, with variable-size
24 // elements, but demultiplexing was relatively expensive. now we allocate
25 // size-64k pos[], allele_idxs[], ids[], cms[], etc. blocks, and just memcpy
26 // those chunks at the end. (cms[] is lazy-initialized.)
27 //
28 // probable todo: try making this more parallel. could e.g. repeatedly peek at
29 // end of ReadLineStream buffer, count # of '\n's, and divvy them up between
30 // worker threads.
31 CONSTI32(kLoadPvarBlockSize, 65536);
32 static_assert(!(kLoadPvarBlockSize & (kLoadPvarBlockSize - 1)), "kLoadPvarBlockSize must be a power of 2.");
33 static_assert(kLoadPvarBlockSize >= (kMaxMediumLine / 8), "kLoadPvarBlockSize cannot be smaller than kMaxMediumLine / 8.");
34
35
36 static_assert(!kChrOffsetX, "ReadChrsetHeaderLine() assumes kChrOffsetX == 0.");
37 static_assert(kChrOffsetY == 1, "ReadChrsetHeaderLine() assumes kChrOffsetY == 1.");
38 static_assert(kChrOffsetPAR1 == 4, "ReadChrsetHeaderLine() assumes kChrOffsetPAR1 == 4.");
ReadChrsetHeaderLine(const char * chrset_iter,const char * file_descrip,MiscFlags misc_flags,uintptr_t line_idx,ChrInfo * cip)39 PglErr ReadChrsetHeaderLine(const char* chrset_iter, const char* file_descrip, MiscFlags misc_flags, uintptr_t line_idx, ChrInfo* cip) {
40 // chrset_iter is expected to point to first character after
41 // "##chrSet=<".
42 PglErr reterr = kPglRetSuccess;
43 {
44 uint32_t cmdline_autosome_ct = 0;
45 uint32_t cmdline_haploid = 0;
46 STD_ARRAY_DECL(uint32_t, kChrOffsetCt, cmdline_xymt_codes);
47 if (cip->chrset_source == kChrsetSourceCmdline) {
48 if (misc_flags & kfMiscChrOverrideCmdline) {
49 goto ReadChrsetHeaderLine_ret_1;
50 }
51 if (!(misc_flags & kfMiscChrOverrideFile)) {
52 // save off info we need for consistency check
53 cmdline_autosome_ct = cip->autosome_ct;
54 cmdline_haploid = cip->haploid_mask[0] & 1;
55 STD_ARRAY_COPY(cip->xymt_codes, kChrOffsetCt, cmdline_xymt_codes);
56 }
57 ZeroWArr(kChrMaskWords, cip->haploid_mask);
58 }
59 for (uint32_t uii = 0; uii != kChrOffsetCt; ++uii) {
60 cip->xymt_codes[uii] = UINT32_MAXM1;
61 }
62 if (StrStartsWithUnsafe(chrset_iter, "haploidAutosomeCt=")) {
63 uint32_t explicit_haploid_ct;
64 if (unlikely(ScanPosintCapped(&(chrset_iter[strlen("haploidAutosomeCt=")]), kMaxChrTextnum, &explicit_haploid_ct))) {
65 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s has an invalid ##chrSet haploid count (max %u).\n", line_idx, file_descrip, kMaxChrTextnum);
66 goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
67 }
68 // could verify that X, Y, etc. are not present?
69 if (cmdline_autosome_ct) {
70 if (unlikely(!cmdline_haploid)) {
71 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies a haploid genome, while a diploid genome was specified on the command line.\n", line_idx, file_descrip);
72 goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
73 }
74 if (unlikely(explicit_haploid_ct != cmdline_autosome_ct)) {
75 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies %u autosome%s, while the command line specified %u.\n", line_idx, file_descrip, explicit_haploid_ct, (explicit_haploid_ct == 1)? "" : "s", cmdline_autosome_ct);
76 goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
77 }
78 }
79 cip->autosome_ct = explicit_haploid_ct;
80 SetAllBits(explicit_haploid_ct + 1, cip->haploid_mask);
81 } else {
82 if (unlikely(!StrStartsWithUnsafe(chrset_iter, "autosomePairCt="))) {
83 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s does not have expected ##chrSet format.\n", line_idx, file_descrip);
84 goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
85 }
86 chrset_iter = &(chrset_iter[strlen("autosomePairCt=")]);
87 uint32_t explicit_autosome_ct;
88 if (unlikely(ScanmovPosintCapped(kMaxChrTextnum, &chrset_iter, &explicit_autosome_ct))) {
89 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s has an invalid ##chrSet autosome count (max %u).\n", line_idx, file_descrip, kMaxChrTextnum);
90 goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
91 }
92 cip->autosome_ct = explicit_autosome_ct;
93 if (*chrset_iter != '>') {
94 if (unlikely(*chrset_iter != ',')) {
95 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s does not have expected ##chrSet format.\n", line_idx, file_descrip);
96 goto ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW;
97 }
98 // this can theoretically be confused by e.g. a Description="..." field
99 // containing commas not followed by spaces
100 do {
101 ++chrset_iter;
102 // uppercase
103 uint32_t first_char_ui = ctou32(*chrset_iter) & 0xdf;
104
105 uint32_t second_char_ui = ctou32(chrset_iter[1]);
106 // 44 is ',', 62 is '>'
107 if ((second_char_ui == 44) || (second_char_ui == 62)) {
108 if (first_char_ui == 77) {
109 // M
110 cip->xymt_codes[kChrOffsetMT] = explicit_autosome_ct + 1 + kChrOffsetMT;
111 } else {
112 first_char_ui -= 88; // X = 0, Y = 1, everything else larger
113 if (first_char_ui < 2) {
114 cip->xymt_codes[first_char_ui] = explicit_autosome_ct + 1 + first_char_ui;
115 }
116 }
117 } else {
118 second_char_ui &= 0xdf;
119 const uint32_t third_char_ui = ctou32(chrset_iter[2]);
120 if ((third_char_ui == 44) || (third_char_ui == 62)) {
121 if ((first_char_ui == 88) && (second_char_ui == 89)) {
122 // XY
123 cip->xymt_codes[kChrOffsetXY] = explicit_autosome_ct + 1 + kChrOffsetXY;
124 } else if ((first_char_ui == 77) && (second_char_ui == 84)) {
125 // MT
126 cip->xymt_codes[kChrOffsetMT] = explicit_autosome_ct + 1 + kChrOffsetMT;
127 }
128 } else if ((first_char_ui == 80) && (second_char_ui == 65) && ((third_char_ui & 0xdf) == 82)) {
129 // PAR1, PAR2
130 const uint32_t par_idx_m1 = ctou32(chrset_iter[3]) - '1';
131 if ((par_idx_m1 < 2) && ((chrset_iter[4] == ',') || (chrset_iter[4] == '>'))) {
132 cip->xymt_codes[kChrOffsetPAR1] = explicit_autosome_ct + 1 + kChrOffsetPAR1 + par_idx_m1;
133 }
134 }
135 }
136 } while (!strchrnul_n_mov(',', &chrset_iter));
137 }
138 if (cmdline_autosome_ct) {
139 if (unlikely(cmdline_haploid)) {
140 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies a diploid genome, while a haploid genome was specified on the command line.\n", line_idx, file_descrip);
141 goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
142 }
143 if (unlikely(explicit_autosome_ct != cmdline_autosome_ct)) {
144 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies %u autosome%s, while the command line specified %u.\n", line_idx, file_descrip, explicit_autosome_ct, (explicit_autosome_ct == 1)? "" : "s", cmdline_autosome_ct);
145 goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
146 }
147 for (uint32_t xymt_idx = 0; xymt_idx != kChrOffsetPAR1; ++xymt_idx) {
148 // it's okay if the command line doesn't explicitly exclude e.g. chrX
149 // while for whatever reason it is excluded from ##chrSet; but the
150 // reverse can create problems
151 if (unlikely(IsI32Neg(cmdline_xymt_codes[xymt_idx]) && (!IsI32Neg(cip->xymt_codes[xymt_idx])))) {
152 snprintf(g_logbuf, kLogbufSize, "Error: Header line %" PRIuPTR " of %s specifies a chromosome set including %s, while the command line excludes it.\n", line_idx, file_descrip, g_xymt_log_names[xymt_idx]);
153 goto ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW;
154 }
155 }
156 }
157 }
158 cip->chrset_source = kChrsetSourceFile;
159 }
160 while (0) {
161 ReadChrsetHeaderLine_ret_MALFORMED_INPUT_WW:
162 WordWrapB(0);
163 logerrputsb();
164 reterr = kPglRetMalformedInput;
165 break;
166 ReadChrsetHeaderLine_ret_INCONSISTENT_INPUT_WW:
167 WordWrapB(0);
168 logerrputsb();
169 reterr = kPglRetInconsistentInput;
170 break;
171 }
172 ReadChrsetHeaderLine_ret_1:
173 return reterr;
174 }
175
VaridTemplateInit(const char * varid_template_str,const char * missing_id_match,char * chr_output_name_buf,uint32_t new_id_max_allele_slen,uint32_t overflow_substitute_blen,VaridTemplate * vtp)176 void VaridTemplateInit(const char* varid_template_str, const char* missing_id_match, char* chr_output_name_buf, uint32_t new_id_max_allele_slen, uint32_t overflow_substitute_blen, VaridTemplate* vtp) {
177 // template string was previously validated
178 // varid_template is only input, everything else is output values
179 const char* varid_template_str_iter = varid_template_str;
180 uint32_t template_insert_ct = 0;
181 uint32_t template_base_len = 0;
182 uint32_t alleles_needed = 0; // bit 0 = ref, bit 1 = alt, bit 2 = ascii sort
183 vtp->chr_output_name_buf = chr_output_name_buf;
184 vtp->segs[0] = varid_template_str_iter;
185 vtp->chr_slen = 0;
186 for (unsigned char ucc = *varid_template_str_iter; ucc; ucc = *(++varid_template_str_iter)) {
187 if (ucc > '@') {
188 continue;
189 }
190 uint32_t seg_len;
191 uint32_t insert_type;
192 if (ucc == '@') {
193 seg_len = varid_template_str_iter - vtp->segs[template_insert_ct];
194 insert_type = 0;
195 } else if (ucc == '#') {
196 seg_len = varid_template_str_iter - vtp->segs[template_insert_ct];
197 insert_type = 1;
198 } else if (ucc != '$') {
199 continue;
200 } else {
201 seg_len = varid_template_str_iter - vtp->segs[template_insert_ct];
202 const uint32_t uii = ctou32(*(++varid_template_str_iter));
203 if (uii <= '2') {
204 alleles_needed += 2; // this happens twice
205 insert_type = uii - 48; // '1' -> type 2, '2' -> type 3
206 } else {
207 // 'r' -> type 2, 'a' -> type 3
208 insert_type = 1 + ((uii & 0xdf) == 'A');
209 }
210 alleles_needed += insert_type;
211 ++insert_type;
212 }
213 vtp->seg_lens[template_insert_ct] = seg_len;
214 template_base_len += seg_len;
215 vtp->insert_types[template_insert_ct++] = insert_type;
216 vtp->segs[template_insert_ct] = &(varid_template_str_iter[1]);
217 }
218 const uint32_t seg_len = varid_template_str_iter - vtp->segs[template_insert_ct];
219 vtp->seg_lens[template_insert_ct] = seg_len;
220 vtp->insert_ct = template_insert_ct;
221 vtp->base_len = template_base_len + seg_len;
222 vtp->alleles_needed = alleles_needed;
223 vtp->new_id_max_allele_slen = new_id_max_allele_slen;
224 vtp->overflow_substitute_blen = overflow_substitute_blen;
225 vtp->missing_id_match = missing_id_match;
226 }
227
228 // alt1_end currently allowed to be nullptr in biallelic case
VaridTemplateApply(unsigned char * tmp_alloc_base,const VaridTemplate * vtp,const char * ref_start,const char * alt1_start,uint32_t cur_bp,uint32_t ref_token_slen,uint32_t extra_alt_ct,uint32_t alt_token_slen,unsigned char ** tmp_alloc_endp,uintptr_t * new_id_allele_len_overflowp,uint32_t * id_slen_ptr)229 BoolErr VaridTemplateApply(unsigned char* tmp_alloc_base, const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, unsigned char** tmp_alloc_endp, uintptr_t* new_id_allele_len_overflowp, uint32_t* id_slen_ptr) {
230 uint32_t insert_slens[4];
231 const uint32_t alleles_needed = vtp->alleles_needed;
232 const uint32_t new_id_max_allele_slen = vtp->new_id_max_allele_slen;
233 uint32_t id_slen = UintSlen(cur_bp);
234 insert_slens[0] = vtp->chr_slen;
235 insert_slens[1] = id_slen;
236 id_slen += vtp->base_len;
237 uint32_t ref_slen = 0;
238 uint32_t cur_overflow = 0;
239 const char* tmp_allele_ptrs[2];
240 if (alleles_needed & 1) {
241 ref_slen = ref_token_slen;
242 if (ref_slen > new_id_max_allele_slen) {
243 ref_slen = new_id_max_allele_slen;
244 cur_overflow = 1;
245 }
246 insert_slens[2] = ref_slen;
247 id_slen += ref_slen;
248 tmp_allele_ptrs[0] = ref_start;
249 }
250 if (alleles_needed > 1) {
251 uint32_t alt1_slen;
252 if (!extra_alt_ct) {
253 alt1_slen = alt_token_slen;
254 } else {
255 alt1_slen = AdvToDelim(alt1_start, ',') - alt1_start;
256 }
257 if (alt1_slen > new_id_max_allele_slen) {
258 alt1_slen = new_id_max_allele_slen;
259 ++cur_overflow;
260 }
261 id_slen += alt1_slen;
262 if (alleles_needed <= 3) {
263 VaridTemplateApply_keep_allele_ascii_order:
264 insert_slens[3] = alt1_slen;
265 tmp_allele_ptrs[1] = alt1_start;
266 } else {
267 uint32_t smaller_slen = alt1_slen;
268 const int32_t ref_slen_geq = (ref_slen >= alt1_slen);
269 if (!ref_slen_geq) {
270 smaller_slen = ref_slen;
271 }
272 int32_t memcmp_result = Memcmp(ref_start, alt1_start, smaller_slen);
273 if (!memcmp_result) {
274 memcmp_result = ref_slen_geq;
275 }
276 if (memcmp_result <= 0) {
277 goto VaridTemplateApply_keep_allele_ascii_order;
278 }
279 insert_slens[3] = ref_slen;
280 tmp_allele_ptrs[1] = tmp_allele_ptrs[0];
281 insert_slens[2] = alt1_slen;
282 tmp_allele_ptrs[0] = alt1_start;
283 }
284 }
285 const uint32_t overflow_substitute_blen = vtp->overflow_substitute_blen;
286 if (overflow_substitute_blen && cur_overflow) {
287 // er, do we really want to make a separate allocation here? see if we can
288 // remove this. (that would impose more constraints on downstream
289 // variant-ID-changing functions, though.)
290 if (PtrWSubCk(tmp_alloc_base, overflow_substitute_blen, tmp_alloc_endp)) {
291 return 1;
292 }
293 memcpy(*tmp_alloc_endp, vtp->missing_id_match, overflow_substitute_blen);
294 id_slen = 0;
295 cur_overflow = 1;
296 } else {
297 if (PtrWSubCk(tmp_alloc_base, id_slen + 1, tmp_alloc_endp)) {
298 return 1;
299 }
300 char* id_iter = R_CAST(char*, *tmp_alloc_endp);
301 const uint32_t insert_ct = vtp->insert_ct;
302 char* insert_ptrs[4];
303
304 // maybe-uninitialized warnings
305 insert_ptrs[0] = nullptr;
306 insert_ptrs[1] = nullptr;
307 insert_ptrs[2] = nullptr;
308
309 for (uint32_t insert_idx = 0; insert_idx != insert_ct; ++insert_idx) {
310 id_iter = memcpya(id_iter, vtp->segs[insert_idx], vtp->seg_lens[insert_idx]);
311 const uint32_t cur_insert_type = vtp->insert_types[insert_idx];
312 insert_ptrs[cur_insert_type] = id_iter;
313 id_iter = &(id_iter[insert_slens[cur_insert_type]]);
314 }
315 memcpyx(id_iter, vtp->segs[insert_ct], vtp->seg_lens[insert_ct], '\0');
316
317 memcpy(insert_ptrs[0], vtp->chr_output_name_buf, insert_slens[0]);
318 u32toa(cur_bp, insert_ptrs[1]);
319 // insert_ct currently guaranteed to be >= 2 since @ and # required
320 for (uint32_t insert_type_idx = 2; insert_type_idx != insert_ct; ++insert_type_idx) {
321 memcpy(insert_ptrs[insert_type_idx], tmp_allele_ptrs[insert_type_idx - 2], insert_slens[insert_type_idx]);
322 }
323 }
324 *id_slen_ptr = id_slen;
325 *new_id_allele_len_overflowp += cur_overflow;
326 return 0;
327 }
328
329 // Exported variant of VaridTemplateApply() which appends to a buffer.
330 // Probable todo: pull out the common parts of the functions.
VaridTemplateWrite(const VaridTemplate * vtp,const char * ref_start,const char * alt1_start,uint32_t cur_bp,uint32_t ref_token_slen,uint32_t extra_alt_ct,uint32_t alt_token_slen,char * dst)331 char* VaridTemplateWrite(const VaridTemplate* vtp, const char* ref_start, const char* alt1_start, uint32_t cur_bp, uint32_t ref_token_slen, uint32_t extra_alt_ct, uint32_t alt_token_slen, char* dst) {
332 uint32_t insert_slens[4];
333 const uint32_t alleles_needed = vtp->alleles_needed;
334 const uint32_t new_id_max_allele_slen = vtp->new_id_max_allele_slen;
335 uint32_t id_slen = UintSlen(cur_bp);
336 insert_slens[0] = vtp->chr_slen;
337 insert_slens[1] = id_slen;
338 id_slen += vtp->base_len;
339 uint32_t ref_slen = 0;
340 uint32_t cur_overflow = 0;
341 const char* tmp_allele_ptrs[2];
342 if (alleles_needed & 1) {
343 ref_slen = ref_token_slen;
344 if (ref_slen > new_id_max_allele_slen) {
345 ref_slen = new_id_max_allele_slen;
346 cur_overflow = 1;
347 }
348 insert_slens[2] = ref_slen;
349 id_slen += ref_slen;
350 tmp_allele_ptrs[0] = ref_start;
351 }
352 if (alleles_needed > 1) {
353 uint32_t alt1_slen;
354 if (!extra_alt_ct) {
355 alt1_slen = alt_token_slen;
356 } else {
357 alt1_slen = AdvToDelim(alt1_start, ',') - alt1_start;
358 }
359 if (alt1_slen > new_id_max_allele_slen) {
360 alt1_slen = new_id_max_allele_slen;
361 cur_overflow = 1;
362 }
363 id_slen += alt1_slen;
364 if (alleles_needed <= 3) {
365 VaridTemplateWrite_keep_allele_ascii_order:
366 insert_slens[3] = alt1_slen;
367 tmp_allele_ptrs[1] = alt1_start;
368 } else {
369 uint32_t smaller_slen = alt1_slen;
370 const int32_t ref_slen_geq = (ref_slen >= alt1_slen);
371 if (!ref_slen_geq) {
372 smaller_slen = ref_slen;
373 }
374 int32_t memcmp_result = Memcmp(ref_start, alt1_start, smaller_slen);
375 if (!memcmp_result) {
376 memcmp_result = ref_slen_geq;
377 }
378 if (memcmp_result <= 0) {
379 goto VaridTemplateWrite_keep_allele_ascii_order;
380 }
381 insert_slens[3] = ref_slen;
382 tmp_allele_ptrs[1] = tmp_allele_ptrs[0];
383 insert_slens[2] = alt1_slen;
384 tmp_allele_ptrs[0] = alt1_start;
385 }
386 }
387 const uint32_t overflow_substitute_blen = vtp->overflow_substitute_blen;
388 if (overflow_substitute_blen && cur_overflow) {
389 return memcpya(dst, vtp->missing_id_match, overflow_substitute_blen);
390 }
391 char* id_iter = dst;
392 const uint32_t insert_ct = vtp->insert_ct;
393 char* insert_ptrs[4];
394
395 // maybe-uninitialized warnings
396 insert_ptrs[0] = nullptr;
397 insert_ptrs[1] = nullptr;
398 insert_ptrs[2] = nullptr;
399
400 for (uint32_t insert_idx = 0; insert_idx != insert_ct; ++insert_idx) {
401 id_iter = memcpya(id_iter, vtp->segs[insert_idx], vtp->seg_lens[insert_idx]);
402 const uint32_t cur_insert_type = vtp->insert_types[insert_idx];
403 insert_ptrs[cur_insert_type] = id_iter;
404 id_iter = &(id_iter[insert_slens[cur_insert_type]]);
405 }
406 char* id_end = memcpya(id_iter, vtp->segs[insert_ct], vtp->seg_lens[insert_ct]);
407
408 memcpy(insert_ptrs[0], vtp->chr_output_name_buf, insert_slens[0]);
409 u32toa(cur_bp, insert_ptrs[1]);
410 // insert_ct currently guaranteed to be >= 2 since @ and # required
411 for (uint32_t insert_type_idx = 2; insert_type_idx != insert_ct; ++insert_type_idx) {
412 memcpy(insert_ptrs[insert_type_idx], tmp_allele_ptrs[insert_type_idx - 2], insert_slens[insert_type_idx]);
413 }
414 return id_end;
415 }
416
VaridWorstCaseSlen(const VaridTemplate * vtp,uint32_t max_chr_slen,uint32_t max_allele_slen)417 uint32_t VaridWorstCaseSlen(const VaridTemplate* vtp, uint32_t max_chr_slen, uint32_t max_allele_slen) {
418 // +10 for base-pair coordinate
419 return (max_allele_slen * vtp->alleles_needed + vtp->base_len + max_chr_slen + 10);
420 }
421
BackfillChrIdxs(const ChrInfo * cip,uint32_t chrs_encountered_m1,uint32_t offset,uint32_t end_vidx,ChrIdx * chr_idxs)422 void BackfillChrIdxs(const ChrInfo* cip, uint32_t chrs_encountered_m1, uint32_t offset, uint32_t end_vidx, ChrIdx* chr_idxs) {
423 for (uint32_t chr_fo_idx = chrs_encountered_m1; ; --chr_fo_idx) {
424 uint32_t start_vidx = cip->chr_fo_vidx_start[chr_fo_idx];
425 if (start_vidx < offset) {
426 start_vidx = offset;
427 }
428 ChrIdx* chr_idxs_write_base = &(chr_idxs[start_vidx - offset]);
429 const uint32_t vidx_ct = end_vidx - start_vidx;
430 const ChrIdx cur_chr_idx = cip->chr_file_order[chr_fo_idx];
431 for (uint32_t uii = 0; uii != vidx_ct; ++uii) {
432 chr_idxs_write_base[uii] = cur_chr_idx;
433 }
434 if (start_vidx == offset) {
435 return;
436 }
437 end_vidx = start_vidx;
438 }
439 }
440
PrInInfoToken(uint32_t info_slen,char * info_token)441 char* PrInInfoToken(uint32_t info_slen, char* info_token) {
442 if (memequal_k(info_token, "PR", 2) && ((info_slen == 2) || (info_token[2] == ';'))) {
443 return info_token;
444 }
445 if (memequal_k(&(info_token[S_CAST(int32_t, info_slen) - 3]), ";PR", 3)) {
446 return &(info_token[info_slen - 2]);
447 }
448 info_token[info_slen] = '\0';
449 char* first_info_end = strchr(info_token, ';');
450 if (!first_info_end) {
451 return nullptr;
452 }
453 char* pr_prestart = strstr(first_info_end, ";PR;");
454 // bugfix (27 Sep 2019): had this backward
455 return pr_prestart? (&(pr_prestart[1])) : nullptr;
456 }
457
458 typedef struct InfoExistStruct {
459 NONCOPYABLE(InfoExistStruct);
460 char* prekeys;
461 uint32_t key_ct;
462 uint32_t key_slens[];
463 } InfoExist;
464
InfoExistInit(const unsigned char * arena_end,const char * require_info_flattened,const char * flagname_p,unsigned char ** arena_base_ptr,InfoExist ** existpp)465 PglErr InfoExistInit(const unsigned char* arena_end, const char* require_info_flattened, const char* flagname_p, unsigned char** arena_base_ptr, InfoExist** existpp) {
466 const char* read_iter = require_info_flattened;
467 uintptr_t prekeys_blen = 0;
468 do {
469 const char* key_end_or_invalid = strchrnul2(read_iter, ';', '=');
470 if (unlikely(*key_end_or_invalid)) {
471 if (*key_end_or_invalid == ';') {
472 logerrprintfww("Error: Invalid --%s key '%s' (semicolon prohibited).\n", flagname_p, read_iter);
473 } else {
474 logerrprintfww("Error: Invalid --%s key '%s' ('=' prohibited).\n", flagname_p, read_iter);
475 }
476 return kPglRetInvalidCmdline;
477 }
478 ++prekeys_blen;
479 read_iter = &(key_end_or_invalid[1]);
480 } while (*read_iter);
481 prekeys_blen += read_iter - require_info_flattened;
482 const uint32_t key_ct = prekeys_blen - S_CAST(uintptr_t, read_iter - require_info_flattened);
483 const uintptr_t bytes_used = offsetof(InfoExist, key_slens) + key_ct * sizeof(int32_t) + prekeys_blen;
484 const uintptr_t cur_alloc = RoundUpPow2(bytes_used, kCacheline);
485 if (unlikely(S_CAST(uintptr_t, arena_end - (*arena_base_ptr)) < cur_alloc)) {
486 return kPglRetNomem;
487 }
488 *existpp = R_CAST(InfoExist*, *arena_base_ptr);
489 (*existpp)->prekeys = R_CAST(char*, &((*arena_base_ptr)[bytes_used - prekeys_blen]));
490 (*arena_base_ptr) += cur_alloc;
491 (*existpp)->key_ct = key_ct;
492 read_iter = require_info_flattened;
493 char* write_iter = (*existpp)->prekeys;
494 for (uint32_t kidx = 0; kidx != key_ct; ++kidx) {
495 *write_iter++ = ';';
496 const uintptr_t slen = strlen(read_iter);
497 (*existpp)->key_slens[kidx] = slen;
498 const uintptr_t blen = slen + 1;
499 write_iter = memcpya(write_iter, read_iter, blen);
500 read_iter = &(read_iter[blen]);
501 }
502 return kPglRetSuccess;
503 }
504
InfoExistCheck(const char * info_token,const InfoExist * existp)505 uint32_t InfoExistCheck(const char* info_token, const InfoExist* existp) {
506 const uint32_t key_ct = existp->key_ct;
507 const char* prekeys_iter = existp->prekeys;
508 const uint32_t* key_slens = existp->key_slens;
509 for (uint32_t kidx = 0; kidx != key_ct; ++kidx) {
510 const uint32_t key_slen = key_slens[kidx];
511 const char* possible_hit;
512 // similar logic to hardcoded PR, except key can also be followed by '=',
513 // and if it is there can't be a lone '.' after the equals
514 if (memequal(info_token, &(prekeys_iter[1]), key_slen)) {
515 possible_hit = &(info_token[key_slen]);
516 } else {
517 possible_hit = strstr(info_token, prekeys_iter);
518 if (!possible_hit) {
519 return 0;
520 }
521 possible_hit = &(possible_hit[key_slen + 1]);
522 }
523 while (1) {
524 const char cc = *possible_hit;
525 if ((!cc) || (cc == ';')) {
526 break;
527 }
528 if (cc == '=') {
529 if ((possible_hit[1] != '.') || (possible_hit[2] && (possible_hit[2] != ';'))) {
530 break;
531 }
532 return 0;
533 }
534 possible_hit = strstr(possible_hit, prekeys_iter);
535 if (!possible_hit) {
536 return 0;
537 }
538 possible_hit = &(possible_hit[key_slen + 1]);
539 }
540 prekeys_iter = &(prekeys_iter[key_slen + 2]);
541 }
542 return 1;
543 }
544
InfoNonexistCheck(const char * info_token,const InfoExist * nonexistp)545 uint32_t InfoNonexistCheck(const char* info_token, const InfoExist* nonexistp) {
546 const uint32_t key_ct = nonexistp->key_ct;
547 const char* prekeys_iter = nonexistp->prekeys;
548 const uint32_t* key_slens = nonexistp->key_slens;
549 uint32_t key_slen = 0;
550 for (uint32_t kidx = 0; kidx != key_ct; ++kidx, prekeys_iter = &(prekeys_iter[key_slen + 2])) {
551 key_slen = key_slens[kidx];
552 const char* possible_hit;
553 if (memequal(info_token, &(prekeys_iter[1]), key_slen)) {
554 possible_hit = &(info_token[key_slen]);
555 } else {
556 possible_hit = strstr(info_token, prekeys_iter);
557 if (!possible_hit) {
558 continue;
559 }
560 possible_hit = &(possible_hit[key_slen + 1]);
561 }
562 while (1) {
563 const char cc = *possible_hit;
564 if ((!cc) || (cc == ';')) {
565 return 0;
566 }
567 if (cc == '=') {
568 if ((possible_hit[1] != '.') || (possible_hit[2] && (possible_hit[2] != ';'))) {
569 return 0;
570 }
571 break;
572 }
573 possible_hit = strstr(possible_hit, prekeys_iter);
574 if (!possible_hit) {
575 break;
576 }
577 possible_hit = &(possible_hit[key_slen + 1]);
578 }
579 }
580 return 1;
581 }
582
583 typedef struct InfoFilterStruct {
584 NONCOPYABLE(InfoFilterStruct);
585 char* prekey;
586 const char* val_str;
587 uint32_t key_slen;
588 uint32_t val_slen;
589 CmpBinaryOp binary_op;
590 double val;
591 } InfoFilter;
592
InfoFilterInit(const unsigned char * arena_end,const CmpExpr * filter_exprp,const char * flagname_p,unsigned char ** arena_base_ptr,InfoFilter * filterp)593 PglErr InfoFilterInit(const unsigned char* arena_end, const CmpExpr* filter_exprp, const char* flagname_p, unsigned char** arena_base_ptr, InfoFilter* filterp) {
594 const char* pheno_name = filter_exprp->pheno_name;
595 const char* pheno_name_end_or_invalid = strchrnul3(pheno_name, ';', '=', ',');
596 if (unlikely(*pheno_name_end_or_invalid)) {
597 if (*pheno_name_end_or_invalid == ';') {
598 logerrprintfww("Error: Invalid --%s key '%s' (semicolon prohibited).\n", flagname_p, pheno_name);
599 } else if (*pheno_name_end_or_invalid == '=') {
600 logerrprintfww("Error: Invalid --%s key '%s' ('=' prohibited).\n", flagname_p, pheno_name);
601 } else {
602 logerrprintfww("Error: Invalid --%s key '%s' (comma prohibited).\n", flagname_p, pheno_name);
603 }
604 return kPglRetInvalidCmdline;
605 }
606 uint32_t key_slen = pheno_name_end_or_invalid - pheno_name;
607 const uintptr_t cur_alloc = RoundUpPow2(3 + key_slen, kCacheline);
608 if (unlikely(S_CAST(uintptr_t, arena_end - (*arena_base_ptr)) < cur_alloc)) {
609 return kPglRetNomem;
610 }
611 filterp->prekey = R_CAST(char*, *arena_base_ptr);
612 (*arena_base_ptr) += cur_alloc;
613 filterp->prekey[0] = ';';
614 char* key_iter = memcpya(&(filterp->prekey[1]), pheno_name, key_slen);
615 strcpy_k(key_iter, "=");
616 ++key_slen;
617 filterp->key_slen = key_slen;
618 filterp->val_str = nullptr;
619 filterp->val_slen = 0;
620 const CmpBinaryOp binary_op = filter_exprp->binary_op;
621 filterp->binary_op = binary_op;
622 // shouldn't need to initialize val
623
624 const char* val_str = &(pheno_name[key_slen]);
625 // bugfix (14 Dec 2017): INFO string constants are not guaranteed to start in
626 // a letter. Only interpret the value as a number if ScanadvDouble()
627 // consumes the entire value string.
628 const char* val_num_end = ScanadvDouble(val_str, &filterp->val);
629 if (val_num_end && (!val_num_end[0])) {
630 return kPglRetSuccess;
631 }
632 if (unlikely((binary_op != kCmpOperatorNoteq) && (binary_op != kCmpOperatorEq))) {
633 logerrprintfww("Error: Invalid --%s value '%s' (finite number expected).\n", flagname_p, val_str);
634 return kPglRetInvalidCmdline;
635 }
636 filterp->val_str = val_str;
637 const char* val_str_end_or_invalid = strchrnul2(val_str, ';', '=');
638 if (unlikely(*val_str_end_or_invalid)) {
639 if (*val_str_end_or_invalid == ';') {
640 logerrprintfww("Error: Invalid --%s value '%s' (semicolon prohibited).\n", flagname_p, val_str);
641 } else {
642 logerrprintfww("Error: Invalid --%s value '%s' ('=' prohibited).\n", flagname_p, val_str);
643 }
644 return kPglRetInvalidCmdline;
645 }
646 filterp->val_slen = val_str_end_or_invalid - val_str;
647 return kPglRetSuccess;
648 }
649
InfoConditionSatisfied(const char * info_token,const InfoFilter * filterp)650 uint32_t InfoConditionSatisfied(const char* info_token, const InfoFilter* filterp) {
651 const char* prekey = filterp->prekey;
652 const uint32_t key_slen = filterp->key_slen;
653 const CmpBinaryOp binary_op = filterp->binary_op;
654 const char* possible_hit;
655 // search for "[key]=" at start or ";[key]=" later; key_slen includes the
656 // trailing =
657 if (memequal(info_token, &(prekey[1]), key_slen)) {
658 possible_hit = &(info_token[key_slen]);
659 } else {
660 possible_hit = strstr(info_token, prekey);
661 if (!possible_hit) {
662 return (binary_op == kCmpOperatorNoteq);
663 }
664 possible_hit = &(possible_hit[key_slen + 1]);
665 }
666 if (filterp->val_str) {
667 const uint32_t val_slen = filterp->val_slen;
668 const uint32_t mismatch = ((!memequal(possible_hit, filterp->val_str, val_slen)) || (possible_hit[val_slen] && (possible_hit[val_slen] != ';')));
669 return mismatch ^ (binary_op != kCmpOperatorNoteq);
670 }
671 // bugfix (3 Jan 2020): semicolon (or \0) terminator expected, can't use
672 // ScantokDouble
673 double dxx;
674 const char* scan_end = ScanadvDouble(possible_hit, &dxx);
675 if ((!scan_end) || ((*scan_end != ';') && (*scan_end))) {
676 return (binary_op == kCmpOperatorNoteq);
677 }
678 const double val = filterp->val;
679 switch (binary_op) {
680 case kCmpOperatorNoteq:
681 return (dxx != val);
682 case kCmpOperatorLe:
683 return (dxx < val);
684 case kCmpOperatorLeq:
685 return (dxx <= val);
686 case kCmpOperatorGe:
687 return (dxx > val);
688 case kCmpOperatorGeq:
689 return (dxx >= val);
690 case kCmpOperatorEq:
691 return (dxx == val);
692 }
693 // should be unreachable, but some gcc versions complain
694 return 0;
695 }
696
SplitPar(const uint32_t * variant_bps,UnsortedVar vpos_sortstatus,uint32_t splitpar_bound1,uint32_t splitpar_bound2,uintptr_t * variant_include,uintptr_t * loaded_chr_mask,ChrInfo * cip,uint32_t * chrs_encountered_m1_ptr,uint32_t * exclude_ct_ptr)697 PglErr SplitPar(const uint32_t* variant_bps, UnsortedVar vpos_sortstatus, uint32_t splitpar_bound1, uint32_t splitpar_bound2, uintptr_t* variant_include, uintptr_t* loaded_chr_mask, ChrInfo* cip, uint32_t* chrs_encountered_m1_ptr, uint32_t* exclude_ct_ptr) {
698 const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
699 if (IsI32Neg(x_code) || (!IsSet(loaded_chr_mask, x_code))) {
700 logerrputs("Warning: --split-par had no effect (no X chromosome in dataset).\n");
701 return kPglRetSuccess;
702 }
703 const uint32_t par1_code = cip->xymt_codes[kChrOffsetPAR1];
704 const uint32_t par2_code = cip->xymt_codes[kChrOffsetPAR2];
705 if (unlikely(IsI32Neg(par2_code))) {
706 // may want to remove this restriction later
707 logerrputs("Error: --split-par cannot currently be used with a custom chromosome set.\n");
708 return kPglRetInvalidCmdline;
709 }
710 if (unlikely(IsSet(loaded_chr_mask, par1_code) || IsSet(loaded_chr_mask, par2_code))) {
711 logerrputs("Error: --split-par cannot be used on a dataset which already contains a PAR1 or\nPAR2 region.\n");
712 return kPglRetInvalidCmdline;
713 }
714 if (unlikely(vpos_sortstatus & kfUnsortedVarBp)) {
715 logerrputs("Error: --split-par cannot be used with an unsorted .bim/.pvar file.\n");
716 return kPglRetInvalidCmdline;
717 }
718 const uint32_t orig_xchr_fo_idx = cip->chr_idx_to_foidx[x_code];
719 const uint32_t orig_x_start = cip->chr_fo_vidx_start[orig_xchr_fo_idx];
720 const uint32_t orig_x_end = cip->chr_fo_vidx_start[orig_xchr_fo_idx + 1];
721 const uint32_t par1_end = orig_x_start + CountSortedSmallerU32(&(variant_bps[orig_x_start]), orig_x_end - orig_x_start, splitpar_bound1 + 1);
722 const uint32_t par2_start = par1_end + CountSortedSmallerU32(&(variant_bps[par1_end]), orig_x_end - par1_end, splitpar_bound2);
723 uint32_t tot_codes_changed = (par1_end - orig_x_start) + (orig_x_end - par2_start);
724 if (!tot_codes_changed) {
725 logerrputs("Warning: --split-par had no effect (no X variants were in the PARs).\n");
726 return kPglRetSuccess;
727 }
728 // one of the PARs, and/or the main chrX body, may be empty; that's not a big
729 // deal
730 *chrs_encountered_m1_ptr += 2;
731 const uint32_t chrs_encountered_m1 = *chrs_encountered_m1_ptr;
732 cip->chr_fo_vidx_start[chrs_encountered_m1 + 1] = cip->chr_fo_vidx_start[chrs_encountered_m1 - 1];
733 for (uint32_t chr_fo_idx = chrs_encountered_m1 - 2; chr_fo_idx != orig_xchr_fo_idx; --chr_fo_idx) {
734 cip->chr_fo_vidx_start[chr_fo_idx + 2] = cip->chr_fo_vidx_start[chr_fo_idx];
735 const uint32_t cur_chr_idx = cip->chr_file_order[chr_fo_idx];
736 cip->chr_file_order[chr_fo_idx + 2] = cur_chr_idx;
737 cip->chr_idx_to_foidx[cur_chr_idx] = chr_fo_idx + 2;
738 }
739 cip->chr_fo_vidx_start[orig_xchr_fo_idx + 1] = par1_end;
740 cip->chr_fo_vidx_start[orig_xchr_fo_idx + 2] = par2_start;
741 cip->chr_file_order[orig_xchr_fo_idx] = par1_code;
742 cip->chr_file_order[orig_xchr_fo_idx + 1] = x_code;
743 cip->chr_file_order[orig_xchr_fo_idx + 2] = par2_code;
744 cip->chr_idx_to_foidx[par1_code] = orig_xchr_fo_idx;
745 cip->chr_idx_to_foidx[x_code] = orig_xchr_fo_idx + 1;
746 cip->chr_idx_to_foidx[par2_code] = orig_xchr_fo_idx + 2;
747 uintptr_t* chr_mask = cip->chr_mask;
748 if (par1_end > orig_x_start) {
749 if (!IsSet(chr_mask, par1_code)) {
750 *exclude_ct_ptr += PopcountBitRange(variant_include, orig_x_start, par1_end);
751 ClearBitsNz(orig_x_start, par1_end, variant_include);
752 } else {
753 SetBit(par1_code, loaded_chr_mask);
754 }
755 }
756 if (par1_end == par2_start) {
757 ClearBit(x_code, chr_mask);
758 } else if (!IsSet(chr_mask, x_code)) {
759 ClearBit(x_code, chr_mask);
760 *exclude_ct_ptr += PopcountBitRange(variant_include, par1_end, par2_start);
761 ClearBitsNz(par1_end, par2_start, variant_include);
762 }
763 if (par2_start < orig_x_end) {
764 if (!IsSet(chr_mask, par2_code)) {
765 *exclude_ct_ptr += PopcountBitRange(variant_include, par2_start, orig_x_end);
766 ClearBitsNz(par2_start, orig_x_end, variant_include);
767 } else {
768 SetBit(par2_code, loaded_chr_mask);
769 }
770 }
771 logprintf("--split-par: %u chromosome code%s changed.\n", tot_codes_changed, (tot_codes_changed == 1)? "" : "s");
772 return kPglRetSuccess;
773 }
774
775 static_assert((!(kMaxIdSlen % kCacheline)), "LoadPvar() must be updated.");
LoadPvar(const char * pvarname,const char * var_filter_exceptions_flattened,const char * varid_template_str,const char * varid_multi_template_str,const char * varid_multi_nonsnp_template_str,const char * missing_varid_match,const char * require_info_flattened,const char * require_no_info_flattened,const CmpExpr * extract_if_info_exprp,const CmpExpr * exclude_if_info_exprp,MiscFlags misc_flags,PvarPsamFlags pvar_psam_flags,uint32_t xheader_needed,uint32_t qualfilter_needed,float var_min_qual,uint32_t splitpar_bound1,uint32_t splitpar_bound2,uint32_t new_variant_id_max_allele_slen,uint32_t snps_only,uint32_t split_chr_ok,uint32_t filter_min_allele_ct,uint32_t filter_max_allele_ct,uint32_t max_thread_ct,ChrInfo * cip,uint32_t * max_variant_id_slen_ptr,uint32_t * info_reload_slen_ptr,UnsortedVar * vpos_sortstatus_ptr,char ** xheader_ptr,uintptr_t ** variant_include_ptr,uint32_t ** variant_bps_ptr,char *** variant_ids_ptr,uintptr_t ** allele_idx_offsets_ptr,const char *** allele_storage_ptr,uintptr_t ** qual_present_ptr,float ** quals_ptr,uintptr_t ** filter_present_ptr,uintptr_t ** filter_npass_ptr,char *** filter_storage_ptr,uintptr_t ** nonref_flags_ptr,double ** variant_cms_ptr,ChrIdx ** chr_idxs_ptr,uint32_t * raw_variant_ct_ptr,uint32_t * variant_ct_ptr,uint32_t * max_allele_ct_ptr,uint32_t * max_allele_slen_ptr,uintptr_t * xheader_blen_ptr,InfoFlags * info_flags_ptr,uint32_t * max_filter_slen_ptr)776 PglErr LoadPvar(const char* pvarname, const char* var_filter_exceptions_flattened, const char* varid_template_str, const char* varid_multi_template_str, const char* varid_multi_nonsnp_template_str, const char* missing_varid_match, const char* require_info_flattened, const char* require_no_info_flattened, const CmpExpr* extract_if_info_exprp, const CmpExpr* exclude_if_info_exprp, MiscFlags misc_flags, PvarPsamFlags pvar_psam_flags, uint32_t xheader_needed, uint32_t qualfilter_needed, float var_min_qual, uint32_t splitpar_bound1, uint32_t splitpar_bound2, uint32_t new_variant_id_max_allele_slen, uint32_t snps_only, uint32_t split_chr_ok, uint32_t filter_min_allele_ct, uint32_t filter_max_allele_ct, uint32_t max_thread_ct, ChrInfo* cip, uint32_t* max_variant_id_slen_ptr, uint32_t* info_reload_slen_ptr, UnsortedVar* vpos_sortstatus_ptr, char** xheader_ptr, uintptr_t** variant_include_ptr, uint32_t** variant_bps_ptr, char*** variant_ids_ptr, uintptr_t** allele_idx_offsets_ptr, const char*** allele_storage_ptr, uintptr_t** qual_present_ptr, float** quals_ptr, uintptr_t** filter_present_ptr, uintptr_t** filter_npass_ptr, char*** filter_storage_ptr, uintptr_t** nonref_flags_ptr, double** variant_cms_ptr, ChrIdx** chr_idxs_ptr, uint32_t* raw_variant_ct_ptr, uint32_t* variant_ct_ptr, uint32_t* max_allele_ct_ptr, uint32_t* max_allele_slen_ptr, uintptr_t* xheader_blen_ptr, InfoFlags* info_flags_ptr, uint32_t* max_filter_slen_ptr) {
777 // chr_info, max_variant_id_slen, and info_reload_slen are in/out; just
778 // outparameters after them. (Due to its large size in some VCFs, INFO is
779 // not kept in memory for now. This has a speed penalty, of course; maybe
780 // it's worthwhile to conditionally load it later.)
781
782 // allele_idx_offsets currently assumed to be initialized to nullptr
783
784 // should handle raw_variant_ct == 0 properly
785
786 // possible todo: optionally skip allele code loading
787 // probable todo: load INFO:END. (does this allow the CNV module to be
788 // unified with the rest of the program?) but this will probably wait
789 // until I need to analyze some sort of CNV data, and that day keeps
790 // getting postponed... for now, the BCF exporter performs its own parsing
791 // of INFO:END so that it can fill each variant's rlen field correctly.
792 // possible todo: require FILTER to only contain values declared in header,
793 // and modify its storage accordingly? (pointless for now, but worthwhile
794 // to keep an eye on what typical VCF files look like.)
795
796 // Workspace is used as follows:
797 // |--header, allele_storage->----|--other return arrays---|--linebuf--|-
798 // 1/4
799 //
800 // -temp-->----|----<- filter failures, variant IDs, long alleles--|
801 // end
802 // I.e. on successful return, both bigstack_base and bigstack_end will move.
803 // This is designed to be called near the start of a program, at a time when
804 // no large temporary buffer is needed.
805 unsigned char* bigstack_mark = g_bigstack_base;
806 unsigned char* bigstack_end_mark = g_bigstack_end;
807 uintptr_t line_idx = 0;
808 uint32_t max_extra_alt_ct = 0;
809 uint32_t max_allele_slen = 1;
810 PglErr reterr = kPglRetSuccess;
811 TextStream pvar_txs;
812 PreinitTextStream(&pvar_txs);
813 {
814 const uintptr_t quarter_left = RoundDownPow2(bigstack_left() / 4, kCacheline);
815 uint32_t max_line_blen;
816 if (unlikely(StandardizeMaxLineBlenEx(quarter_left, kLoadPvarBlockSize * 2 * sizeof(intptr_t), &max_line_blen))) {
817 goto LoadPvar_ret_NOMEM;
818 }
819 // bugfix (29 Jun 2018): don't cap allele_storage space at 2 GB, otherwise
820 // we're limited to 134M variants
821 unsigned char* rlstream_start = &(bigstack_mark[quarter_left]);
822 g_bigstack_base = rlstream_start;
823 uint32_t decompress_thread_ct = max_thread_ct - 1;
824 // 3 still seems best on a heavily multicore Linux test machine
825 if (decompress_thread_ct > 3) {
826 decompress_thread_ct = 3;
827 } else if (!decompress_thread_ct) {
828 decompress_thread_ct = 1;
829 }
830 reterr = InitTextStream(pvarname, max_line_blen, decompress_thread_ct, &pvar_txs);
831 if (unlikely(reterr)) {
832 goto LoadPvar_ret_TSTREAM_FAIL;
833 }
834 unsigned char* tmp_alloc_base = g_bigstack_base;
835 g_bigstack_base = bigstack_mark;
836
837 char* xheader_end = ((pvar_psam_flags & (kfPvarColXheader | kfPvarColVcfheader)) || xheader_needed)? R_CAST(char*, bigstack_mark) : nullptr;
838 uint32_t chrset_present = 0;
839 uint32_t info_pr_present = 0;
840 uint32_t info_pr_nonflag_present = 0;
841 uint32_t info_nonpr_present = 0;
842 char* line_iter = TextLineEnd(&pvar_txs);
843 char* line_start;
844 while (1) {
845 ++line_idx;
846 if (!TextGetUnsafe2(&pvar_txs, &line_iter)) {
847 if (unlikely(TextStreamErrcode2(&pvar_txs, &reterr))) {
848 goto LoadPvar_ret_TSTREAM_FAIL;
849 }
850 line_start = K_CAST(char*, &(g_one_char_strs[0]));
851 --line_iter;
852 break;
853 }
854 line_start = line_iter;
855 if ((*line_start != '#') || tokequal_k(line_start, "#CHROM")) {
856 break;
857 }
858 if (StrStartsWithUnsafe(line_start, "##INFO=<ID=PR,Number=")) {
859 if (unlikely(info_pr_present || info_pr_nonflag_present)) {
860 snprintf(g_logbuf, kLogbufSize, "Error: Duplicate INFO:PR header line in %s.\n", pvarname);
861 goto LoadPvar_ret_MALFORMED_INPUT_WW;
862 }
863 info_pr_nonflag_present = !StrStartsWithUnsafe(&(line_start[strlen("##INFO=<ID=PR,Number=")]), "0,Type=Flag,Description=");
864 info_pr_present = 1 - info_pr_nonflag_present;
865 if (info_pr_nonflag_present) {
866 logerrprintfww("Warning: Header line %" PRIuPTR " of %s has an unexpected definition of INFO:PR. This interferes with a few merge and liftover operations.\n", line_idx, pvarname);
867 }
868 } else if ((!info_nonpr_present) && StrStartsWithUnsafe(line_start, "##INFO=<ID=")) {
869 info_nonpr_present = 1;
870 }
871 if (StrStartsWithUnsafe(line_start, "##chrSet=<")) {
872 if (unlikely(chrset_present)) {
873 snprintf(g_logbuf, kLogbufSize, "Error: Multiple ##chrSet header lines in %s.\n", pvarname);
874 goto LoadPvar_ret_MALFORMED_INPUT_WW;
875 }
876 chrset_present = 1;
877 const uint32_t cmdline_chrset = (cip->chrset_source == kChrsetSourceCmdline) && (!(misc_flags & kfMiscChrOverrideFile));
878 reterr = ReadChrsetHeaderLine(&(line_start[strlen("##chrSet=<")]), pvarname, misc_flags, line_idx, cip);
879 if (unlikely(reterr)) {
880 goto LoadPvar_ret_1;
881 }
882 if (!cmdline_chrset) {
883 const uint32_t autosome_ct = cip->autosome_ct;
884 if (cip->haploid_mask[0] & 1) {
885 logprintf("chrSet header line: %u autosome%s (haploid).\n", autosome_ct, (autosome_ct == 1)? "" : "s");
886 } else {
887 logprintf("chrSet header line: %u autosome pair%s.\n", autosome_ct, (autosome_ct == 1)? "" : "s");
888 }
889 }
890 } else if (xheader_end) {
891 // if the "pvar file" was actually a VCF, suppress the same lines we'd
892 // suppress when importing with --vcf.
893 if ((!StrStartsWithUnsafe(line_start, "##fileformat=")) &&
894 (!StrStartsWithUnsafe(line_start, "##fileDate=")) &&
895 (!StrStartsWithUnsafe(line_start, "##source=")) &&
896 (!StrStartsWithUnsafe(line_start, "##FORMAT="))) {
897 char* line_end = AdvToDelim(line_start, '\n');
898 uint32_t line_slen = line_end - line_start;
899 if (line_start[line_slen - 1] == '\r') {
900 --line_slen;
901 }
902 if (unlikely(S_CAST(uintptr_t, R_CAST(char*, rlstream_start) - xheader_end) < line_slen + 2)) {
903 goto LoadPvar_ret_NOMEM;
904 }
905 xheader_end = memcpya(xheader_end, line_start, line_slen);
906 AppendBinaryEoln(&xheader_end);
907 line_iter = line_end;
908 }
909 }
910 line_iter = AdvPastDelim(line_iter, '\n');
911 }
912 if (xheader_end) {
913 *xheader_ptr = R_CAST(char*, bigstack_mark);
914 *xheader_blen_ptr = xheader_end - (*xheader_ptr);
915 BigstackBaseSet(xheader_end);
916 }
917 FinalizeChrset(misc_flags, cip);
918 const char** allele_storage = R_CAST(const char**, g_bigstack_base);
919 const char** allele_storage_iter = allele_storage;
920
921 uint32_t col_skips[8];
922 uint32_t col_types[8];
923 uint32_t relevant_postchr_col_ct = 5;
924 uint32_t alt_col_idx = 4;
925 uint32_t load_qual_col = 0;
926 uint32_t load_filter_col = 0;
927 uint32_t info_col_present = 0;
928 uint32_t cm_col_present = 0;
929 if (line_start[0] == '#') {
930 *info_flags_ptr = S_CAST(InfoFlags, (info_pr_present * kfInfoPrFlagPresent) | (info_pr_nonflag_present * kfInfoPrNonflagPresent) | (info_nonpr_present * kfInfoNonprPresent));
931 // parse header
932 // [-1] = #CHROM (must be first column)
933 // [0] = POS
934 // [1] = ID
935 // [2] = REF
936 // [3] = ALT
937 // [4] = QUAL
938 // [5] = FILTER
939 // [6] = INFO
940 // [7] = CM (usually absent)
941
942 // code is similar to plink 1.9 annotate() and gene_report(), but they
943 // don't have a forced first column
944 // might want to write plink2_common library functions for this...
945 const char* token_end = &(line_start[6]);
946 uint32_t found_header_bitset = 0;
947 relevant_postchr_col_ct = 0;
948 const char* linebuf_iter;
949 for (uint32_t col_idx = 1; ; ++col_idx) {
950 linebuf_iter = FirstNonTspace(token_end);
951 if (IsEolnKns(*linebuf_iter)) {
952 break;
953 }
954 token_end = CurTokenEnd(linebuf_iter);
955 const uint32_t token_slen = token_end - linebuf_iter;
956 uint32_t cur_col_type;
957 if (token_slen <= 3) {
958 if (token_slen == 3) {
959 if (memequal_k(linebuf_iter, "POS", 3)) {
960 cur_col_type = 0;
961 } else if (memequal_k(linebuf_iter, "REF", 3)) {
962 cur_col_type = 2;
963 } else if (memequal_k(linebuf_iter, "ALT", 3)) {
964 cur_col_type = 3;
965 alt_col_idx = col_idx;
966 } else {
967 continue;
968 }
969 } else if (token_slen == 2) {
970 if (memequal_k(linebuf_iter, "ID", 2)) {
971 cur_col_type = 1;
972 } else if (memequal_k(linebuf_iter, "CM", 2)) {
973 cur_col_type = 7;
974 cm_col_present = 1;
975 } else {
976 continue;
977 }
978 } else {
979 continue;
980 }
981 } else if (strequal_k(linebuf_iter, "QUAL", token_slen)) {
982 load_qual_col = 2 * ((pvar_psam_flags & (kfPvarColMaybequal | kfPvarColQual)) || qualfilter_needed) + (var_min_qual != -1);
983 if (!load_qual_col) {
984 continue;
985 }
986 cur_col_type = 4;
987 } else if (strequal_k(linebuf_iter, "INFO", token_slen)) {
988 cur_col_type = 6;
989 info_col_present = 1;
990 } else if (token_slen == 6) {
991 if (memequal_k(linebuf_iter, "FILTER", 6)) {
992 load_filter_col = 2 * ((pvar_psam_flags & (kfPvarColMaybefilter | kfPvarColFilter)) || qualfilter_needed) + ((misc_flags / kfMiscExcludePvarFilterFail) & 1);
993 if (!load_filter_col) {
994 continue;
995 }
996 cur_col_type = 5;
997 } else if (memequal_k(linebuf_iter, "FORMAT", 6)) {
998 break;
999 } else {
1000 continue;
1001 }
1002 } else {
1003 continue;
1004 }
1005 const uint32_t cur_col_type_shifted = 1 << cur_col_type;
1006 if (unlikely(found_header_bitset & cur_col_type_shifted)) {
1007 // known token, so no overflow danger
1008 char* write_iter = strcpya_k(g_logbuf, "Error: Duplicate column header '");
1009 write_iter = memcpya(write_iter, linebuf_iter, token_slen);
1010 write_iter = strcpya_k(write_iter, "' on line ");
1011 write_iter = wtoa(line_idx, write_iter);
1012 write_iter = strcpya_k(write_iter, " of ");
1013 write_iter = strcpya(write_iter, pvarname);
1014 memcpy_k(write_iter, ".\n\0", 4);
1015 goto LoadPvar_ret_MALFORMED_INPUT_WW;
1016 }
1017 found_header_bitset |= cur_col_type_shifted;
1018 col_skips[relevant_postchr_col_ct] = col_idx;
1019 col_types[relevant_postchr_col_ct++] = cur_col_type;
1020 }
1021 if (unlikely((found_header_bitset & 0x0f) != 0x0f)) {
1022 snprintf(g_logbuf, kLogbufSize, "Error: Missing column header(s) on line %" PRIuPTR " of %s. (POS, ID, REF, and ALT are required.)\n", line_idx, pvarname);
1023 goto LoadPvar_ret_MALFORMED_INPUT_WW;
1024 }
1025 if ((var_min_qual != -1) && (!(found_header_bitset & 0x10))) {
1026 logerrputs("Error: --var-min-qual used on a variant file with no QUAL column.\n");
1027 goto LoadPvar_ret_INCONSISTENT_INPUT;
1028 }
1029 for (uint32_t rpc_col_idx = relevant_postchr_col_ct - 1; rpc_col_idx; --rpc_col_idx) {
1030 col_skips[rpc_col_idx] -= col_skips[rpc_col_idx - 1];
1031 }
1032 // skip this line in main loop
1033 line_iter = K_CAST(char*, AdvToDelim(linebuf_iter, '\n'));
1034 line_start = line_iter;
1035 } else if (line_start[0] != '\n') {
1036 if (var_min_qual != -1) {
1037 logerrputs("Error: --var-min-qual used on a variant file with no QUAL column.\n");
1038 goto LoadPvar_ret_INCONSISTENT_INPUT;
1039 }
1040 *info_flags_ptr = kfInfoPrNonrefDefault;
1041 col_skips[0] = 1;
1042 col_skips[1] = 1;
1043 col_skips[2] = 1;
1044 col_skips[3] = 1;
1045 col_types[0] = 1;
1046 // CM column is formally optional in headerless .pvar files (and it was
1047 // "secretly" optional for the standard plink 1.9 standard .bim loader).
1048 // If the line has exactly 5 columns, assume CM is omitted.
1049
1050 // Note that this doesn't break in the line_start = &(g_one_char_strs[0])
1051 // case.
1052 const char* linebuf_iter = NextTokenMult(line_start, 4);
1053 if (unlikely(!linebuf_iter)) {
1054 goto LoadPvar_ret_MISSING_TOKENS;
1055 }
1056 linebuf_iter = NextToken(linebuf_iter);
1057 if (!linebuf_iter) {
1058 // #CHROM ID POS ALT REF
1059 relevant_postchr_col_ct = 4;
1060 col_types[1] = 0;
1061 col_types[2] = 3;
1062 col_types[3] = 2;
1063 alt_col_idx = 3;
1064 } else {
1065 // #CHROM ID CM POS ALT REF
1066 col_skips[4] = 1;
1067 col_types[1] = 7;
1068 col_types[2] = 0;
1069 col_types[3] = 3;
1070 col_types[4] = 2;
1071 // alt_col_idx = 4;
1072 cm_col_present = 1;
1073 }
1074 }
1075 uint32_t info_reload_slen = *info_reload_slen_ptr;
1076
1077 // done with header. line_start now points to either the beginning of the
1078 // first real line, or an eoln character.
1079 uint32_t max_variant_id_slen = *max_variant_id_slen_ptr;
1080 uint32_t chrs_encountered_m1 = UINT32_MAX; // intentional overflow
1081 uint32_t prev_chr_code = UINT32_MAX; // force initial mismatch
1082 uint32_t raw_variant_ct = 0;
1083 uintptr_t* chr_mask = cip->chr_mask;
1084 const char* missing_allele_str = &(g_one_char_strs[92]);
1085 double last_cm = -DBL_MAX;
1086 int32_t last_bp = 0;
1087
1088 // this way, we only need to check allele_storage_iter against this (i)
1089 // when processing a multiallelic variant or (ii) at the end of a block
1090 const char** allele_storage_limit = R_CAST(const char**, &(rlstream_start[kLoadPvarBlockSize * (-2) * sizeof(intptr_t)]));
1091
1092 uintptr_t* loaded_chr_mask = R_CAST(uintptr_t*, tmp_alloc_base);
1093 unsigned char* tmp_alloc_end = bigstack_end_mark;
1094 // guaranteed to succeed since max_line_blen > 128k, etc.
1095 /*
1096 if ((uintptr_t)(tmp_alloc_end - tmp_alloc_base) < RoundUpPow2(kChrMaskWords * sizeof(intptr_t), kCacheline)) {
1097 goto LoadPvar_ret_NOMEM;
1098 }
1099 */
1100 tmp_alloc_base = &(tmp_alloc_base[RoundUpPow2(kChrMaskWords * sizeof(intptr_t), kCacheline)]);
1101 // bugfix (2 Jun 2017): forgot to zero-initialize loaded_chr_mask
1102 ZeroWArr(kChrMaskWords, loaded_chr_mask);
1103
1104 InfoExist* info_existp = nullptr;
1105 if (require_info_flattened) {
1106 reterr = InfoExistInit(tmp_alloc_end, require_info_flattened, "require-info", &tmp_alloc_base, &info_existp);
1107 if (unlikely(reterr)) {
1108 goto LoadPvar_ret_1;
1109 }
1110 }
1111 InfoExist* info_nonexistp = nullptr;
1112 if (require_no_info_flattened) {
1113 reterr = InfoExistInit(tmp_alloc_end, require_no_info_flattened, "require-no-info", &tmp_alloc_base, &info_nonexistp);
1114 if (unlikely(reterr)) {
1115 goto LoadPvar_ret_1;
1116 }
1117 }
1118 InfoFilter info_keep;
1119 info_keep.prekey = nullptr;
1120 if (extract_if_info_exprp->pheno_name) {
1121 // todo: also print warning (or optionally error out?) if header line is
1122 // missing or doesn't match type expectation
1123 // (same for --require-info)
1124 reterr = InfoFilterInit(tmp_alloc_end, extract_if_info_exprp, "extract-if-info", &tmp_alloc_base, &info_keep);
1125 if (unlikely(reterr)) {
1126 goto LoadPvar_ret_1;
1127 }
1128 }
1129 InfoFilter info_remove;
1130 info_remove.prekey = nullptr;
1131 if (exclude_if_info_exprp->pheno_name) {
1132 reterr = InfoFilterInit(tmp_alloc_end, exclude_if_info_exprp, "exclude-if-info", &tmp_alloc_base, &info_remove);
1133 if (unlikely(reterr)) {
1134 goto LoadPvar_ret_1;
1135 }
1136 }
1137 if (!info_col_present) {
1138 if (unlikely(require_info_flattened)) {
1139 logerrputs("Error: --require-info used on a variant file with no INFO column.\n");
1140 goto LoadPvar_ret_INCONSISTENT_INPUT;
1141 }
1142 if (unlikely(info_keep.prekey)) {
1143 logerrputs("Error: --extract-if-info used on a variant file with no INFO column.\n");
1144 goto LoadPvar_ret_INCONSISTENT_INPUT;
1145 }
1146 info_pr_present = 0;
1147 info_reload_slen = 0;
1148 } else if ((!info_pr_present) && (!info_reload_slen) && (!info_existp) && (!info_nonexistp) && (!info_keep.prekey) && (!info_remove.prekey)) {
1149 info_col_present = 0;
1150 }
1151
1152 uint32_t fexcept_ct = 0;
1153 uintptr_t max_fexcept_blen = 2;
1154 char* sorted_fexcepts = nullptr;
1155 if (var_filter_exceptions_flattened) {
1156 if (unlikely(MultistrToStrboxDedupArenaAlloc(tmp_alloc_end, var_filter_exceptions_flattened, &tmp_alloc_base, &sorted_fexcepts, &fexcept_ct, &max_fexcept_blen))) {
1157 goto LoadPvar_ret_NOMEM;
1158 }
1159 }
1160 const uint32_t new_variant_id_overflow_missing = (misc_flags / kfMiscNewVarIdOverflowMissing) & 1;
1161 char* chr_output_name_buf = nullptr;
1162 VaridTemplate* varid_templatep = nullptr;
1163 VaridTemplate* varid_multi_templatep = nullptr;
1164 VaridTemplate* varid_multi_nonsnp_templatep = nullptr;
1165 uint32_t missing_varid_blen = 0;
1166 uint32_t missing_varid_match_slen = 0;
1167 if (varid_template_str) {
1168 if (unlikely(S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) < kMaxIdSlen + 3 * RoundUpPow2(sizeof(VaridTemplate), kCacheline))) {
1169 goto LoadPvar_ret_NOMEM;
1170 }
1171 chr_output_name_buf = R_CAST(char*, tmp_alloc_base);
1172 tmp_alloc_base = &(tmp_alloc_base[kMaxIdSlen]);
1173 if (!missing_varid_match) {
1174 missing_varid_match = &(g_one_char_strs[92]); // '.'
1175 }
1176 missing_varid_blen = strlen(missing_varid_match);
1177 if (misc_flags & kfMiscSetMissingVarIds) {
1178 missing_varid_match_slen = missing_varid_blen;
1179 }
1180 ++missing_varid_blen;
1181 varid_templatep = R_CAST(VaridTemplate*, tmp_alloc_base);
1182 tmp_alloc_base = &(tmp_alloc_base[RoundUpPow2(sizeof(VaridTemplate), kCacheline)]);
1183 const uint32_t overflow_substitute_blen = new_variant_id_overflow_missing? missing_varid_blen : 0;
1184 VaridTemplateInit(varid_template_str, missing_varid_match, chr_output_name_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_templatep);
1185 if (varid_multi_template_str) {
1186 varid_multi_templatep = R_CAST(VaridTemplate*, tmp_alloc_base);
1187 tmp_alloc_base = &(tmp_alloc_base[RoundUpPow2(sizeof(VaridTemplate), kCacheline)]);
1188 VaridTemplateInit(varid_multi_template_str, missing_varid_match, chr_output_name_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_multi_templatep);
1189 }
1190 if (varid_multi_nonsnp_template_str) {
1191 varid_multi_nonsnp_templatep = R_CAST(VaridTemplate*, tmp_alloc_base);
1192 tmp_alloc_base = &(tmp_alloc_base[RoundUpPow2(sizeof(VaridTemplate), kCacheline)]);
1193 VaridTemplateInit(varid_multi_nonsnp_template_str, missing_varid_match, chr_output_name_buf, new_variant_id_max_allele_slen, overflow_substitute_blen, varid_multi_nonsnp_templatep);
1194 }
1195 }
1196
1197 // prevent later return-array allocations from overlapping with temporary
1198 // storage
1199 g_bigstack_end = tmp_alloc_base;
1200
1201 // prevent variant_id_htable_find from breaking
1202 if (R_CAST(const char*, tmp_alloc_end) > (&(g_one_char_strs[512 - kMaxIdSlen]))) {
1203 tmp_alloc_end = R_CAST(unsigned char*, K_CAST(char*, &(g_one_char_strs[512 - kMaxIdSlen])));
1204 }
1205 const uint32_t allow_extra_chrs = (misc_flags / kfMiscAllowExtraChrs) & 1;
1206 const uint32_t merge_par = ((misc_flags & (kfMiscMergePar | kfMiscMergeX)) != 0);
1207 const uint32_t x_code = cip->xymt_codes[kChrOffsetX];
1208 const char input_missing_geno_char = *g_input_missing_geno_ptr;
1209 uint32_t parx_code = cip->xymt_codes[kChrOffsetPAR1];
1210 uint32_t par2_code = cip->xymt_codes[kChrOffsetPAR2];
1211 if (misc_flags & kfMiscMergeX) {
1212 parx_code = cip->xymt_codes[kChrOffsetXY];
1213 par2_code = UINT32_MAXM1;
1214 }
1215 uint32_t merge_par_ct = 0;
1216
1217 // Corner case: with --split-par + --not-chr x, we should keep the
1218 // pseudoautosomal regions. To facilitate this, we temporarily don't mask
1219 // out chrX; SplitPar() handles this properly later.
1220 const uint32_t splitpar_and_exclude_x = splitpar_bound2 && (!IsI32Neg(x_code)) && (!IsSet(cip->chr_mask, x_code));
1221 if (splitpar_and_exclude_x) {
1222 SetBit(x_code, cip->chr_mask);
1223 }
1224
1225 uint8_t acgtm_bool_table[256] = {
1226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
1229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1230 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1231 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1232 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1233 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1242 if (snps_only > 1) {
1243 acgtm_bool_table[ctou32(input_missing_geno_char)] = 1;
1244 }
1245
1246 uint32_t* cur_bps = nullptr;
1247 uintptr_t* cur_allele_idxs = nullptr;
1248 char** cur_ids = nullptr;
1249 uintptr_t* cur_include = nullptr;
1250 uintptr_t* cur_qual_present = nullptr;
1251 float* cur_quals = nullptr;
1252 uintptr_t* cur_filter_present = nullptr;
1253 uintptr_t* cur_filter_npass = nullptr;
1254 char** cur_filter_storage = nullptr;
1255 uintptr_t* cur_nonref_flags = nullptr;
1256
1257 // only need this for --set-{missing,all}-var-ids error message
1258 uint32_t max_chr_slen = 0;
1259
1260 uint32_t max_filter_slen = 0;
1261 uint32_t exclude_ct = 0;
1262
1263 // only allocated when necessary
1264 // if we want to scale this approach to more fields, we'll need to add a
1265 // few pointers to the start of each block. right now, we force cur_cms[]
1266 // to be allocated before cur_chr_idxs[] when both are present, but this
1267 // is error-prone.
1268 uint32_t at_least_one_npass_filter = 0;
1269 uint32_t at_least_one_nzero_cm = 0;
1270 uintptr_t new_variant_id_allele_len_overflow = 0;
1271 double* cur_cms = nullptr;
1272 uint32_t cms_start_block = UINT32_MAX;
1273
1274 ChrIdx* cur_chr_idxs = nullptr;
1275 uint32_t chr_idxs_start_block = UINT32_MAX;
1276 uint32_t is_split_chr = 0;
1277 UnsortedVar vpos_sortstatus = kfUnsortedVar0;
1278
1279 if (IsEolnKns(*line_start)) {
1280 ++line_iter;
1281 ++line_idx;
1282 } else {
1283 line_iter = line_start;
1284 }
1285 for (; TextGetUnsafe2(&pvar_txs, &line_iter); ++line_iter, ++line_idx) {
1286 if (unlikely(line_iter[0] == '#')) {
1287 snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of %s starts with a '#'. (This is only permitted before the first nonheader line, and if a #CHROM header line is present it must denote the end of the header block.)\n", line_idx, pvarname);
1288 goto LoadPvar_ret_MALFORMED_INPUT_WW;
1289 }
1290 #ifdef __LP64__
1291 // maximum prime < 2^32 is 4294967291; quadratic hashing guarantee
1292 // breaks down past that divided by 2.
1293 if (unlikely(raw_variant_ct == 0x7ffffffd)) {
1294 logerrputs("Error: " PROG_NAME_STR " does not support more than 2^31 - 3 variants. We recommend using\nother software for very deep studies of small numbers of genomes.\n");
1295 goto LoadPvar_ret_MALFORMED_INPUT;
1296 }
1297 #endif
1298 const uint32_t variant_idx_lowbits = raw_variant_ct % kLoadPvarBlockSize;
1299 if (!variant_idx_lowbits) {
1300 if (unlikely(
1301 (S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) <=
1302 kLoadPvarBlockSize *
1303 (sizeof(int32_t) +
1304 2 * sizeof(intptr_t) +
1305 at_least_one_nzero_cm * sizeof(double)) +
1306 is_split_chr * sizeof(ChrIdx) +
1307 (1 + info_pr_present) * (kLoadPvarBlockSize / CHAR_BIT) +
1308 (load_qual_col? ((kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(float)) : 0) +
1309 (load_filter_col? (2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)) : 0)) ||
1310 (allele_storage_iter >= allele_storage_limit))) {
1311 goto LoadPvar_ret_NOMEM;
1312 }
1313 cur_bps = R_CAST(uint32_t*, tmp_alloc_base);
1314 cur_allele_idxs = R_CAST(uintptr_t*, &(tmp_alloc_base[kLoadPvarBlockSize * sizeof(int32_t)]));
1315 cur_ids = R_CAST(char**, &(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]));
1316 cur_include = R_CAST(uintptr_t*, &(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t))]));
1317 SetAllWArr(kLoadPvarBlockSize / kBitsPerWord, cur_include);
1318 tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t)) + (kLoadPvarBlockSize / CHAR_BIT)]);
1319 if (load_qual_col > 1) {
1320 cur_qual_present = R_CAST(uintptr_t*, tmp_alloc_base);
1321 ZeroWArr(kLoadPvarBlockSize / kBitsPerWord, cur_qual_present);
1322 cur_quals = R_CAST(float*, &(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]));
1323 tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize * sizeof(float) + (kLoadPvarBlockSize / CHAR_BIT)]);
1324 }
1325 if (load_filter_col > 1) {
1326 cur_filter_present = R_CAST(uintptr_t*, tmp_alloc_base);
1327 cur_filter_npass = R_CAST(uintptr_t*, &(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]));
1328 cur_filter_storage = R_CAST(char**, &(tmp_alloc_base[2 * (kLoadPvarBlockSize / CHAR_BIT)]));
1329 ZeroWArr(kLoadPvarBlockSize / kBitsPerWord, cur_filter_present);
1330 ZeroWArr(kLoadPvarBlockSize / kBitsPerWord, cur_filter_npass);
1331 tmp_alloc_base = &(tmp_alloc_base[2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)]);
1332 }
1333 if (info_pr_present) {
1334 cur_nonref_flags = R_CAST(uintptr_t*, tmp_alloc_base);
1335 ZeroWArr(kLoadPvarBlockSize / kBitsPerWord, cur_nonref_flags);
1336 tmp_alloc_base = &(tmp_alloc_base[kLoadPvarBlockSize / CHAR_BIT]);
1337 }
1338 if (at_least_one_nzero_cm) {
1339 cur_cms = R_CAST(double*, tmp_alloc_base);
1340 ZeroDArr(kLoadPvarBlockSize, cur_cms);
1341 tmp_alloc_base = R_CAST(unsigned char*, &(cur_cms[kLoadPvarBlockSize]));
1342 }
1343 if (is_split_chr) {
1344 cur_chr_idxs = R_CAST(ChrIdx*, tmp_alloc_base);
1345 tmp_alloc_base = R_CAST(unsigned char*, &(cur_chr_idxs[kLoadPvarBlockSize]));
1346 }
1347 }
1348 char* linebuf_iter = CurTokenEnd(line_iter);
1349 // #CHROM
1350 if (unlikely(*linebuf_iter == '\n')) {
1351 goto LoadPvar_ret_MISSING_TOKENS;
1352 }
1353 uint32_t cur_chr_code;
1354 reterr = GetOrAddChrCodeDestructive(".pvar file", line_idx, allow_extra_chrs, line_iter, linebuf_iter, cip, &cur_chr_code);
1355 if (unlikely(reterr)) {
1356 goto LoadPvar_ret_1;
1357 }
1358 if (merge_par) {
1359 if (cur_chr_code == par2_code) {
1360 // don't permit PAR1 variants after PAR2
1361 parx_code = par2_code;
1362 }
1363 if (cur_chr_code == parx_code) {
1364 ++merge_par_ct;
1365 cur_chr_code = x_code;
1366 }
1367 }
1368 if (cur_chr_code != prev_chr_code) {
1369 prev_chr_code = cur_chr_code;
1370 if (!is_split_chr) {
1371 if (IsSet(loaded_chr_mask, cur_chr_code)) {
1372 if (unlikely(!split_chr_ok)) {
1373 snprintf(g_logbuf, kLogbufSize, "Error: %s has a split chromosome. Use --make-pgen + --sort-vars to remedy this.\n", pvarname);
1374 goto LoadPvar_ret_MALFORMED_INPUT_WW;
1375 }
1376 if (unlikely(S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) < kLoadPvarBlockSize * sizeof(ChrIdx))) {
1377 goto LoadPvar_ret_NOMEM;
1378 }
1379 cur_chr_idxs = R_CAST(ChrIdx*, tmp_alloc_base);
1380 tmp_alloc_base = R_CAST(unsigned char*, &(cur_chr_idxs[kLoadPvarBlockSize]));
1381 // may want to track the first problem variant index
1382 // cip->chr_fo_vidx_start[chrs_encountered_m1] = raw_variant_ct;
1383 BackfillChrIdxs(cip, chrs_encountered_m1, RoundDownPow2(raw_variant_ct, kLoadPvarBlockSize), raw_variant_ct, cur_chr_idxs);
1384 chr_idxs_start_block = raw_variant_ct / kLoadPvarBlockSize;
1385 is_split_chr = 1;
1386 vpos_sortstatus |= kfUnsortedVarBp | kfUnsortedVarCm | kfUnsortedVarSplitChr;
1387 } else {
1388 // how much of this do we need in split-chrom case?
1389 cip->chr_file_order[++chrs_encountered_m1] = cur_chr_code;
1390 cip->chr_fo_vidx_start[chrs_encountered_m1] = raw_variant_ct;
1391 cip->chr_idx_to_foidx[cur_chr_code] = chrs_encountered_m1;
1392 last_cm = -DBL_MAX;
1393 }
1394 }
1395 // always need to set this, if we want to avoid chromosome-length
1396 // overestimation in --sort-vars + early-variant-filter +
1397 // pvar-cols=+vcfheader case.
1398 last_bp = 0;
1399
1400 SetBit(cur_chr_code, loaded_chr_mask);
1401 if (chr_output_name_buf) {
1402 char* chr_name_end = chrtoa(cip, cur_chr_code, chr_output_name_buf);
1403 const uint32_t chr_slen = chr_name_end - chr_output_name_buf;
1404 if (chr_slen > max_chr_slen) {
1405 max_chr_slen = chr_slen;
1406 }
1407 const int32_t chr_slen_delta = chr_slen - varid_templatep->chr_slen;
1408 varid_templatep->chr_slen = chr_slen;
1409 varid_templatep->base_len += chr_slen_delta;
1410 if (varid_multi_templatep) {
1411 varid_multi_templatep->chr_slen = chr_slen;
1412 varid_multi_templatep->base_len += chr_slen_delta;
1413 }
1414 if (varid_multi_nonsnp_templatep) {
1415 varid_multi_nonsnp_templatep->chr_slen = chr_slen;
1416 varid_multi_nonsnp_templatep->base_len += chr_slen_delta;
1417 }
1418 }
1419 }
1420 *linebuf_iter = '\t';
1421
1422 // could make this store (and cur_allele_idxs[] allocation) conditional
1423 // on a multiallelic variant being sighted, but unlike the CM column
1424 // this should become common
1425 cur_allele_idxs[variant_idx_lowbits] = allele_storage_iter - allele_storage;
1426
1427 char* token_ptrs[8];
1428 uint32_t token_slens[8];
1429 uint32_t extra_alt_ct;
1430 if (IsSet(chr_mask, cur_chr_code) || info_pr_present) {
1431 linebuf_iter = TokenLex(linebuf_iter, col_types, col_skips, relevant_postchr_col_ct, token_ptrs, token_slens);
1432 if (unlikely(!linebuf_iter)) {
1433 goto LoadPvar_ret_MISSING_TOKENS;
1434 }
1435
1436 extra_alt_ct = CountByte(token_ptrs[3], ',', token_slens[3]);
1437 if (extra_alt_ct > max_extra_alt_ct) {
1438 max_extra_alt_ct = extra_alt_ct;
1439 }
1440
1441 // It is possible for the info_token[info_slen] assignment below to
1442 // clobber the line terminator, so we advance line_iter to eoln here
1443 // and never reference it again before the next line.
1444 line_iter = AdvToDelim(linebuf_iter, '\n');
1445 if (info_col_present) {
1446 const uint32_t info_slen = token_slens[6];
1447 if (info_slen > info_reload_slen) {
1448 info_reload_slen = info_slen;
1449 }
1450 char* info_token = token_ptrs[6];
1451 info_token[info_slen] = '\0';
1452 if (info_pr_present) {
1453 // always load all nonref_flags entries so (i) --ref-from-fa +
1454 // --make-just-pvar works and (ii) they can be compared against
1455 // the .pgen.
1456 if ((memequal_k(info_token, "PR", 2) && ((info_slen == 2) || (info_token[2] == ';'))) || memequal_k(&(info_token[S_CAST(int32_t, info_slen) - 3]), ";PR", 3)) {
1457 SetBit(variant_idx_lowbits, cur_nonref_flags);
1458 } else {
1459 const char* first_info_end = strchr(info_token, ';');
1460 if (first_info_end && strstr(first_info_end, ";PR;")) {
1461 SetBit(variant_idx_lowbits, cur_nonref_flags);
1462 }
1463 }
1464 if (!IsSet(chr_mask, cur_chr_code)) {
1465 goto LoadPvar_skip_variant;
1466 }
1467 }
1468 if (info_existp) {
1469 if (!InfoExistCheck(info_token, info_existp)) {
1470 goto LoadPvar_skip_variant;
1471 }
1472 }
1473 if (info_nonexistp) {
1474 if (!InfoNonexistCheck(info_token, info_nonexistp)) {
1475 goto LoadPvar_skip_variant;
1476 }
1477 }
1478 if (info_keep.prekey) {
1479 if (!InfoConditionSatisfied(info_token, &info_keep)) {
1480 goto LoadPvar_skip_variant;
1481 }
1482 }
1483 if (info_remove.prekey) {
1484 if (InfoConditionSatisfied(info_token, &info_remove)) {
1485 goto LoadPvar_skip_variant;
1486 }
1487 }
1488 }
1489 // POS
1490 int32_t cur_bp;
1491 if (unlikely(ScanIntAbsDefcap(token_ptrs[0], &cur_bp))) {
1492 snprintf(g_logbuf, kLogbufSize, "Error: Invalid bp coordinate on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
1493 goto LoadPvar_ret_MALFORMED_INPUT_WW;
1494 }
1495
1496 if (cur_bp < 0) {
1497 goto LoadPvar_skip_variant;
1498 }
1499
1500 // QUAL
1501 if (load_qual_col) {
1502 const char* qual_token = token_ptrs[4];
1503 if ((qual_token[0] != '.') || (qual_token[1] > ' ')) {
1504 float cur_qual;
1505 if (unlikely(ScanFloat(qual_token, &cur_qual))) {
1506 snprintf(g_logbuf, kLogbufSize, "Error: Invalid QUAL value on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
1507 goto LoadPvar_ret_MALFORMED_INPUT_WW;
1508 }
1509 if ((load_qual_col & 1) && (cur_qual < var_min_qual)) {
1510 goto LoadPvar_skip_variant;
1511 }
1512 if (load_qual_col > 1) {
1513 SetBit(variant_idx_lowbits, cur_qual_present);
1514 // possible todo: optimize all-quals-same case
1515 // possible todo: conditionally allocate, like cur_cms
1516 cur_quals[variant_idx_lowbits] = cur_qual;
1517 }
1518 } else if (load_qual_col & 1) {
1519 goto LoadPvar_skip_variant;
1520 }
1521 }
1522
1523 // avoid repeating the ALT string split in --set-...-var-ids case
1524 linebuf_iter = token_ptrs[3];
1525 uint32_t remaining_alt_char_ct = token_slens[3];
1526 // handle --snps-only here instead of later, since it reduces the
1527 // amount of data we need to load
1528 if (snps_only) {
1529 if ((token_slens[2] != 1) || (remaining_alt_char_ct != 2 * extra_alt_ct + 1)) {
1530 goto LoadPvar_skip_variant;
1531 }
1532 if (snps_only > 1) {
1533 // just-acgt
1534 if (!acgtm_bool_table[ctou32(token_ptrs[2][0])]) {
1535 goto LoadPvar_skip_variant;
1536 }
1537 for (uint32_t uii = 0; uii <= extra_alt_ct; ++uii) {
1538 if (!acgtm_bool_table[ctou32(linebuf_iter[2 * uii])]) {
1539 goto LoadPvar_skip_variant;
1540 }
1541 }
1542 }
1543 }
1544
1545 // also handle --min-alleles/--max-alleles here
1546 if (filter_min_allele_ct || (filter_max_allele_ct <= kPglMaxAltAlleleCt)) {
1547 uint32_t allele_ct = extra_alt_ct + 2;
1548 if (!extra_alt_ct) {
1549 // allele_ct == 1 or 2 for filtering purposes, depending on
1550 // whether ALT allele matches a missing code.
1551 if ((remaining_alt_char_ct == 1) && ((linebuf_iter[0] == '.') || (linebuf_iter[0] == input_missing_geno_char))) {
1552 allele_ct = 1;
1553 }
1554 }
1555 if ((allele_ct < filter_min_allele_ct) || (allele_ct > filter_max_allele_ct)) {
1556 goto LoadPvar_skip_variant;
1557 }
1558 }
1559
1560 // FILTER
1561 if (load_filter_col) {
1562 const char* filter_token = token_ptrs[5];
1563 const uint32_t filter_slen = token_slens[5];
1564 if ((filter_slen > 1) || (filter_token[0] != '.')) {
1565 if (!strequal_k(filter_token, "PASS", filter_slen)) {
1566 if (load_filter_col & 1) {
1567 if (!fexcept_ct) {
1568 goto LoadPvar_skip_variant;
1569 }
1570 const char* filter_token_end = &(filter_token[filter_slen]);
1571 for (const char* filter_token_iter = filter_token; ; ) {
1572 const char* cur_filter_name_end = AdvToDelimOrEnd(filter_token_iter, filter_token_end, ';');
1573 uint32_t cur_slen = cur_filter_name_end - filter_token_iter;
1574 // possible todo: error out on "PASS", since that
1575 // shouldn't coexist with other filters
1576 // possible todo: maintain a dictionary of FILTER
1577 // strings, analogous to what BCF2 does on disk
1578 if (bsearch_str(filter_token_iter, sorted_fexcepts, cur_slen, max_fexcept_blen, fexcept_ct) == -1) {
1579 goto LoadPvar_skip_variant;
1580 }
1581 if (cur_filter_name_end == filter_token_end) {
1582 break;
1583 }
1584 filter_token_iter = &(cur_filter_name_end[1]);
1585 }
1586 }
1587 if (load_filter_col > 1) {
1588 SetBit(variant_idx_lowbits, cur_filter_npass);
1589 at_least_one_npass_filter = 1;
1590 // possible todo: detect repeated filter values, store more
1591 // compactly
1592 if (filter_slen > max_filter_slen) {
1593 max_filter_slen = filter_slen;
1594 }
1595 if (StoreStringAtEnd(tmp_alloc_base, filter_token, filter_slen, &tmp_alloc_end, &(cur_filter_storage[variant_idx_lowbits]))) {
1596 goto LoadPvar_ret_NOMEM;
1597 }
1598 }
1599 }
1600 if (load_filter_col > 1) {
1601 SetBit(variant_idx_lowbits, cur_filter_present);
1602 }
1603 }
1604 }
1605
1606 if (cur_chr_idxs) {
1607 cur_chr_idxs[variant_idx_lowbits] = cur_chr_code;
1608 }
1609 if (cur_bp < last_bp) {
1610 vpos_sortstatus |= kfUnsortedVarBp;
1611 }
1612 cur_bps[variant_idx_lowbits] = cur_bp;
1613 last_bp = cur_bp;
1614 const uint32_t ref_slen = token_slens[2];
1615 uint32_t id_slen;
1616 if ((!varid_templatep) || (missing_varid_match_slen && ((token_slens[1] != missing_varid_match_slen) || (!memequal(token_ptrs[1], missing_varid_match, missing_varid_match_slen))))) {
1617 id_slen = token_slens[1];
1618 if (PtrWSubCk(tmp_alloc_base, id_slen + 1, &tmp_alloc_end)) {
1619 goto LoadPvar_ret_NOMEM;
1620 }
1621 memcpyx(tmp_alloc_end, token_ptrs[1], id_slen, '\0');
1622 } else {
1623 VaridTemplate* cur_varid_templatep = varid_templatep;
1624 if (extra_alt_ct && (varid_multi_templatep || varid_multi_nonsnp_templatep)) {
1625 if (varid_multi_templatep) {
1626 cur_varid_templatep = varid_multi_templatep;
1627 }
1628 if (varid_multi_nonsnp_templatep) {
1629 if ((ref_slen > 1) || (remaining_alt_char_ct != 2 * extra_alt_ct + 1)) {
1630 cur_varid_templatep = varid_multi_nonsnp_templatep;
1631 }
1632 }
1633 }
1634 if (unlikely(VaridTemplateApply(tmp_alloc_base, cur_varid_templatep, token_ptrs[2], linebuf_iter, cur_bp, token_slens[2], extra_alt_ct, remaining_alt_char_ct, &tmp_alloc_end, &new_variant_id_allele_len_overflow, &id_slen))) {
1635 goto LoadPvar_ret_NOMEM;
1636 }
1637 }
1638 if (id_slen > max_variant_id_slen) {
1639 max_variant_id_slen = id_slen;
1640 }
1641 cur_ids[variant_idx_lowbits] = R_CAST(char*, tmp_alloc_end);
1642
1643 // REF
1644 const char* ref_allele = token_ptrs[2];
1645 if (ref_slen == 1) {
1646 char geno_char = ref_allele[0];
1647 if (geno_char == input_missing_geno_char) {
1648 geno_char = '.';
1649 }
1650 *allele_storage_iter = &(g_one_char_strs[2 * ctou32(geno_char)]);
1651 } else {
1652 if (StoreStringAtEndK(tmp_alloc_base, ref_allele, ref_slen, &tmp_alloc_end, allele_storage_iter)) {
1653 goto LoadPvar_ret_NOMEM;
1654 }
1655 if (ref_slen > max_allele_slen) {
1656 max_allele_slen = ref_slen;
1657 }
1658 }
1659 ++allele_storage_iter;
1660
1661 // ALT
1662 if (extra_alt_ct) {
1663 if (PtrCheck(allele_storage_limit, allele_storage_iter, extra_alt_ct * sizeof(intptr_t))) {
1664 goto LoadPvar_ret_NOMEM;
1665 }
1666 char* alt_token_end = &(linebuf_iter[remaining_alt_char_ct]);
1667 for (uint32_t alt_idx = 0; alt_idx != extra_alt_ct; ++alt_idx) {
1668 char* cur_alt_end = AdvToDelim(linebuf_iter, ',');
1669 const uint32_t cur_allele_slen = cur_alt_end - linebuf_iter;
1670 if (cur_allele_slen == 1) {
1671 char geno_char = linebuf_iter[0];
1672 if (geno_char == input_missing_geno_char) {
1673 geno_char = '.';
1674 }
1675 *allele_storage_iter = &(g_one_char_strs[2 * ctou32(geno_char)]);
1676 } else {
1677 if (unlikely(!cur_allele_slen)) {
1678 goto LoadPvar_ret_EMPTY_ALLELE_CODE;
1679 }
1680 if (StoreStringAtEndK(tmp_alloc_base, linebuf_iter, cur_allele_slen, &tmp_alloc_end, allele_storage_iter)) {
1681 goto LoadPvar_ret_NOMEM;
1682 }
1683 if (cur_allele_slen > max_allele_slen) {
1684 max_allele_slen = cur_allele_slen;
1685 }
1686 }
1687 ++allele_storage_iter;
1688 linebuf_iter = &(cur_alt_end[1]);
1689 }
1690 remaining_alt_char_ct = alt_token_end - linebuf_iter;
1691 if (unlikely(!remaining_alt_char_ct)) {
1692 goto LoadPvar_ret_EMPTY_ALLELE_CODE;
1693 }
1694 }
1695 if (remaining_alt_char_ct == 1) {
1696 char geno_char = linebuf_iter[0];
1697 if (geno_char == input_missing_geno_char) {
1698 geno_char = '.';
1699 }
1700 *allele_storage_iter = &(g_one_char_strs[2 * ctou32(geno_char)]);
1701 } else {
1702 if (StoreStringAtEndK(tmp_alloc_base, linebuf_iter, remaining_alt_char_ct, &tmp_alloc_end, allele_storage_iter)) {
1703 goto LoadPvar_ret_NOMEM;
1704 }
1705 if (remaining_alt_char_ct > max_allele_slen) {
1706 max_allele_slen = remaining_alt_char_ct;
1707 }
1708 }
1709 ++allele_storage_iter;
1710
1711 // CM
1712 if (cm_col_present) {
1713 const char* cm_token = token_ptrs[7];
1714 if ((cm_token[0] != '0') || (cm_token[1] > ' ')) {
1715 double cur_cm;
1716 if (unlikely(!ScantokDouble(cm_token, &cur_cm))) {
1717 snprintf(g_logbuf, kLogbufSize, "Error: Invalid centimorgan position on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
1718 goto LoadPvar_ret_MALFORMED_INPUT_WW;
1719 }
1720 if (cur_cm < last_cm) {
1721 vpos_sortstatus |= kfUnsortedVarCm;
1722 } else {
1723 last_cm = cur_cm;
1724 }
1725 if (cur_cm != 0.0) {
1726 if (!at_least_one_nzero_cm) {
1727 if (unlikely(S_CAST(uintptr_t, tmp_alloc_end - tmp_alloc_base) < kLoadPvarBlockSize * sizeof(double))) {
1728 goto LoadPvar_ret_NOMEM;
1729 }
1730 if (cur_chr_idxs) {
1731 // reposition cur_chr_idxs[] after cur_cms[]
1732 cur_cms = R_CAST(double*, cur_chr_idxs);
1733 cur_chr_idxs = R_CAST(ChrIdx*, &(cur_cms[kLoadPvarBlockSize]));
1734 memcpy(cur_chr_idxs, cur_cms, kLoadPvarBlockSize * sizeof(ChrIdx));
1735 tmp_alloc_base = R_CAST(unsigned char*, &(cur_chr_idxs[kLoadPvarBlockSize]));
1736 } else {
1737 cur_cms = R_CAST(double*, tmp_alloc_base);
1738 tmp_alloc_base = R_CAST(unsigned char*, &(cur_cms[kLoadPvarBlockSize]));
1739 }
1740 ZeroDArr(kLoadPvarBlockSize, cur_cms);
1741 cms_start_block = raw_variant_ct / kLoadPvarBlockSize;
1742 at_least_one_nzero_cm = 1;
1743 }
1744 cur_cms[variant_idx_lowbits] = cur_cm;
1745 }
1746 }
1747 }
1748 } else {
1749 {
1750 // linebuf_iter guaranteed to be at '\t' after chromosome code
1751 char* alt_col_start = NextTokenMult(linebuf_iter, alt_col_idx);
1752 if (unlikely(!alt_col_start)) {
1753 goto LoadPvar_ret_MISSING_TOKENS;
1754 }
1755 char* alt_col_end = CurTokenEnd(alt_col_start);
1756 extra_alt_ct = CountByte(alt_col_start, ',', alt_col_end - alt_col_start);
1757 line_iter = AdvToDelim(alt_col_end, '\n');
1758 }
1759 LoadPvar_skip_variant:
1760 ++exclude_ct;
1761 ClearBit(variant_idx_lowbits, cur_include);
1762 cur_bps[variant_idx_lowbits] = last_bp;
1763 // need to advance allele_storage_iter for later allele_idx_offsets
1764 // lookups to work properly
1765 *allele_storage_iter++ = missing_allele_str;
1766 *allele_storage_iter++ = missing_allele_str;
1767 if (extra_alt_ct) {
1768 if (PtrCheck(allele_storage_limit, allele_storage_iter, extra_alt_ct * sizeof(intptr_t))) {
1769 goto LoadPvar_ret_NOMEM;
1770 }
1771 for (uint32_t uii = 0; uii != extra_alt_ct; ++uii) {
1772 *allele_storage_iter++ = missing_allele_str;
1773 }
1774 }
1775 }
1776 ++raw_variant_ct;
1777 }
1778 if (unlikely(TextStreamErrcode2(&pvar_txs, &reterr))) {
1779 goto LoadPvar_ret_TSTREAM_FAIL;
1780 }
1781 reterr = kPglRetSuccess;
1782 if (unlikely(max_variant_id_slen > kMaxIdSlen)) {
1783 logerrputs("Error: Variant names are limited to " MAX_ID_SLEN_STR " characters.\n");
1784 goto LoadPvar_ret_MALFORMED_INPUT;
1785 }
1786 if (new_variant_id_allele_len_overflow) {
1787 if (new_variant_id_overflow_missing) {
1788 logerrprintfww("Warning: %" PRIuPTR " variant ID%s %s due to allele code length.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "unchanged by --set-missing-var-ids" : "erased by --set-all-var-ids");
1789 if (max_variant_id_slen < missing_varid_blen - 1) {
1790 max_variant_id_slen = missing_varid_blen - 1;
1791 }
1792 } else if (likely(misc_flags & kfMiscNewVarIdOverflowTruncate)) {
1793 // this is less likely in practice than the error branch, but let's
1794 // keep our usage of likely()/unlikely() as easy to interpret as
1795 // possible.
1796 logerrprintf("Warning: %" PRIuPTR " allele code%s truncated by --set-%s-var-ids.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "missing" : "all");
1797 } else {
1798 uint32_t worst_case_id_slen = VaridWorstCaseSlen(varid_templatep, max_chr_slen, max_allele_slen);
1799 if ((worst_case_id_slen <= kMaxIdSlen) && varid_multi_templatep) {
1800 const uint32_t tmp_slen = VaridWorstCaseSlen(varid_multi_templatep, max_chr_slen, max_allele_slen);
1801 if (tmp_slen > worst_case_id_slen) {
1802 worst_case_id_slen = tmp_slen;
1803 }
1804 }
1805 if ((worst_case_id_slen <= kMaxIdSlen) && varid_multi_nonsnp_templatep) {
1806 const uint32_t tmp_slen = VaridWorstCaseSlen(varid_multi_nonsnp_templatep, max_chr_slen, max_allele_slen);
1807 if (tmp_slen > worst_case_id_slen) {
1808 worst_case_id_slen = tmp_slen;
1809 }
1810 }
1811 logerrprintf("Error: %" PRIuPTR " allele code%s too long for --set-%s-var-ids.\n", new_variant_id_allele_len_overflow, (new_variant_id_allele_len_overflow == 1)? "" : "s", missing_varid_match_slen? "missing" : "all");
1812 if (worst_case_id_slen <= kMaxIdSlen) {
1813 logerrprintfww("The longest observed allele code in this dataset has length %u. If you're fine with the corresponding ID length, rerun with \"--new-id-max-allele-len %u\" added to your command line.\n", max_allele_slen, max_allele_slen);
1814 logerrputs("Otherwise, use \"--new-id-max-allele-len <limit> missing\" to set the IDs of all\nvariants with an allele code longer than the given length-limit to '.' (and\nthen process those variants with another script, if necessary).\n");
1815 } else {
1816 logerrprintfww("The longest observed allele code in this dataset has length %u. We recommend deciding on a length-limit, and then adding \"--new-id-max-allele-len <limit> missing\" to your command line to cause variants with longer allele codes to be assigned '.' IDs. (You can then process just those variants with another script, if necessary.)\n", max_allele_slen);
1817 }
1818 goto LoadPvar_ret_INCONSISTENT_INPUT;
1819 }
1820 }
1821 *max_variant_id_slen_ptr = max_variant_id_slen;
1822 *max_allele_ct_ptr = max_extra_alt_ct + 2;
1823 *max_allele_slen_ptr = max_allele_slen;
1824 *max_filter_slen_ptr = max_filter_slen;
1825 *raw_variant_ct_ptr = raw_variant_ct;
1826 uintptr_t allele_idx_end = allele_storage_iter - allele_storage;
1827 BigstackFinalizeCp(allele_storage, allele_idx_end);
1828 // We may clobber this object soon, so close it now (verifying
1829 // rewindability first, if necessary).
1830 if (unlikely(CleanupTextStream2(pvarname, &pvar_txs, &reterr))) {
1831 goto LoadPvar_ret_1;
1832 }
1833 uintptr_t* allele_idx_offsets = nullptr;
1834 const uint32_t full_block_ct = raw_variant_ct / kLoadPvarBlockSize;
1835 const uintptr_t raw_variant_ct_lowbits = raw_variant_ct % kLoadPvarBlockSize;
1836 // todo: determine whether we want variant_include to be guaranteed to be
1837 // terminated by a zero bit
1838 const uint32_t raw_variant_ctl = BitCtToWordCt(raw_variant_ct);
1839 if (unlikely(
1840 bigstack_alloc_w(raw_variant_ctl, variant_include_ptr) ||
1841 bigstack_alloc_u32(raw_variant_ct, variant_bps_ptr) ||
1842 bigstack_alloc_cp(raw_variant_ct, variant_ids_ptr))) {
1843 goto LoadPvar_ret_NOMEM;
1844 }
1845 uintptr_t* qual_present = nullptr;
1846 float* quals = nullptr;
1847 if (load_qual_col > 1) {
1848 if (unlikely(
1849 bigstack_alloc_w(raw_variant_ctl, qual_present_ptr) ||
1850 bigstack_alloc_f(raw_variant_ct, quals_ptr))) {
1851 goto LoadPvar_ret_NOMEM;
1852 }
1853 qual_present = *qual_present_ptr;
1854 quals = *quals_ptr;
1855 }
1856 uintptr_t* filter_present = nullptr;
1857 uintptr_t* filter_npass = nullptr;
1858 char** filter_storage = nullptr;
1859 if (load_filter_col > 1) {
1860 if (unlikely(
1861 bigstack_alloc_w(raw_variant_ctl, filter_present_ptr) ||
1862 bigstack_alloc_w(raw_variant_ctl, filter_npass_ptr))) {
1863 goto LoadPvar_ret_NOMEM;
1864 }
1865 filter_present = *filter_present_ptr;
1866 filter_npass = *filter_npass_ptr;
1867 if (at_least_one_npass_filter) {
1868 // possible todo: store this in a sparse manner
1869 if (unlikely(bigstack_alloc_cp(raw_variant_ct, filter_storage_ptr))) {
1870 goto LoadPvar_ret_NOMEM;
1871 }
1872 filter_storage = *filter_storage_ptr;
1873 }
1874 }
1875 uintptr_t* nonref_flags = nullptr;
1876 if (info_pr_present) {
1877 if (unlikely(bigstack_alloc_w(raw_variant_ctl, nonref_flags_ptr))) {
1878 goto LoadPvar_ret_NOMEM;
1879 }
1880 nonref_flags = *nonref_flags_ptr;
1881 }
1882 // load_qual_col > 1:
1883 // kLoadPvarBlockSize / CHAR_BIT for qual_present
1884 // kLoadPvarBlockSize * sizeof(float) for quals
1885 // load_filter_col > 1:
1886 // 2 * (kLoadPvarBlockSize / CHAR_BIT) for filter_present, filter_npass
1887 // kLoadPvarBlockSize * sizeof(intptr_t) for filter_storage
1888 // at_least_one_nzero_cm:
1889 // kLoadPvarBlockSize * sizeof(double)
1890 // is_split_chr:
1891 // kLoadPvarBlockSize * sizeof(ChrIdx)
1892 unsigned char* read_iter = g_bigstack_end;
1893 uint32_t* variant_bps = *variant_bps_ptr;
1894 char** variant_ids = *variant_ids_ptr;
1895 uintptr_t* variant_include = *variant_include_ptr;
1896 for (uint32_t block_idx = 0; block_idx != full_block_ct; ++block_idx) {
1897 memcpy(&(variant_bps[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(int32_t));
1898 // skip over allele_idx_offsets
1899 read_iter = &(read_iter[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]);
1900 memcpy(&(variant_ids[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
1901 read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
1902 memcpy(&(variant_include[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1903 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1904 if (qual_present) {
1905 memcpy(&(qual_present[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1906 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1907 memcpy(&(quals[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(float));
1908 read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(float)]);
1909 }
1910 if (filter_present) {
1911 memcpy(&(filter_present[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1912 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1913 memcpy(&(filter_npass[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1914 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1915 if (filter_storage) {
1916 memcpy(&(filter_storage[block_idx * kLoadPvarBlockSize]), read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
1917 }
1918 read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
1919 }
1920 if (info_pr_present) {
1921 memcpy(&(nonref_flags[block_idx * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, kLoadPvarBlockSize / CHAR_BIT);
1922 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1923 }
1924 // skip over cms
1925 if (block_idx >= cms_start_block) {
1926 read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(double)]);
1927 }
1928 // skip over chr_idxs
1929 if (block_idx >= chr_idxs_start_block) {
1930 read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(ChrIdx)]);
1931 }
1932 }
1933 memcpy(&(variant_bps[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(int32_t));
1934 read_iter = &(read_iter[kLoadPvarBlockSize * (sizeof(int32_t) + sizeof(intptr_t))]);
1935 memcpy(&(variant_ids[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
1936 read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
1937 const uint32_t last_bitblock_size = DivUp(raw_variant_ct_lowbits, CHAR_BIT);
1938 memcpy(&(variant_include[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1939 ZeroTrailingBits(raw_variant_ct, variant_include);
1940 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1941 if (qual_present) {
1942 memcpy(&(qual_present[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1943 ZeroTrailingBits(raw_variant_ct, qual_present);
1944 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1945 memcpy(&(quals[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(float));
1946 read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(float)]);
1947 }
1948 if (filter_present) {
1949 memcpy(&(filter_present[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1950 ZeroTrailingBits(raw_variant_ct, filter_present);
1951 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1952 memcpy(&(filter_npass[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1953 ZeroTrailingBits(raw_variant_ct, filter_npass);
1954 read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1955 if (filter_storage) {
1956 memcpy(&(filter_storage[full_block_ct * kLoadPvarBlockSize]), read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
1957 read_iter = &(read_iter[kLoadPvarBlockSize * sizeof(intptr_t)]);
1958 }
1959 }
1960 if (info_pr_present) {
1961 memcpy(&(nonref_flags[full_block_ct * (kLoadPvarBlockSize / kBitsPerWord)]), read_iter, last_bitblock_size);
1962 ZeroTrailingBits(raw_variant_ct, nonref_flags);
1963 // read_iter = &(read_iter[kLoadPvarBlockSize / CHAR_BIT]);
1964 }
1965 const uintptr_t read_iter_stride_base = kLoadPvarBlockSize * (sizeof(int32_t) + 2 * sizeof(intptr_t)) + (kLoadPvarBlockSize / CHAR_BIT) + (load_qual_col > 1) * ((kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(float)) + (load_filter_col > 1) * (2 * (kLoadPvarBlockSize / CHAR_BIT) + kLoadPvarBlockSize * sizeof(intptr_t)) + info_pr_present * (kLoadPvarBlockSize / CHAR_BIT);
1966 if (allele_idx_end > 2 * S_CAST(uintptr_t, raw_variant_ct)) {
1967 if (unlikely(bigstack_alloc_w(raw_variant_ct + 1, allele_idx_offsets_ptr))) {
1968 goto LoadPvar_ret_NOMEM;
1969 }
1970 allele_idx_offsets = *allele_idx_offsets_ptr;
1971 uintptr_t* allele_idx_read_iter = R_CAST(uintptr_t*, &(g_bigstack_end[kLoadPvarBlockSize * sizeof(int32_t)]));
1972 for (uint32_t block_idx = 0; block_idx != full_block_ct; ++block_idx) {
1973 memcpy(&(allele_idx_offsets[block_idx * kLoadPvarBlockSize]), allele_idx_read_iter, kLoadPvarBlockSize * sizeof(intptr_t));
1974 allele_idx_read_iter = R_CAST(uintptr_t*, R_CAST(uintptr_t, allele_idx_read_iter) + read_iter_stride_base + (block_idx >= cms_start_block) * kLoadPvarBlockSize * sizeof(double) + (block_idx >= chr_idxs_start_block) * kLoadPvarBlockSize * sizeof(ChrIdx));
1975 }
1976 memcpy(&(allele_idx_offsets[full_block_ct * kLoadPvarBlockSize]), allele_idx_read_iter, raw_variant_ct_lowbits * sizeof(intptr_t));
1977 allele_idx_offsets[raw_variant_ct] = allele_idx_end;
1978 }
1979 if (at_least_one_nzero_cm) {
1980 if (unlikely(bigstack_alloc_d(raw_variant_ct, variant_cms_ptr))) {
1981 goto LoadPvar_ret_NOMEM;
1982 }
1983 double* variant_cms = *variant_cms_ptr;
1984 ZeroDArr(cms_start_block * kLoadPvarBlockSize, variant_cms);
1985 double* cms_read_iter = R_CAST(double*, &(g_bigstack_end[read_iter_stride_base * (cms_start_block + 1)]));
1986 if (cms_start_block > chr_idxs_start_block) {
1987 cms_read_iter = R_CAST(double*, R_CAST(uintptr_t, cms_read_iter) + kLoadPvarBlockSize * sizeof(ChrIdx) * (cms_start_block - chr_idxs_start_block));
1988 }
1989 for (uint32_t block_idx = cms_start_block; block_idx != full_block_ct; ++block_idx) {
1990 memcpy(&(variant_cms[block_idx * kLoadPvarBlockSize]), cms_read_iter, kLoadPvarBlockSize * sizeof(double));
1991 cms_read_iter = R_CAST(double*, R_CAST(uintptr_t, cms_read_iter) + read_iter_stride_base + kLoadPvarBlockSize * sizeof(double) + (block_idx >= chr_idxs_start_block) * kLoadPvarBlockSize * sizeof(ChrIdx));
1992 }
1993 memcpy(&(variant_cms[full_block_ct * kLoadPvarBlockSize]), cms_read_iter, raw_variant_ct_lowbits * sizeof(double));
1994 } else {
1995 *variant_cms_ptr = nullptr;
1996 }
1997 if (!is_split_chr) {
1998 cip->chr_fo_vidx_start[chrs_encountered_m1 + 1] = raw_variant_ct;
1999 if (splitpar_bound2) {
2000 if (splitpar_and_exclude_x) {
2001 ClearBit(x_code, chr_mask);
2002 }
2003 reterr = SplitPar(variant_bps, *vpos_sortstatus_ptr, splitpar_bound1, splitpar_bound2, variant_include, loaded_chr_mask, cip, &chrs_encountered_m1, &exclude_ct);
2004 if (unlikely(reterr)) {
2005 goto LoadPvar_ret_1;
2006 }
2007 }
2008 cip->chr_ct = chrs_encountered_m1 + 1;
2009 } else {
2010 ChrIdx* chr_idxs;
2011 if (unlikely(BIGSTACK_ALLOC_X(ChrIdx, raw_variant_ct, &chr_idxs))) {
2012 goto LoadPvar_ret_NOMEM;
2013 }
2014 *chr_idxs_ptr = chr_idxs;
2015 if (chr_idxs_start_block) {
2016 const uint32_t end_vidx = chr_idxs_start_block * kLoadPvarBlockSize;
2017 uint32_t chr_fo_idx = chrs_encountered_m1;
2018 while (cip->chr_fo_vidx_start[chr_fo_idx] >= end_vidx) {
2019 --chr_fo_idx;
2020 }
2021 BackfillChrIdxs(cip, chr_fo_idx, 0, end_vidx, chr_idxs);
2022 }
2023 ChrIdx* chr_idxs_read_iter = R_CAST(ChrIdx*, &(g_bigstack_end[read_iter_stride_base * (chr_idxs_start_block + 1)]));
2024 if (chr_idxs_start_block >= cms_start_block) {
2025 chr_idxs_read_iter = R_CAST(ChrIdx*, R_CAST(uintptr_t, chr_idxs_read_iter) + kLoadPvarBlockSize * sizeof(double) * (chr_idxs_start_block + 1 - cms_start_block));
2026 }
2027 for (uint32_t block_idx = chr_idxs_start_block; block_idx != full_block_ct; ) {
2028 memcpy(&(chr_idxs[block_idx * kLoadPvarBlockSize]), chr_idxs_read_iter, kLoadPvarBlockSize * sizeof(ChrIdx));
2029 ++block_idx;
2030 chr_idxs_read_iter = R_CAST(ChrIdx*, R_CAST(uintptr_t, chr_idxs_read_iter) + read_iter_stride_base + kLoadPvarBlockSize * sizeof(ChrIdx) + (block_idx >= cms_start_block) * kLoadPvarBlockSize * sizeof(double));
2031 }
2032 memcpy(&(chr_idxs[full_block_ct * kLoadPvarBlockSize]), chr_idxs_read_iter, raw_variant_ct_lowbits * sizeof(ChrIdx));
2033 cip->chr_ct = PopcountWords(loaded_chr_mask, DivUp(cip->max_code + cip->name_ct + 1, kBitsPerWord));
2034 }
2035 if (merge_par) {
2036 const uint32_t is_plink2_par = (misc_flags / kfMiscMergePar) & 1;
2037 if (merge_par_ct) {
2038 logprintf("--merge-%s: %u chromosome code%s changed.\n", is_plink2_par? "par" : "x", merge_par_ct, (merge_par_ct == 1)? "" : "s");
2039 } else if (is_plink2_par) {
2040 logerrputs("Warning: --merge-par had no effect (no PAR1/PAR2 chromosome codes present).\n");
2041 } else {
2042 logerrputs("Warning: --merge-x had no effect (no XY chromosome codes present).\n");
2043 }
2044 }
2045 const uint32_t last_chr_code = cip->max_code + cip->name_ct;
2046 const uint32_t chr_word_ct = BitCtToWordCt(last_chr_code + 1);
2047 BitvecAnd(loaded_chr_mask, chr_word_ct, chr_mask);
2048 BigstackEndSet(tmp_alloc_end);
2049 *variant_ct_ptr = raw_variant_ct - exclude_ct;
2050 *vpos_sortstatus_ptr = vpos_sortstatus;
2051 *allele_storage_ptr = allele_storage;
2052 // if only INFO:PR flag present, no need to reload
2053 if (!(info_nonpr_present || info_pr_nonflag_present)) {
2054 info_reload_slen = 0;
2055 }
2056 if (info_reload_slen) {
2057 // treat open-fail as rewind-fail here
2058 if (unlikely(ForceNonFifo(pvarname))) {
2059 logerrprintfww(kErrprintfRewind, pvarname);
2060 reterr = kPglRetRewindFail;
2061 goto LoadPvar_ret_1;
2062 }
2063 }
2064 *info_reload_slen_ptr = info_reload_slen;
2065 }
2066
2067 while (0) {
2068 LoadPvar_ret_NOMEM:
2069 reterr = kPglRetNomem;
2070 break;
2071 LoadPvar_ret_TSTREAM_FAIL:
2072 TextStreamErrPrint(pvarname, &pvar_txs);
2073 break;
2074 LoadPvar_ret_EMPTY_ALLELE_CODE:
2075 snprintf(g_logbuf, kLogbufSize, "Error: Empty allele code on line %" PRIuPTR " of %s.\n", line_idx, pvarname);
2076 LoadPvar_ret_MALFORMED_INPUT_WW:
2077 WordWrapB(0);
2078 logerrputsb();
2079 LoadPvar_ret_MALFORMED_INPUT:
2080 reterr = kPglRetMalformedInput;
2081 break;
2082 LoadPvar_ret_INCONSISTENT_INPUT:
2083 reterr = kPglRetInconsistentInput;
2084 break;
2085 LoadPvar_ret_MISSING_TOKENS:
2086 logerrprintfww("Error: Line %" PRIuPTR " of %s has fewer tokens than expected.\n", line_idx, pvarname);
2087 reterr = kPglRetMalformedInput;
2088 break;
2089 }
2090 LoadPvar_ret_1:
2091 CleanupTextStream2(pvarname, &pvar_txs, &reterr);
2092 if (reterr) {
2093 BigstackDoubleReset(bigstack_mark, bigstack_end_mark);
2094 }
2095 return reterr;
2096 }
2097
2098 #ifdef __cplusplus
2099 } // namespace plink2
2100 #endif
2101