1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 
17 
18 #include "plink2_fasta.h"
19 
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23 
24 PglErr RefFromFaContig(const uintptr_t* variant_include, const uint32_t* variant_bps, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const ChrInfo* cip, const char* seqbuf, uint32_t force, uint32_t chr_fo_idx, uint32_t variant_uidx_last, uint32_t bp_end, STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), uintptr_t* nonref_flags, uint32_t* changed_ct_ptr, uint32_t* validated_ct_ptr, uint32_t* downgraded_ct_ptr) {
25   uintptr_t variant_uidx_base;
26   uintptr_t cur_bits;
27   BitIter1Start(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], &variant_uidx_base, &cur_bits);
28   uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
29   if (variant_bps[variant_uidx_last] >= bp_end) {
30     const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
31     if (!force) {
32       char* write_iter = strcpya_k(g_logbuf, "Error: Contig '");
33       write_iter = chrtoa(cip, chr_idx, write_iter);
34       snprintf(write_iter, kLogbufSize - kMaxIdSlen - 32, "' in --fa file is too short; it is likely to be mismatched with your data. Add the 'force' modifier if this wasn't a mistake, and you just want to mark all reference alleles past the end as provisional.\n");
35       WordWrapB(0);
36       logerrputsb();
37       return kPglRetInconsistentInput;
38     } else {
39       char* write_iter = strcpya_k(g_logbuf, "Warning: Contig '");
40       write_iter = chrtoa(cip, chr_idx, write_iter);
41       snprintf(write_iter, kLogbufSize - kMaxIdSlen - 32, "' in --fa file is too short; it is likely to be mismatched with your data.\n");
42       WordWrapB(0);
43       logerrputsb();
44     }
45     uint32_t offset = CountSortedSmallerU32(&(variant_bps[variant_uidx]), variant_uidx_last - variant_uidx, bp_end);
46 
47     const uint32_t chr_vidx_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
48     // set all bits in [variant_uidx + offset, chr_vidx_end), and count how
49     // many relevant bits were set.
50     const uint32_t widx_full_end = chr_vidx_end / kBitsPerWord;
51     const uint32_t trailing_bit_ct = chr_vidx_end % kBitsPerWord;
52     const uint32_t chr_vidx_truncate = variant_uidx + offset;
53     uint32_t widx = chr_vidx_truncate / kBitsPerWord;
54     uint32_t downgraded_incr;
55     if (widx == widx_full_end) {
56       const uintptr_t cur_mask = (k1LU << trailing_bit_ct) - (k1LU << (chr_vidx_truncate % kBitsPerWord));
57       downgraded_incr = PopcountWord(variant_include[widx] & (~nonref_flags[widx]) & cur_mask);
58       nonref_flags[widx] |= cur_mask;
59     } else {
60       downgraded_incr = 0;
61       if (chr_vidx_truncate % kBitsPerWord) {
62         const uintptr_t cur_mask = (~k0LU) << (chr_vidx_truncate % kBitsPerWord);
63         downgraded_incr = PopcountWord(variant_include[widx] & (~nonref_flags[widx]) & cur_mask);
64         nonref_flags[widx] |= cur_mask;
65         ++widx;
66       }
67       assert(widx > widx_full_end);
68       for (; widx != widx_full_end; ++widx) {
69         downgraded_incr += PopcountWord(variant_include[widx] & (~nonref_flags[widx]));
70         nonref_flags[widx] = ~k0LU;
71       }
72       if (trailing_bit_ct) {
73         const uintptr_t cur_mask = (k1LU << trailing_bit_ct) - k1LU;
74         downgraded_incr += PopcountWord(variant_include[widx] & (~nonref_flags[widx]) & cur_mask);
75         nonref_flags[widx] |= cur_mask;
76       }
77     }
78     *downgraded_ct_ptr += downgraded_incr;
79     if (!offset) {
80       return kPglRetSuccess;
81     }
82     // We know that variant_bps[variant_uidx + offset - 1] < bp_end,
83     // variant_bps[variant_uidx + offset] >= bp_end, and that at least one
84     // variant_include[] bit is set in [variant_uidx, variant_uidx + offset)
85     // (since offset > 0).  Find the last such bit.
86     variant_uidx_last = FindLast1BitBefore(variant_include, chr_vidx_truncate);
87   }
88   uint32_t changed_ct = *changed_ct_ptr;
89   uint32_t validated_ct = *validated_ct_ptr;
90   uint32_t allele_ct = 2;
91   for (; ; variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits)) {
92     const uint32_t cur_bp = variant_bps[variant_uidx];
93     uintptr_t allele_idx_offset_base = variant_uidx * 2;
94     if (allele_idx_offsets) {
95       allele_idx_offset_base = allele_idx_offsets[variant_uidx];
96       allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
97     }
98     const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
99     const char* cur_ref = &(seqbuf[cur_bp]);
100     int32_t consistent_allele_idx = -1;
101     for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
102       const char* cur_allele = cur_alleles[allele_idx];
103       const uint32_t cur_allele_slen = strlen(cur_allele);
104       if (strcaseequal(cur_allele, cur_ref, cur_allele_slen)) {
105         if (consistent_allele_idx != -1) {
106           // Multiple alleles could be ref (this always happens for deletions).
107           // Don't try to do anything.
108           consistent_allele_idx = -2;
109           break;
110         }
111         consistent_allele_idx = allele_idx;
112       }
113     }
114     if (consistent_allele_idx >= 0) {
115       if (consistent_allele_idx) {
116         if ((!IsSet(nonref_flags, variant_uidx)) && (!force)) {
117           const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
118           char* write_iter = strcpya_k(g_logbuf, "Error: --ref-from-fa wants to change reference allele assignment at ");
119           write_iter = chrtoa(cip, chr_idx, write_iter);
120           *write_iter++ = ':';
121           write_iter = u32toa(cur_bp, write_iter);
122           snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, ", but it's marked as 'known'. Add the 'force' modifier to force this change through.\n");
123           WordWrapB(0);
124           logerrputsb();
125           return kPglRetInconsistentInput;
126         }
127         R_CAST(DoubleAlleleCode*, refalt1_select)[variant_uidx] = consistent_allele_idx;
128         ++changed_ct;
129       } else {
130         ++validated_ct;
131       }
132       ClearBit(variant_uidx, nonref_flags);
133     } else if ((consistent_allele_idx == -1) && (!IsSet(nonref_flags, variant_uidx))) {
134       // okay to have multiple matches, but not zero matches
135       if (!force) {
136         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
137         char* write_iter = strcpya_k(g_logbuf, "Error: Reference allele at ");
138         write_iter = chrtoa(cip, chr_idx, write_iter);
139         *write_iter++ = ':';
140         write_iter = u32toa(cur_bp, write_iter);
141         snprintf(write_iter, kLogbufSize - kMaxIdSlen - 64, " is marked as 'known', but is inconsistent with .fa file. Add the 'force' modifier to downgrade it to provisional.\n");
142         WordWrapB(0);
143         logerrputsb();
144         return kPglRetInconsistentInput;
145       }
146       SetBit(variant_uidx, nonref_flags);
147       *downgraded_ct_ptr += 1;
148     }
149     if (variant_uidx == variant_uidx_last) {
150       *changed_ct_ptr = changed_ct;
151       *validated_ct_ptr = validated_ct;
152       return kPglRetSuccess;
153     }
154   }
155 }
156 
VNormalizeContig(const uintptr_t * variant_include,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const ChrInfo * cip,const char * seqbuf,uint32_t chr_fo_idx,uint32_t variant_uidx_last,uint32_t bp_end,unsigned char ** alloc_endp,UnsortedVar * vpos_sortstatusp,uint32_t * __restrict variant_bps,const char ** allele_storage,uint32_t * __restrict nchanged_ct_ptr,char * nlist_flush,FILE * nlist_file,char ** nlist_write_iterp,uint32_t * __restrict alen_buf)157 PglErr VNormalizeContig(const uintptr_t* variant_include, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const ChrInfo* cip, const char* seqbuf, uint32_t chr_fo_idx, uint32_t variant_uidx_last, uint32_t bp_end, unsigned char** alloc_endp, UnsortedVar* vpos_sortstatusp, uint32_t* __restrict variant_bps, const char** allele_storage, uint32_t* __restrict nchanged_ct_ptr, char* nlist_flush, FILE* nlist_file, char** nlist_write_iterp, uint32_t* __restrict alen_buf) {
158   uintptr_t variant_uidx_base;
159   uintptr_t cur_bits;
160   BitIter1Start(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], &variant_uidx_base, &cur_bits);
161   uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
162   if (variant_bps[variant_uidx_last] >= bp_end) {
163     const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
164     char* write_iter = strcpya_k(g_logbuf, "Error: Contig '");
165     write_iter = chrtoa(cip, chr_idx, write_iter);
166     snprintf(write_iter, kLogbufSize - kMaxIdSlen - 32, "' in --fa file is too short; it is likely to be mismatched with your data.\n");
167     WordWrapB(0);
168     logerrputsb();
169     return kPglRetInconsistentInput;
170   }
171   const char* missing_allele_str = &(g_one_char_strs[92]);
172   const uint32_t input_missing_geno_code = ctou32(*g_input_missing_geno_ptr);
173   unsigned char* alloc_base = g_bigstack_base;
174   unsigned char* alloc_end = *alloc_endp;
175   char* nlist_write_iter = *nlist_write_iterp;
176   uint32_t nchanged_ct = *nchanged_ct_ptr;
177   uint32_t allele_ct = 2;
178   uint32_t prev_bp = 0;
179   uint32_t is_unsorted = ((*vpos_sortstatusp) / kfUnsortedVarBp) & 1;
180   uint32_t cur_bp;
181   goto VNormalizeContig_loop_start;
182   while (1) {
183     if (!is_unsorted) {
184       if (prev_bp > cur_bp) {
185         is_unsorted = 1;
186       }
187       prev_bp = cur_bp;
188     }
189     if (variant_uidx == variant_uidx_last) {
190       break;
191     }
192     variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
193   VNormalizeContig_loop_start:
194     cur_bp = variant_bps[variant_uidx];
195     uintptr_t allele_idx_offset_base = variant_uidx * 2;
196     if (allele_idx_offsets) {
197       allele_idx_offset_base = allele_idx_offsets[variant_uidx];
198       allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
199     }
200     const char** cur_alleles = &(allele_storage[allele_idx_offset_base]);
201     {
202       uint32_t aidx = 0;
203       // Common case: if all alleles have length 1, variant must already be
204       // normalized.
205       for (; aidx != allele_ct; ++aidx) {
206         if (cur_alleles[aidx][1]) {
207           break;
208         }
209       }
210       if (aidx == allele_ct) {
211         continue;
212       }
213     }
214     // Ignore missing allele if present, get lengths of each allele, check if
215     // all right nucleotides match or all left nucleotides match
216     uint32_t left_match = 0;
217     uint32_t right_match = 0;
218     uint32_t min_alen = 0;
219     uint32_t missing_aidx = UINT32_MAX;
220     for (uint32_t aidx = 0; aidx != allele_ct; ++aidx) {
221       if (cur_alleles[aidx] == missing_allele_str) {
222         if (missing_aidx != UINT32_MAX) {
223           const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
224           char* write_iter = strcpya_k(g_logbuf, "Error: Variant at ");
225           write_iter = chrtoa(cip, chr_idx, write_iter);
226           *write_iter++ = ':';
227           write_iter = u32toa(cur_bp, write_iter);
228           snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " has multiple missing alleles.\n");
229           WordWrapB(0);
230           logerrputsb();
231           return kPglRetInconsistentInput;
232         }
233         missing_aidx = aidx;
234         alen_buf[aidx] = 1;
235         continue;
236       }
237       const char* cur_allele = cur_alleles[aidx];
238       const uint32_t alen = strlen(cur_allele);
239       alen_buf[aidx] = alen;
240       const uint32_t first_code = ctou32(cur_allele[0]);
241       // Special case: if first character of any allele is '<', don't attempt
242       // to normalize.
243       if (first_code == '<') {
244         left_match = UINT32_MAX;
245         right_match = UINT32_MAX;
246         break;
247       }
248       const uint32_t last_code = ctou32(cur_allele[alen - 1]);
249       if (!min_alen) {
250         // must be first nonmissing allele
251         left_match = first_code;
252         right_match = last_code;
253         min_alen = alen;
254       } else {
255         if (first_code != left_match) {
256           left_match = UINT32_MAX;
257         }
258         if (last_code != right_match) {
259           right_match = UINT32_MAX;
260         }
261         if (alen < min_alen) {
262           min_alen = alen;
263         }
264       }
265     }
266     if (((left_match == UINT32_MAX) || (min_alen == 1)) && (right_match == UINT32_MAX)) {
267       continue;
268     }
269     // Sanity check: verify alleles aren't all identical.
270     const uint32_t first_aidx = (missing_aidx == 0)? 1 : 0;
271     const uint32_t first_alen = alen_buf[first_aidx];
272     for (uint32_t aidx = first_aidx + 1; ; ) {
273       if (aidx == missing_aidx) {
274         continue;
275       }
276       if ((alen_buf[aidx] != first_alen) || (!memequal(cur_alleles[first_aidx], cur_alleles[aidx], min_alen))) {
277         break;
278       }
279       if (++aidx == allele_ct) {
280         // probable todo: report ID instead?
281         const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
282         char* write_iter = strcpya_k(g_logbuf, "Error: Variant at ");
283         write_iter = chrtoa(cip, chr_idx, write_iter);
284         *write_iter++ = ':';
285         write_iter = u32toa(cur_bp, write_iter);
286         snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " has duplicate allele codes.\n");
287         WordWrapB(0);
288         logerrputsb();
289         return kPglRetInconsistentInput;
290       }
291     }
292     ++nchanged_ct;
293     if (nlist_write_iter) {
294       nlist_write_iter = strcpya(nlist_write_iter, variant_ids[variant_uidx]);
295       AppendBinaryEoln(&nlist_write_iter);
296       if (unlikely(fwrite_ck(nlist_flush, nlist_file, &nlist_write_iter))) {
297         return kPglRetWriteFail;
298       }
299     }
300     // Algorithm from Tan A, Abecasis GR, Kang HM (2015) Unified representation
301     // of genetic variants:
302     //   while (alleles end with the same nucleotide) {
303     //     truncate rightmost
304     //     if there's now an empty allele, extend alleles 1 base to the left
305     //   }
306     //   while (all alleles length > 1 and start with same nucleotide) {
307     //     truncate leftmost
308     //   }
309     // https://genome.sph.umich.edu/wiki/Variant_Normalization also covers
310     // this.
311     const uint32_t rtrim_stop = cur_bp + min_alen - 1;
312     if ((right_match == UINT32_MAX) || (!rtrim_stop)) {
313       // Easy case: only left-trimming.  We separate this out because bigstack
314       // allocation is never required.
315       const uint32_t min_alen_m1 = min_alen - 1;
316       uint32_t ltrim = 1;
317       uint32_t ltrim_p1;
318       for (; ltrim != min_alen_m1; ++ltrim) {
319         const char cc = cur_alleles[first_aidx][ltrim];
320         for (uint32_t aidx = first_aidx + 1; aidx != allele_ct; ++aidx) {
321           if (aidx == missing_aidx) {
322             continue;
323           }
324           if (cur_alleles[aidx][ltrim] != cc) {
325             goto VNormalizeContig_ltrim1_done;
326           }
327         }
328       }
329     VNormalizeContig_ltrim1_done:
330       ltrim_p1 = ltrim + 1;
331       for (uint32_t aidx = first_aidx; aidx != allele_ct; ++aidx) {
332         if (aidx == missing_aidx) {
333           continue;
334         }
335         if (alen_buf[aidx] == ltrim_p1) {
336           const uint32_t cur_code = ctou32(cur_alleles[aidx][ltrim]);
337           if ((cur_code == 46) || (cur_code == input_missing_geno_code)) {
338             const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
339             char* write_iter = strcpya_k(g_logbuf, "Error: Variant at ");
340             write_iter = chrtoa(cip, chr_idx, write_iter);
341             *write_iter++ = ':';
342             write_iter = u32toa(cur_bp, write_iter);
343             snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " has an invalid normalized allele (conflicts with missing code).\n");
344             WordWrapB(0);
345             logerrputsb();
346             return kPglRetInconsistentInput;
347           }
348           cur_alleles[aidx] = &(g_one_char_strs[2 * cur_code]);
349         } else {
350           cur_alleles[aidx] = &(cur_alleles[aidx][ltrim]);
351         }
352       }
353       cur_bp += ltrim;
354       variant_bps[variant_uidx] = cur_bp;
355       continue;
356     }
357     // cur_bp == 0 is possible for e.g. an indel at the beginning of chr17.
358     const char* prev_ref = &(seqbuf[S_CAST(int32_t, cur_bp) - 1]);
359     uint32_t rtrim = 0;
360     uint32_t lshift = 0;
361     uint32_t ltrim = 0;
362     const char* shifted_ref = nullptr;
363     do {
364       ++rtrim;
365       uint32_t cur_alen = alen_buf[first_aidx];
366       if (rtrim >= cur_alen) {
367         right_match = ctou32(prev_ref[S_CAST(int32_t, cur_alen - rtrim)]);
368       } else {
369         right_match = ctou32(cur_alleles[first_aidx][cur_alen - 1 - rtrim]);
370       }
371       for (uint32_t aidx = first_aidx + 1; aidx != allele_ct; ++aidx) {
372         if (aidx == missing_aidx) {
373           continue;
374         }
375         cur_alen = alen_buf[aidx];
376         uint32_t cur_code;
377         if (rtrim >= cur_alen) {
378           cur_code = ctou32(prev_ref[S_CAST(int32_t, cur_alen - rtrim)]);
379         } else {
380           cur_code = ctou32(cur_alleles[aidx][cur_alen - 1 - rtrim]);
381         }
382         if (right_match != cur_code) {
383           goto VNormalizeContig_rtrim_done;
384         }
385       }
386     } while (rtrim != rtrim_stop);
387   VNormalizeContig_rtrim_done:
388     if (rtrim >= min_alen) {
389       lshift = rtrim + 1 - min_alen;
390       cur_bp -= lshift;
391       shifted_ref = &(seqbuf[cur_bp]);
392       // min_alen = 1;
393       variant_bps[variant_uidx] = cur_bp;
394     } else {
395       min_alen -= rtrim;
396       if ((left_match != UINT32_MAX) && (min_alen > 1)) {
397         const uint32_t min_alen_m1 = min_alen - 1;
398         ltrim = 1;
399         for (; ltrim != min_alen_m1; ++ltrim) {
400           left_match = ctou32(cur_alleles[first_aidx][ltrim]);
401           for (uint32_t aidx = first_aidx + 1; aidx != allele_ct; ++aidx) {
402             if (aidx == missing_aidx) {
403               continue;
404             }
405             const uint32_t cur_code = ctou32(cur_alleles[aidx][ltrim]);
406             if (left_match != cur_code) {
407               goto VNormalizeContig_ltrim2_done;
408             }
409           }
410         }
411       }
412     VNormalizeContig_ltrim2_done:
413       cur_bp += ltrim;
414       variant_bps[variant_uidx] = cur_bp;
415     }
416     for (uint32_t aidx = first_aidx; aidx != allele_ct; ++aidx) {
417       if (aidx == missing_aidx) {
418         continue;
419       }
420       const uint32_t orig_alen = alen_buf[aidx];
421       if (orig_alen <= rtrim) {
422         // Must be single-character from .fa.
423         cur_alleles[aidx] = &(g_one_char_strs[2 * ctou32(prev_ref[S_CAST(int32_t, orig_alen - rtrim)])]);
424       } else {
425         uint32_t new_slen = orig_alen + lshift - rtrim - ltrim;
426         const char* cur_allele = &(cur_alleles[aidx][ltrim]);
427         if (new_slen == 1) {
428           cur_alleles[aidx] = &(g_one_char_strs[2 * ctou32(cur_allele[0])]);
429         } else {
430           if (S_CAST(uintptr_t, alloc_end - alloc_base) <= new_slen) {
431             return kPglRetNomem;
432           }
433           alloc_end -= new_slen + 1;
434           char* new_allele = R_CAST(char*, alloc_end);
435           char* new_allele_iter = new_allele;
436           if (lshift) {
437             if (lshift < new_slen) {
438               new_allele_iter = memcpya(new_allele_iter, shifted_ref, lshift);
439               new_slen -= lshift;
440             } else {
441               new_allele_iter = memcpya(new_allele_iter, shifted_ref, new_slen);
442               new_slen = 0;
443             }
444           }
445           new_allele_iter = memcpya(new_allele_iter, cur_allele, new_slen);
446           *new_allele_iter = '\0';
447           cur_alleles[aidx] = new_allele;
448         }
449       }
450     }
451   }
452   *alloc_endp = alloc_end;
453   if (is_unsorted) {
454     *vpos_sortstatusp |= kfUnsortedVarBp;
455   }
456   *nchanged_ct_ptr = nchanged_ct;
457   *nlist_write_iterp = nlist_write_iter;
458   return kPglRetSuccess;
459 }
460 
461 PglErr ProcessFa(const uintptr_t* variant_include, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const ChrInfo* cip, const char* fname, uint32_t max_allele_ct, uint32_t max_allele_slen, FaFlags flags, uint32_t max_thread_ct, UnsortedVar* vpos_sortstatusp, uint32_t* variant_bps, const char** allele_storage, STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), uintptr_t* nonref_flags, char* outname, char* outname_end) {
462   unsigned char* bigstack_mark = g_bigstack_base;
463   uintptr_t line_idx = 0;
464   FILE* nlist_file = nullptr;
465   PglErr reterr = kPglRetSuccess;
466   TextStream fa_txs;
467   PreinitTextStream(&fa_txs);
468   {
469     const uint32_t chr_ct = cip->chr_ct;
470     char* chr_name_buf;
471     uintptr_t* chr_already_seen;
472     if (unlikely(
473             bigstack_calloc_w(BitCtToWordCt(chr_ct), &chr_already_seen) ||
474             bigstack_alloc_c(kMaxIdBlen, &chr_name_buf))) {
475       goto ProcessFa_ret_NOMEM;
476     }
477     uint32_t* alen_buf = nullptr;
478     char* nlist_write_iter = nullptr;
479     char* nlist_flush = nullptr;
480     if (flags & kfFaNormalize) {
481       if (unlikely(bigstack_alloc_u32(max_allele_ct, &alen_buf))) {
482         goto ProcessFa_ret_NOMEM;
483       }
484       if (flags & kfFaNormalizeList) {
485         snprintf(outname_end, kMaxOutfnameExtBlen, ".normalized");
486         if (unlikely(fopen_checked(outname, FOPEN_WB, &nlist_file))) {
487           goto ProcessFa_ret_OPEN_FAIL;
488         }
489         nlist_write_iter = g_textbuf;
490         nlist_flush = &(nlist_write_iter[kMaxMediumLine]);
491       }
492     }
493 
494     // To simplify indel/complex-variant handling, we load an entire contig at
495     // a time.  Determine an upper bound for the size of this buffer.
496     const uintptr_t* chr_mask = cip->chr_mask;
497     uint32_t seqbuf_size = 0;
498     for (uint32_t chr_fo_idx = 0; chr_fo_idx != chr_ct; ++chr_fo_idx) {
499       if (!IsSet(chr_mask, cip->chr_file_order[chr_fo_idx])) {
500         continue;
501       }
502       const int32_t chr_vidx_start_m1 = cip->chr_fo_vidx_start[chr_fo_idx] - 1;
503       const int32_t chr_vidx_last = FindLast1BitBeforeBounded(variant_include, cip->chr_fo_vidx_start[chr_fo_idx + 1], chr_vidx_start_m1);
504       if (chr_vidx_last != chr_vidx_start_m1) {
505         const uint32_t cur_bp = variant_bps[S_CAST(uint32_t, chr_vidx_last)];
506         if (cur_bp > seqbuf_size) {
507           seqbuf_size = cur_bp;
508         }
509       }
510     }
511     seqbuf_size += max_allele_slen + 1;
512     char* seqbuf;
513     if (unlikely(bigstack_alloc_c(seqbuf_size, &seqbuf))) {
514       goto ProcessFa_ret_NOMEM;
515     }
516     // May as well handle insertion before first contig base, and deletion of
517     // first base, correctly.
518     seqbuf[0] = 'N';
519 
520     reterr = SizeAndInitTextStream(fname, bigstack_left(), MAXV(max_thread_ct - 1, 1), &fa_txs);
521     if (unlikely(reterr)) {
522       goto ProcessFa_ret_TSTREAM_FAIL;
523     }
524 
525     unsigned char* tmp_alloc_end = g_bigstack_end;
526     char* seqbuf_end = nullptr;
527     char* seq_iter = nullptr;
528     // possible but low-priority todo: exploit .fai when it's present
529     uint32_t chr_fo_idx = UINT32_MAX;
530     uint32_t cur_vidx_last = 0;
531     uint32_t skip_chr = 1;
532     uint32_t is_first_noncomment_line = 1;
533     uint32_t ref_changed_ct = 0;
534     uint32_t ref_validated_ct = 0;
535     uint32_t ref_downgraded_ct = 0;
536     uint32_t nchanged_ct = 0;
537     while (1) {
538       ++line_idx;
539       char* line_iter;
540       reterr = TextNextLine(&fa_txs, &line_iter);
541       if (reterr) {
542         if (likely(reterr == kPglRetEof)) {
543           reterr = kPglRetSuccess;
544           break;
545         }
546         goto ProcessFa_ret_TSTREAM_FAIL;
547       }
548       unsigned char ucc = line_iter[0];
549       if (ucc < 'A') {
550         // > = ascii 62
551         // ; = ascii 59
552         if (ucc == ';') {
553           continue;
554         }
555         is_first_noncomment_line = 0;
556         if (unlikely(ucc != '>')) {
557           snprintf(g_logbuf, kLogbufSize, "Error: Unexpected character at beginning of line %" PRIuPTR " of --fa file.\n", line_idx);
558           goto ProcessFa_ret_MALFORMED_INPUT_WW;
559         }
560         if (chr_fo_idx != UINT32_MAX) {
561           *seq_iter = '\0';
562           const uint32_t bp_end = seq_iter - seqbuf;
563           if (flags & kfFaRefFrom) {
564             reterr = RefFromFaContig(variant_include, variant_bps, allele_idx_offsets, allele_storage, cip, seqbuf, flags & kfFaRefFromForce, chr_fo_idx, cur_vidx_last, bp_end, refalt1_select, nonref_flags, &ref_changed_ct, &ref_validated_ct, &ref_downgraded_ct);
565             if (unlikely(reterr)) {
566               goto ProcessFa_ret_1;
567             }
568           }
569           if (flags & kfFaNormalize) {
570             reterr = VNormalizeContig(variant_include, variant_ids, allele_idx_offsets, cip, seqbuf, chr_fo_idx, cur_vidx_last, bp_end, &tmp_alloc_end, vpos_sortstatusp, variant_bps, allele_storage, &nchanged_ct, nlist_flush, nlist_file, &nlist_write_iter, alen_buf);
571             if (unlikely(reterr)) {
572               goto ProcessFa_ret_1;
573             }
574           }
575         }
576         char* chr_name_start = &(line_iter[1]);
577         if (unlikely(IsSpaceOrEoln(*chr_name_start))) {
578           snprintf(g_logbuf, kLogbufSize, "Error: Invalid contig description on line %" PRIuPTR " of --fa file.%s\n", line_idx, (*chr_name_start == ' ')? " (Spaces are not permitted between the leading '>' and the contig name.)" : "");
579           goto ProcessFa_ret_MALFORMED_INPUT_WW;
580         }
581         char* chr_name_end = CurTokenEnd(chr_name_start);
582         line_iter = AdvToDelim(chr_name_end, '\n');
583         *chr_name_end = '\0';
584         const uint32_t chr_name_slen = chr_name_end - chr_name_start;
585         chr_fo_idx = UINT32_MAX;
586         const uint32_t chr_idx = GetChrCode(chr_name_start, cip, chr_name_slen);
587         if ((!IsI32Neg(chr_idx)) && IsSet(cip->chr_mask, chr_idx)) {
588           chr_fo_idx = cip->chr_idx_to_foidx[chr_idx];
589           if (unlikely(IsSet(chr_already_seen, chr_fo_idx))) {
590             snprintf(g_logbuf, kLogbufSize, "Error: Duplicate contig name '%s' in --fa file.\n", chr_name_start);
591             goto ProcessFa_ret_MALFORMED_INPUT_WW;
592           }
593           SetBit(chr_fo_idx, chr_already_seen);
594           const int32_t chr_vidx_start_m1 = cip->chr_fo_vidx_start[chr_fo_idx] - 1;
595           const int32_t chr_vidx_last = FindLast1BitBeforeBounded(variant_include, cip->chr_fo_vidx_start[chr_fo_idx + 1], chr_vidx_start_m1);
596           if (chr_vidx_last == chr_vidx_start_m1) {
597             chr_fo_idx = UINT32_MAX;
598           } else {
599             cur_vidx_last = chr_vidx_last;
600             seqbuf_end = &(seqbuf[variant_bps[cur_vidx_last] + max_allele_slen]);
601             seq_iter = &(seqbuf[1]);
602           }
603         }
604         skip_chr = (chr_fo_idx == UINT32_MAX);
605         *chr_name_end = '\n';  // bugfix (16 Apr 2018)
606         continue;
607       }
608       const char* line_start = line_iter;
609       line_iter = CurTokenEnd(line_iter);
610       const char* seqline_end = line_iter;
611       if (skip_chr) {
612         if (is_first_noncomment_line) {
613           logerrputs("Error: --fa file is not a valid FASTA.\n");
614           goto ProcessFa_ret_MALFORMED_INPUT;
615         }
616         continue;
617       }
618       ucc = *seqline_end;
619       if (unlikely((ucc == ' ') || (ucc == '\t'))) {
620         snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --fa file is malformed.\n", line_idx);
621         goto ProcessFa_ret_MALFORMED_INPUT_2;
622       }
623       uint32_t cur_seq_slen = seqline_end - line_start;
624       const uint32_t seq_rem = seqbuf_end - seq_iter;
625       if (seq_rem <= cur_seq_slen) {
626         cur_seq_slen = seq_rem;
627         skip_chr = 1;
628       }
629       const char* gap_start = S_CAST(const char*, memchr(line_start, '-', cur_seq_slen));
630       if (gap_start) {
631         logerrprintfww("Warning: Indeterminate-length gap present on line %" PRIuPTR " of --fa file. Ignoring remainder of contig.\n", line_idx);
632         cur_seq_slen = gap_start - line_start;
633         skip_chr = 1;
634       }
635       seq_iter = memcpya(seq_iter, line_start, cur_seq_slen);
636       // seq_iter = memcpya_toupper(seq_iter, loadbuf, cur_seq_slen);
637     }
638     if (flags & kfFaRefFrom) {
639       const uint32_t ref_from_fa_force = flags & kfFaRefFromForce;
640       if (chr_fo_idx != UINT32_MAX) {
641         *seq_iter = '\0';
642         const uint32_t bp_end = seq_iter - seqbuf;
643         reterr = RefFromFaContig(variant_include, variant_bps, allele_idx_offsets, allele_storage, cip, seqbuf, ref_from_fa_force, chr_fo_idx, cur_vidx_last, bp_end, refalt1_select, nonref_flags, &ref_changed_ct, &ref_validated_ct, &ref_downgraded_ct);
644         if (unlikely(reterr)) {
645           goto ProcessFa_ret_1;
646         }
647       }
648       logprintf("--ref-from-fa%s: %u variant%s changed, %u validated.\n", ref_from_fa_force? " force" : "", ref_changed_ct, (ref_changed_ct == 1)? "" : "s", ref_validated_ct);
649       if (ref_downgraded_ct) {
650         logerrprintfww("Warning: %u reference allele%s downgraded from 'known' to 'provisional'.\n", ref_downgraded_ct, (ref_downgraded_ct == 1)? "" : "s");
651       }
652     }
653     if (flags & kfFaNormalize) {
654       if (chr_fo_idx != UINT32_MAX) {
655         // slightly redundant
656         *seq_iter = '\0';
657         const uint32_t bp_end = seq_iter - seqbuf;
658         reterr = VNormalizeContig(variant_include, variant_ids, allele_idx_offsets, cip, seqbuf, chr_fo_idx, cur_vidx_last, bp_end, &tmp_alloc_end, vpos_sortstatusp, variant_bps, allele_storage, &nchanged_ct, nlist_flush, nlist_file, &nlist_write_iter, alen_buf);
659         if (unlikely(reterr)) {
660           goto ProcessFa_ret_1;
661         }
662       }
663       BigstackEndSet(tmp_alloc_end);
664       logprintf("--normalize: %u variant%s changed.\n", nchanged_ct, (nchanged_ct == 1)? "" : "s");
665       if (nlist_file) {
666         if (unlikely(fclose_flush_null(nlist_flush, nlist_write_iter, &nlist_file))) {
667           goto ProcessFa_ret_WRITE_FAIL;
668         }
669         logprintfww("Affected variant ID%s written to %s .\n", (nchanged_ct == 1)? "" : "s", outname);
670       }
671       if ((*vpos_sortstatusp) & kfUnsortedVarBp) {
672         logerrprintf("Warning: Base-pair positions are now unsorted!\n");
673       }
674     }
675   }
676   while (0) {
677   ProcessFa_ret_NOMEM:
678     reterr = kPglRetNomem;
679     break;
680   ProcessFa_ret_OPEN_FAIL:
681     reterr = kPglRetOpenFail;
682     break;
683   ProcessFa_ret_TSTREAM_FAIL:
684     TextStreamErrPrint("--ref-from-fa file", &fa_txs);
685     break;
686   ProcessFa_ret_WRITE_FAIL:
687     reterr = kPglRetWriteFail;
688     break;
689   ProcessFa_ret_MALFORMED_INPUT_WW:
690     WordWrapB(0);
691   ProcessFa_ret_MALFORMED_INPUT_2:
692     logerrputsb();
693   ProcessFa_ret_MALFORMED_INPUT:
694     reterr = kPglRetMalformedInput;
695     break;
696   }
697  ProcessFa_ret_1:
698   fclose_cond(nlist_file);
699   CleanupTextStream2("--ref-from-fa file", &fa_txs, &reterr);
700   BigstackReset(bigstack_mark);
701   return reterr;
702 }
703 
704 // Eventually want a standalone function which uses a .fa to fill in ref
705 // alleles for .ped and similar all-SNP datasets where the reference allele may
706 // have disappeared, and some of the resulting variants will become
707 // multiallelic.  But wait till --pmerge is implemented, since this is mostly a
708 // subset of what --pmerge needs to do.
709 
710 #ifdef __cplusplus
711 }  // namespace plink2
712 #endif
713