1 // This file is part of PLINK 2.00, copyright (C) 2005-2020 Shaun Purcell,
2 // Christopher Chang.
3 //
4 // This program is free software: you can redistribute it and/or modify it
5 // under the terms of the GNU General Public License as published by the Free
6 // Software Foundation, either version 3 of the License, or (at your option)
7 // any later version.
8 //
9 // This program is distributed in the hope that it will be useful, but WITHOUT
10 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 // more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17
18 #include "plink2_fasta.h"
19
20 #ifdef __cplusplus
21 namespace plink2 {
22 #endif
23
24 PglErr RefFromFaContig(const uintptr_t* variant_include, const uint32_t* variant_bps, const uintptr_t* allele_idx_offsets, const char* const* allele_storage, const ChrInfo* cip, const char* seqbuf, uint32_t force, uint32_t chr_fo_idx, uint32_t variant_uidx_last, uint32_t bp_end, STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), uintptr_t* nonref_flags, uint32_t* changed_ct_ptr, uint32_t* validated_ct_ptr, uint32_t* downgraded_ct_ptr) {
25 uintptr_t variant_uidx_base;
26 uintptr_t cur_bits;
27 BitIter1Start(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], &variant_uidx_base, &cur_bits);
28 uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
29 if (variant_bps[variant_uidx_last] >= bp_end) {
30 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
31 if (!force) {
32 char* write_iter = strcpya_k(g_logbuf, "Error: Contig '");
33 write_iter = chrtoa(cip, chr_idx, write_iter);
34 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 32, "' in --fa file is too short; it is likely to be mismatched with your data. Add the 'force' modifier if this wasn't a mistake, and you just want to mark all reference alleles past the end as provisional.\n");
35 WordWrapB(0);
36 logerrputsb();
37 return kPglRetInconsistentInput;
38 } else {
39 char* write_iter = strcpya_k(g_logbuf, "Warning: Contig '");
40 write_iter = chrtoa(cip, chr_idx, write_iter);
41 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 32, "' in --fa file is too short; it is likely to be mismatched with your data.\n");
42 WordWrapB(0);
43 logerrputsb();
44 }
45 uint32_t offset = CountSortedSmallerU32(&(variant_bps[variant_uidx]), variant_uidx_last - variant_uidx, bp_end);
46
47 const uint32_t chr_vidx_end = cip->chr_fo_vidx_start[chr_fo_idx + 1];
48 // set all bits in [variant_uidx + offset, chr_vidx_end), and count how
49 // many relevant bits were set.
50 const uint32_t widx_full_end = chr_vidx_end / kBitsPerWord;
51 const uint32_t trailing_bit_ct = chr_vidx_end % kBitsPerWord;
52 const uint32_t chr_vidx_truncate = variant_uidx + offset;
53 uint32_t widx = chr_vidx_truncate / kBitsPerWord;
54 uint32_t downgraded_incr;
55 if (widx == widx_full_end) {
56 const uintptr_t cur_mask = (k1LU << trailing_bit_ct) - (k1LU << (chr_vidx_truncate % kBitsPerWord));
57 downgraded_incr = PopcountWord(variant_include[widx] & (~nonref_flags[widx]) & cur_mask);
58 nonref_flags[widx] |= cur_mask;
59 } else {
60 downgraded_incr = 0;
61 if (chr_vidx_truncate % kBitsPerWord) {
62 const uintptr_t cur_mask = (~k0LU) << (chr_vidx_truncate % kBitsPerWord);
63 downgraded_incr = PopcountWord(variant_include[widx] & (~nonref_flags[widx]) & cur_mask);
64 nonref_flags[widx] |= cur_mask;
65 ++widx;
66 }
67 assert(widx > widx_full_end);
68 for (; widx != widx_full_end; ++widx) {
69 downgraded_incr += PopcountWord(variant_include[widx] & (~nonref_flags[widx]));
70 nonref_flags[widx] = ~k0LU;
71 }
72 if (trailing_bit_ct) {
73 const uintptr_t cur_mask = (k1LU << trailing_bit_ct) - k1LU;
74 downgraded_incr += PopcountWord(variant_include[widx] & (~nonref_flags[widx]) & cur_mask);
75 nonref_flags[widx] |= cur_mask;
76 }
77 }
78 *downgraded_ct_ptr += downgraded_incr;
79 if (!offset) {
80 return kPglRetSuccess;
81 }
82 // We know that variant_bps[variant_uidx + offset - 1] < bp_end,
83 // variant_bps[variant_uidx + offset] >= bp_end, and that at least one
84 // variant_include[] bit is set in [variant_uidx, variant_uidx + offset)
85 // (since offset > 0). Find the last such bit.
86 variant_uidx_last = FindLast1BitBefore(variant_include, chr_vidx_truncate);
87 }
88 uint32_t changed_ct = *changed_ct_ptr;
89 uint32_t validated_ct = *validated_ct_ptr;
90 uint32_t allele_ct = 2;
91 for (; ; variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits)) {
92 const uint32_t cur_bp = variant_bps[variant_uidx];
93 uintptr_t allele_idx_offset_base = variant_uidx * 2;
94 if (allele_idx_offsets) {
95 allele_idx_offset_base = allele_idx_offsets[variant_uidx];
96 allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
97 }
98 const char* const* cur_alleles = &(allele_storage[allele_idx_offset_base]);
99 const char* cur_ref = &(seqbuf[cur_bp]);
100 int32_t consistent_allele_idx = -1;
101 for (uint32_t allele_idx = 0; allele_idx != allele_ct; ++allele_idx) {
102 const char* cur_allele = cur_alleles[allele_idx];
103 const uint32_t cur_allele_slen = strlen(cur_allele);
104 if (strcaseequal(cur_allele, cur_ref, cur_allele_slen)) {
105 if (consistent_allele_idx != -1) {
106 // Multiple alleles could be ref (this always happens for deletions).
107 // Don't try to do anything.
108 consistent_allele_idx = -2;
109 break;
110 }
111 consistent_allele_idx = allele_idx;
112 }
113 }
114 if (consistent_allele_idx >= 0) {
115 if (consistent_allele_idx) {
116 if ((!IsSet(nonref_flags, variant_uidx)) && (!force)) {
117 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
118 char* write_iter = strcpya_k(g_logbuf, "Error: --ref-from-fa wants to change reference allele assignment at ");
119 write_iter = chrtoa(cip, chr_idx, write_iter);
120 *write_iter++ = ':';
121 write_iter = u32toa(cur_bp, write_iter);
122 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, ", but it's marked as 'known'. Add the 'force' modifier to force this change through.\n");
123 WordWrapB(0);
124 logerrputsb();
125 return kPglRetInconsistentInput;
126 }
127 R_CAST(DoubleAlleleCode*, refalt1_select)[variant_uidx] = consistent_allele_idx;
128 ++changed_ct;
129 } else {
130 ++validated_ct;
131 }
132 ClearBit(variant_uidx, nonref_flags);
133 } else if ((consistent_allele_idx == -1) && (!IsSet(nonref_flags, variant_uidx))) {
134 // okay to have multiple matches, but not zero matches
135 if (!force) {
136 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
137 char* write_iter = strcpya_k(g_logbuf, "Error: Reference allele at ");
138 write_iter = chrtoa(cip, chr_idx, write_iter);
139 *write_iter++ = ':';
140 write_iter = u32toa(cur_bp, write_iter);
141 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 64, " is marked as 'known', but is inconsistent with .fa file. Add the 'force' modifier to downgrade it to provisional.\n");
142 WordWrapB(0);
143 logerrputsb();
144 return kPglRetInconsistentInput;
145 }
146 SetBit(variant_uidx, nonref_flags);
147 *downgraded_ct_ptr += 1;
148 }
149 if (variant_uidx == variant_uidx_last) {
150 *changed_ct_ptr = changed_ct;
151 *validated_ct_ptr = validated_ct;
152 return kPglRetSuccess;
153 }
154 }
155 }
156
VNormalizeContig(const uintptr_t * variant_include,const char * const * variant_ids,const uintptr_t * allele_idx_offsets,const ChrInfo * cip,const char * seqbuf,uint32_t chr_fo_idx,uint32_t variant_uidx_last,uint32_t bp_end,unsigned char ** alloc_endp,UnsortedVar * vpos_sortstatusp,uint32_t * __restrict variant_bps,const char ** allele_storage,uint32_t * __restrict nchanged_ct_ptr,char * nlist_flush,FILE * nlist_file,char ** nlist_write_iterp,uint32_t * __restrict alen_buf)157 PglErr VNormalizeContig(const uintptr_t* variant_include, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const ChrInfo* cip, const char* seqbuf, uint32_t chr_fo_idx, uint32_t variant_uidx_last, uint32_t bp_end, unsigned char** alloc_endp, UnsortedVar* vpos_sortstatusp, uint32_t* __restrict variant_bps, const char** allele_storage, uint32_t* __restrict nchanged_ct_ptr, char* nlist_flush, FILE* nlist_file, char** nlist_write_iterp, uint32_t* __restrict alen_buf) {
158 uintptr_t variant_uidx_base;
159 uintptr_t cur_bits;
160 BitIter1Start(variant_include, cip->chr_fo_vidx_start[chr_fo_idx], &variant_uidx_base, &cur_bits);
161 uint32_t variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
162 if (variant_bps[variant_uidx_last] >= bp_end) {
163 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
164 char* write_iter = strcpya_k(g_logbuf, "Error: Contig '");
165 write_iter = chrtoa(cip, chr_idx, write_iter);
166 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 32, "' in --fa file is too short; it is likely to be mismatched with your data.\n");
167 WordWrapB(0);
168 logerrputsb();
169 return kPglRetInconsistentInput;
170 }
171 const char* missing_allele_str = &(g_one_char_strs[92]);
172 const uint32_t input_missing_geno_code = ctou32(*g_input_missing_geno_ptr);
173 unsigned char* alloc_base = g_bigstack_base;
174 unsigned char* alloc_end = *alloc_endp;
175 char* nlist_write_iter = *nlist_write_iterp;
176 uint32_t nchanged_ct = *nchanged_ct_ptr;
177 uint32_t allele_ct = 2;
178 uint32_t prev_bp = 0;
179 uint32_t is_unsorted = ((*vpos_sortstatusp) / kfUnsortedVarBp) & 1;
180 uint32_t cur_bp;
181 goto VNormalizeContig_loop_start;
182 while (1) {
183 if (!is_unsorted) {
184 if (prev_bp > cur_bp) {
185 is_unsorted = 1;
186 }
187 prev_bp = cur_bp;
188 }
189 if (variant_uidx == variant_uidx_last) {
190 break;
191 }
192 variant_uidx = BitIter1(variant_include, &variant_uidx_base, &cur_bits);
193 VNormalizeContig_loop_start:
194 cur_bp = variant_bps[variant_uidx];
195 uintptr_t allele_idx_offset_base = variant_uidx * 2;
196 if (allele_idx_offsets) {
197 allele_idx_offset_base = allele_idx_offsets[variant_uidx];
198 allele_ct = allele_idx_offsets[variant_uidx + 1] - allele_idx_offset_base;
199 }
200 const char** cur_alleles = &(allele_storage[allele_idx_offset_base]);
201 {
202 uint32_t aidx = 0;
203 // Common case: if all alleles have length 1, variant must already be
204 // normalized.
205 for (; aidx != allele_ct; ++aidx) {
206 if (cur_alleles[aidx][1]) {
207 break;
208 }
209 }
210 if (aidx == allele_ct) {
211 continue;
212 }
213 }
214 // Ignore missing allele if present, get lengths of each allele, check if
215 // all right nucleotides match or all left nucleotides match
216 uint32_t left_match = 0;
217 uint32_t right_match = 0;
218 uint32_t min_alen = 0;
219 uint32_t missing_aidx = UINT32_MAX;
220 for (uint32_t aidx = 0; aidx != allele_ct; ++aidx) {
221 if (cur_alleles[aidx] == missing_allele_str) {
222 if (missing_aidx != UINT32_MAX) {
223 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
224 char* write_iter = strcpya_k(g_logbuf, "Error: Variant at ");
225 write_iter = chrtoa(cip, chr_idx, write_iter);
226 *write_iter++ = ':';
227 write_iter = u32toa(cur_bp, write_iter);
228 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " has multiple missing alleles.\n");
229 WordWrapB(0);
230 logerrputsb();
231 return kPglRetInconsistentInput;
232 }
233 missing_aidx = aidx;
234 alen_buf[aidx] = 1;
235 continue;
236 }
237 const char* cur_allele = cur_alleles[aidx];
238 const uint32_t alen = strlen(cur_allele);
239 alen_buf[aidx] = alen;
240 const uint32_t first_code = ctou32(cur_allele[0]);
241 // Special case: if first character of any allele is '<', don't attempt
242 // to normalize.
243 if (first_code == '<') {
244 left_match = UINT32_MAX;
245 right_match = UINT32_MAX;
246 break;
247 }
248 const uint32_t last_code = ctou32(cur_allele[alen - 1]);
249 if (!min_alen) {
250 // must be first nonmissing allele
251 left_match = first_code;
252 right_match = last_code;
253 min_alen = alen;
254 } else {
255 if (first_code != left_match) {
256 left_match = UINT32_MAX;
257 }
258 if (last_code != right_match) {
259 right_match = UINT32_MAX;
260 }
261 if (alen < min_alen) {
262 min_alen = alen;
263 }
264 }
265 }
266 if (((left_match == UINT32_MAX) || (min_alen == 1)) && (right_match == UINT32_MAX)) {
267 continue;
268 }
269 // Sanity check: verify alleles aren't all identical.
270 const uint32_t first_aidx = (missing_aidx == 0)? 1 : 0;
271 const uint32_t first_alen = alen_buf[first_aidx];
272 for (uint32_t aidx = first_aidx + 1; ; ) {
273 if (aidx == missing_aidx) {
274 continue;
275 }
276 if ((alen_buf[aidx] != first_alen) || (!memequal(cur_alleles[first_aidx], cur_alleles[aidx], min_alen))) {
277 break;
278 }
279 if (++aidx == allele_ct) {
280 // probable todo: report ID instead?
281 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
282 char* write_iter = strcpya_k(g_logbuf, "Error: Variant at ");
283 write_iter = chrtoa(cip, chr_idx, write_iter);
284 *write_iter++ = ':';
285 write_iter = u32toa(cur_bp, write_iter);
286 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " has duplicate allele codes.\n");
287 WordWrapB(0);
288 logerrputsb();
289 return kPglRetInconsistentInput;
290 }
291 }
292 ++nchanged_ct;
293 if (nlist_write_iter) {
294 nlist_write_iter = strcpya(nlist_write_iter, variant_ids[variant_uidx]);
295 AppendBinaryEoln(&nlist_write_iter);
296 if (unlikely(fwrite_ck(nlist_flush, nlist_file, &nlist_write_iter))) {
297 return kPglRetWriteFail;
298 }
299 }
300 // Algorithm from Tan A, Abecasis GR, Kang HM (2015) Unified representation
301 // of genetic variants:
302 // while (alleles end with the same nucleotide) {
303 // truncate rightmost
304 // if there's now an empty allele, extend alleles 1 base to the left
305 // }
306 // while (all alleles length > 1 and start with same nucleotide) {
307 // truncate leftmost
308 // }
309 // https://genome.sph.umich.edu/wiki/Variant_Normalization also covers
310 // this.
311 const uint32_t rtrim_stop = cur_bp + min_alen - 1;
312 if ((right_match == UINT32_MAX) || (!rtrim_stop)) {
313 // Easy case: only left-trimming. We separate this out because bigstack
314 // allocation is never required.
315 const uint32_t min_alen_m1 = min_alen - 1;
316 uint32_t ltrim = 1;
317 uint32_t ltrim_p1;
318 for (; ltrim != min_alen_m1; ++ltrim) {
319 const char cc = cur_alleles[first_aidx][ltrim];
320 for (uint32_t aidx = first_aidx + 1; aidx != allele_ct; ++aidx) {
321 if (aidx == missing_aidx) {
322 continue;
323 }
324 if (cur_alleles[aidx][ltrim] != cc) {
325 goto VNormalizeContig_ltrim1_done;
326 }
327 }
328 }
329 VNormalizeContig_ltrim1_done:
330 ltrim_p1 = ltrim + 1;
331 for (uint32_t aidx = first_aidx; aidx != allele_ct; ++aidx) {
332 if (aidx == missing_aidx) {
333 continue;
334 }
335 if (alen_buf[aidx] == ltrim_p1) {
336 const uint32_t cur_code = ctou32(cur_alleles[aidx][ltrim]);
337 if ((cur_code == 46) || (cur_code == input_missing_geno_code)) {
338 const uint32_t chr_idx = cip->chr_file_order[chr_fo_idx];
339 char* write_iter = strcpya_k(g_logbuf, "Error: Variant at ");
340 write_iter = chrtoa(cip, chr_idx, write_iter);
341 *write_iter++ = ':';
342 write_iter = u32toa(cur_bp, write_iter);
343 snprintf(write_iter, kLogbufSize - kMaxIdSlen - 128, " has an invalid normalized allele (conflicts with missing code).\n");
344 WordWrapB(0);
345 logerrputsb();
346 return kPglRetInconsistentInput;
347 }
348 cur_alleles[aidx] = &(g_one_char_strs[2 * cur_code]);
349 } else {
350 cur_alleles[aidx] = &(cur_alleles[aidx][ltrim]);
351 }
352 }
353 cur_bp += ltrim;
354 variant_bps[variant_uidx] = cur_bp;
355 continue;
356 }
357 // cur_bp == 0 is possible for e.g. an indel at the beginning of chr17.
358 const char* prev_ref = &(seqbuf[S_CAST(int32_t, cur_bp) - 1]);
359 uint32_t rtrim = 0;
360 uint32_t lshift = 0;
361 uint32_t ltrim = 0;
362 const char* shifted_ref = nullptr;
363 do {
364 ++rtrim;
365 uint32_t cur_alen = alen_buf[first_aidx];
366 if (rtrim >= cur_alen) {
367 right_match = ctou32(prev_ref[S_CAST(int32_t, cur_alen - rtrim)]);
368 } else {
369 right_match = ctou32(cur_alleles[first_aidx][cur_alen - 1 - rtrim]);
370 }
371 for (uint32_t aidx = first_aidx + 1; aidx != allele_ct; ++aidx) {
372 if (aidx == missing_aidx) {
373 continue;
374 }
375 cur_alen = alen_buf[aidx];
376 uint32_t cur_code;
377 if (rtrim >= cur_alen) {
378 cur_code = ctou32(prev_ref[S_CAST(int32_t, cur_alen - rtrim)]);
379 } else {
380 cur_code = ctou32(cur_alleles[aidx][cur_alen - 1 - rtrim]);
381 }
382 if (right_match != cur_code) {
383 goto VNormalizeContig_rtrim_done;
384 }
385 }
386 } while (rtrim != rtrim_stop);
387 VNormalizeContig_rtrim_done:
388 if (rtrim >= min_alen) {
389 lshift = rtrim + 1 - min_alen;
390 cur_bp -= lshift;
391 shifted_ref = &(seqbuf[cur_bp]);
392 // min_alen = 1;
393 variant_bps[variant_uidx] = cur_bp;
394 } else {
395 min_alen -= rtrim;
396 if ((left_match != UINT32_MAX) && (min_alen > 1)) {
397 const uint32_t min_alen_m1 = min_alen - 1;
398 ltrim = 1;
399 for (; ltrim != min_alen_m1; ++ltrim) {
400 left_match = ctou32(cur_alleles[first_aidx][ltrim]);
401 for (uint32_t aidx = first_aidx + 1; aidx != allele_ct; ++aidx) {
402 if (aidx == missing_aidx) {
403 continue;
404 }
405 const uint32_t cur_code = ctou32(cur_alleles[aidx][ltrim]);
406 if (left_match != cur_code) {
407 goto VNormalizeContig_ltrim2_done;
408 }
409 }
410 }
411 }
412 VNormalizeContig_ltrim2_done:
413 cur_bp += ltrim;
414 variant_bps[variant_uidx] = cur_bp;
415 }
416 for (uint32_t aidx = first_aidx; aidx != allele_ct; ++aidx) {
417 if (aidx == missing_aidx) {
418 continue;
419 }
420 const uint32_t orig_alen = alen_buf[aidx];
421 if (orig_alen <= rtrim) {
422 // Must be single-character from .fa.
423 cur_alleles[aidx] = &(g_one_char_strs[2 * ctou32(prev_ref[S_CAST(int32_t, orig_alen - rtrim)])]);
424 } else {
425 uint32_t new_slen = orig_alen + lshift - rtrim - ltrim;
426 const char* cur_allele = &(cur_alleles[aidx][ltrim]);
427 if (new_slen == 1) {
428 cur_alleles[aidx] = &(g_one_char_strs[2 * ctou32(cur_allele[0])]);
429 } else {
430 if (S_CAST(uintptr_t, alloc_end - alloc_base) <= new_slen) {
431 return kPglRetNomem;
432 }
433 alloc_end -= new_slen + 1;
434 char* new_allele = R_CAST(char*, alloc_end);
435 char* new_allele_iter = new_allele;
436 if (lshift) {
437 if (lshift < new_slen) {
438 new_allele_iter = memcpya(new_allele_iter, shifted_ref, lshift);
439 new_slen -= lshift;
440 } else {
441 new_allele_iter = memcpya(new_allele_iter, shifted_ref, new_slen);
442 new_slen = 0;
443 }
444 }
445 new_allele_iter = memcpya(new_allele_iter, cur_allele, new_slen);
446 *new_allele_iter = '\0';
447 cur_alleles[aidx] = new_allele;
448 }
449 }
450 }
451 }
452 *alloc_endp = alloc_end;
453 if (is_unsorted) {
454 *vpos_sortstatusp |= kfUnsortedVarBp;
455 }
456 *nchanged_ct_ptr = nchanged_ct;
457 *nlist_write_iterp = nlist_write_iter;
458 return kPglRetSuccess;
459 }
460
461 PglErr ProcessFa(const uintptr_t* variant_include, const char* const* variant_ids, const uintptr_t* allele_idx_offsets, const ChrInfo* cip, const char* fname, uint32_t max_allele_ct, uint32_t max_allele_slen, FaFlags flags, uint32_t max_thread_ct, UnsortedVar* vpos_sortstatusp, uint32_t* variant_bps, const char** allele_storage, STD_ARRAY_PTR_DECL(AlleleCode, 2, refalt1_select), uintptr_t* nonref_flags, char* outname, char* outname_end) {
462 unsigned char* bigstack_mark = g_bigstack_base;
463 uintptr_t line_idx = 0;
464 FILE* nlist_file = nullptr;
465 PglErr reterr = kPglRetSuccess;
466 TextStream fa_txs;
467 PreinitTextStream(&fa_txs);
468 {
469 const uint32_t chr_ct = cip->chr_ct;
470 char* chr_name_buf;
471 uintptr_t* chr_already_seen;
472 if (unlikely(
473 bigstack_calloc_w(BitCtToWordCt(chr_ct), &chr_already_seen) ||
474 bigstack_alloc_c(kMaxIdBlen, &chr_name_buf))) {
475 goto ProcessFa_ret_NOMEM;
476 }
477 uint32_t* alen_buf = nullptr;
478 char* nlist_write_iter = nullptr;
479 char* nlist_flush = nullptr;
480 if (flags & kfFaNormalize) {
481 if (unlikely(bigstack_alloc_u32(max_allele_ct, &alen_buf))) {
482 goto ProcessFa_ret_NOMEM;
483 }
484 if (flags & kfFaNormalizeList) {
485 snprintf(outname_end, kMaxOutfnameExtBlen, ".normalized");
486 if (unlikely(fopen_checked(outname, FOPEN_WB, &nlist_file))) {
487 goto ProcessFa_ret_OPEN_FAIL;
488 }
489 nlist_write_iter = g_textbuf;
490 nlist_flush = &(nlist_write_iter[kMaxMediumLine]);
491 }
492 }
493
494 // To simplify indel/complex-variant handling, we load an entire contig at
495 // a time. Determine an upper bound for the size of this buffer.
496 const uintptr_t* chr_mask = cip->chr_mask;
497 uint32_t seqbuf_size = 0;
498 for (uint32_t chr_fo_idx = 0; chr_fo_idx != chr_ct; ++chr_fo_idx) {
499 if (!IsSet(chr_mask, cip->chr_file_order[chr_fo_idx])) {
500 continue;
501 }
502 const int32_t chr_vidx_start_m1 = cip->chr_fo_vidx_start[chr_fo_idx] - 1;
503 const int32_t chr_vidx_last = FindLast1BitBeforeBounded(variant_include, cip->chr_fo_vidx_start[chr_fo_idx + 1], chr_vidx_start_m1);
504 if (chr_vidx_last != chr_vidx_start_m1) {
505 const uint32_t cur_bp = variant_bps[S_CAST(uint32_t, chr_vidx_last)];
506 if (cur_bp > seqbuf_size) {
507 seqbuf_size = cur_bp;
508 }
509 }
510 }
511 seqbuf_size += max_allele_slen + 1;
512 char* seqbuf;
513 if (unlikely(bigstack_alloc_c(seqbuf_size, &seqbuf))) {
514 goto ProcessFa_ret_NOMEM;
515 }
516 // May as well handle insertion before first contig base, and deletion of
517 // first base, correctly.
518 seqbuf[0] = 'N';
519
520 reterr = SizeAndInitTextStream(fname, bigstack_left(), MAXV(max_thread_ct - 1, 1), &fa_txs);
521 if (unlikely(reterr)) {
522 goto ProcessFa_ret_TSTREAM_FAIL;
523 }
524
525 unsigned char* tmp_alloc_end = g_bigstack_end;
526 char* seqbuf_end = nullptr;
527 char* seq_iter = nullptr;
528 // possible but low-priority todo: exploit .fai when it's present
529 uint32_t chr_fo_idx = UINT32_MAX;
530 uint32_t cur_vidx_last = 0;
531 uint32_t skip_chr = 1;
532 uint32_t is_first_noncomment_line = 1;
533 uint32_t ref_changed_ct = 0;
534 uint32_t ref_validated_ct = 0;
535 uint32_t ref_downgraded_ct = 0;
536 uint32_t nchanged_ct = 0;
537 while (1) {
538 ++line_idx;
539 char* line_iter;
540 reterr = TextNextLine(&fa_txs, &line_iter);
541 if (reterr) {
542 if (likely(reterr == kPglRetEof)) {
543 reterr = kPglRetSuccess;
544 break;
545 }
546 goto ProcessFa_ret_TSTREAM_FAIL;
547 }
548 unsigned char ucc = line_iter[0];
549 if (ucc < 'A') {
550 // > = ascii 62
551 // ; = ascii 59
552 if (ucc == ';') {
553 continue;
554 }
555 is_first_noncomment_line = 0;
556 if (unlikely(ucc != '>')) {
557 snprintf(g_logbuf, kLogbufSize, "Error: Unexpected character at beginning of line %" PRIuPTR " of --fa file.\n", line_idx);
558 goto ProcessFa_ret_MALFORMED_INPUT_WW;
559 }
560 if (chr_fo_idx != UINT32_MAX) {
561 *seq_iter = '\0';
562 const uint32_t bp_end = seq_iter - seqbuf;
563 if (flags & kfFaRefFrom) {
564 reterr = RefFromFaContig(variant_include, variant_bps, allele_idx_offsets, allele_storage, cip, seqbuf, flags & kfFaRefFromForce, chr_fo_idx, cur_vidx_last, bp_end, refalt1_select, nonref_flags, &ref_changed_ct, &ref_validated_ct, &ref_downgraded_ct);
565 if (unlikely(reterr)) {
566 goto ProcessFa_ret_1;
567 }
568 }
569 if (flags & kfFaNormalize) {
570 reterr = VNormalizeContig(variant_include, variant_ids, allele_idx_offsets, cip, seqbuf, chr_fo_idx, cur_vidx_last, bp_end, &tmp_alloc_end, vpos_sortstatusp, variant_bps, allele_storage, &nchanged_ct, nlist_flush, nlist_file, &nlist_write_iter, alen_buf);
571 if (unlikely(reterr)) {
572 goto ProcessFa_ret_1;
573 }
574 }
575 }
576 char* chr_name_start = &(line_iter[1]);
577 if (unlikely(IsSpaceOrEoln(*chr_name_start))) {
578 snprintf(g_logbuf, kLogbufSize, "Error: Invalid contig description on line %" PRIuPTR " of --fa file.%s\n", line_idx, (*chr_name_start == ' ')? " (Spaces are not permitted between the leading '>' and the contig name.)" : "");
579 goto ProcessFa_ret_MALFORMED_INPUT_WW;
580 }
581 char* chr_name_end = CurTokenEnd(chr_name_start);
582 line_iter = AdvToDelim(chr_name_end, '\n');
583 *chr_name_end = '\0';
584 const uint32_t chr_name_slen = chr_name_end - chr_name_start;
585 chr_fo_idx = UINT32_MAX;
586 const uint32_t chr_idx = GetChrCode(chr_name_start, cip, chr_name_slen);
587 if ((!IsI32Neg(chr_idx)) && IsSet(cip->chr_mask, chr_idx)) {
588 chr_fo_idx = cip->chr_idx_to_foidx[chr_idx];
589 if (unlikely(IsSet(chr_already_seen, chr_fo_idx))) {
590 snprintf(g_logbuf, kLogbufSize, "Error: Duplicate contig name '%s' in --fa file.\n", chr_name_start);
591 goto ProcessFa_ret_MALFORMED_INPUT_WW;
592 }
593 SetBit(chr_fo_idx, chr_already_seen);
594 const int32_t chr_vidx_start_m1 = cip->chr_fo_vidx_start[chr_fo_idx] - 1;
595 const int32_t chr_vidx_last = FindLast1BitBeforeBounded(variant_include, cip->chr_fo_vidx_start[chr_fo_idx + 1], chr_vidx_start_m1);
596 if (chr_vidx_last == chr_vidx_start_m1) {
597 chr_fo_idx = UINT32_MAX;
598 } else {
599 cur_vidx_last = chr_vidx_last;
600 seqbuf_end = &(seqbuf[variant_bps[cur_vidx_last] + max_allele_slen]);
601 seq_iter = &(seqbuf[1]);
602 }
603 }
604 skip_chr = (chr_fo_idx == UINT32_MAX);
605 *chr_name_end = '\n'; // bugfix (16 Apr 2018)
606 continue;
607 }
608 const char* line_start = line_iter;
609 line_iter = CurTokenEnd(line_iter);
610 const char* seqline_end = line_iter;
611 if (skip_chr) {
612 if (is_first_noncomment_line) {
613 logerrputs("Error: --fa file is not a valid FASTA.\n");
614 goto ProcessFa_ret_MALFORMED_INPUT;
615 }
616 continue;
617 }
618 ucc = *seqline_end;
619 if (unlikely((ucc == ' ') || (ucc == '\t'))) {
620 snprintf(g_logbuf, kLogbufSize, "Error: Line %" PRIuPTR " of --fa file is malformed.\n", line_idx);
621 goto ProcessFa_ret_MALFORMED_INPUT_2;
622 }
623 uint32_t cur_seq_slen = seqline_end - line_start;
624 const uint32_t seq_rem = seqbuf_end - seq_iter;
625 if (seq_rem <= cur_seq_slen) {
626 cur_seq_slen = seq_rem;
627 skip_chr = 1;
628 }
629 const char* gap_start = S_CAST(const char*, memchr(line_start, '-', cur_seq_slen));
630 if (gap_start) {
631 logerrprintfww("Warning: Indeterminate-length gap present on line %" PRIuPTR " of --fa file. Ignoring remainder of contig.\n", line_idx);
632 cur_seq_slen = gap_start - line_start;
633 skip_chr = 1;
634 }
635 seq_iter = memcpya(seq_iter, line_start, cur_seq_slen);
636 // seq_iter = memcpya_toupper(seq_iter, loadbuf, cur_seq_slen);
637 }
638 if (flags & kfFaRefFrom) {
639 const uint32_t ref_from_fa_force = flags & kfFaRefFromForce;
640 if (chr_fo_idx != UINT32_MAX) {
641 *seq_iter = '\0';
642 const uint32_t bp_end = seq_iter - seqbuf;
643 reterr = RefFromFaContig(variant_include, variant_bps, allele_idx_offsets, allele_storage, cip, seqbuf, ref_from_fa_force, chr_fo_idx, cur_vidx_last, bp_end, refalt1_select, nonref_flags, &ref_changed_ct, &ref_validated_ct, &ref_downgraded_ct);
644 if (unlikely(reterr)) {
645 goto ProcessFa_ret_1;
646 }
647 }
648 logprintf("--ref-from-fa%s: %u variant%s changed, %u validated.\n", ref_from_fa_force? " force" : "", ref_changed_ct, (ref_changed_ct == 1)? "" : "s", ref_validated_ct);
649 if (ref_downgraded_ct) {
650 logerrprintfww("Warning: %u reference allele%s downgraded from 'known' to 'provisional'.\n", ref_downgraded_ct, (ref_downgraded_ct == 1)? "" : "s");
651 }
652 }
653 if (flags & kfFaNormalize) {
654 if (chr_fo_idx != UINT32_MAX) {
655 // slightly redundant
656 *seq_iter = '\0';
657 const uint32_t bp_end = seq_iter - seqbuf;
658 reterr = VNormalizeContig(variant_include, variant_ids, allele_idx_offsets, cip, seqbuf, chr_fo_idx, cur_vidx_last, bp_end, &tmp_alloc_end, vpos_sortstatusp, variant_bps, allele_storage, &nchanged_ct, nlist_flush, nlist_file, &nlist_write_iter, alen_buf);
659 if (unlikely(reterr)) {
660 goto ProcessFa_ret_1;
661 }
662 }
663 BigstackEndSet(tmp_alloc_end);
664 logprintf("--normalize: %u variant%s changed.\n", nchanged_ct, (nchanged_ct == 1)? "" : "s");
665 if (nlist_file) {
666 if (unlikely(fclose_flush_null(nlist_flush, nlist_write_iter, &nlist_file))) {
667 goto ProcessFa_ret_WRITE_FAIL;
668 }
669 logprintfww("Affected variant ID%s written to %s .\n", (nchanged_ct == 1)? "" : "s", outname);
670 }
671 if ((*vpos_sortstatusp) & kfUnsortedVarBp) {
672 logerrprintf("Warning: Base-pair positions are now unsorted!\n");
673 }
674 }
675 }
676 while (0) {
677 ProcessFa_ret_NOMEM:
678 reterr = kPglRetNomem;
679 break;
680 ProcessFa_ret_OPEN_FAIL:
681 reterr = kPglRetOpenFail;
682 break;
683 ProcessFa_ret_TSTREAM_FAIL:
684 TextStreamErrPrint("--ref-from-fa file", &fa_txs);
685 break;
686 ProcessFa_ret_WRITE_FAIL:
687 reterr = kPglRetWriteFail;
688 break;
689 ProcessFa_ret_MALFORMED_INPUT_WW:
690 WordWrapB(0);
691 ProcessFa_ret_MALFORMED_INPUT_2:
692 logerrputsb();
693 ProcessFa_ret_MALFORMED_INPUT:
694 reterr = kPglRetMalformedInput;
695 break;
696 }
697 ProcessFa_ret_1:
698 fclose_cond(nlist_file);
699 CleanupTextStream2("--ref-from-fa file", &fa_txs, &reterr);
700 BigstackReset(bigstack_mark);
701 return reterr;
702 }
703
704 // Eventually want a standalone function which uses a .fa to fill in ref
705 // alleles for .ped and similar all-SNP datasets where the reference allele may
706 // have disappeared, and some of the resulting variants will become
707 // multiallelic. But wait till --pmerge is implemented, since this is mostly a
708 // subset of what --pmerge needs to do.
709
710 #ifdef __cplusplus
711 } // namespace plink2
712 #endif
713