1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2016, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // aln_utils.cc -- common routines needed for dealing with gapped alignments.
23 //
24
25 #include "aln_utils.h"
26
27 // shell:
28 // echo "
29 // print(', '.join([ ('true' if chr(i) in set('0123456789=DHIMNPSX') else 'false') for i in range(256) ]))
30 // " | python3 | fold -s
31 const bool is_cigar_char[256] = {
32 false, false, false, false, false, false, false, false, false, false, false,
33 false, false, false, false, false, false, false, false, false, false, false,
34 false, false, false, false, false, false, false, false, false, false, false,
35 false, false, false, false, false, false, false, false, false, false, false,
36 false, false, false, false, true, true, true, true, true, true, true, true,
37 true, true, false, false, false, true, false, false, false, false, false,
38 false, true, false, false, false, true, true, false, false, false, true, true,
39 false, true, false, false, true, false, false, false, false, true, false,
40 false, false, false, false, false, false, false, false, false, false, false,
41 false, false, false, false, false, false, false, false, false, false, false,
42 false, false, false, false, false, false, false, false, false, false, false,
43 false, false, false, false, false, false, false, false, false, false, false,
44 false, false, false, false, false, false, false, false, false, false, false,
45 false, false, false, false, false, false, false, false, false, false, false,
46 false, false, false, false, false, false, false, false, false, false, false,
47 false, false, false, false, false, false, false, false, false, false, false,
48 false, false, false, false, false, false, false, false, false, false, false,
49 false, false, false, false, false, false, false, false, false, false, false,
50 false, false, false, false, false, false, false, false, false, false, false,
51 false, false, false, false, false, false, false, false, false, false, false,
52 false, false, false, false, false, false, false, false, false, false, false,
53 false, false, false, false, false, false, false, false, false, false, false,
54 false, false, false, false, false, false, false, false, false, false, false,
55 false
56 };
57
operator <<(ostream & os,const Cigar & cig)58 ostream& operator<< (ostream& os, const Cigar& cig) {
59 for (auto op : cig)
60 os << op.second << (op.first == '\0' ? '?' : op.first);
61 return os;
62 }
63
64 string
invert_cigar(string cigar)65 invert_cigar(string cigar)
66 {
67 for (uint i = 0; i < cigar.length(); i++) {
68 if (cigar[i] == 'I')
69 cigar[i] = 'D';
70 else if (cigar[i] == 'D')
71 cigar[i] = 'I';
72 }
73
74 return cigar;
75 }
76
77 int
invert_cigar(Cigar & cigar)78 invert_cigar(Cigar &cigar)
79 {
80 for (uint i = 0; i < cigar.size(); i++) {
81 if (cigar[i].first == 'I')
82 cigar[i].first = 'D';
83 else if (cigar[i].first == 'D')
84 cigar[i].first = 'I';
85 }
86
87 return 0;
88 }
89
90 int
convert_local_cigar_to_global(Cigar & cigar)91 convert_local_cigar_to_global(Cigar &cigar)
92 {
93 int diff;
94 uint i = 0;
95 uint len = cigar.size();
96 if (cigar.front().first == 'S') {
97 cigar.front().first = 'M';
98 i++;
99 }
100
101 int indel_cnt = 0;
102
103 for (; i < len; i++)
104 if (cigar[i].first == 'I')
105 indel_cnt += cigar[i].second;
106 else if (cigar[i].first == 'D')
107 indel_cnt -= cigar[i].second;
108
109 if (indel_cnt != 0) {
110 if (cigar.back().first == 'S') {
111 diff = abs(indel_cnt) - cigar.back().second;
112 cigar.back().first = indel_cnt < 0 ? 'I' : 'D';
113 cigar.back().second = cigar.back().second - diff;
114 if (diff > 0)
115 cigar.push_back({'S', diff});
116 } else {
117 diff = abs(indel_cnt);
118 cigar.push_back({indel_cnt < 0 ? 'I' : 'D', diff});
119 }
120 }
121 // for (; i < len - 1; i++) {
122 // switch(cigar[i].first) {
123 // case 'D':
124 // if (cigar.back().first == 'S') {
125 // diff = cigar.back().second - cigar[i].second;
126 // cigar.back().first = 'I';
127 // cigar.back().second = cigar.back().second - diff;
128 // if (diff > 0)
129 // cigar.push_back({'S', diff});
130 // }
131 // break;
132 // case 'I':
133 // if (cigar.back().first == 'S') {
134 // diff = cigar.back().second - cigar[i].second;
135 // cigar.back().first = 'D';
136 // cigar.back().second = cigar.back().second - diff;
137 // if (diff > 0)
138 // cigar.push_back({'S', diff});
139 // }
140 // break;
141 // }
142 // }
143
144 Cigar consolidated_cigar;
145 i = 0;
146 uint msum = 0;
147 while (i < cigar.size() && cigar[i].first == 'M') {
148 msum += cigar[i].second;
149 i++;
150 }
151
152 uint j = cigar.size() - 1;
153 uint dsum = 0;
154 uint isum = 0;
155 while (j > 0 && cigar[j].first == 'D') {
156 dsum += cigar[j].second;
157 j--;
158 }
159 if (dsum == 0) {
160 j = cigar.size() - 1;
161 while (j > 0 && cigar[j].first == 'D') {
162 isum += cigar[j].second;
163 j--;
164 }
165 }
166
167 if (msum > 0)
168 consolidated_cigar.push_back({'M', msum});
169 for (uint k = i; k < cigar.size() && k <= j; k++)
170 consolidated_cigar.push_back(cigar[k]);
171
172 if (dsum > 0)
173 consolidated_cigar.push_back({'D', dsum});
174 else if (isum > 0)
175 consolidated_cigar.push_back({'I', dsum});
176
177 cigar = consolidated_cigar;
178
179 return 0;
180 }
181
182 int
parse_cigar(const char * cigar_str,Cigar & cigar,bool check_correctness)183 parse_cigar(const char *cigar_str, Cigar &cigar, bool check_correctness)
184 {
185 assert(cigar_str && *cigar_str);
186 cigar.clear();
187 const char* p = cigar_str;
188 uint padded_len = 0;
189
190 while(*p != '\0') {
191 char* q;
192 uint len = strtol(p, &q, 10);
193 char c = *q;
194
195 if (check_correctness) {
196 if (q == p || c == '\0') {
197 // No number or no qualifier, respectively.
198 cerr << "Error: Malformed CIGAR string '" << cigar_str << "'.\n";
199 throw exception();
200 }
201 switch (c) {
202 case 'M':
203 case '=':
204 case 'X':
205 case 'I':
206 case 'D':
207 case 'S':
208 case 'N':
209 case 'H':
210 case 'P':
211 break;
212 default:
213 cerr << "Warning: Unknown CIGAR operation '" << c << "' in '" << cigar_str << "'.\n";
214 break;
215 }
216 }
217
218 cigar.push_back({c, len});
219 padded_len += len;
220 p = q + 1;
221 }
222
223 return padded_len;
224 }
225
226 string
apply_cigar_to_seq(const char * seq,const Cigar & cigar)227 apply_cigar_to_seq(const char *seq, const Cigar &cigar)
228 {
229 uint size = cigar.size();
230 char op;
231 uint dist, bp, edited_bp, stop;
232 string edited_seq;
233
234 //
235 // Calculate the overall sequence length.
236 //
237 uint seqlen = 0;
238 for (uint i = 0; i < size; i++)
239 seqlen += cigar[i].second;
240
241 bp = 0;
242
243 edited_seq.reserve(seqlen);
244
245 for (uint i = 0; i < size; i++) {
246 op = cigar[i].first;
247 dist = cigar[i].second;
248
249 switch(op) {
250 case 'S':
251 stop = bp + dist;
252 while (bp < stop) {
253 edited_seq.push_back('N');
254 bp++;
255 }
256 break;
257 case 'D':
258 edited_bp = 0;
259 while (edited_bp < dist) {
260 edited_seq.push_back('N');
261 edited_bp++;
262 }
263 break;
264 case 'I':
265 case 'M':
266 stop = bp + dist;
267 while (bp < stop) {
268 edited_seq.push_back(seq[bp]);
269 bp++;
270 }
271 break;
272 default:
273 break;
274 }
275 }
276
277 return edited_seq;
278 }
279
280 string
apply_cigar_to_model_seq(const char * seq,const Cigar & cigar)281 apply_cigar_to_model_seq(const char *seq, const Cigar &cigar)
282 {
283 uint size = cigar.size();
284 char op;
285 uint dist, bp, edited_bp, stop;
286 string edited_seq;
287
288 //
289 // Calculate the overall sequence length.
290 //
291 uint seqlen = 0;
292 for (uint i = 0; i < size; i++)
293 seqlen += cigar[i].second;
294
295 bp = 0;
296
297 edited_seq.reserve(seqlen);
298
299 for (uint i = 0; i < size; i++) {
300 op = cigar[i].first;
301 dist = cigar[i].second;
302
303 switch(op) {
304 case 'S':
305 stop = bp + dist;
306 while (bp < stop) {
307 edited_seq.push_back('U');
308 bp++;
309 }
310 break;
311 case 'D':
312 edited_bp = 0;
313 while (edited_bp < dist) {
314 edited_seq.push_back('U');
315 edited_bp++;
316 }
317 break;
318 case 'I':
319 case 'M':
320 stop = bp + dist;
321 while (bp < stop) {
322 edited_seq.push_back(seq[bp]);
323 bp++;
324 }
325 break;
326 default:
327 break;
328 }
329 }
330
331 return edited_seq;
332 }
333
334 int
apply_cigar_to_seq(char * seq,uint seq_len,const char * old_seq,Cigar & cigar)335 apply_cigar_to_seq(char *seq, uint seq_len, const char *old_seq, Cigar &cigar)
336 {
337 uint size = cigar.size();
338 char op;
339 uint dist, bp, seq_bp, stop;
340
341 bp = 0;
342 seq_bp = 0;
343
344 for (uint i = 0; i < size; i++) {
345 op = cigar[i].first;
346 dist = cigar[i].second;
347
348 switch(op) {
349 case 'S':
350 stop = seq_bp + dist;
351 stop = stop > seq_len ? seq_len : stop;
352 while (seq_bp < stop) {
353 seq[seq_bp] = 'N';
354 seq_bp++;
355 bp++;
356 }
357 break;
358 case 'D':
359 stop = seq_bp + dist;
360 stop = stop > seq_len ? seq_len : stop;
361 while (seq_bp < stop) {
362 seq[seq_bp] = 'N';
363 seq_bp++;
364 }
365 break;
366 case 'I':
367 case 'M':
368 stop = bp + dist;
369 stop = stop > seq_len ? seq_len : stop;
370 while (bp < stop) {
371 seq[seq_bp] = old_seq[bp];
372 seq_bp++;
373 bp++;
374 }
375 break;
376 default:
377 break;
378 }
379 }
380
381 seq[seq_len] = '\0';
382
383 return 0;
384 }
385
386 int
apply_cigar_to_model_seq(char * seq,uint seq_len,const char * model,Cigar & cigar)387 apply_cigar_to_model_seq(char *seq, uint seq_len, const char *model, Cigar &cigar)
388 {
389 uint size = cigar.size();
390 char op;
391 uint dist, model_bp, seq_bp, stop;
392
393 model_bp = 0;
394 seq_bp = 0;
395
396 for (uint i = 0; i < size; i++) {
397 op = cigar[i].first;
398 dist = cigar[i].second;
399
400 switch(op) {
401 case 'S':
402 stop = seq_bp + dist;
403 stop = stop > seq_len ? seq_len : stop;
404 while (seq_bp < stop) {
405 seq[seq_bp] = 'U';
406 seq_bp++;
407 model_bp++;
408 }
409 break;
410 case 'D':
411 stop = seq_bp + dist;
412 stop = stop > seq_len ? seq_len : stop;
413 while (seq_bp < stop) {
414 seq[seq_bp] = 'U';
415 seq_bp++;
416 }
417 break;
418 case 'I':
419 case 'M':
420 stop = model_bp + dist;
421 stop = stop > seq_len ? seq_len : stop;
422 while (model_bp < stop) {
423 seq[seq_bp] = model[model_bp];
424 seq_bp++;
425 model_bp++;
426 }
427 break;
428 default:
429 break;
430 }
431 }
432
433 seq[seq_len] = '\0';
434
435 return 0;
436 }
437
438 string
remove_cigar_from_seq(const char * seq,const Cigar & cigar)439 remove_cigar_from_seq(const char *seq, const Cigar &cigar)
440 {
441 uint size = cigar.size();
442 char op;
443 uint dist, bp, edited_bp, stop;
444 string edited_seq;
445
446 //
447 // Calculate the overall sequence length.
448 //
449 uint seqlen = 0;
450 for (uint i = 0; i < size; i++)
451 seqlen += cigar[i].first != 'D' ? cigar[i].second : 0;
452
453 bp = 0;
454
455 edited_seq.reserve(seqlen);
456
457 for (uint i = 0; i < size; i++) {
458 op = cigar[i].first;
459 dist = cigar[i].second;
460
461 switch(op) {
462 case 'D':
463 edited_bp = 0;
464 while (edited_bp < dist) {
465 edited_bp++;
466 bp++;
467 }
468 break;
469 case 'I':
470 case 'M':
471 stop = bp + dist;
472 while (bp < stop) {
473 edited_seq.push_back(seq[bp]);
474 bp++;
475 }
476 break;
477 default:
478 break;
479 }
480 }
481
482 return edited_seq;
483 }
484
cigar_lengths(const Cigar & cigar)485 std::tuple<uint,uint,uint> cigar_lengths(const Cigar& cigar) {
486 uint padded_len = 0;
487 uint ref_len = 0;
488 uint seq_len = 0;
489 for (vector<pair<char, uint>>::const_iterator op = cigar.begin(); op != cigar.end(); ++op) {
490 padded_len += op->second;
491 switch (op->first) {
492 case 'M':
493 case '=':
494 case 'X':
495 // Consume both ref & seq.
496 ref_len += op->second;
497 seq_len += op->second;
498 break;
499 case 'I':
500 case 'S':
501 // Consume seq.
502 seq_len += op->second;
503 break;
504 case 'D':
505 case 'N':
506 case 'H':
507 case 'P':
508 // Consume ref.
509 ref_len += op->second;
510 break;
511 default:
512 DOES_NOT_HAPPEN;
513 break;
514 }
515 }
516
517 return std::make_tuple(padded_len, ref_len, seq_len);
518 }
519
cigar_simplify_to_MDI(Cigar & cig)520 void cigar_simplify_to_MDI(Cigar& cig) {
521 if (cig.empty())
522 return;
523
524 // Replace operations with the relevant equivalent in "MDI".
525 for (auto& op : cig) {
526 switch (op.first) {
527 case 'M':
528 case 'D':
529 case 'I':
530 break;
531 case '=':
532 case 'X':
533 op.first = 'M'; break;
534 case 'S':
535 op.first = 'I'; break;
536 case 'N':
537 case 'H':
538 case 'P':
539 op.first = 'D'; break;
540 default:
541 DOES_NOT_HAPPEN;
542 break;
543 }
544 }
545
546 // Collapse identical successive operations.
547 auto prev = cig.rbegin();
548 auto op = ++cig.rbegin(); // n.b. `cig` isn't empty.
549 while(op != cig.rend()) {
550 if (op->first == prev->first) {
551 op->second += prev->second;
552 prev->first = '\0'; // Marked for deletion.
553 }
554 ++prev;
555 ++op;
556 }
557 cig.erase(std::remove_if(
558 cig.begin(), cig.end(),
559 [](const pair<char,uint>& op){return op.first == '\0';}
560 ), cig.end());
561 }
562
cigar_canonicalize_MDI_order(Cigar & cig)563 void cigar_canonicalize_MDI_order(Cigar& cig) {
564 assert(!cig.empty());
565 assert(cig.front().first == 'M' || cig.front().first == 'D' || cig.front().first == 'I');
566 for (auto op0=cig.begin(), op1=++cig.begin(); op1!=cig.end(); ++op0, ++op1) {
567 assert(op1->first == 'M' || op1->first == 'D' || op1->first == 'I');
568 assert(op0->first != op1->first); // Identical successive operations should have been collapsed.
569 if (op0->first == 'I' && op1->first == 'D')
570 // The M's never move, and it's always 'D' before 'I'...
571 std::swap(*op0, *op1);
572 }
573 if (cig.size() >= 2 && cig.front().first == 'D' && cig[1].first == 'I')
574 // ...except at the beginning (because I ~ S, and S when present must be first/last).
575 std::swap(cig.back(), *++cig.rbegin());
576 }
577