1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2016, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // aln_utils.cc -- common routines needed for dealing with gapped alignments.
23 //
24 
25 #include "aln_utils.h"
26 
27 // shell:
28 // echo "
29 // print(', '.join([ ('true' if chr(i) in set('0123456789=DHIMNPSX') else 'false') for i in range(256) ]))
30 // " | python3 | fold -s
31 const bool is_cigar_char[256] = {
32     false, false, false, false, false, false, false, false, false, false, false,
33     false, false, false, false, false, false, false, false, false, false, false,
34     false, false, false, false, false, false, false, false, false, false, false,
35     false, false, false, false, false, false, false, false, false, false, false,
36     false, false, false, false, true, true, true, true, true, true, true, true,
37     true, true, false, false, false, true, false, false, false, false, false,
38     false, true, false, false, false, true, true, false, false, false, true, true,
39     false, true, false, false, true, false, false, false, false, true, false,
40     false, false, false, false, false, false, false, false, false, false, false,
41     false, false, false, false, false, false, false, false, false, false, false,
42     false, false, false, false, false, false, false, false, false, false, false,
43     false, false, false, false, false, false, false, false, false, false, false,
44     false, false, false, false, false, false, false, false, false, false, false,
45     false, false, false, false, false, false, false, false, false, false, false,
46     false, false, false, false, false, false, false, false, false, false, false,
47     false, false, false, false, false, false, false, false, false, false, false,
48     false, false, false, false, false, false, false, false, false, false, false,
49     false, false, false, false, false, false, false, false, false, false, false,
50     false, false, false, false, false, false, false, false, false, false, false,
51     false, false, false, false, false, false, false, false, false, false, false,
52     false, false, false, false, false, false, false, false, false, false, false,
53     false, false, false, false, false, false, false, false, false, false, false,
54     false, false, false, false, false, false, false, false, false, false, false,
55     false
56 };
57 
operator <<(ostream & os,const Cigar & cig)58 ostream& operator<< (ostream& os, const Cigar& cig) {
59     for (auto op : cig)
60         os << op.second << (op.first == '\0' ? '?' : op.first);
61     return os;
62 }
63 
64 string
invert_cigar(string cigar)65 invert_cigar(string cigar)
66 {
67     for (uint i = 0; i < cigar.length(); i++) {
68         if (cigar[i] == 'I')
69             cigar[i] = 'D';
70         else if (cigar[i] == 'D')
71             cigar[i] = 'I';
72     }
73 
74     return cigar;
75 }
76 
77 int
invert_cigar(Cigar & cigar)78 invert_cigar(Cigar &cigar)
79 {
80     for (uint i = 0; i < cigar.size(); i++) {
81         if (cigar[i].first == 'I')
82             cigar[i].first = 'D';
83         else if (cigar[i].first == 'D')
84             cigar[i].first = 'I';
85     }
86 
87     return 0;
88 }
89 
90 int
convert_local_cigar_to_global(Cigar & cigar)91 convert_local_cigar_to_global(Cigar &cigar)
92 {
93     int  diff;
94     uint i   = 0;
95     uint len = cigar.size();
96     if (cigar.front().first == 'S') {
97         cigar.front().first = 'M';
98         i++;
99     }
100 
101     int indel_cnt = 0;
102 
103     for (; i < len; i++)
104         if (cigar[i].first == 'I')
105             indel_cnt += cigar[i].second;
106         else if (cigar[i].first == 'D')
107             indel_cnt -= cigar[i].second;
108 
109     if (indel_cnt != 0) {
110         if (cigar.back().first == 'S') {
111             diff = abs(indel_cnt) - cigar.back().second;
112             cigar.back().first = indel_cnt < 0 ? 'I' : 'D';
113             cigar.back().second = cigar.back().second - diff;
114             if (diff > 0)
115                 cigar.push_back({'S', diff});
116         } else {
117             diff = abs(indel_cnt);
118             cigar.push_back({indel_cnt < 0 ? 'I' : 'D', diff});
119         }
120     }
121     // for (; i < len - 1; i++) {
122     //     switch(cigar[i].first) {
123     //     case 'D':
124     //         if (cigar.back().first == 'S') {
125     //             diff = cigar.back().second - cigar[i].second;
126     //             cigar.back().first  = 'I';
127     //             cigar.back().second = cigar.back().second - diff;
128     //             if (diff > 0)
129     //                 cigar.push_back({'S', diff});
130     //         }
131     //         break;
132     //     case 'I':
133     //         if (cigar.back().first == 'S') {
134     //             diff = cigar.back().second - cigar[i].second;
135     //             cigar.back().first  = 'D';
136     //             cigar.back().second = cigar.back().second - diff;
137     //             if (diff > 0)
138     //                 cigar.push_back({'S', diff});
139     //         }
140     //         break;
141     //     }
142     // }
143 
144     Cigar consolidated_cigar;
145     i = 0;
146     uint msum = 0;
147     while (i < cigar.size() && cigar[i].first == 'M') {
148         msum += cigar[i].second;
149         i++;
150     }
151 
152     uint j    = cigar.size() - 1;
153     uint dsum = 0;
154     uint isum = 0;
155     while (j > 0 && cigar[j].first == 'D') {
156         dsum += cigar[j].second;
157         j--;
158     }
159     if (dsum == 0) {
160         j = cigar.size() - 1;
161         while (j > 0 && cigar[j].first == 'D') {
162             isum += cigar[j].second;
163             j--;
164         }
165     }
166 
167     if (msum > 0)
168         consolidated_cigar.push_back({'M', msum});
169     for (uint k = i; k < cigar.size() && k <= j; k++)
170         consolidated_cigar.push_back(cigar[k]);
171 
172     if (dsum > 0)
173         consolidated_cigar.push_back({'D', dsum});
174     else if (isum > 0)
175         consolidated_cigar.push_back({'I', dsum});
176 
177     cigar = consolidated_cigar;
178 
179     return 0;
180 }
181 
182 int
parse_cigar(const char * cigar_str,Cigar & cigar,bool check_correctness)183 parse_cigar(const char *cigar_str, Cigar &cigar, bool check_correctness)
184 {
185     assert(cigar_str && *cigar_str);
186     cigar.clear();
187     const char* p = cigar_str;
188     uint padded_len = 0;
189 
190     while(*p != '\0') {
191         char* q;
192         uint len = strtol(p, &q, 10);
193         char c = *q;
194 
195         if (check_correctness) {
196             if (q == p || c == '\0') {
197                 // No number or no qualifier, respectively.
198                 cerr << "Error: Malformed CIGAR string '" << cigar_str << "'.\n";
199                 throw exception();
200             }
201             switch (c) {
202             case 'M':
203             case '=':
204             case 'X':
205             case 'I':
206             case 'D':
207             case 'S':
208             case 'N':
209             case 'H':
210             case 'P':
211                 break;
212             default:
213                 cerr << "Warning: Unknown CIGAR operation '" << c << "' in '" << cigar_str << "'.\n";
214                 break;
215             }
216         }
217 
218         cigar.push_back({c, len});
219         padded_len += len;
220         p = q + 1;
221     }
222 
223     return padded_len;
224 }
225 
226 string
apply_cigar_to_seq(const char * seq,const Cigar & cigar)227 apply_cigar_to_seq(const char *seq, const Cigar &cigar)
228 {
229     uint   size = cigar.size();
230     char   op;
231     uint   dist, bp, edited_bp, stop;
232     string edited_seq;
233 
234     //
235     // Calculate the overall sequence length.
236     //
237     uint seqlen = 0;
238     for (uint i = 0; i < size; i++)
239         seqlen += cigar[i].second;
240 
241     bp  = 0;
242 
243     edited_seq.reserve(seqlen);
244 
245     for (uint i = 0; i < size; i++)  {
246         op   = cigar[i].first;
247         dist = cigar[i].second;
248 
249         switch(op) {
250         case 'S':
251             stop = bp + dist;
252             while (bp < stop) {
253                 edited_seq.push_back('N');
254                 bp++;
255             }
256             break;
257         case 'D':
258             edited_bp = 0;
259             while (edited_bp < dist) {
260                 edited_seq.push_back('N');
261                 edited_bp++;
262             }
263             break;
264         case 'I':
265         case 'M':
266             stop = bp + dist;
267             while (bp < stop) {
268                 edited_seq.push_back(seq[bp]);
269                 bp++;
270             }
271             break;
272         default:
273             break;
274         }
275     }
276 
277     return edited_seq;
278 }
279 
280 string
apply_cigar_to_model_seq(const char * seq,const Cigar & cigar)281 apply_cigar_to_model_seq(const char *seq, const Cigar &cigar)
282 {
283     uint   size = cigar.size();
284     char   op;
285     uint   dist, bp, edited_bp, stop;
286     string edited_seq;
287 
288     //
289     // Calculate the overall sequence length.
290     //
291     uint seqlen = 0;
292     for (uint i = 0; i < size; i++)
293         seqlen += cigar[i].second;
294 
295     bp  = 0;
296 
297     edited_seq.reserve(seqlen);
298 
299     for (uint i = 0; i < size; i++)  {
300         op   = cigar[i].first;
301         dist = cigar[i].second;
302 
303         switch(op) {
304         case 'S':
305             stop = bp + dist;
306             while (bp < stop) {
307                 edited_seq.push_back('U');
308                 bp++;
309             }
310             break;
311         case 'D':
312             edited_bp = 0;
313             while (edited_bp < dist) {
314                 edited_seq.push_back('U');
315                 edited_bp++;
316             }
317             break;
318         case 'I':
319         case 'M':
320             stop = bp + dist;
321             while (bp < stop) {
322                 edited_seq.push_back(seq[bp]);
323                 bp++;
324             }
325             break;
326         default:
327             break;
328         }
329     }
330 
331     return edited_seq;
332 }
333 
334 int
apply_cigar_to_seq(char * seq,uint seq_len,const char * old_seq,Cigar & cigar)335 apply_cigar_to_seq(char *seq, uint seq_len, const char *old_seq, Cigar &cigar)
336 {
337     uint   size = cigar.size();
338     char   op;
339     uint   dist, bp, seq_bp, stop;
340 
341     bp         = 0;
342     seq_bp     = 0;
343 
344     for (uint i = 0; i < size; i++)  {
345         op   = cigar[i].first;
346         dist = cigar[i].second;
347 
348         switch(op) {
349         case 'S':
350             stop = seq_bp + dist;
351             stop = stop > seq_len ? seq_len : stop;
352             while (seq_bp < stop) {
353                 seq[seq_bp] = 'N';
354                 seq_bp++;
355                 bp++;
356             }
357             break;
358         case 'D':
359             stop = seq_bp + dist;
360             stop = stop > seq_len ? seq_len : stop;
361             while (seq_bp < stop) {
362                 seq[seq_bp] = 'N';
363                 seq_bp++;
364             }
365             break;
366         case 'I':
367         case 'M':
368             stop = bp + dist;
369             stop = stop > seq_len ? seq_len : stop;
370             while (bp < stop) {
371                 seq[seq_bp] = old_seq[bp];
372                 seq_bp++;
373                 bp++;
374             }
375             break;
376         default:
377             break;
378         }
379     }
380 
381     seq[seq_len] = '\0';
382 
383     return 0;
384 }
385 
386 int
apply_cigar_to_model_seq(char * seq,uint seq_len,const char * model,Cigar & cigar)387 apply_cigar_to_model_seq(char *seq, uint seq_len, const char *model, Cigar &cigar)
388 {
389     uint   size = cigar.size();
390     char   op;
391     uint   dist, model_bp, seq_bp, stop;
392 
393     model_bp  = 0;
394     seq_bp    = 0;
395 
396     for (uint i = 0; i < size; i++)  {
397         op   = cigar[i].first;
398         dist = cigar[i].second;
399 
400         switch(op) {
401         case 'S':
402             stop = seq_bp + dist;
403             stop = stop > seq_len ? seq_len : stop;
404             while (seq_bp < stop) {
405                 seq[seq_bp] = 'U';
406                 seq_bp++;
407                 model_bp++;
408             }
409             break;
410         case 'D':
411             stop = seq_bp + dist;
412             stop = stop > seq_len ? seq_len : stop;
413             while (seq_bp < stop) {
414                 seq[seq_bp] = 'U';
415                 seq_bp++;
416             }
417             break;
418         case 'I':
419         case 'M':
420             stop = model_bp + dist;
421             stop = stop > seq_len ? seq_len : stop;
422             while (model_bp < stop) {
423                 seq[seq_bp] = model[model_bp];
424                 seq_bp++;
425                 model_bp++;
426             }
427             break;
428         default:
429             break;
430         }
431     }
432 
433     seq[seq_len] = '\0';
434 
435     return 0;
436 }
437 
438 string
remove_cigar_from_seq(const char * seq,const Cigar & cigar)439 remove_cigar_from_seq(const char *seq, const Cigar &cigar)
440 {
441     uint   size = cigar.size();
442     char   op;
443     uint   dist, bp, edited_bp, stop;
444     string edited_seq;
445 
446     //
447     // Calculate the overall sequence length.
448     //
449     uint seqlen = 0;
450     for (uint i = 0; i < size; i++)
451         seqlen += cigar[i].first != 'D' ? cigar[i].second : 0;
452 
453     bp  = 0;
454 
455     edited_seq.reserve(seqlen);
456 
457     for (uint i = 0; i < size; i++)  {
458         op   = cigar[i].first;
459         dist = cigar[i].second;
460 
461         switch(op) {
462         case 'D':
463             edited_bp = 0;
464             while (edited_bp < dist) {
465                 edited_bp++;
466                 bp++;
467             }
468             break;
469         case 'I':
470         case 'M':
471             stop = bp + dist;
472             while (bp < stop) {
473                 edited_seq.push_back(seq[bp]);
474                 bp++;
475             }
476             break;
477         default:
478             break;
479         }
480     }
481 
482     return edited_seq;
483 }
484 
cigar_lengths(const Cigar & cigar)485 std::tuple<uint,uint,uint> cigar_lengths(const Cigar& cigar) {
486     uint padded_len = 0;
487     uint ref_len = 0;
488     uint seq_len = 0;
489     for (vector<pair<char, uint>>::const_iterator op = cigar.begin(); op != cigar.end(); ++op) {
490         padded_len += op->second;
491         switch (op->first) {
492         case 'M':
493         case '=':
494         case 'X':
495             // Consume both ref & seq.
496             ref_len += op->second;
497             seq_len += op->second;
498             break;
499         case 'I':
500         case 'S':
501             // Consume seq.
502             seq_len += op->second;
503             break;
504         case 'D':
505         case 'N':
506         case 'H':
507         case 'P':
508             // Consume ref.
509             ref_len += op->second;
510             break;
511         default:
512             DOES_NOT_HAPPEN;
513             break;
514         }
515     }
516 
517     return std::make_tuple(padded_len, ref_len, seq_len);
518 }
519 
cigar_simplify_to_MDI(Cigar & cig)520 void cigar_simplify_to_MDI(Cigar& cig) {
521     if (cig.empty())
522         return;
523 
524     // Replace operations with the relevant equivalent in "MDI".
525     for (auto& op : cig) {
526         switch (op.first) {
527         case 'M':
528         case 'D':
529         case 'I':
530             break;
531         case '=':
532         case 'X':
533             op.first = 'M'; break;
534         case 'S':
535             op.first = 'I'; break;
536         case 'N':
537         case 'H':
538         case 'P':
539             op.first = 'D'; break;
540         default:
541             DOES_NOT_HAPPEN;
542             break;
543         }
544     }
545 
546     // Collapse identical successive operations.
547     auto prev = cig.rbegin();
548     auto op = ++cig.rbegin(); // n.b. `cig` isn't empty.
549     while(op != cig.rend()) {
550         if (op->first == prev->first) {
551             op->second += prev->second;
552             prev->first = '\0'; // Marked for deletion.
553         }
554         ++prev;
555         ++op;
556     }
557     cig.erase(std::remove_if(
558             cig.begin(), cig.end(),
559             [](const pair<char,uint>& op){return op.first == '\0';}
560             ), cig.end());
561 }
562 
cigar_canonicalize_MDI_order(Cigar & cig)563 void cigar_canonicalize_MDI_order(Cigar& cig) {
564     assert(!cig.empty());
565     assert(cig.front().first == 'M' || cig.front().first == 'D' || cig.front().first == 'I');
566     for (auto op0=cig.begin(), op1=++cig.begin(); op1!=cig.end(); ++op0, ++op1) {
567         assert(op1->first == 'M' || op1->first == 'D' || op1->first == 'I');
568         assert(op0->first != op1->first); // Identical successive operations should have been collapsed.
569         if (op0->first == 'I' && op1->first == 'D')
570             // The M's never move, and it's always 'D' before 'I'...
571             std::swap(*op0, *op1);
572     }
573     if (cig.size() >= 2 && cig.front().first == 'D' && cig[1].first == 'I')
574         // ...except at the beginning (because I ~ S, and S when present must be first/last).
575         std::swap(cig.back(), *++cig.rbegin());
576 }
577