1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2016 - 2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __GAPPEDALN_H__
22 #define __GAPPEDALN_H__
23 
24 #include "SuffixTree.h"
25 
26 enum dynprog {dynp_down, dynp_right, dynp_diag};
27 
28 class AlignRes {
29 public:
30     string cigar;
31     uint   gap_cnt;
32     uint   contiguity;
33     double pct_id;
34     uint   subj_pos;
AlignRes()35     AlignRes() {
36         this->gap_cnt    = 0;
37         this->contiguity = 0;
38         this->pct_id     = 0.0;
39         this->subj_pos   = 0;
40     }
AlignRes(string && cigar,uint gapcnt,uint contiguity,double pct_id)41     AlignRes(string&& cigar, uint gapcnt, uint contiguity, double pct_id) {
42         this->cigar      = move(cigar);
43         this->gap_cnt    = gapcnt;
44         this->contiguity = contiguity;
45         this->pct_id     = pct_id;
46         this->subj_pos   = 0;
47     }
AlignRes(string && cigar,uint gapcnt,uint contiguity,double pct_id,double pos)48     AlignRes(string&& cigar, uint gapcnt, uint contiguity, double pct_id, double pos) {
49         this->cigar      = move(cigar);
50         this->gap_cnt    = gapcnt;
51         this->contiguity = contiguity;
52         this->pct_id     = pct_id;
53         this->subj_pos   = pos;
54     }
clear()55     void clear() {
56         this->cigar.clear();
57         this->gap_cnt    = 0;
58         this->contiguity = 0;
59         this->pct_id     = 0.0;
60         this->subj_pos   = 0;
61     }
62 };
63 
64 class AlignPath {
65 public:
66     bool diag;
67     bool left;
68     bool up;
69 
AlignPath()70     AlignPath() {
71         diag = false;
72         left = false;
73         up   = false;
74     }
count()75     int count() {
76         int cnt;
77         cnt  = this->up   ? 1 : 0;
78         cnt += this->left ? 1 : 0;
79         cnt += this->diag ? 1 : 0;
80 
81         return cnt;
82     }
83 };
84 
85 //
86 // Needleman-Wunsch Alignment
87 //
88 static const double gapopen_score  = -10.0;
89 static const double gapext_score   =  -0.5;
90 static const double mismatch_score =  -4.0;
91 static const double match_score    =   5.0;
92 
93 class GappedAln {
94     uint        _m;
95     uint        _n;
96     uint        _m_size;
97     uint        _n_size;
98     uint        _max_score;   //
99     uint        _max_score_m; // For local alignment.
100     uint        _max_score_n; //
101     double    **matrix;
102     AlignPath **path;
103     AlignRes    _aln;
104 
105     inline int swap(double *, dynprog *, int, int);
106     int        trace_global_alignment(const string&, const string&);
107     int        trace_local_alignment(const string&, const string&);
108 
109     GappedAln& operator= (const GappedAln&) = delete;
110 
111  public:
112     GappedAln();
GappedAln(int i)113     GappedAln(int i) : GappedAln(i, i) {};
GappedAln(int m,int n)114     GappedAln(int m, int n) : GappedAln(m, n, false) {};
115     GappedAln(int, int, bool);
116     ~GappedAln();
117 
118     int init(int, int);
119     int init(int, int, bool);
120     int align(const string&, const string&);
121     int align_region(const string&, const string&, const size_t, const size_t, const size_t, const size_t);
122     int align_constrained(const string&, const string&, const vector<STAln> &);
123     const AlignRes& result();
124 
125     int parse_cigar(vector<pair<char, uint> > &);
126     int dump_alignment(const string&, const string&);
127 
128 private:
129     int bound_region(const int, const int, const int, const int);
130     int score(bool, const string&, const int, const int, const string&, const int, const int);
131 };
132 
GappedAln()133 GappedAln::GappedAln()
134 {
135     this->_m           = 0;
136     this->_n           = 0;
137     this->_m_size      = this->_m;
138     this->_n_size      = this->_n;
139     this->_max_score   = 0;
140     this->_max_score_m = 0;
141     this->_max_score_n = 0;
142     this->matrix       = NULL;
143     this->path         = NULL;
144 }
145 
GappedAln(int len_1,int len_2,bool initialize)146 GappedAln::GappedAln(int len_1, int len_2, bool initialize)
147 {
148     this->_m      = len_1 + 1;
149     this->_n      = len_2 + 1;
150     this->_m_size = this->_m;
151     this->_n_size = this->_n;
152     this->_max_score   = 0;
153     this->_max_score_m = 0;
154     this->_max_score_n = 0;
155 
156     this->matrix = new double * [this->_m];
157     for (uint i = 0; i < this->_m; i++)
158         this->matrix[i] = new double [this->_n];
159 
160     this->path = new AlignPath * [this->_m];
161     for (uint i = 0; i < this->_m; i++)
162         this->path[i] = new AlignPath [this->_n];
163 
164     if (initialize)
165         for (uint i = 0; i < this->_m; i++)
166             memset(this->matrix[i], 0, sizeof(double) * this->_n);
167 }
168 
~GappedAln()169 GappedAln::~GappedAln()
170 {
171     for (uint i = 0; i < this->_m_size; i++) {
172         delete [] this->matrix[i];
173         delete [] this->path[i];
174     }
175     delete [] this->matrix;
176     delete [] this->path;
177 }
178 
179 int
init(int size_1,int size_2)180 GappedAln::init(int size_1, int size_2)
181 {
182     return this->init(size_1, size_2, false);
183 }
184 
185 int
init(int size_1,int size_2,bool initialize)186 GappedAln::init(int size_1, int size_2, bool initialize)
187 {
188     //
189     // Resize the underlying matrix and path arrays, if necessary.
190     //
191     if ((size_1 + 1) > (int)this->_m_size || (size_2 + 1) > (int)this->_n_size) {
192         for (uint i = 0; i < this->_m_size; i++) {
193             delete [] this->matrix[i];
194             delete [] this->path[i];
195         }
196         if (this->_m_size > 0) {
197             delete [] this->matrix;
198             delete [] this->path;
199         }
200 
201         this->_m_size = size_1 + 1;
202         this->_n_size = size_2 + 1;
203         //
204         // Resize the arrays to be 25% larger than the requested size.
205         //
206         int new_size = this->_m_size > this->_n_size ? this->_m_size : this->_n_size;
207         new_size += int((double) new_size * 0.25);
208         this->_m_size = new_size;
209         this->_n_size = new_size;
210 
211         this->matrix = new double * [this->_m_size];
212         for (uint i = 0; i < this->_m_size; i++)
213             this->matrix[i] = new double [this->_n_size];
214 
215         this->path = new AlignPath * [this->_m_size];
216         for (uint i = 0; i < this->_m_size; i++)
217             this->path[i] = new AlignPath [this->_n_size];
218     }
219 
220     //
221     // Otherwise, set the dimensions of the matrix and path arrays to be the sequence lengths.
222     //
223     this->_m = size_1 + 1;
224     this->_n = size_2 + 1;
225 
226     if (initialize) {
227         for (uint i = 0; i < this->_m_size; i++)
228             memset(this->matrix[i], 0, sizeof(double) * this->_n_size);
229         this->_aln.clear();
230         this->_max_score   = 0;
231         this->_max_score_m = 0;
232         this->_max_score_n = 0;
233 
234     }
235 
236     return 0;
237 }
238 
239 int
align(const string & tag_1,const string & tag_2)240 GappedAln::align(const string& tag_1, const string& tag_2)
241 {
242     //         j---->        tag_2
243     //        [0][1][2][3]...[n-1]
244     //       +--------------------
245     // i [0] | [i][j]
246     // | [1] |
247     // | [2] |
248     // v [3] |
249     //   ... |
250     // [m-1] |
251     // tag_1
252     //
253 
254     //
255     // Initialize the first column and row of the dynamic programming
256     // matrix and the path array.
257     //
258     path[0][0].diag = false;
259     path[0][0].up   = false;
260     path[0][0].left = false;
261     matrix[0][0]    = 0.0;
262     for (uint i = 1; i < this->_m; i++) {
263         this->matrix[i][0]    = this->path[i - 1][0].up ? this->matrix[i - 1][0] + gapext_score : this->matrix[i - 1][0] + gapopen_score;
264         this->path[i][0].diag = false;
265         this->path[i][0].up   = true;
266         this->path[i][0].left = false;
267     }
268     for (uint j = 1; j < this->_n; j++) {
269         this->matrix[0][j]    = this->path[0][j - 1].left ? this->matrix[0][j - 1] + gapext_score : this->matrix[0][j - 1] + gapopen_score;
270         this->path[0][j].diag = false;
271         this->path[0][j].up   = false;
272         this->path[0][j].left = true;
273     }
274 
275     this->score(false, tag_1, 1, this->_m - 1, tag_2, 1, this->_n - 1);
276 
277     if (this->trace_global_alignment(tag_1, tag_2))
278         return 1;
279 
280     return 0;
281 }
282 
283 int
align_region(const string & query,const string & subj,const size_t q_start,const size_t q_end,const size_t s_start,const size_t s_end)284 GappedAln::align_region(const string& query,  const string& subj,
285                         const size_t q_start, const size_t q_end,
286                         const size_t s_start, const size_t s_end)
287 {
288     //         j---->      subject
289     //        [0][1][2][3]...[n-1]
290     //       +--------------------
291     // i [0] | [i][j]
292     // | [1] |
293     // | [2] |
294     // v [3] |
295     //   ... |
296     // [m-1] |
297     // query
298     this->bound_region(q_start, q_end + 1, s_start, s_end + 1);
299     this->score(true, query, q_start, q_end + 1, subj, s_start, s_end + 1);
300 
301     if (this->trace_local_alignment(query, subj))
302         return 1;
303 
304     return 0;
305 }
306 
307 int
align_constrained(const string & query,const string & subj,const vector<STAln> & alns)308 GappedAln::align_constrained(const string& query, const string& subj, const vector<STAln> &alns)
309 {
310     //         j---->      subject
311     //        [0][1][2][3]...[n-1]
312     //       +--------------------
313     // i [0] | [i][j]
314     // | [1] |
315     // | [2] |
316     // v [3] |
317     //   ... |
318     // [m-1] |
319     // query
320 
321     uint q_start, q_end, s_start, s_end;
322     uint q_len, q, s;
323 
324     //
325     // Does the first pre-aligned region start at the beginning of the query? If not, fill in the matrix.
326     //
327     if (alns.front().query_pos > 0) {
328         q_start = 1;
329         q_end   = 1 + alns.front().query_pos - 1;
330         q_len   = q_end;
331         s_start = (alns.front().subj_pos >= (q_len * 2)) ? 1 + alns.front().subj_pos - (q_len * 2) : 1;
332         s_end   = 1 + alns.front().subj_pos - 1;
333 
334         this->path[q_start][s_start].diag = false;
335         this->path[q_start][s_start].up   = false;
336         this->path[q_start][s_start].left = false;
337 
338         // cerr << "Filling start region; q_start: " << q_start << ", q_end: " << q_end << "; s_start: " << s_start << ", s_end: " << s_end << "\n";
339         this->bound_region(q_start, q_end, s_start, s_end);
340         this->score(true, query, q_start, q_end, subj, s_start, s_end);
341     }
342 
343     //
344     // Fill in each pre-aligned region of the sequences starting from the first fragment as ordered by
345     // the query sequence.
346     //
347     for (uint n = 0; n < alns.size(); n++) {
348 
349         for (uint i = alns[n].query_pos + 1; i <= alns[n].query_pos + alns[n].aln_len; i++) {
350             for (uint j = alns[n].subj_pos + 1; j <= alns[n].subj_pos + alns[n].aln_len; j++) {
351                 q = i - alns[n].query_pos;
352                 s = j - alns[n].subj_pos;
353                 if (q == s) {
354                     this->matrix[i][j] = this->matrix[i - 1][j - 1] + match_score;
355                     this->path[i][j].diag = true;
356                     this->path[i][j].up   = false;
357                     this->path[i][j].left = false;
358 
359                     if (this->matrix[i][j] > this->_max_score) {
360                         this->_max_score = this->matrix[i][j];
361                         this->_max_score_m = i;
362                         this->_max_score_n = j;
363                     }
364                 }
365             }
366         }
367 
368         //
369         // Next fill in the following connector sequence span the region between two pre-aligned regions.
370         //
371         uint m = n + 1;
372         if (m < alns.size()) {
373             q_start = 1 + alns[n].query_pos + alns[n].aln_len;
374             q_end   = 1 + alns[m].query_pos - 1;
375             s_start = 1 + alns[n].subj_pos + alns[n].aln_len;
376             s_end   = 1 + alns[m].subj_pos - 1;
377 
378             // cerr << "q_start: " << q_start << ", q_end: " << q_end << "; s_start: " << s_start << ", s_end: " << s_end << "\n";
379             this->bound_region(q_start, q_end, s_start, s_end);
380             this->score(true, query, q_start, q_end, subj, s_start, s_end);
381         }
382     }
383 
384     //
385     // Does the last pre-aligned region end at the end of the query? If not, fill in the matrix.
386     //
387     if (alns.back().query_pos + alns.back().aln_len != query.length()) {
388         q_start = 1 + alns.back().query_pos + alns.back().aln_len;
389         q_end   = query.length();
390 
391         assert(q_end >= q_start);
392 
393         q_len   = q_end - q_start + 1;
394         q       = 1 + alns.back().subj_pos + alns.back().aln_len;
395         s_start = (q < this->_n) ? q : this->_n - 1;
396         s_end   = (s_start + (q_len * 2) < this->_n) ? s_start + (q_len * 2) : this->_n - 1;
397 
398         // cerr << "Filling end region; q_start: " << q_start << ", q_end: " << q_end << "; s_start: " << s_start << ", s_end: " << s_end << "\n";
399         this->bound_region(q_start, q_end, s_start, s_end);
400         this->score(true, query, q_start, q_end, subj, s_start, s_end);
401     }
402 
403     if (this->trace_local_alignment(query, subj))
404         return 1;
405 
406     return 0;
407 }
408 
409 inline int
bound_region(const int q_start,const int q_end,const int s_start,const int s_end)410 GappedAln::bound_region(const int q_start, const int q_end,
411                         const int s_start, const int s_end)
412 {
413     //         j---->      subject
414     //        [0][1][2][3]...[n-1]
415     //       +--------------------
416     // i [0] | [i][j]
417     // | [1] |
418     // | [2] |
419     // v [3] |
420     //   ... |
421     // [m-1] |
422     // query
423 
424     int i_bnd, i_bnd_low, j_bnd, j_bnd_low;
425     //
426     // Bound the region we are about to score taking care not to cross out of the bounds of the matrix.
427     //
428     assert(q_start >= 0);
429     assert(s_start >= 0);
430     assert(q_end   >= 0);
431     assert(s_end   >= 0);
432 
433     //
434     // _n and _m are the length of the subject and query, respectively. however, as they are
435     // indexes into a zero-based array, their maximum value is _n - 1 and _m - 1.
436     //
437     j_bnd     = s_end + 1   > (int) this->_n - 1 ? this->_n - 1 : s_end + 1;
438     j_bnd_low = s_start - 1 < 0 ? 0 : s_start - 1;
439     i_bnd     = q_end + 1   > (int) this->_m - 1 ? this->_m - 1 : q_end + 1;
440     i_bnd_low = q_start - 1 < 0 ? 0 : q_start - 1;
441 
442     assert(j_bnd < (int) this->_n_size);
443     assert(i_bnd < (int) this->_m_size);
444 
445     double score_down, score_right;
446 
447     // First, bound the top row.
448     if (i_bnd_low >= 0 && s_start < (int) this->_n - 1)
449         for (int j = s_start; j <= j_bnd; j++) {
450             score_right  = this->matrix[i_bnd_low][j - 1];
451             score_right += this->path[i_bnd_low][j - 1].left ? gapext_score : gapopen_score;
452             this->matrix[i_bnd_low][j]    = score_right < 0 ? 0 : score_right;
453             this->path[i_bnd_low][j].diag = false;
454             this->path[i_bnd_low][j].up   = false;
455             this->path[i_bnd_low][j].left = true;
456         }
457 
458     // Second, fill the left column.
459     if (j_bnd_low >= 0)
460         for (int i = q_start; i <= i_bnd; i++) {
461             score_down  = this->matrix[i - 1][j_bnd_low];
462             score_down += this->path[i - 1][j_bnd_low].up ? gapext_score : gapopen_score;
463             this->matrix[i][j_bnd_low]    = score_down < 0 ? 0 : score_down;
464             this->path[i][j_bnd_low].diag = false;
465             this->path[i][j_bnd_low].up   = true;
466             this->path[i][j_bnd_low].left = false;
467         }
468 
469     // Third, fill the right column.
470     if (s_end + 1 < (int) this->_n)
471         for (int i = i_bnd_low; i <= i_bnd; i++) {
472             score_right  = this->matrix[i][j_bnd - 1];
473             score_right += this->path[i][j_bnd - 1].left ? gapext_score : gapopen_score;
474             this->matrix[i][j_bnd]    = score_right < 0 ? 0 : score_right;
475             this->path[i][j_bnd].diag = false;
476             this->path[i][j_bnd].up   = false;
477             this->path[i][j_bnd].left = true;
478         }
479 
480     // Fourth, bound the bottom row.
481     if (q_end + 1 < (int) this->_m)
482         for (int j = j_bnd_low; j <= j_bnd; j++) {
483             score_down  = this->matrix[i_bnd - 1][j];
484             score_down += this->path[i_bnd - 1][j].up ? gapext_score : gapopen_score;
485             this->matrix[i_bnd][j]    = score_down < 0 ? 0 : score_down;
486             this->path[i_bnd][j].diag = false;
487             this->path[i_bnd][j].up   = true;
488             this->path[i_bnd][j].left = false;
489         }
490 
491     return 0;
492 }
493 
494 inline int
score(bool local,const string & query,const int q_start,const int q_end,const string & subj,const int s_start,const int s_end)495 GappedAln::score(bool local,
496                  const string& query, const int q_start, const int q_end,
497                  const string& subj, const int s_start, const int s_end)
498 {
499     double  score_down, score_diag, score_right;
500     double  scores[3];
501     dynprog direction[3];
502 
503     //
504     // Score a region of the matrix.
505     //
506     for (int i = q_start; i <= q_end; i++) {
507         for (int j = s_start; j <= s_end; j++) {
508             // Calculate the score:
509             //   1) If we were to move down from the above cell.
510             score_down   = this->matrix[i - 1][j];
511             score_down  += this->path[i - 1][j].up ? gapext_score : gapopen_score;
512             //   2) If we were to move diagonally from the above and left cell.
513             score_diag   = this->matrix[i - 1][j - 1] + (query[i - 1] == subj[j - 1] ? match_score : mismatch_score);
514             //   3) If we were to move over from the cell left of us.
515             score_right  = this->matrix[i][j - 1];
516             score_right += this->path[i][j - 1].left ? gapext_score : gapopen_score;
517 
518             //
519             // Sort the scores, highest to lowest.
520             //
521             scores[0]    = score_down;
522             direction[0] = dynp_down;
523             scores[1]    = score_diag;
524             direction[1] = dynp_diag;
525             scores[2]    = score_right;
526             direction[2] = dynp_right;
527 
528             if (scores[0] < scores[1])
529                 this->swap(scores, direction, 0, 1);
530             if (scores[1] < scores[2])
531                 this->swap(scores, direction, 1, 2);
532             if (scores[0] < scores[1])
533                 this->swap(scores, direction, 0, 1);
534 
535             this->matrix[i][j] = scores[0];
536 
537             if (scores[0] > this->_max_score) {
538                 this->_max_score = scores[0];
539                 this->_max_score_m = i;
540                 this->_max_score_n = j;
541             }
542 
543             //
544             // If this is a local alignment and the score is zero or negative, mark this node
545             // as having no valid paths exiting it.
546             //
547             if (local && scores[0] <= 0) {
548                 this->path[i][j].diag = false;
549                 this->path[i][j].up   = false;
550                 this->path[i][j].left = false;
551                 continue;
552             }
553 
554             if (scores[0] > scores[1]) {
555                 //
556                 // One path is best.
557                 //
558                 switch (direction[0]) {
559                 case dynp_diag:
560                     this->path[i][j].diag = true;
561                     this->path[i][j].up   = false;
562                     this->path[i][j].left = false;
563                     break;
564                 case dynp_down:
565                     this->path[i][j].diag = false;
566                     this->path[i][j].up   = true;
567                     this->path[i][j].left = false;
568                     break;
569                 case dynp_right:
570                 default:
571                     this->path[i][j].diag = false;
572                     this->path[i][j].up   = false;
573                     this->path[i][j].left = true;
574                 }
575 
576             } else if (scores[0] == scores[1]) {
577                 //
578                 // Two of the paths are equivalent.
579                 //
580                 switch (direction[0]) {
581                 case dynp_diag:
582                     this->path[i][j].diag = true;
583 
584                     switch (direction[1]) {
585                     case dynp_down:
586                         this->path[i][j].up   = true;
587                         this->path[i][j].left = false;
588                         break;
589                     default:
590                     case dynp_right:
591                         this->path[i][j].up   = false;
592                         this->path[i][j].left = true;
593                         break;
594                     }
595                     break;
596                 case dynp_down:
597                     this->path[i][j].up = true;
598 
599                     switch (direction[1]) {
600                     case dynp_right:
601                         this->path[i][j].diag  = false;
602                         this->path[i][j].left = true;
603                         break;
604                     default:
605                     case dynp_diag:
606                         this->path[i][j].diag  = true;
607                         this->path[i][j].left = false;
608                         break;
609                     }
610                     break;
611                 default:
612                 case dynp_right:
613                     this->path[i][j].left = true;
614 
615                     switch (direction[1]) {
616                     case dynp_diag:
617                         this->path[i][j].diag = true;
618                         this->path[i][j].up   = false;
619                         break;
620                     default:
621                     case dynp_down:
622                         this->path[i][j].diag = false;
623                         this->path[i][j].up   = true;
624                         break;
625                     }
626                     break;
627                 }
628 
629             } else {
630                 //
631                 // All paths equivalent.
632                 //
633                 this->path[i][j].diag = true;
634                 this->path[i][j].up   = true;
635                 this->path[i][j].left = true;
636             }
637         }
638     }
639 
640     return 0;
641 }
642 
643 inline int
swap(double * scores,dynprog * direction,int index_1,int index_2)644 GappedAln::swap(double *scores, dynprog *direction, int index_1, int index_2)
645 {
646     double swap        = scores[index_1];
647     scores[index_1]    = scores[index_2];
648     scores[index_2]    = swap;
649     dynprog swapdir    = direction[index_1];
650     direction[index_1] = direction[index_2];
651     direction[index_2] = swapdir;
652 
653     return 0;
654 }
655 
656 bool
compare_alignres(const AlignRes & a,const AlignRes & b)657 compare_alignres(const AlignRes& a, const AlignRes& b)
658 {
659     if (a.gap_cnt == b.gap_cnt) {
660 
661         if (a.pct_id == b.pct_id)
662             return (a.contiguity > b.contiguity);
663         else
664             return (a.pct_id > b.pct_id);
665 
666     } else {
667         return (a.gap_cnt < b.gap_cnt);
668     }
669 }
670 
671 int
trace_global_alignment(const string & tag_1,const string & tag_2)672 GappedAln::trace_global_alignment(const string& tag_1, const string& tag_2)
673 {
674     //         j---->        tag_2
675     //        [0][1][2][3]...[n-1]
676     //       +--------------------
677     // i [0] | [i][j]
678     // | [1] |
679     // | [2] |
680     // v [3] |
681     //   ... |
682     // [m-1] |
683     // tag_1
684     //
685     int    i, j, cnt, len, gaps, contiguity;
686     double ident;
687     string cigar;
688     char   buf[id_len];
689 
690     vector<AlignRes> alns;
691     bool more_paths = true;
692     bool seq_break  = false;
693 
694     do {
695         more_paths = false;
696 
697         i = this->_m - 1;
698         j = this->_n - 1;
699 
700         string aln_1, aln_2;
701 
702         while (i > 0 || j > 0) {
703             cnt  = this->path[i][j].count();
704 
705             if (cnt > 1) more_paths = true;
706 
707             if (this->path[i][j].diag) {
708                 aln_1 += tag_1[i - 1];
709                 aln_2 += tag_2[j - 1];
710                 if (cnt > 1) this->path[i][j].diag = false;
711                 i--;
712                 j--;
713             } else if (this->path[i][j].up) {
714                 aln_1 += tag_1[i - 1];
715                 aln_2 += "-";
716                 if (cnt > 1) this->path[i][j].up = false;
717                 i--;
718             } else if (this->path[i][j].left) {
719                 aln_1 += "-";
720                 aln_2 += tag_2[j - 1];
721                 if (cnt > 1) this->path[i][j].left = false;
722                 j--;
723             }
724         }
725 
726         reverse(aln_1.begin(), aln_1.end());
727         reverse(aln_2.begin(), aln_2.end());
728 
729         //
730         // Convert to CIGAR strings.
731         //
732         cigar      = "";
733         len        = aln_1.length();
734         gaps       = 0;
735         contiguity = 0;
736         seq_break  = false;
737         ident      = 0.0;
738         i          = 0;
739         while (i < len) {
740             if (aln_1[i] != '-' && aln_2[i] != '-') {
741                 cnt = 0;
742                 do {
743                     if (aln_1[i] == aln_2[i]) ident++;
744                     cnt++;
745                     i++;
746                     if (seq_break == false) contiguity++;
747                 } while (i < len && aln_1[i] != '-' && aln_2[i] != '-');
748                 sprintf(buf, "%dM", cnt);
749 
750             } else if (aln_1[i] == '-') {
751                 cnt = 0;
752                 do {
753                     cnt++;
754                     i++;
755                 } while (i < len && aln_1[i] == '-');
756                 sprintf(buf, "%dD", cnt);
757                 gaps++;
758                 seq_break = true;
759 
760             } else {
761                 cnt = 0;
762                 do {
763                     cnt++;
764                     i++;
765                 } while (i < len && aln_2[i] == '-');
766                 sprintf(buf, "%dI", cnt);
767                 gaps++;
768                 seq_break = true;
769             }
770 
771             cigar += buf;
772         }
773 
774         alns.push_back(AlignRes(move(cigar), gaps, contiguity, (ident / (double) len)));
775 
776         // cerr << aln_1 << " [" << cigar << ", contiguity: " << contiguity << ", gaps: " << gaps << "]\n"
777         //      << aln_2 << "\n";
778 
779     } while (more_paths);
780 
781     sort(alns.begin(), alns.end(), compare_alignres);
782     this->_aln = alns[0];
783     // cerr << "Final alignment: " << this->_aln.cigar << "; contiguity: " << contiguity << "; gaps: " << this->_aln.gap_cnt << "\n";
784 
785     return 1;
786 }
787 
788 int
trace_local_alignment(const string & query,const string & subj)789 GappedAln::trace_local_alignment(const string& query, const string& subj)
790 {
791     //         j---->      subject
792     //        [0][1][2][3]...[n-1]
793     //       +--------------------
794     // i [0] | [i][j]
795     // | [1] |
796     // | [2] |
797     // v [3] |
798     //   ... |
799     // [m-1] |
800     // query
801     //
802     int    i, j, cnt, len, gaps, contiguity;
803     double ident;
804     string cigar;
805     char   buf[id_len];
806 
807     vector<AlignRes> alns;
808     bool more_paths  = true;
809     bool seq_break   = false;
810     int  query_start;
811 
812     do {
813         query_start = 0;
814         more_paths  = false;
815 
816         //
817         // For a local alignment, begin the trace at the matrix cell with the highest score.
818         //
819         i = this->_max_score_m;
820         j = this->_max_score_n;
821 
822         string aln_1, aln_2;
823 
824         while (i > 0 && j > 0) {
825             cnt  = this->path[i][j].count();
826 
827             if (cnt > 1) more_paths = true;
828 
829             if (this->path[i][j].diag) {
830                 aln_1 += query[i - 1];
831                 aln_2 += subj[j - 1];
832                 if (cnt > 1) this->path[i][j].diag = false;
833                 i--;
834                 j--;
835             } else if (this->path[i][j].up) {
836                 aln_1 += query[i - 1];
837                 aln_2 += "-";
838                 if (cnt > 1) this->path[i][j].up = false;
839                 i--;
840             } else if (this->path[i][j].left) {
841                 aln_1 += "-";
842                 aln_2 += subj[j - 1];
843                 if (cnt > 1) this->path[i][j].left = false;
844                 j--;
845             } else {
846                 //
847                 // Stop traversing the matrix when we reach a node with no paths from it.
848                 //
849                 query_start = i;
850                 break;
851             }
852         }
853 
854         if (i > 0 && j == 0)
855             query_start = i;
856 
857         reverse(aln_1.begin(), aln_1.end());
858         reverse(aln_2.begin(), aln_2.end());
859 
860         //
861         // Convert to CIGAR strings.
862         //
863         cigar = "";
864 
865         //
866         // If the local alignment didn't span to the beginning of the query, add
867         // the softmasking to the CIGAR string.
868         //
869         if (query_start > 0) {
870             sprintf(buf, "%dS", (int) query_start);
871             cigar += buf;
872         }
873 
874         len        = aln_1.length();
875         gaps       = 0;
876         contiguity = 0;
877         seq_break  = false;
878         ident      = 0.0;
879         i          = 0;
880         while (i < len) {
881             if (aln_1[i] != '-' && aln_2[i] != '-') {
882                 cnt = 0;
883                 do {
884                     if (aln_1[i] == aln_2[i]) ident++;
885                     cnt++;
886                     i++;
887                     if (seq_break == false) contiguity++;
888                 } while (i < len && aln_1[i] != '-' && aln_2[i] != '-');
889                 sprintf(buf, "%dM", cnt);
890 
891             } else if (aln_1[i] == '-') {
892                 cnt = 0;
893                 do {
894                     cnt++;
895                     i++;
896                 } while (i < len && aln_1[i] == '-');
897                 sprintf(buf, "%dD", cnt);
898                 gaps++;
899                 seq_break = true;
900 
901             } else {
902                 cnt = 0;
903                 do {
904                     cnt++;
905                     i++;
906                 } while (i < len && aln_2[i] == '-');
907                 sprintf(buf, "%dI", cnt);
908                 gaps++;
909                 seq_break = true;
910             }
911 
912             cigar += buf;
913         }
914 
915         //
916         // If the entire query was not aligned, add the softmasked bases to the cigar.
917         //
918         if (this->_max_score_m < query.length()) {
919             sprintf(buf, "%dS", (int) query.length() - this->_max_score_m);
920             cigar += buf;
921         }
922 
923         alns.push_back(AlignRes(move(cigar), gaps, contiguity, (ident / (double) len), (uint) j));
924 
925     } while (more_paths);
926 
927     sort(alns.begin(), alns.end(), compare_alignres);
928     this->_aln = alns[0];
929 
930     return 1;
931 }
932 
933 const AlignRes&
result()934 GappedAln::result()
935 {
936     return this->_aln;
937 }
938 
939 int
parse_cigar(vector<pair<char,uint>> & cigar)940 GappedAln::parse_cigar(vector<pair<char, uint> > &cigar)
941 {
942     char buf[id_len];
943     int  dist;
944     const char *p, *q;
945 
946     p = this->_aln.cigar.c_str();
947 
948     cigar.clear();
949 
950     while (*p != '\0') {
951         q = p + 1;
952 
953         while (*q != '\0' && isdigit(*q))
954             q++;
955         strncpy(buf, p, q - p);
956         buf[q-p] = '\0';
957         dist = atoi(buf);
958 
959         cigar.push_back(make_pair(*q, dist));
960 
961         p = q + 1;
962     }
963 
964     return 0;
965 }
966 
967 int
dump_alignment(const string & tag_1,const string & tag_2)968 GappedAln::dump_alignment(const string& tag_1, const string& tag_2)
969 {
970     //         j---->        tag_2
971     //        [0][1][2][3]...[n-1]
972     //       +--------------------
973     // i [0] | [i][j]
974     // | [1] |
975     // | [2] |
976     // v [3] |
977     //   ... |
978     // [m-1] |
979     // tag_1
980     //
981 
982     //
983     // Output the score matrix.
984     //
985     cout << "         ";
986     for (uint j = 0; j < this->_n - 1; j++)
987         cout << "   " << tag_2[j] << "  |";
988     cout << "\n";
989 
990     cout << "  ";
991     for (uint j = 0; j < this->_n; j++)
992         printf("% 6.1f|", this->matrix[0][j]);
993     cout << "\n";
994 
995     for (uint i = 1; i < this->_m; i++) {
996         cout << tag_1[i - 1] << " ";
997         for (uint j = 0; j < this->_n; j++)
998             printf("% 6.1f|", this->matrix[i][j]);
999         cout << "\n";
1000     }
1001 
1002     cout << "\n";
1003 
1004     //
1005     // Output the path matrix.
1006     //
1007     cout << "      ";
1008     for (uint j = 0; j < this->_n - 1; j++)
1009         cout << " " << tag_2[j] << " |";
1010     cout << "\n";
1011 
1012     cout << "  ";
1013     for (uint j = 0; j < this->_n; j++) {
1014         this->path[0][j].diag ? cout << "d" : cout << " ";
1015         this->path[0][j].up   ? cout << "u" : cout << " ";
1016         this->path[0][j].left ? cout << "l" : cout << " ";
1017         cout << "|";
1018     }
1019     cout << "\n";
1020 
1021     for (uint i = 1; i < this->_m; i++) {
1022         cout << tag_1[i - 1] << " ";
1023         for (uint j = 0; j < this->_n; j++) {
1024             this->path[i][j].diag ? cout << "d" : cout << " ";
1025             this->path[i][j].up   ? cout << "u" : cout << " ";
1026             this->path[i][j].left ? cout << "l" : cout << " ";
1027             cout << "|";
1028         }
1029         cout << "\n";
1030     }
1031 
1032     cout << "\n";
1033 
1034     return 0;
1035 }
1036 
1037 #endif // __GAPPEDALN_H__
1038