1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2016 - 2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 #ifndef __GAPPEDALN_H__
22 #define __GAPPEDALN_H__
23
24 #include "SuffixTree.h"
25
26 enum dynprog {dynp_down, dynp_right, dynp_diag};
27
28 class AlignRes {
29 public:
30 string cigar;
31 uint gap_cnt;
32 uint contiguity;
33 double pct_id;
34 uint subj_pos;
AlignRes()35 AlignRes() {
36 this->gap_cnt = 0;
37 this->contiguity = 0;
38 this->pct_id = 0.0;
39 this->subj_pos = 0;
40 }
AlignRes(string && cigar,uint gapcnt,uint contiguity,double pct_id)41 AlignRes(string&& cigar, uint gapcnt, uint contiguity, double pct_id) {
42 this->cigar = move(cigar);
43 this->gap_cnt = gapcnt;
44 this->contiguity = contiguity;
45 this->pct_id = pct_id;
46 this->subj_pos = 0;
47 }
AlignRes(string && cigar,uint gapcnt,uint contiguity,double pct_id,double pos)48 AlignRes(string&& cigar, uint gapcnt, uint contiguity, double pct_id, double pos) {
49 this->cigar = move(cigar);
50 this->gap_cnt = gapcnt;
51 this->contiguity = contiguity;
52 this->pct_id = pct_id;
53 this->subj_pos = pos;
54 }
clear()55 void clear() {
56 this->cigar.clear();
57 this->gap_cnt = 0;
58 this->contiguity = 0;
59 this->pct_id = 0.0;
60 this->subj_pos = 0;
61 }
62 };
63
64 class AlignPath {
65 public:
66 bool diag;
67 bool left;
68 bool up;
69
AlignPath()70 AlignPath() {
71 diag = false;
72 left = false;
73 up = false;
74 }
count()75 int count() {
76 int cnt;
77 cnt = this->up ? 1 : 0;
78 cnt += this->left ? 1 : 0;
79 cnt += this->diag ? 1 : 0;
80
81 return cnt;
82 }
83 };
84
85 //
86 // Needleman-Wunsch Alignment
87 //
88 static const double gapopen_score = -10.0;
89 static const double gapext_score = -0.5;
90 static const double mismatch_score = -4.0;
91 static const double match_score = 5.0;
92
93 class GappedAln {
94 uint _m;
95 uint _n;
96 uint _m_size;
97 uint _n_size;
98 uint _max_score; //
99 uint _max_score_m; // For local alignment.
100 uint _max_score_n; //
101 double **matrix;
102 AlignPath **path;
103 AlignRes _aln;
104
105 inline int swap(double *, dynprog *, int, int);
106 int trace_global_alignment(const string&, const string&);
107 int trace_local_alignment(const string&, const string&);
108
109 GappedAln& operator= (const GappedAln&) = delete;
110
111 public:
112 GappedAln();
GappedAln(int i)113 GappedAln(int i) : GappedAln(i, i) {};
GappedAln(int m,int n)114 GappedAln(int m, int n) : GappedAln(m, n, false) {};
115 GappedAln(int, int, bool);
116 ~GappedAln();
117
118 int init(int, int);
119 int init(int, int, bool);
120 int align(const string&, const string&);
121 int align_region(const string&, const string&, const size_t, const size_t, const size_t, const size_t);
122 int align_constrained(const string&, const string&, const vector<STAln> &);
123 const AlignRes& result();
124
125 int parse_cigar(vector<pair<char, uint> > &);
126 int dump_alignment(const string&, const string&);
127
128 private:
129 int bound_region(const int, const int, const int, const int);
130 int score(bool, const string&, const int, const int, const string&, const int, const int);
131 };
132
GappedAln()133 GappedAln::GappedAln()
134 {
135 this->_m = 0;
136 this->_n = 0;
137 this->_m_size = this->_m;
138 this->_n_size = this->_n;
139 this->_max_score = 0;
140 this->_max_score_m = 0;
141 this->_max_score_n = 0;
142 this->matrix = NULL;
143 this->path = NULL;
144 }
145
GappedAln(int len_1,int len_2,bool initialize)146 GappedAln::GappedAln(int len_1, int len_2, bool initialize)
147 {
148 this->_m = len_1 + 1;
149 this->_n = len_2 + 1;
150 this->_m_size = this->_m;
151 this->_n_size = this->_n;
152 this->_max_score = 0;
153 this->_max_score_m = 0;
154 this->_max_score_n = 0;
155
156 this->matrix = new double * [this->_m];
157 for (uint i = 0; i < this->_m; i++)
158 this->matrix[i] = new double [this->_n];
159
160 this->path = new AlignPath * [this->_m];
161 for (uint i = 0; i < this->_m; i++)
162 this->path[i] = new AlignPath [this->_n];
163
164 if (initialize)
165 for (uint i = 0; i < this->_m; i++)
166 memset(this->matrix[i], 0, sizeof(double) * this->_n);
167 }
168
~GappedAln()169 GappedAln::~GappedAln()
170 {
171 for (uint i = 0; i < this->_m_size; i++) {
172 delete [] this->matrix[i];
173 delete [] this->path[i];
174 }
175 delete [] this->matrix;
176 delete [] this->path;
177 }
178
179 int
init(int size_1,int size_2)180 GappedAln::init(int size_1, int size_2)
181 {
182 return this->init(size_1, size_2, false);
183 }
184
185 int
init(int size_1,int size_2,bool initialize)186 GappedAln::init(int size_1, int size_2, bool initialize)
187 {
188 //
189 // Resize the underlying matrix and path arrays, if necessary.
190 //
191 if ((size_1 + 1) > (int)this->_m_size || (size_2 + 1) > (int)this->_n_size) {
192 for (uint i = 0; i < this->_m_size; i++) {
193 delete [] this->matrix[i];
194 delete [] this->path[i];
195 }
196 if (this->_m_size > 0) {
197 delete [] this->matrix;
198 delete [] this->path;
199 }
200
201 this->_m_size = size_1 + 1;
202 this->_n_size = size_2 + 1;
203 //
204 // Resize the arrays to be 25% larger than the requested size.
205 //
206 int new_size = this->_m_size > this->_n_size ? this->_m_size : this->_n_size;
207 new_size += int((double) new_size * 0.25);
208 this->_m_size = new_size;
209 this->_n_size = new_size;
210
211 this->matrix = new double * [this->_m_size];
212 for (uint i = 0; i < this->_m_size; i++)
213 this->matrix[i] = new double [this->_n_size];
214
215 this->path = new AlignPath * [this->_m_size];
216 for (uint i = 0; i < this->_m_size; i++)
217 this->path[i] = new AlignPath [this->_n_size];
218 }
219
220 //
221 // Otherwise, set the dimensions of the matrix and path arrays to be the sequence lengths.
222 //
223 this->_m = size_1 + 1;
224 this->_n = size_2 + 1;
225
226 if (initialize) {
227 for (uint i = 0; i < this->_m_size; i++)
228 memset(this->matrix[i], 0, sizeof(double) * this->_n_size);
229 this->_aln.clear();
230 this->_max_score = 0;
231 this->_max_score_m = 0;
232 this->_max_score_n = 0;
233
234 }
235
236 return 0;
237 }
238
239 int
align(const string & tag_1,const string & tag_2)240 GappedAln::align(const string& tag_1, const string& tag_2)
241 {
242 // j----> tag_2
243 // [0][1][2][3]...[n-1]
244 // +--------------------
245 // i [0] | [i][j]
246 // | [1] |
247 // | [2] |
248 // v [3] |
249 // ... |
250 // [m-1] |
251 // tag_1
252 //
253
254 //
255 // Initialize the first column and row of the dynamic programming
256 // matrix and the path array.
257 //
258 path[0][0].diag = false;
259 path[0][0].up = false;
260 path[0][0].left = false;
261 matrix[0][0] = 0.0;
262 for (uint i = 1; i < this->_m; i++) {
263 this->matrix[i][0] = this->path[i - 1][0].up ? this->matrix[i - 1][0] + gapext_score : this->matrix[i - 1][0] + gapopen_score;
264 this->path[i][0].diag = false;
265 this->path[i][0].up = true;
266 this->path[i][0].left = false;
267 }
268 for (uint j = 1; j < this->_n; j++) {
269 this->matrix[0][j] = this->path[0][j - 1].left ? this->matrix[0][j - 1] + gapext_score : this->matrix[0][j - 1] + gapopen_score;
270 this->path[0][j].diag = false;
271 this->path[0][j].up = false;
272 this->path[0][j].left = true;
273 }
274
275 this->score(false, tag_1, 1, this->_m - 1, tag_2, 1, this->_n - 1);
276
277 if (this->trace_global_alignment(tag_1, tag_2))
278 return 1;
279
280 return 0;
281 }
282
283 int
align_region(const string & query,const string & subj,const size_t q_start,const size_t q_end,const size_t s_start,const size_t s_end)284 GappedAln::align_region(const string& query, const string& subj,
285 const size_t q_start, const size_t q_end,
286 const size_t s_start, const size_t s_end)
287 {
288 // j----> subject
289 // [0][1][2][3]...[n-1]
290 // +--------------------
291 // i [0] | [i][j]
292 // | [1] |
293 // | [2] |
294 // v [3] |
295 // ... |
296 // [m-1] |
297 // query
298 this->bound_region(q_start, q_end + 1, s_start, s_end + 1);
299 this->score(true, query, q_start, q_end + 1, subj, s_start, s_end + 1);
300
301 if (this->trace_local_alignment(query, subj))
302 return 1;
303
304 return 0;
305 }
306
307 int
align_constrained(const string & query,const string & subj,const vector<STAln> & alns)308 GappedAln::align_constrained(const string& query, const string& subj, const vector<STAln> &alns)
309 {
310 // j----> subject
311 // [0][1][2][3]...[n-1]
312 // +--------------------
313 // i [0] | [i][j]
314 // | [1] |
315 // | [2] |
316 // v [3] |
317 // ... |
318 // [m-1] |
319 // query
320
321 uint q_start, q_end, s_start, s_end;
322 uint q_len, q, s;
323
324 //
325 // Does the first pre-aligned region start at the beginning of the query? If not, fill in the matrix.
326 //
327 if (alns.front().query_pos > 0) {
328 q_start = 1;
329 q_end = 1 + alns.front().query_pos - 1;
330 q_len = q_end;
331 s_start = (alns.front().subj_pos >= (q_len * 2)) ? 1 + alns.front().subj_pos - (q_len * 2) : 1;
332 s_end = 1 + alns.front().subj_pos - 1;
333
334 this->path[q_start][s_start].diag = false;
335 this->path[q_start][s_start].up = false;
336 this->path[q_start][s_start].left = false;
337
338 // cerr << "Filling start region; q_start: " << q_start << ", q_end: " << q_end << "; s_start: " << s_start << ", s_end: " << s_end << "\n";
339 this->bound_region(q_start, q_end, s_start, s_end);
340 this->score(true, query, q_start, q_end, subj, s_start, s_end);
341 }
342
343 //
344 // Fill in each pre-aligned region of the sequences starting from the first fragment as ordered by
345 // the query sequence.
346 //
347 for (uint n = 0; n < alns.size(); n++) {
348
349 for (uint i = alns[n].query_pos + 1; i <= alns[n].query_pos + alns[n].aln_len; i++) {
350 for (uint j = alns[n].subj_pos + 1; j <= alns[n].subj_pos + alns[n].aln_len; j++) {
351 q = i - alns[n].query_pos;
352 s = j - alns[n].subj_pos;
353 if (q == s) {
354 this->matrix[i][j] = this->matrix[i - 1][j - 1] + match_score;
355 this->path[i][j].diag = true;
356 this->path[i][j].up = false;
357 this->path[i][j].left = false;
358
359 if (this->matrix[i][j] > this->_max_score) {
360 this->_max_score = this->matrix[i][j];
361 this->_max_score_m = i;
362 this->_max_score_n = j;
363 }
364 }
365 }
366 }
367
368 //
369 // Next fill in the following connector sequence span the region between two pre-aligned regions.
370 //
371 uint m = n + 1;
372 if (m < alns.size()) {
373 q_start = 1 + alns[n].query_pos + alns[n].aln_len;
374 q_end = 1 + alns[m].query_pos - 1;
375 s_start = 1 + alns[n].subj_pos + alns[n].aln_len;
376 s_end = 1 + alns[m].subj_pos - 1;
377
378 // cerr << "q_start: " << q_start << ", q_end: " << q_end << "; s_start: " << s_start << ", s_end: " << s_end << "\n";
379 this->bound_region(q_start, q_end, s_start, s_end);
380 this->score(true, query, q_start, q_end, subj, s_start, s_end);
381 }
382 }
383
384 //
385 // Does the last pre-aligned region end at the end of the query? If not, fill in the matrix.
386 //
387 if (alns.back().query_pos + alns.back().aln_len != query.length()) {
388 q_start = 1 + alns.back().query_pos + alns.back().aln_len;
389 q_end = query.length();
390
391 assert(q_end >= q_start);
392
393 q_len = q_end - q_start + 1;
394 q = 1 + alns.back().subj_pos + alns.back().aln_len;
395 s_start = (q < this->_n) ? q : this->_n - 1;
396 s_end = (s_start + (q_len * 2) < this->_n) ? s_start + (q_len * 2) : this->_n - 1;
397
398 // cerr << "Filling end region; q_start: " << q_start << ", q_end: " << q_end << "; s_start: " << s_start << ", s_end: " << s_end << "\n";
399 this->bound_region(q_start, q_end, s_start, s_end);
400 this->score(true, query, q_start, q_end, subj, s_start, s_end);
401 }
402
403 if (this->trace_local_alignment(query, subj))
404 return 1;
405
406 return 0;
407 }
408
409 inline int
bound_region(const int q_start,const int q_end,const int s_start,const int s_end)410 GappedAln::bound_region(const int q_start, const int q_end,
411 const int s_start, const int s_end)
412 {
413 // j----> subject
414 // [0][1][2][3]...[n-1]
415 // +--------------------
416 // i [0] | [i][j]
417 // | [1] |
418 // | [2] |
419 // v [3] |
420 // ... |
421 // [m-1] |
422 // query
423
424 int i_bnd, i_bnd_low, j_bnd, j_bnd_low;
425 //
426 // Bound the region we are about to score taking care not to cross out of the bounds of the matrix.
427 //
428 assert(q_start >= 0);
429 assert(s_start >= 0);
430 assert(q_end >= 0);
431 assert(s_end >= 0);
432
433 //
434 // _n and _m are the length of the subject and query, respectively. however, as they are
435 // indexes into a zero-based array, their maximum value is _n - 1 and _m - 1.
436 //
437 j_bnd = s_end + 1 > (int) this->_n - 1 ? this->_n - 1 : s_end + 1;
438 j_bnd_low = s_start - 1 < 0 ? 0 : s_start - 1;
439 i_bnd = q_end + 1 > (int) this->_m - 1 ? this->_m - 1 : q_end + 1;
440 i_bnd_low = q_start - 1 < 0 ? 0 : q_start - 1;
441
442 assert(j_bnd < (int) this->_n_size);
443 assert(i_bnd < (int) this->_m_size);
444
445 double score_down, score_right;
446
447 // First, bound the top row.
448 if (i_bnd_low >= 0 && s_start < (int) this->_n - 1)
449 for (int j = s_start; j <= j_bnd; j++) {
450 score_right = this->matrix[i_bnd_low][j - 1];
451 score_right += this->path[i_bnd_low][j - 1].left ? gapext_score : gapopen_score;
452 this->matrix[i_bnd_low][j] = score_right < 0 ? 0 : score_right;
453 this->path[i_bnd_low][j].diag = false;
454 this->path[i_bnd_low][j].up = false;
455 this->path[i_bnd_low][j].left = true;
456 }
457
458 // Second, fill the left column.
459 if (j_bnd_low >= 0)
460 for (int i = q_start; i <= i_bnd; i++) {
461 score_down = this->matrix[i - 1][j_bnd_low];
462 score_down += this->path[i - 1][j_bnd_low].up ? gapext_score : gapopen_score;
463 this->matrix[i][j_bnd_low] = score_down < 0 ? 0 : score_down;
464 this->path[i][j_bnd_low].diag = false;
465 this->path[i][j_bnd_low].up = true;
466 this->path[i][j_bnd_low].left = false;
467 }
468
469 // Third, fill the right column.
470 if (s_end + 1 < (int) this->_n)
471 for (int i = i_bnd_low; i <= i_bnd; i++) {
472 score_right = this->matrix[i][j_bnd - 1];
473 score_right += this->path[i][j_bnd - 1].left ? gapext_score : gapopen_score;
474 this->matrix[i][j_bnd] = score_right < 0 ? 0 : score_right;
475 this->path[i][j_bnd].diag = false;
476 this->path[i][j_bnd].up = false;
477 this->path[i][j_bnd].left = true;
478 }
479
480 // Fourth, bound the bottom row.
481 if (q_end + 1 < (int) this->_m)
482 for (int j = j_bnd_low; j <= j_bnd; j++) {
483 score_down = this->matrix[i_bnd - 1][j];
484 score_down += this->path[i_bnd - 1][j].up ? gapext_score : gapopen_score;
485 this->matrix[i_bnd][j] = score_down < 0 ? 0 : score_down;
486 this->path[i_bnd][j].diag = false;
487 this->path[i_bnd][j].up = true;
488 this->path[i_bnd][j].left = false;
489 }
490
491 return 0;
492 }
493
494 inline int
score(bool local,const string & query,const int q_start,const int q_end,const string & subj,const int s_start,const int s_end)495 GappedAln::score(bool local,
496 const string& query, const int q_start, const int q_end,
497 const string& subj, const int s_start, const int s_end)
498 {
499 double score_down, score_diag, score_right;
500 double scores[3];
501 dynprog direction[3];
502
503 //
504 // Score a region of the matrix.
505 //
506 for (int i = q_start; i <= q_end; i++) {
507 for (int j = s_start; j <= s_end; j++) {
508 // Calculate the score:
509 // 1) If we were to move down from the above cell.
510 score_down = this->matrix[i - 1][j];
511 score_down += this->path[i - 1][j].up ? gapext_score : gapopen_score;
512 // 2) If we were to move diagonally from the above and left cell.
513 score_diag = this->matrix[i - 1][j - 1] + (query[i - 1] == subj[j - 1] ? match_score : mismatch_score);
514 // 3) If we were to move over from the cell left of us.
515 score_right = this->matrix[i][j - 1];
516 score_right += this->path[i][j - 1].left ? gapext_score : gapopen_score;
517
518 //
519 // Sort the scores, highest to lowest.
520 //
521 scores[0] = score_down;
522 direction[0] = dynp_down;
523 scores[1] = score_diag;
524 direction[1] = dynp_diag;
525 scores[2] = score_right;
526 direction[2] = dynp_right;
527
528 if (scores[0] < scores[1])
529 this->swap(scores, direction, 0, 1);
530 if (scores[1] < scores[2])
531 this->swap(scores, direction, 1, 2);
532 if (scores[0] < scores[1])
533 this->swap(scores, direction, 0, 1);
534
535 this->matrix[i][j] = scores[0];
536
537 if (scores[0] > this->_max_score) {
538 this->_max_score = scores[0];
539 this->_max_score_m = i;
540 this->_max_score_n = j;
541 }
542
543 //
544 // If this is a local alignment and the score is zero or negative, mark this node
545 // as having no valid paths exiting it.
546 //
547 if (local && scores[0] <= 0) {
548 this->path[i][j].diag = false;
549 this->path[i][j].up = false;
550 this->path[i][j].left = false;
551 continue;
552 }
553
554 if (scores[0] > scores[1]) {
555 //
556 // One path is best.
557 //
558 switch (direction[0]) {
559 case dynp_diag:
560 this->path[i][j].diag = true;
561 this->path[i][j].up = false;
562 this->path[i][j].left = false;
563 break;
564 case dynp_down:
565 this->path[i][j].diag = false;
566 this->path[i][j].up = true;
567 this->path[i][j].left = false;
568 break;
569 case dynp_right:
570 default:
571 this->path[i][j].diag = false;
572 this->path[i][j].up = false;
573 this->path[i][j].left = true;
574 }
575
576 } else if (scores[0] == scores[1]) {
577 //
578 // Two of the paths are equivalent.
579 //
580 switch (direction[0]) {
581 case dynp_diag:
582 this->path[i][j].diag = true;
583
584 switch (direction[1]) {
585 case dynp_down:
586 this->path[i][j].up = true;
587 this->path[i][j].left = false;
588 break;
589 default:
590 case dynp_right:
591 this->path[i][j].up = false;
592 this->path[i][j].left = true;
593 break;
594 }
595 break;
596 case dynp_down:
597 this->path[i][j].up = true;
598
599 switch (direction[1]) {
600 case dynp_right:
601 this->path[i][j].diag = false;
602 this->path[i][j].left = true;
603 break;
604 default:
605 case dynp_diag:
606 this->path[i][j].diag = true;
607 this->path[i][j].left = false;
608 break;
609 }
610 break;
611 default:
612 case dynp_right:
613 this->path[i][j].left = true;
614
615 switch (direction[1]) {
616 case dynp_diag:
617 this->path[i][j].diag = true;
618 this->path[i][j].up = false;
619 break;
620 default:
621 case dynp_down:
622 this->path[i][j].diag = false;
623 this->path[i][j].up = true;
624 break;
625 }
626 break;
627 }
628
629 } else {
630 //
631 // All paths equivalent.
632 //
633 this->path[i][j].diag = true;
634 this->path[i][j].up = true;
635 this->path[i][j].left = true;
636 }
637 }
638 }
639
640 return 0;
641 }
642
643 inline int
swap(double * scores,dynprog * direction,int index_1,int index_2)644 GappedAln::swap(double *scores, dynprog *direction, int index_1, int index_2)
645 {
646 double swap = scores[index_1];
647 scores[index_1] = scores[index_2];
648 scores[index_2] = swap;
649 dynprog swapdir = direction[index_1];
650 direction[index_1] = direction[index_2];
651 direction[index_2] = swapdir;
652
653 return 0;
654 }
655
656 bool
compare_alignres(const AlignRes & a,const AlignRes & b)657 compare_alignres(const AlignRes& a, const AlignRes& b)
658 {
659 if (a.gap_cnt == b.gap_cnt) {
660
661 if (a.pct_id == b.pct_id)
662 return (a.contiguity > b.contiguity);
663 else
664 return (a.pct_id > b.pct_id);
665
666 } else {
667 return (a.gap_cnt < b.gap_cnt);
668 }
669 }
670
671 int
trace_global_alignment(const string & tag_1,const string & tag_2)672 GappedAln::trace_global_alignment(const string& tag_1, const string& tag_2)
673 {
674 // j----> tag_2
675 // [0][1][2][3]...[n-1]
676 // +--------------------
677 // i [0] | [i][j]
678 // | [1] |
679 // | [2] |
680 // v [3] |
681 // ... |
682 // [m-1] |
683 // tag_1
684 //
685 int i, j, cnt, len, gaps, contiguity;
686 double ident;
687 string cigar;
688 char buf[id_len];
689
690 vector<AlignRes> alns;
691 bool more_paths = true;
692 bool seq_break = false;
693
694 do {
695 more_paths = false;
696
697 i = this->_m - 1;
698 j = this->_n - 1;
699
700 string aln_1, aln_2;
701
702 while (i > 0 || j > 0) {
703 cnt = this->path[i][j].count();
704
705 if (cnt > 1) more_paths = true;
706
707 if (this->path[i][j].diag) {
708 aln_1 += tag_1[i - 1];
709 aln_2 += tag_2[j - 1];
710 if (cnt > 1) this->path[i][j].diag = false;
711 i--;
712 j--;
713 } else if (this->path[i][j].up) {
714 aln_1 += tag_1[i - 1];
715 aln_2 += "-";
716 if (cnt > 1) this->path[i][j].up = false;
717 i--;
718 } else if (this->path[i][j].left) {
719 aln_1 += "-";
720 aln_2 += tag_2[j - 1];
721 if (cnt > 1) this->path[i][j].left = false;
722 j--;
723 }
724 }
725
726 reverse(aln_1.begin(), aln_1.end());
727 reverse(aln_2.begin(), aln_2.end());
728
729 //
730 // Convert to CIGAR strings.
731 //
732 cigar = "";
733 len = aln_1.length();
734 gaps = 0;
735 contiguity = 0;
736 seq_break = false;
737 ident = 0.0;
738 i = 0;
739 while (i < len) {
740 if (aln_1[i] != '-' && aln_2[i] != '-') {
741 cnt = 0;
742 do {
743 if (aln_1[i] == aln_2[i]) ident++;
744 cnt++;
745 i++;
746 if (seq_break == false) contiguity++;
747 } while (i < len && aln_1[i] != '-' && aln_2[i] != '-');
748 sprintf(buf, "%dM", cnt);
749
750 } else if (aln_1[i] == '-') {
751 cnt = 0;
752 do {
753 cnt++;
754 i++;
755 } while (i < len && aln_1[i] == '-');
756 sprintf(buf, "%dD", cnt);
757 gaps++;
758 seq_break = true;
759
760 } else {
761 cnt = 0;
762 do {
763 cnt++;
764 i++;
765 } while (i < len && aln_2[i] == '-');
766 sprintf(buf, "%dI", cnt);
767 gaps++;
768 seq_break = true;
769 }
770
771 cigar += buf;
772 }
773
774 alns.push_back(AlignRes(move(cigar), gaps, contiguity, (ident / (double) len)));
775
776 // cerr << aln_1 << " [" << cigar << ", contiguity: " << contiguity << ", gaps: " << gaps << "]\n"
777 // << aln_2 << "\n";
778
779 } while (more_paths);
780
781 sort(alns.begin(), alns.end(), compare_alignres);
782 this->_aln = alns[0];
783 // cerr << "Final alignment: " << this->_aln.cigar << "; contiguity: " << contiguity << "; gaps: " << this->_aln.gap_cnt << "\n";
784
785 return 1;
786 }
787
788 int
trace_local_alignment(const string & query,const string & subj)789 GappedAln::trace_local_alignment(const string& query, const string& subj)
790 {
791 // j----> subject
792 // [0][1][2][3]...[n-1]
793 // +--------------------
794 // i [0] | [i][j]
795 // | [1] |
796 // | [2] |
797 // v [3] |
798 // ... |
799 // [m-1] |
800 // query
801 //
802 int i, j, cnt, len, gaps, contiguity;
803 double ident;
804 string cigar;
805 char buf[id_len];
806
807 vector<AlignRes> alns;
808 bool more_paths = true;
809 bool seq_break = false;
810 int query_start;
811
812 do {
813 query_start = 0;
814 more_paths = false;
815
816 //
817 // For a local alignment, begin the trace at the matrix cell with the highest score.
818 //
819 i = this->_max_score_m;
820 j = this->_max_score_n;
821
822 string aln_1, aln_2;
823
824 while (i > 0 && j > 0) {
825 cnt = this->path[i][j].count();
826
827 if (cnt > 1) more_paths = true;
828
829 if (this->path[i][j].diag) {
830 aln_1 += query[i - 1];
831 aln_2 += subj[j - 1];
832 if (cnt > 1) this->path[i][j].diag = false;
833 i--;
834 j--;
835 } else if (this->path[i][j].up) {
836 aln_1 += query[i - 1];
837 aln_2 += "-";
838 if (cnt > 1) this->path[i][j].up = false;
839 i--;
840 } else if (this->path[i][j].left) {
841 aln_1 += "-";
842 aln_2 += subj[j - 1];
843 if (cnt > 1) this->path[i][j].left = false;
844 j--;
845 } else {
846 //
847 // Stop traversing the matrix when we reach a node with no paths from it.
848 //
849 query_start = i;
850 break;
851 }
852 }
853
854 if (i > 0 && j == 0)
855 query_start = i;
856
857 reverse(aln_1.begin(), aln_1.end());
858 reverse(aln_2.begin(), aln_2.end());
859
860 //
861 // Convert to CIGAR strings.
862 //
863 cigar = "";
864
865 //
866 // If the local alignment didn't span to the beginning of the query, add
867 // the softmasking to the CIGAR string.
868 //
869 if (query_start > 0) {
870 sprintf(buf, "%dS", (int) query_start);
871 cigar += buf;
872 }
873
874 len = aln_1.length();
875 gaps = 0;
876 contiguity = 0;
877 seq_break = false;
878 ident = 0.0;
879 i = 0;
880 while (i < len) {
881 if (aln_1[i] != '-' && aln_2[i] != '-') {
882 cnt = 0;
883 do {
884 if (aln_1[i] == aln_2[i]) ident++;
885 cnt++;
886 i++;
887 if (seq_break == false) contiguity++;
888 } while (i < len && aln_1[i] != '-' && aln_2[i] != '-');
889 sprintf(buf, "%dM", cnt);
890
891 } else if (aln_1[i] == '-') {
892 cnt = 0;
893 do {
894 cnt++;
895 i++;
896 } while (i < len && aln_1[i] == '-');
897 sprintf(buf, "%dD", cnt);
898 gaps++;
899 seq_break = true;
900
901 } else {
902 cnt = 0;
903 do {
904 cnt++;
905 i++;
906 } while (i < len && aln_2[i] == '-');
907 sprintf(buf, "%dI", cnt);
908 gaps++;
909 seq_break = true;
910 }
911
912 cigar += buf;
913 }
914
915 //
916 // If the entire query was not aligned, add the softmasked bases to the cigar.
917 //
918 if (this->_max_score_m < query.length()) {
919 sprintf(buf, "%dS", (int) query.length() - this->_max_score_m);
920 cigar += buf;
921 }
922
923 alns.push_back(AlignRes(move(cigar), gaps, contiguity, (ident / (double) len), (uint) j));
924
925 } while (more_paths);
926
927 sort(alns.begin(), alns.end(), compare_alignres);
928 this->_aln = alns[0];
929
930 return 1;
931 }
932
933 const AlignRes&
result()934 GappedAln::result()
935 {
936 return this->_aln;
937 }
938
939 int
parse_cigar(vector<pair<char,uint>> & cigar)940 GappedAln::parse_cigar(vector<pair<char, uint> > &cigar)
941 {
942 char buf[id_len];
943 int dist;
944 const char *p, *q;
945
946 p = this->_aln.cigar.c_str();
947
948 cigar.clear();
949
950 while (*p != '\0') {
951 q = p + 1;
952
953 while (*q != '\0' && isdigit(*q))
954 q++;
955 strncpy(buf, p, q - p);
956 buf[q-p] = '\0';
957 dist = atoi(buf);
958
959 cigar.push_back(make_pair(*q, dist));
960
961 p = q + 1;
962 }
963
964 return 0;
965 }
966
967 int
dump_alignment(const string & tag_1,const string & tag_2)968 GappedAln::dump_alignment(const string& tag_1, const string& tag_2)
969 {
970 // j----> tag_2
971 // [0][1][2][3]...[n-1]
972 // +--------------------
973 // i [0] | [i][j]
974 // | [1] |
975 // | [2] |
976 // v [3] |
977 // ... |
978 // [m-1] |
979 // tag_1
980 //
981
982 //
983 // Output the score matrix.
984 //
985 cout << " ";
986 for (uint j = 0; j < this->_n - 1; j++)
987 cout << " " << tag_2[j] << " |";
988 cout << "\n";
989
990 cout << " ";
991 for (uint j = 0; j < this->_n; j++)
992 printf("% 6.1f|", this->matrix[0][j]);
993 cout << "\n";
994
995 for (uint i = 1; i < this->_m; i++) {
996 cout << tag_1[i - 1] << " ";
997 for (uint j = 0; j < this->_n; j++)
998 printf("% 6.1f|", this->matrix[i][j]);
999 cout << "\n";
1000 }
1001
1002 cout << "\n";
1003
1004 //
1005 // Output the path matrix.
1006 //
1007 cout << " ";
1008 for (uint j = 0; j < this->_n - 1; j++)
1009 cout << " " << tag_2[j] << " |";
1010 cout << "\n";
1011
1012 cout << " ";
1013 for (uint j = 0; j < this->_n; j++) {
1014 this->path[0][j].diag ? cout << "d" : cout << " ";
1015 this->path[0][j].up ? cout << "u" : cout << " ";
1016 this->path[0][j].left ? cout << "l" : cout << " ";
1017 cout << "|";
1018 }
1019 cout << "\n";
1020
1021 for (uint i = 1; i < this->_m; i++) {
1022 cout << tag_1[i - 1] << " ";
1023 for (uint j = 0; j < this->_n; j++) {
1024 this->path[i][j].diag ? cout << "d" : cout << " ";
1025 this->path[i][j].up ? cout << "u" : cout << " ";
1026 this->path[i][j].left ? cout << "l" : cout << " ";
1027 cout << "|";
1028 }
1029 cout << "\n";
1030 }
1031
1032 cout << "\n";
1033
1034 return 0;
1035 }
1036
1037 #endif // __GAPPEDALN_H__
1038