1 /*
2
3 VSEARCH: a versatile open source tool for metagenomics
4
5 Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6 All rights reserved.
7
8 Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9 Department of Informatics, University of Oslo,
10 PO Box 1080 Blindern, NO-0316 Oslo, Norway
11
12 This software is dual-licensed and available under a choice
13 of one of two licenses, either under the terms of the GNU
14 General Public License version 3 or the BSD 2-Clause License.
15
16
17 GNU General Public License version 3
18
19 This program is free software: you can redistribute it and/or modify
20 it under the terms of the GNU General Public License as published by
21 the Free Software Foundation, either version 3 of the License, or
22 (at your option) any later version.
23
24 This program is distributed in the hope that it will be useful,
25 but WITHOUT ANY WARRANTY; without even the implied warranty of
26 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 GNU General Public License for more details.
28
29 You should have received a copy of the GNU General Public License
30 along with this program. If not, see <http://www.gnu.org/licenses/>.
31
32
33 The BSD 2-Clause License
34
35 Redistribution and use in source and binary forms, with or without
36 modification, are permitted provided that the following conditions
37 are met:
38
39 1. Redistributions of source code must retain the above copyright
40 notice, this list of conditions and the following disclaimer.
41
42 2. Redistributions in binary form must reproduce the above copyright
43 notice, this list of conditions and the following disclaimer in the
44 documentation and/or other materials provided with the distribution.
45
46 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50 COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57 POSSIBILITY OF SUCH DAMAGE.
58
59 */
60
61 #include "vsearch.h"
62
63 /*
64
65 Compute the optimal global alignment of two sequences
66 in linear space using the divide and conquer method.
67
68 These functions are based on the following articles:
69 - Hirschberg (1975) Comm ACM 18:341-343
70 - Myers & Miller (1988) CABIOS 4:11-17
71
72 The method has been adapted for the use of different
73 gap penalties for query/target/left/interior/right gaps.
74
75 scorematrix consists of 16x16 int64_t integers
76
77 Sequences and alignment matrix:
78 A/a/i/query/q/downwards/vertical/top/bottom
79 B/b/j/target/t/rightwards/horizontal/left/right
80
81 f corresponds to score ending with gap in A/query
82 EE corresponds to score ending with gap in B/target
83
84 */
85
LinearMemoryAligner()86 LinearMemoryAligner::LinearMemoryAligner()
87 {
88 scorematrix = nullptr;
89
90 cigar_alloc = 0;
91 cigar_string = nullptr;
92
93 vector_alloc = 0;
94 HH = nullptr;
95 EE = nullptr;
96 XX = nullptr;
97 YY = nullptr;
98 }
99
~LinearMemoryAligner()100 LinearMemoryAligner::~LinearMemoryAligner()
101 {
102 if (cigar_string) {
103 xfree(cigar_string);
104
105 }
106 if (HH) {
107 xfree(HH);
108
109 }
110 if (EE) {
111 xfree(EE);
112
113 }
114 if (XX) {
115 xfree(XX);
116
117 }
118 if (YY) {
119 xfree(YY);
120
121 }
122 }
123
scorematrix_create(int64_t match,int64_t mismatch)124 int64_t * LinearMemoryAligner::scorematrix_create(int64_t match, int64_t mismatch)
125 {
126 auto * newscorematrix = (int64_t*) xmalloc(16*16*sizeof(int64_t));
127
128 for(int i=0; i<16; i++) {
129 for(int j=0; j<16; j++)
130 {
131 int64_t value;
132 if (ambiguous_4bit[i] || ambiguous_4bit[j]) {
133 value = 0;
134 } else if (i == j) {
135 value = match;
136 } else {
137 value = mismatch;
138
139 }
140 newscorematrix[16*i+j] = value;
141 }
142
143 }
144 return newscorematrix;
145 }
146
alloc_vectors(size_t x)147 void LinearMemoryAligner::alloc_vectors(size_t x)
148 {
149 if (vector_alloc < x)
150 {
151 vector_alloc = x;
152
153 if (HH) {
154 xfree(HH);
155
156 }
157 if (EE) {
158 xfree(EE);
159
160 }
161 if (XX) {
162 xfree(XX);
163
164 }
165 if (YY) {
166 xfree(YY);
167
168 }
169
170 HH = (int64_t*) xmalloc(vector_alloc * (sizeof(int64_t)));
171 EE = (int64_t*) xmalloc(vector_alloc * (sizeof(int64_t)));
172 XX = (int64_t*) xmalloc(vector_alloc * (sizeof(int64_t)));
173 YY = (int64_t*) xmalloc(vector_alloc * (sizeof(int64_t)));
174 }
175 }
176
cigar_reset()177 void LinearMemoryAligner::cigar_reset()
178 {
179 if (cigar_alloc < 1)
180 {
181 cigar_alloc = 64;
182 cigar_string = (char*) xrealloc(cigar_string, cigar_alloc);
183 }
184 cigar_string[0] = 0;
185 cigar_length = 0;
186 op = 0;
187 op_run = 0;
188 }
189
cigar_flush()190 void LinearMemoryAligner::cigar_flush()
191 {
192 if (op_run > 0)
193 {
194 while (true)
195 {
196 /* try writing string until enough memory has been allocated */
197
198 int64_t rest = cigar_alloc - cigar_length;
199 int n;
200 if (op_run > 1) {
201 n = snprintf(cigar_string + cigar_length,
202 rest,
203 "%" PRId64 "%c", op_run, op);
204 } else {
205 n = snprintf(cigar_string + cigar_length,
206 rest,
207 "%c", op);
208
209 }
210 if (n < 0)
211 {
212 fatal("snprintf returned a negative number.\n");
213 }
214 else if (n >= rest)
215 {
216 cigar_alloc += MAX(n - rest + 1, 64);
217 cigar_string = (char*) xrealloc(cigar_string, cigar_alloc);
218 }
219 else
220 {
221 cigar_length += n;
222 break;
223 }
224 }
225 }
226 }
227
cigar_add(char _op,int64_t run)228 void LinearMemoryAligner::cigar_add(char _op, int64_t run)
229 {
230 if (op == _op) {
231 op_run += run;
232 } else
233 {
234 cigar_flush();
235 op = _op;
236 op_run = run;
237 }
238 }
239
show_matrix()240 void LinearMemoryAligner::show_matrix()
241 {
242 for(int i=0; i<16; i++)
243 {
244 printf("%2d:", i);
245 for(int j=0; j<16; j++) {
246 printf(" %2" PRId64, scorematrix[16*i+j]);
247
248 }
249 printf("\n");
250 }
251 }
252
diff(int64_t a_start,int64_t b_start,int64_t a_len,int64_t b_len,bool gap_b_left,bool gap_b_right,bool a_left,bool a_right,bool b_left,bool b_right)253 void LinearMemoryAligner::diff(int64_t a_start,
254 int64_t b_start,
255 int64_t a_len,
256 int64_t b_len,
257 bool gap_b_left, /* gap open left of b */
258 bool gap_b_right, /* gap open right of b */
259 bool a_left, /* includes left end of a */
260 bool a_right, /* includes right end of a */
261 bool b_left, /* includes left end of b */
262 bool b_right) /* includes right end of b */
263 {
264 if (b_len == 0)
265 {
266 /* B and possibly A is empty */
267 if (a_len > 0)
268 {
269 // Delete a_len from A
270 // AAA
271 // ---
272
273 cigar_add('D', a_len);
274 }
275 }
276 else if (a_len == 0)
277 {
278 /* A is empty, B is not */
279
280 // Delete b_len from B
281 // ---
282 // BBB
283
284 cigar_add('I', b_len);
285 }
286 else if (a_len == 1)
287 {
288 /*
289 Convert 1 symbol from A to b_len symbols from B
290 b_len >= 1
291 */
292
293
294 int64_t MaxScore;
295 int64_t best;
296
297 int64_t Score = 0;
298
299 /* First possibility */
300
301 // Delete 1 from A, Insert b_len from B
302 // A----
303 // -BBBB
304
305 /* gap penalty for gap in B of length 1 */
306
307 if (! gap_b_left) {
308 Score -= b_left ? go_t_l : go_t_i;
309
310 }
311
312 Score -= b_left ? ge_t_l : ge_t_i;
313
314 /* gap penalty for gap in A of length b_len */
315
316 Score -= a_right ? go_q_r + b_len * ge_q_r : go_q_i + b_len * ge_q_i;
317
318 MaxScore = Score;
319 best = -1;
320
321
322 /* Second possibility */
323
324 // Insert b_len from B, Delete 1 from A
325 // ----A
326 // BBBB-
327
328 /* gap penalty for gap in A of length b_len */
329
330 Score -= a_left ? go_q_l + b_len * ge_q_l : go_q_i + b_len * ge_q_i;
331
332 /* gap penalty for gap in B of length 1 */
333
334 if (! gap_b_right) {
335 Score -= b_right ? go_t_r : go_t_i;
336
337 }
338
339 Score -= b_right ? ge_t_r : ge_t_i;
340
341 if (Score > MaxScore)
342 {
343 MaxScore = Score;
344 best = b_len;
345 }
346
347
348 /* Third possibility */
349
350 for (int64_t j = 0; j < b_len; j++)
351 {
352 // Insert zero or more from B, replace 1, insert rest of B
353 // -A--
354 // BBBB
355
356 Score = 0;
357
358 if (j > 0) {
359 Score -= a_left ? go_q_l + j * ge_q_l : go_q_i + j * ge_q_i;
360
361 }
362
363 Score += subst_score(a_start, b_start + j);
364
365 if (j < b_len - 1) {
366 Score -= a_right ?
367 go_q_r + (b_len-1-j) * ge_q_r :
368 go_q_i + (b_len-1-j) * ge_q_i;
369
370 }
371
372 if (Score > MaxScore)
373 {
374 MaxScore = Score;
375 best = j;
376 }
377 }
378
379 if (best == -1)
380 {
381 cigar_add('D', 1);
382 cigar_add('I', b_len);
383 }
384 else if (best == b_len)
385 {
386 cigar_add('I', b_len);
387 cigar_add('D', 1);
388 }
389 else
390 {
391 if (best > 0) {
392 cigar_add('I', best);
393
394 }
395 cigar_add('M', 1);
396 if (best < b_len - 1) {
397 cigar_add('I', b_len - 1 - best);
398
399 }
400 }
401 }
402 else
403 {
404 /* a_len >= 2, b_len >= 1 */
405
406 int64_t I = a_len / 2;
407 int64_t i, j;
408
409 // Compute HH & EE in forward phase
410 // Upper part
411
412 /* initialize HH and EE for values corresponding to
413 empty seq A vs B of j symbols,
414 i.e. a gap of length j in A */
415
416 HH[0] = 0;
417 EE[0] = 0;
418
419 for (j = 1; j <= b_len; j++)
420 {
421 HH[j] = - (a_left ? go_q_l + j * ge_q_l : go_q_i + j * ge_q_i);
422 EE[j] = LONG_MIN;
423 }
424
425 /* compute matrix */
426
427 for (i = 1; i <= I; i++)
428 {
429 int64_t p = HH[0];
430
431 int64_t h = - (b_left ?
432 (gap_b_left ? 0 : go_t_l) + i * ge_t_l :
433 (gap_b_left ? 0 : go_t_i) + i * ge_t_i);
434
435 HH[0] = h;
436 int64_t f = LONG_MIN;
437
438 for (j = 1; j <= b_len; j++)
439 {
440 f = MAX(f, h - go_q_i) - ge_q_i;
441 if (b_right && (j==b_len)) {
442 EE[j] = MAX(EE[j], HH[j] - go_t_r) - ge_t_r;
443 } else {
444 EE[j] = MAX(EE[j], HH[j] - go_t_i) - ge_t_i;
445
446 }
447
448 h = p + subst_score(a_start + i - 1, b_start + j - 1);
449
450 if (f > h) {
451 h = f;
452
453 }
454 if (EE[j] > h) {
455 h = EE[j];
456
457 }
458 p = HH[j];
459 HH[j] = h;
460 }
461 }
462
463 EE[0] = HH[0];
464
465 // Compute XX & YY in reverse phase
466 // Lower part
467
468 /* initialize XX and YY */
469
470 XX[0] = 0;
471 YY[0] = 0;
472
473 for (j = 1; j <= b_len; j++)
474 {
475 XX[j] = - (a_right ? go_q_r + j * ge_q_r : go_q_i + j * ge_q_i);
476 YY[j] = LONG_MIN;
477 }
478
479 /* compute matrix */
480
481 for (i = 1; i <= a_len - I; i++)
482 {
483 int64_t p = XX[0];
484
485 int64_t h = - (b_right ?
486 (gap_b_right ? 0 : go_t_r) + i * ge_t_r :
487 (gap_b_right ? 0 : go_t_i) + i * ge_t_i);
488 XX[0] = h;
489 int64_t f = LONG_MIN;
490
491 for (j = 1; j <= b_len; j++)
492 {
493 f = MAX(f, h - go_q_i) - ge_q_i;
494 if (b_left && (j==b_len)) {
495 YY[j] = MAX(YY[j], XX[j] - go_t_l) - ge_t_l;
496 } else {
497 YY[j] = MAX(YY[j], XX[j] - go_t_i) - ge_t_i;
498
499 }
500
501 h = p + subst_score(a_start + a_len - i, b_start + b_len - j);
502
503 if (f > h) {
504 h = f;
505
506 }
507 if (YY[j] > h) {
508 h = YY[j];
509
510 }
511 p = XX[j];
512 XX[j] = h;
513 }
514 }
515
516 YY[0] = XX[0];
517
518
519 /* find maximum score along division line */
520
521 int64_t MaxScore0 = LONG_MIN;
522 int64_t best0 = -1;
523
524 /* solutions with diagonal at break */
525
526 for (j=0; j <= b_len; j++)
527 {
528 int64_t Score = HH[j] + XX[b_len - j];
529
530 if (Score > MaxScore0)
531 {
532 MaxScore0 = Score;
533 best0 = j;
534 }
535 }
536
537 int64_t MaxScore1 = LONG_MIN;
538 int64_t best1 = -1;
539
540 /* solutions that end with a gap in b from both ends at break */
541
542 for (j=0; j <= b_len; j++)
543 {
544 int64_t g;
545 if (b_left && (j==0)) {
546 g = go_t_l;
547 } else if (b_right && (j==b_len)) {
548 g = go_t_r;
549 } else {
550 g = go_t_i;
551
552 }
553
554 int64_t Score = EE[j] + YY[b_len - j] + g;
555
556 if (Score > MaxScore1)
557 {
558 MaxScore1 = Score;
559 best1 = j;
560 }
561 }
562
563 int64_t P;
564 int64_t best;
565
566 if (MaxScore0 > MaxScore1)
567 {
568 P = 0;
569 best = best0;
570 }
571 else if (MaxScore1 > MaxScore0)
572 {
573 P = 1;
574 best = best1;
575 }
576 else
577 {
578 if (best0 <= best1)
579 {
580 P = 0;
581 best = best0;
582 }
583 else
584 {
585 P = 1;
586 best = best1;
587 }
588 }
589
590 /* recursively compute upper left and lower right parts */
591
592 if (P == 0)
593 {
594 diff(a_start, b_start,
595 I, best,
596 gap_b_left, false,
597 a_left, false,
598 b_left, b_right && (best == b_len));
599
600 diff(a_start + I, b_start + best,
601 a_len - I, b_len - best,
602 false, gap_b_right,
603 false, a_right,
604 b_left && (best == 0), b_right);
605 }
606 else if (P == 1)
607 {
608 diff(a_start, b_start,
609 I - 1, best,
610 gap_b_left, true,
611 a_left, false,
612 b_left, b_right && (best == b_len));
613
614 cigar_add('D', 2);
615
616 diff(a_start + I + 1, b_start + best,
617 a_len - I - 1, b_len - best,
618 true, gap_b_right,
619 false, a_right,
620 b_left && (best == 0), b_right);
621 }
622 }
623 }
624
set_parameters(int64_t * _scorematrix,int64_t _gap_open_query_left,int64_t _gap_open_target_left,int64_t _gap_open_query_interior,int64_t _gap_open_target_interior,int64_t _gap_open_query_right,int64_t _gap_open_target_right,int64_t _gap_extension_query_left,int64_t _gap_extension_target_left,int64_t _gap_extension_query_interior,int64_t _gap_extension_target_interior,int64_t _gap_extension_query_right,int64_t _gap_extension_target_right)625 void LinearMemoryAligner::set_parameters(int64_t * _scorematrix,
626 int64_t _gap_open_query_left,
627 int64_t _gap_open_target_left,
628 int64_t _gap_open_query_interior,
629 int64_t _gap_open_target_interior,
630 int64_t _gap_open_query_right,
631 int64_t _gap_open_target_right,
632 int64_t _gap_extension_query_left,
633 int64_t _gap_extension_target_left,
634 int64_t _gap_extension_query_interior,
635 int64_t _gap_extension_target_interior,
636 int64_t _gap_extension_query_right,
637 int64_t _gap_extension_target_right)
638 {
639 scorematrix = _scorematrix;
640
641 /* a = query/q b = t/target */
642
643 go_q_l = _gap_open_query_left;
644 go_t_l = _gap_open_target_left;
645 go_q_i = _gap_open_query_interior;
646 go_t_i = _gap_open_target_interior;
647 go_q_r = _gap_open_query_right;
648 go_t_r = _gap_open_target_right;
649 ge_q_l = _gap_extension_query_left;
650 ge_t_l = _gap_extension_target_left;
651 ge_q_i = _gap_extension_query_interior;
652 ge_t_i = _gap_extension_target_interior;
653 ge_q_r = _gap_extension_query_right;
654 ge_t_r = _gap_extension_target_right;
655
656 q = _gap_open_query_interior;
657 r = _gap_extension_query_interior;
658 }
659
660
661
align(char * _a_seq,char * _b_seq,int64_t a_len,int64_t b_len)662 char * LinearMemoryAligner::align(char * _a_seq,
663 char * _b_seq,
664 int64_t a_len,
665 int64_t b_len)
666 {
667 /* copy parameters */
668 a_seq = _a_seq;
669 b_seq = _b_seq;
670
671 /* init cigar operations */
672 cigar_reset();
673
674 /* allocate enough memory for vectors */
675 alloc_vectors(b_len+1);
676
677 /* perform alignment */
678 diff(0, 0, a_len, b_len, false, false, true, true, true, true);
679
680 /* ensure entire cigar has been written */
681 cigar_flush();
682
683 /* return cigar */
684 return cigar_string;
685 }
686
alignstats(char * cigar,char * _a_seq,char * _b_seq,int64_t * _nwscore,int64_t * _nwalignmentlength,int64_t * _nwmatches,int64_t * _nwmismatches,int64_t * _nwgaps)687 void LinearMemoryAligner::alignstats(char * cigar,
688 char * _a_seq,
689 char * _b_seq,
690 int64_t * _nwscore,
691 int64_t * _nwalignmentlength,
692 int64_t * _nwmatches,
693 int64_t * _nwmismatches,
694 int64_t * _nwgaps)
695 {
696 a_seq = _a_seq;
697 b_seq = _b_seq;
698
699 int64_t nwscore = 0;
700 int64_t nwalignmentlength = 0;
701 int64_t nwmatches = 0;
702 int64_t nwmismatches = 0;
703 int64_t nwgaps = 0;
704
705 int64_t a_pos = 0;
706 int64_t b_pos = 0;
707
708 char * p = cigar;
709
710 int64_t g;
711
712 while (*p)
713 {
714 int64_t run = 1;
715 int scanlength = 0;
716 sscanf(p, "%" PRId64 "%n", &run, &scanlength);
717 p += scanlength;
718 switch (*p++)
719 {
720 case 'M':
721 nwalignmentlength += run;
722 for(int64_t k=0; k<run; k++)
723 {
724 nwscore += subst_score(a_pos, b_pos);
725
726 if (chrmap_4bit[(int)(a_seq[a_pos])] &
727 chrmap_4bit[(int)(b_seq[b_pos])]) {
728 nwmatches++;
729 } else {
730 nwmismatches++;
731
732 }
733
734 a_pos++;
735 b_pos++;
736 }
737 break;
738
739 case 'I':
740 if ((a_pos == 0) && (b_pos == 0)) {
741 g = go_q_l + run * ge_q_l;
742 } else if (*p == 0) {
743 g = go_q_r + run * ge_q_r;
744 } else {
745 g = go_q_i + run * ge_q_i;
746
747 }
748 nwscore -= g;
749 nwgaps++;
750 nwalignmentlength += run;
751 b_pos += run;
752 break;
753
754 case 'D':
755 if ((a_pos == 0) && (b_pos == 0)) {
756 g = go_t_l + run * ge_t_l;
757 } else if (*p == 0) {
758 g = go_t_r + run * ge_t_r;
759 } else {
760 g = go_t_i + run * ge_t_i;
761
762 }
763 nwscore -= g;
764 nwgaps++;
765 nwalignmentlength += run;
766 a_pos += run;
767 break;
768 }
769 }
770
771 *_nwscore = nwscore;
772 *_nwalignmentlength = nwalignmentlength;
773 *_nwmatches = nwmatches;
774 *_nwmismatches = nwmismatches;
775 *_nwgaps = nwgaps;
776 }
777
778