1 /*
2 
3   VSEARCH: a versatile open source tool for metagenomics
4 
5   Copyright (C) 2014-2021, Torbjorn Rognes, Frederic Mahe and Tomas Flouri
6   All rights reserved.
7 
8   Contact: Torbjorn Rognes <torognes@ifi.uio.no>,
9   Department of Informatics, University of Oslo,
10   PO Box 1080 Blindern, NO-0316 Oslo, Norway
11 
12   This software is dual-licensed and available under a choice
13   of one of two licenses, either under the terms of the GNU
14   General Public License version 3 or the BSD 2-Clause License.
15 
16 
17   GNU General Public License version 3
18 
19   This program is free software: you can redistribute it and/or modify
20   it under the terms of the GNU General Public License as published by
21   the Free Software Foundation, either version 3 of the License, or
22   (at your option) any later version.
23 
24   This program is distributed in the hope that it will be useful,
25   but WITHOUT ANY WARRANTY; without even the implied warranty of
26   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27   GNU General Public License for more details.
28 
29   You should have received a copy of the GNU General Public License
30   along with this program.  If not, see <http://www.gnu.org/licenses/>.
31 
32 
33   The BSD 2-Clause License
34 
35   Redistribution and use in source and binary forms, with or without
36   modification, are permitted provided that the following conditions
37   are met:
38 
39   1. Redistributions of source code must retain the above copyright
40   notice, this list of conditions and the following disclaimer.
41 
42   2. Redistributions in binary form must reproduce the above copyright
43   notice, this list of conditions and the following disclaimer in the
44   documentation and/or other materials provided with the distribution.
45 
46   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
47   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
48   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
49   FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
50   COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
51   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
52   BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
53   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
54   CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
56   ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
57   POSSIBILITY OF SUCH DAMAGE.
58 
59 */
60 
61 #include "vsearch.h"
62 
63 /*
64 
65   Compute the optimal global alignment of two sequences
66   in linear space using the divide and conquer method.
67 
68   These functions are based on the following articles:
69   - Hirschberg (1975) Comm ACM 18:341-343
70   - Myers & Miller (1988) CABIOS 4:11-17
71 
72   The method has been adapted for the use of different
73   gap penalties for query/target/left/interior/right gaps.
74 
75   scorematrix consists of 16x16 int64_t integers
76 
77   Sequences and alignment matrix:
78   A/a/i/query/q/downwards/vertical/top/bottom
79   B/b/j/target/t/rightwards/horizontal/left/right
80 
81   f corresponds to score ending with gap in A/query
82   EE corresponds to score ending with gap in B/target
83 
84 */
85 
LinearMemoryAligner()86 LinearMemoryAligner::LinearMemoryAligner()
87 {
88   scorematrix = nullptr;
89 
90   cigar_alloc = 0;
91   cigar_string = nullptr;
92 
93   vector_alloc = 0;
94   HH = nullptr;
95   EE = nullptr;
96   XX = nullptr;
97   YY = nullptr;
98 }
99 
~LinearMemoryAligner()100 LinearMemoryAligner::~LinearMemoryAligner()
101 {
102   if (cigar_string) {
103     xfree(cigar_string);
104 
105         }
106   if (HH) {
107     xfree(HH);
108 
109         }
110   if (EE) {
111     xfree(EE);
112 
113         }
114   if (XX) {
115     xfree(XX);
116 
117         }
118   if (YY) {
119     xfree(YY);
120 
121         }
122 }
123 
scorematrix_create(int64_t match,int64_t mismatch)124 int64_t * LinearMemoryAligner::scorematrix_create(int64_t match, int64_t mismatch)
125 {
126   auto * newscorematrix = (int64_t*) xmalloc(16*16*sizeof(int64_t));
127 
128   for(int i=0; i<16; i++) {
129     for(int j=0; j<16; j++)
130       {
131         int64_t value;
132         if (ambiguous_4bit[i] || ambiguous_4bit[j]) {
133           value = 0;
134         } else if (i == j) {
135           value = match;
136         } else {
137           value = mismatch;
138 
139         }
140         newscorematrix[16*i+j] = value;
141       }
142 
143         }
144   return newscorematrix;
145 }
146 
alloc_vectors(size_t x)147 void LinearMemoryAligner::alloc_vectors(size_t x)
148 {
149   if (vector_alloc < x)
150     {
151       vector_alloc = x;
152 
153       if (HH) {
154         xfree(HH);
155 
156         }
157       if (EE) {
158         xfree(EE);
159 
160         }
161       if (XX) {
162         xfree(XX);
163 
164         }
165       if (YY) {
166         xfree(YY);
167 
168         }
169 
170       HH = (int64_t*) xmalloc(vector_alloc * (sizeof(int64_t)));
171       EE = (int64_t*) xmalloc(vector_alloc * (sizeof(int64_t)));
172       XX = (int64_t*) xmalloc(vector_alloc * (sizeof(int64_t)));
173       YY = (int64_t*) xmalloc(vector_alloc * (sizeof(int64_t)));
174     }
175 }
176 
cigar_reset()177 void LinearMemoryAligner::cigar_reset()
178 {
179   if (cigar_alloc < 1)
180     {
181       cigar_alloc = 64;
182       cigar_string = (char*) xrealloc(cigar_string, cigar_alloc);
183     }
184   cigar_string[0] = 0;
185   cigar_length = 0;
186   op = 0;
187   op_run = 0;
188 }
189 
cigar_flush()190 void LinearMemoryAligner::cigar_flush()
191 {
192   if (op_run > 0)
193   {
194     while (true)
195     {
196       /* try writing string until enough memory has been allocated */
197 
198       int64_t rest = cigar_alloc - cigar_length;
199       int n;
200       if (op_run > 1) {
201         n = snprintf(cigar_string + cigar_length,
202                      rest,
203                      "%" PRId64 "%c", op_run, op);
204       } else {
205         n = snprintf(cigar_string + cigar_length,
206                      rest,
207                      "%c", op);
208 
209         }
210       if (n < 0)
211         {
212           fatal("snprintf returned a negative number.\n");
213         }
214       else if (n >= rest)
215         {
216           cigar_alloc += MAX(n - rest + 1, 64);
217           cigar_string = (char*) xrealloc(cigar_string, cigar_alloc);
218         }
219       else
220         {
221           cigar_length += n;
222           break;
223         }
224     }
225   }
226 }
227 
cigar_add(char _op,int64_t run)228 void LinearMemoryAligner::cigar_add(char _op, int64_t run)
229   {
230     if (op == _op) {
231       op_run += run;
232     } else
233       {
234         cigar_flush();
235         op = _op;
236         op_run = run;
237       }
238   }
239 
show_matrix()240 void LinearMemoryAligner::show_matrix()
241 {
242   for(int i=0; i<16; i++)
243     {
244       printf("%2d:", i);
245       for(int j=0; j<16; j++) {
246         printf(" %2" PRId64, scorematrix[16*i+j]);
247 
248         }
249       printf("\n");
250     }
251 }
252 
diff(int64_t a_start,int64_t b_start,int64_t a_len,int64_t b_len,bool gap_b_left,bool gap_b_right,bool a_left,bool a_right,bool b_left,bool b_right)253 void LinearMemoryAligner::diff(int64_t a_start,
254                                int64_t b_start,
255                                int64_t a_len,
256                                int64_t b_len,
257                                bool gap_b_left,  /* gap open left of b      */
258                                bool gap_b_right, /* gap open right of b     */
259                                bool a_left,      /* includes left end of a  */
260                                bool a_right,     /* includes right end of a */
261                                bool b_left,      /* includes left end of b  */
262                                bool b_right)     /* includes right end of b */
263 {
264   if (b_len == 0)
265     {
266       /* B and possibly A is empty */
267       if (a_len > 0)
268         {
269           // Delete a_len from A
270           // AAA
271           // ---
272 
273           cigar_add('D', a_len);
274         }
275     }
276   else if (a_len == 0)
277     {
278       /* A is empty, B is not */
279 
280       // Delete b_len from B
281       // ---
282       // BBB
283 
284       cigar_add('I', b_len);
285     }
286   else if (a_len == 1)
287     {
288       /*
289         Convert 1 symbol from A to b_len symbols from B
290         b_len >= 1
291       */
292 
293 
294       int64_t MaxScore;
295       int64_t best;
296 
297       int64_t Score = 0;
298 
299       /* First possibility */
300 
301       // Delete 1 from A, Insert b_len from B
302       // A----
303       // -BBBB
304 
305       /* gap penalty for gap in B of length 1 */
306 
307       if (! gap_b_left) {
308         Score -= b_left ? go_t_l : go_t_i;
309 
310         }
311 
312       Score -= b_left ? ge_t_l : ge_t_i;
313 
314       /* gap penalty for gap in A of length b_len */
315 
316       Score -= a_right ? go_q_r + b_len * ge_q_r : go_q_i + b_len * ge_q_i;
317 
318       MaxScore = Score;
319       best = -1;
320 
321 
322       /* Second possibility */
323 
324       // Insert b_len from B, Delete 1 from A
325       // ----A
326       // BBBB-
327 
328       /* gap penalty for gap in A of length b_len */
329 
330       Score -= a_left ? go_q_l + b_len * ge_q_l : go_q_i + b_len * ge_q_i;
331 
332       /* gap penalty for gap in B of length 1 */
333 
334       if (! gap_b_right) {
335         Score -= b_right ? go_t_r : go_t_i;
336 
337         }
338 
339       Score -= b_right ? ge_t_r : ge_t_i;
340 
341       if (Score > MaxScore)
342         {
343           MaxScore = Score;
344           best = b_len;
345         }
346 
347 
348       /* Third possibility */
349 
350       for (int64_t j = 0; j < b_len; j++)
351         {
352           // Insert zero or more from B, replace 1, insert rest of B
353           // -A--
354           // BBBB
355 
356           Score = 0;
357 
358           if (j > 0) {
359             Score -= a_left ? go_q_l + j * ge_q_l : go_q_i + j * ge_q_i;
360 
361         }
362 
363           Score += subst_score(a_start, b_start + j);
364 
365           if (j < b_len - 1) {
366             Score -= a_right ?
367               go_q_r + (b_len-1-j) * ge_q_r :
368               go_q_i + (b_len-1-j) * ge_q_i;
369 
370         }
371 
372           if (Score > MaxScore)
373             {
374               MaxScore = Score;
375               best = j;
376             }
377         }
378 
379       if (best == -1)
380         {
381           cigar_add('D', 1);
382           cigar_add('I', b_len);
383         }
384       else if (best == b_len)
385         {
386           cigar_add('I', b_len);
387           cigar_add('D', 1);
388         }
389       else
390         {
391           if (best > 0) {
392             cigar_add('I', best);
393 
394         }
395           cigar_add('M', 1);
396           if (best < b_len - 1) {
397             cigar_add('I', b_len - 1 - best);
398 
399         }
400         }
401     }
402   else
403     {
404       /* a_len >= 2, b_len >= 1 */
405 
406       int64_t I = a_len / 2;
407       int64_t i, j;
408 
409       // Compute HH & EE in forward phase
410       // Upper part
411 
412       /* initialize HH and EE for values corresponding to
413          empty seq A vs B of j symbols,
414          i.e. a gap of length j in A                 */
415 
416       HH[0] = 0;
417       EE[0] = 0;
418 
419       for (j = 1; j <= b_len; j++)
420         {
421           HH[j] = - (a_left ? go_q_l + j * ge_q_l : go_q_i + j * ge_q_i);
422           EE[j] = LONG_MIN;
423         }
424 
425       /* compute matrix */
426 
427       for (i = 1; i <= I; i++)
428         {
429           int64_t p = HH[0];
430 
431           int64_t h = - (b_left ?
432                       (gap_b_left ? 0 : go_t_l) + i * ge_t_l :
433                       (gap_b_left ? 0 : go_t_i) + i * ge_t_i);
434 
435           HH[0] = h;
436           int64_t f = LONG_MIN;
437 
438           for (j = 1; j <= b_len; j++)
439             {
440               f = MAX(f, h - go_q_i) - ge_q_i;
441               if (b_right && (j==b_len)) {
442                 EE[j] = MAX(EE[j], HH[j] - go_t_r) - ge_t_r;
443               } else {
444                 EE[j] = MAX(EE[j], HH[j] - go_t_i) - ge_t_i;
445 
446         }
447 
448               h = p + subst_score(a_start + i - 1, b_start + j - 1);
449 
450               if (f > h) {
451                 h = f;
452 
453         }
454               if (EE[j] > h) {
455                 h = EE[j];
456 
457         }
458               p = HH[j];
459               HH[j] = h;
460             }
461         }
462 
463       EE[0] = HH[0];
464 
465       // Compute XX & YY in reverse phase
466       // Lower part
467 
468       /* initialize XX and YY */
469 
470       XX[0] = 0;
471       YY[0] = 0;
472 
473       for (j = 1; j <= b_len; j++)
474         {
475           XX[j] = - (a_right ? go_q_r + j * ge_q_r : go_q_i + j * ge_q_i);
476           YY[j] = LONG_MIN;
477         }
478 
479       /* compute matrix */
480 
481       for (i = 1; i <= a_len - I; i++)
482         {
483           int64_t p = XX[0];
484 
485           int64_t h = - (b_right ?
486                       (gap_b_right ? 0 : go_t_r) + i * ge_t_r :
487                       (gap_b_right ? 0 : go_t_i) + i * ge_t_i);
488           XX[0] = h;
489           int64_t f = LONG_MIN;
490 
491           for (j = 1; j <= b_len; j++)
492             {
493               f = MAX(f, h - go_q_i) - ge_q_i;
494               if (b_left && (j==b_len)) {
495                 YY[j] = MAX(YY[j], XX[j] - go_t_l) - ge_t_l;
496               } else {
497                 YY[j] = MAX(YY[j], XX[j] - go_t_i) - ge_t_i;
498 
499         }
500 
501               h = p + subst_score(a_start + a_len - i, b_start + b_len - j);
502 
503               if (f > h) {
504                 h = f;
505 
506         }
507               if (YY[j] > h) {
508                 h = YY[j];
509 
510         }
511               p = XX[j];
512               XX[j] = h;
513             }
514         }
515 
516       YY[0] = XX[0];
517 
518 
519       /* find maximum score along division line */
520 
521       int64_t MaxScore0 = LONG_MIN;
522       int64_t best0 = -1;
523 
524       /* solutions with diagonal at break */
525 
526       for (j=0; j <= b_len; j++)
527         {
528           int64_t Score = HH[j] + XX[b_len - j];
529 
530           if (Score > MaxScore0)
531             {
532               MaxScore0 = Score;
533               best0 = j;
534             }
535         }
536 
537       int64_t MaxScore1 = LONG_MIN;
538       int64_t best1 = -1;
539 
540       /* solutions that end with a gap in b from both ends at break */
541 
542       for (j=0; j <= b_len; j++)
543         {
544           int64_t g;
545           if (b_left && (j==0)) {
546             g = go_t_l;
547           } else if (b_right && (j==b_len)) {
548             g = go_t_r;
549           } else {
550             g = go_t_i;
551 
552         }
553 
554           int64_t Score = EE[j] + YY[b_len - j] + g;
555 
556           if (Score > MaxScore1)
557             {
558               MaxScore1 = Score;
559               best1 = j;
560             }
561         }
562 
563       int64_t P;
564       int64_t best;
565 
566       if (MaxScore0 > MaxScore1)
567         {
568           P = 0;
569           best = best0;
570         }
571       else if (MaxScore1 > MaxScore0)
572         {
573           P = 1;
574           best = best1;
575         }
576       else
577         {
578           if (best0 <= best1)
579             {
580               P = 0;
581               best = best0;
582             }
583           else
584             {
585               P = 1;
586               best = best1;
587             }
588         }
589 
590       /* recursively compute upper left and lower right parts */
591 
592       if (P == 0)
593         {
594           diff(a_start,               b_start,
595                I,                     best,
596                gap_b_left,            false,
597                a_left,                false,
598                b_left,                b_right && (best == b_len));
599 
600           diff(a_start + I,           b_start + best,
601                a_len - I,             b_len - best,
602                false,                 gap_b_right,
603                false,                 a_right,
604                b_left && (best == 0), b_right);
605         }
606       else if (P == 1)
607         {
608           diff(a_start,               b_start,
609                I - 1,                 best,
610                gap_b_left,            true,
611                a_left,                false,
612                b_left,                b_right && (best == b_len));
613 
614           cigar_add('D', 2);
615 
616           diff(a_start + I + 1,       b_start + best,
617                a_len - I - 1,         b_len - best,
618                true,                  gap_b_right,
619                false,                 a_right,
620                b_left && (best == 0), b_right);
621         }
622     }
623 }
624 
set_parameters(int64_t * _scorematrix,int64_t _gap_open_query_left,int64_t _gap_open_target_left,int64_t _gap_open_query_interior,int64_t _gap_open_target_interior,int64_t _gap_open_query_right,int64_t _gap_open_target_right,int64_t _gap_extension_query_left,int64_t _gap_extension_target_left,int64_t _gap_extension_query_interior,int64_t _gap_extension_target_interior,int64_t _gap_extension_query_right,int64_t _gap_extension_target_right)625 void LinearMemoryAligner::set_parameters(int64_t * _scorematrix,
626                                          int64_t _gap_open_query_left,
627                                          int64_t _gap_open_target_left,
628                                          int64_t _gap_open_query_interior,
629                                          int64_t _gap_open_target_interior,
630                                          int64_t _gap_open_query_right,
631                                          int64_t _gap_open_target_right,
632                                          int64_t _gap_extension_query_left,
633                                          int64_t _gap_extension_target_left,
634                                          int64_t _gap_extension_query_interior,
635                                          int64_t _gap_extension_target_interior,
636                                          int64_t _gap_extension_query_right,
637                                          int64_t _gap_extension_target_right)
638 {
639   scorematrix = _scorematrix;
640 
641   /* a = query/q   b = t/target */
642 
643   go_q_l = _gap_open_query_left;
644   go_t_l = _gap_open_target_left;
645   go_q_i = _gap_open_query_interior;
646   go_t_i = _gap_open_target_interior;
647   go_q_r = _gap_open_query_right;
648   go_t_r = _gap_open_target_right;
649   ge_q_l = _gap_extension_query_left;
650   ge_t_l = _gap_extension_target_left;
651   ge_q_i = _gap_extension_query_interior;
652   ge_t_i = _gap_extension_target_interior;
653   ge_q_r = _gap_extension_query_right;
654   ge_t_r = _gap_extension_target_right;
655 
656   q = _gap_open_query_interior;
657   r = _gap_extension_query_interior;
658 }
659 
660 
661 
align(char * _a_seq,char * _b_seq,int64_t a_len,int64_t b_len)662 char * LinearMemoryAligner::align(char * _a_seq,
663                                   char * _b_seq,
664                                   int64_t a_len,
665                                   int64_t b_len)
666 {
667   /* copy parameters */
668   a_seq = _a_seq;
669   b_seq = _b_seq;
670 
671   /* init cigar operations */
672   cigar_reset();
673 
674   /* allocate enough memory for vectors */
675   alloc_vectors(b_len+1);
676 
677   /* perform alignment */
678   diff(0, 0, a_len, b_len, false, false, true, true, true, true);
679 
680   /* ensure entire cigar has been written */
681   cigar_flush();
682 
683   /* return cigar */
684   return cigar_string;
685 }
686 
alignstats(char * cigar,char * _a_seq,char * _b_seq,int64_t * _nwscore,int64_t * _nwalignmentlength,int64_t * _nwmatches,int64_t * _nwmismatches,int64_t * _nwgaps)687 void LinearMemoryAligner::alignstats(char * cigar,
688                                      char * _a_seq,
689                                      char * _b_seq,
690                                      int64_t * _nwscore,
691                                      int64_t * _nwalignmentlength,
692                                      int64_t * _nwmatches,
693                                      int64_t * _nwmismatches,
694                                      int64_t * _nwgaps)
695 {
696   a_seq = _a_seq;
697   b_seq = _b_seq;
698 
699   int64_t nwscore = 0;
700   int64_t nwalignmentlength = 0;
701   int64_t nwmatches = 0;
702   int64_t nwmismatches = 0;
703   int64_t nwgaps = 0;
704 
705   int64_t a_pos = 0;
706   int64_t b_pos = 0;
707 
708   char * p = cigar;
709 
710   int64_t g;
711 
712   while (*p)
713     {
714       int64_t run = 1;
715       int scanlength = 0;
716       sscanf(p, "%" PRId64 "%n", &run, &scanlength);
717       p += scanlength;
718       switch (*p++)
719         {
720         case 'M':
721           nwalignmentlength += run;
722           for(int64_t k=0; k<run; k++)
723             {
724               nwscore += subst_score(a_pos, b_pos);
725 
726               if (chrmap_4bit[(int)(a_seq[a_pos])] &
727                   chrmap_4bit[(int)(b_seq[b_pos])]) {
728                 nwmatches++;
729               } else {
730                 nwmismatches++;
731 
732         }
733 
734               a_pos++;
735               b_pos++;
736             }
737           break;
738 
739         case 'I':
740           if ((a_pos == 0) && (b_pos == 0)) {
741             g = go_q_l + run * ge_q_l;
742           } else if (*p == 0) {
743             g = go_q_r + run * ge_q_r;
744           } else {
745             g = go_q_i + run * ge_q_i;
746 
747         }
748           nwscore -= g;
749           nwgaps++;
750           nwalignmentlength += run;
751           b_pos += run;
752           break;
753 
754         case 'D':
755           if ((a_pos == 0) && (b_pos == 0)) {
756             g = go_t_l + run * ge_t_l;
757           } else if (*p == 0) {
758             g = go_t_r + run * ge_t_r;
759           } else {
760             g = go_t_i + run * ge_t_i;
761 
762         }
763           nwscore -= g;
764           nwgaps++;
765           nwalignmentlength += run;
766           a_pos += run;
767           break;
768         }
769     }
770 
771   *_nwscore = nwscore;
772   *_nwalignmentlength = nwalignmentlength;
773   *_nwmatches = nwmatches;
774   *_nwmismatches = nwmismatches;
775   *_nwgaps = nwgaps;
776 }
777 
778