1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include <stdio.h>
23 
24 #include <U2Core/U2Region.h>
25 
26 #include "SmithWatermanAlgorithm.h"
27 
28 namespace U2 {
29 
30 const char SmithWatermanAlgorithm::STOP = 's';
31 const char SmithWatermanAlgorithm::UP = 'u';
32 const char SmithWatermanAlgorithm::LEFT = 'l';
33 const char SmithWatermanAlgorithm::DIAG = 'd';
34 
SmithWatermanAlgorithm()35 SmithWatermanAlgorithm::SmithWatermanAlgorithm() {
36     gapOpen = 0;
37     gapExtension = 0;
38     minScore = 0;
39     matrixLength = 0;
40 }
41 
estimateNeededRamAmount(const qint32 gapOpen,const qint32 gapExtension,const quint32 minScore,const quint32 maxScore,const QByteArray & patternSeq,const QByteArray & searchSeq,const SmithWatermanSettings::SWResultView resultView)42 quint64 SmithWatermanAlgorithm::estimateNeededRamAmount(const qint32 gapOpen,
43                                                         const qint32 gapExtension,
44                                                         const quint32 minScore,
45                                                         const quint32 maxScore,
46                                                         const QByteArray &patternSeq,
47                                                         const QByteArray &searchSeq,
48                                                         const SmithWatermanSettings::SWResultView resultView) {
49     const double b_to_mb_factor = 1048576.0;
50 
51     const quint64 queryLength = patternSeq.length();
52     const quint64 searchLength = searchSeq.length();
53 
54     quint64 memToAllocInBytes = 0;
55     if (SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
56         const qint32 maxGapPenalty = (gapOpen > gapExtension) ? gapOpen : gapExtension;
57         assert(0 > maxGapPenalty);
58         quint64 matrixLength = queryLength - (maxScore - minScore) / maxGapPenalty + 1;
59         if (searchLength + 1 < matrixLength) {
60             matrixLength = searchLength + 1;
61         }
62         memToAllocInBytes = queryLength * (2 * sizeof(int) + 0x80) + matrixLength * ((4 + queryLength + 3) >> 2);
63     } else if (SmithWatermanSettings::ANNOTATIONS == resultView) {
64         memToAllocInBytes = queryLength * (3 * sizeof(int) + 0x80);
65     } else {
66         assert(false);
67     }
68 
69     return memToAllocInBytes / b_to_mb_factor;
70 }
71 
calculateMatrixLength()72 bool SmithWatermanAlgorithm::calculateMatrixLength() {
73     int maxScore = 0;
74 
75     QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
76     int nCharsInAlphabet = alphaChars.size();
77     for (int i = 0; i < patternSeq.length(); i++) {
78         int max = 0;
79         for (int j = 0; j < nCharsInAlphabet; j++) {
80             // TODO: cache pattern seq raw pointer and alphaChars row pointer out of the loop
81             int substValue = substitutionMatrix.getScore(patternSeq.at(i), alphaChars.at(j));
82             max = qMax(max, substValue);
83         }
84         maxScore += max;
85     }
86 
87     if (minScore > maxScore)
88         return 0;
89     int gap = gapOpen > gapExtension ? gapOpen : gapExtension;
90     matrixLength = patternSeq.length() + (maxScore - minScore) / gap * (-1) + 1;
91     if (searchSeq.length() + 1 < matrixLength)
92         matrixLength = searchSeq.length() + 1;
93     return 1;
94 }
95 
setValues(const SMatrix & _substitutionMatrix,const QByteArray & _patternSeq,const QByteArray & _searchSeq,int _gapOpen,int _gapExtension,int _minScore,SmithWatermanSettings::SWResultView _resultView)96 void SmithWatermanAlgorithm::setValues(const SMatrix &_substitutionMatrix,
97                                        const QByteArray &_patternSeq,
98                                        const QByteArray &_searchSeq,
99                                        int _gapOpen,
100                                        int _gapExtension,
101                                        int _minScore,
102                                        SmithWatermanSettings::SWResultView _resultView) {
103     substitutionMatrix = _substitutionMatrix;
104     patternSeq = _patternSeq;
105     searchSeq = _searchSeq;
106     gapOpen = _gapOpen;
107     gapExtension = _gapExtension;
108     minScore = _minScore;
109     resultView = _resultView;
110 }
111 
launch(const SMatrix & _substitutionMatrix,const QByteArray & _patternSeq,const QByteArray & _searchSeq,int _gapOpen,int _gapExtension,int _minScore,SmithWatermanSettings::SWResultView _resultView)112 void SmithWatermanAlgorithm::launch(const SMatrix &_substitutionMatrix,
113                                     const QByteArray &_patternSeq,
114                                     const QByteArray &_searchSeq,
115                                     int _gapOpen,
116                                     int _gapExtension,
117                                     int _minScore,
118                                     SmithWatermanSettings::SWResultView _resultView) {
119     setValues(_substitutionMatrix, _patternSeq, _searchSeq, _gapOpen, _gapExtension, _minScore, _resultView);
120     if (isValidParams() && calculateMatrixLength()) {
121         switch (resultView) {
122             case SmithWatermanSettings::MULTIPLE_ALIGNMENT:
123                 calculateMatrixForMultipleAlignmentResult();
124                 break;
125             case SmithWatermanSettings::ANNOTATIONS:
126                 calculateMatrixForAnnotationsResult();
127                 break;
128             default:
129                 assert(false);
130         }
131     }
132 }
133 
isValidParams()134 bool SmithWatermanAlgorithm::isValidParams() {
135     if (searchSeq.length() <= 0 || patternSeq.length() <= 0)
136         return false;
137     if (searchSeq.length() < patternSeq.length())
138         return false;
139     if (gapOpen >= 0 || gapExtension >= 0)
140         return false;
141     return true;
142 }
143 
144 // Get results
145 // countResults - count of results will be return
getResults()146 QList<PairAlignSequences> SmithWatermanAlgorithm::getResults() {
147     return pairAlignmentStrings;
148 }
149 
sortByScore(QList<PairAlignSequences> & res)150 void SmithWatermanAlgorithm::sortByScore(QList<PairAlignSequences> &res) {
151     QList<PairAlignSequences> buf;
152     QVector<int> pos;
153     QVector<KeyOfPairAlignSeq> sortedScores;
154 
155     for (int i = 0; i < res.size(); i++)
156         for (int j = i + 1; j < res.size(); j++) {
157             if (res.at(i).score < res.at(j).score) {
158                 KeyOfPairAlignSeq::exchange(res[i], res[j]);
159             } else if (res.at(i).score == res.at(j).score && res.at(i).refSubseqInterval.startPos > res.at(j).refSubseqInterval.startPos) {
160                 KeyOfPairAlignSeq::exchange(res[i], res[j]);
161             } else if (res.at(i).score == res.at(j).score && res.at(i).refSubseqInterval.startPos == res.at(j).refSubseqInterval.startPos && res.at(i).refSubseqInterval.length > res.at(j).refSubseqInterval.length) {
162                 KeyOfPairAlignSeq::exchange(res[i], res[j]);
163             }
164         }
165 }
166 
calculateMatrixForMultipleAlignmentResult()167 void SmithWatermanAlgorithm::calculateMatrixForMultipleAlignmentResult() {
168     int subst, max, max1;
169     int i, j, n, e1, f1, x;
170     unsigned int xpos = 0;
171     unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
172     unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
173 
174     n = pat_n * 2;
175     unsigned int dirn = (4 + pat_n + 3) >> 2;
176     unsigned int memory = n * sizeof(int) + pat_n * 0x80 + matrixLength * dirn;
177     int *buf, *matrix = (int *)malloc(memory);
178     if (matrix == nullptr) {
179         std::bad_alloc e;
180         throw e;
181     }
182     char *score, *score1 = (char *)(matrix + n);
183     unsigned char *dir, *dir2, *dir1 = (unsigned char *)score1 + pat_n * 0x80;
184     memset(matrix, 0, n * sizeof(int));
185     memset(dir1, 0, dirn);
186     dir = dir1 + dirn;
187     dir2 = dir1 + matrixLength * dirn;
188 
189     QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
190     char *alphaCharsData = alphaChars.data();
191     n = alphaChars.size();
192     for (i = 0; i < n; i++) {
193         unsigned char ch = alphaCharsData[i];
194         score = score1 + ch * pat_n;
195         j = 0;
196         do {
197             score[j] = substitutionMatrix.getScore(ch, pat[j]);
198         } while (++j < static_cast<int>(pat_n));
199     }
200 
201     PairAlignSequences p;
202     p.refSubseqInterval.startPos = 0;
203     p.score = 0;
204 
205 #define SW_LOOP(SWX, N) \
206     max = subst + *score++; \
207     x = 3 << N; \
208     f1 = buf[1]; \
209     if (max <= 0) { \
210         max = 0; \
211         x = 0; \
212     } \
213     if (max >= max1) { \
214         max1 = max; \
215         xpos = j; \
216     } \
217     if (max < f1) { \
218         max = f1; \
219         x = 2 << N; \
220     } \
221     if (max < e1) { \
222         max = e1; \
223         x = 1 << N; \
224     } \
225     subst = buf[0]; \
226     buf[0] = max; \
227     SWX; \
228 \
229     e1 += gapExtension; \
230     max += gapOpen; \
231     f1 = buf[1] + gapExtension; \
232     e1 = e1 > max ? e1 : max; \
233     f1 = f1 > max ? f1 : max; \
234     buf[1] = f1; \
235     buf += 2;
236 
237     i = 1;
238     do {
239         buf = matrix;
240         score = score1 + src[i - 1] * pat_n;
241         e1 = 0;
242         max1 = 0;
243         subst = 0;
244 
245         if (dir == dir2) {
246             dir = dir1;
247         }
248         *dir++ = 0;
249         j = pat_n;
250         do {
251             SW_LOOP(*dir++ = x, 0);
252             if (!(--j))
253                 break;
254             SW_LOOP(dir[-1] |= x, 2);
255             if (!(--j))
256                 break;
257             SW_LOOP(dir[-1] |= x, 4);
258             if (!(--j))
259                 break;
260             SW_LOOP(dir[-1] |= x, 6);
261         } while (--j);
262 
263 #undef SW_LOOP
264 
265         /*
266         for(j = 0; j < pat_n; j++)
267         printf(" %02X", matrix[j * 2]);
268         printf("\n");
269     */
270         if (max1 >= minScore) {
271             QByteArray pairAlign;
272             xpos = pat_n - xpos + 4;
273             j = i;
274             int xend = xpos - 3;
275             unsigned char *xdir = (unsigned char *)dir - dirn;
276             for (;;) {
277                 x = (xdir[xpos >> 2] >> ((xpos & 3) * 2)) & 3;
278                 if (!x)
279                     break;
280                 if (x == 1) {
281                     pairAlign.append(PairAlignSequences::LEFT);
282                     xpos--;
283                     continue;
284                 }
285                 if (x == 2) {
286                     pairAlign.append(PairAlignSequences::UP);
287                 } else if (x == 3) {
288                     pairAlign.append(PairAlignSequences::DIAG);
289                     xpos--;
290                 }
291                 if (xdir == dir1) {
292                     xdir = dir2;
293                 }
294                 if (xdir == dir) {
295                     /* printf("#error\n"); */ break;
296                 }
297                 xdir -= dirn;
298                 j--;
299             }
300             xpos -= 3;
301 
302             p.score = max1;
303             p.refSubseqInterval.startPos = j;
304             p.refSubseqInterval.length = i - j;
305             p.ptrnSubseqInterval.startPos = xpos;
306             p.ptrnSubseqInterval.length = xend - xpos;
307             p.pairAlignment = pairAlign;
308             pairAlignmentStrings.append(p);
309 
310             // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
311             // printf("#%i-%i %s\n", xpos, xend - xpos, pairAlign.data());
312         }
313     } while (++i <= static_cast<int>(src_n));
314 
315     free(matrix);
316 }
317 
calculateMatrixForAnnotationsResult()318 void SmithWatermanAlgorithm::calculateMatrixForAnnotationsResult() {
319     int subst, max, pos = 0, max1;
320     int i, j, n, e1, f1, fpos;
321     unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
322     unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
323 
324     n = pat_n * 3;
325     int *buf, *matrix = (int *)malloc(n * sizeof(int) + pat_n * 0x80);
326     if (matrix == nullptr) {
327         std::bad_alloc e;
328         throw e;
329     }
330     char *score, *score1 = (char *)(matrix + n);
331     memset(matrix, 0, n * sizeof(int));
332 
333     QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
334     char *alphaCharsData = alphaChars.data();
335     n = alphaChars.size();
336     for (i = 0; i < n; i++) {
337         unsigned char ch = alphaCharsData[i];
338         score = score1 + ch * pat_n;
339         j = 0;
340         do {
341             score[j] = substitutionMatrix.getScore(ch, pat[j]);
342         } while (++j < static_cast<int>(pat_n));
343     }
344 
345     PairAlignSequences p;
346 
347     p.refSubseqInterval.startPos = 0;
348     p.score = 0;
349 
350     i = 1;
351     do {
352         buf = matrix;
353         score = score1 + src[i - 1] * pat_n;
354         e1 = 0;
355         max1 = 0;
356         subst = 0;
357         fpos = i - 1;
358         j = pat_n;
359         do {
360             max = subst + *score++;
361             n = fpos;
362             f1 = buf[2];
363             if (max <= 0) {
364                 max = 0;
365                 n = i;
366             }
367             if (max >= max1) {
368                 max1 = max;
369                 pos = n;
370             }
371             if (max < f1) {
372                 max = f1;
373                 n = buf[1];
374             }
375             if (max < e1) {
376                 max = e1;
377                 n = buf[1 - 3];
378             }
379             subst = buf[0];
380             fpos = buf[1];
381             buf[0] = max;
382             buf[1] = n;
383 
384             e1 += gapExtension;
385             max += gapOpen;
386             f1 = buf[2] + gapExtension;
387             e1 = e1 > max ? e1 : max;
388             f1 = f1 > max ? f1 : max;
389             buf[2] = f1;
390             buf += 3;
391         } while (--j);
392 
393         // #define SW_FILT
394 
395         if (max1 >= minScore) {
396 #ifdef SW_FILT
397             if (p.refSubseqInterval.startPos != pos) {
398                 if (p.score) {
399                     pairAlignmentStrings.append(p);
400                     // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
401                 }
402                 p.refSubseqInterval.startPos = pos;
403                 p.refSubseqInterval.length = i - pos;
404                 p.score = max1;
405             } else if (p.score < max1) {
406                 p.refSubseqInterval.length = i - pos;
407                 p.score = max1;
408             }
409 #else
410             p.refSubseqInterval.startPos = pos;
411             p.refSubseqInterval.length = i - pos;
412             p.score = max1;
413             pairAlignmentStrings.append(p);
414             // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
415 #endif
416         }
417     } while (++i <= static_cast<int>(src_n));
418 
419 #ifdef SW_FILT
420     if (p.score) {
421         pairAlignmentStrings.append(p);
422         // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
423     }
424 #endif
425 
426     free(matrix);
427 }
428 
429 }  // namespace U2
430