1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include <stdio.h>
23
24 #include <U2Core/U2Region.h>
25
26 #include "SmithWatermanAlgorithm.h"
27
28 namespace U2 {
29
30 const char SmithWatermanAlgorithm::STOP = 's';
31 const char SmithWatermanAlgorithm::UP = 'u';
32 const char SmithWatermanAlgorithm::LEFT = 'l';
33 const char SmithWatermanAlgorithm::DIAG = 'd';
34
SmithWatermanAlgorithm()35 SmithWatermanAlgorithm::SmithWatermanAlgorithm() {
36 gapOpen = 0;
37 gapExtension = 0;
38 minScore = 0;
39 matrixLength = 0;
40 }
41
estimateNeededRamAmount(const qint32 gapOpen,const qint32 gapExtension,const quint32 minScore,const quint32 maxScore,const QByteArray & patternSeq,const QByteArray & searchSeq,const SmithWatermanSettings::SWResultView resultView)42 quint64 SmithWatermanAlgorithm::estimateNeededRamAmount(const qint32 gapOpen,
43 const qint32 gapExtension,
44 const quint32 minScore,
45 const quint32 maxScore,
46 const QByteArray &patternSeq,
47 const QByteArray &searchSeq,
48 const SmithWatermanSettings::SWResultView resultView) {
49 const double b_to_mb_factor = 1048576.0;
50
51 const quint64 queryLength = patternSeq.length();
52 const quint64 searchLength = searchSeq.length();
53
54 quint64 memToAllocInBytes = 0;
55 if (SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
56 const qint32 maxGapPenalty = (gapOpen > gapExtension) ? gapOpen : gapExtension;
57 assert(0 > maxGapPenalty);
58 quint64 matrixLength = queryLength - (maxScore - minScore) / maxGapPenalty + 1;
59 if (searchLength + 1 < matrixLength) {
60 matrixLength = searchLength + 1;
61 }
62 memToAllocInBytes = queryLength * (2 * sizeof(int) + 0x80) + matrixLength * ((4 + queryLength + 3) >> 2);
63 } else if (SmithWatermanSettings::ANNOTATIONS == resultView) {
64 memToAllocInBytes = queryLength * (3 * sizeof(int) + 0x80);
65 } else {
66 assert(false);
67 }
68
69 return memToAllocInBytes / b_to_mb_factor;
70 }
71
calculateMatrixLength()72 bool SmithWatermanAlgorithm::calculateMatrixLength() {
73 int maxScore = 0;
74
75 QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
76 int nCharsInAlphabet = alphaChars.size();
77 for (int i = 0; i < patternSeq.length(); i++) {
78 int max = 0;
79 for (int j = 0; j < nCharsInAlphabet; j++) {
80 // TODO: cache pattern seq raw pointer and alphaChars row pointer out of the loop
81 int substValue = substitutionMatrix.getScore(patternSeq.at(i), alphaChars.at(j));
82 max = qMax(max, substValue);
83 }
84 maxScore += max;
85 }
86
87 if (minScore > maxScore)
88 return 0;
89 int gap = gapOpen > gapExtension ? gapOpen : gapExtension;
90 matrixLength = patternSeq.length() + (maxScore - minScore) / gap * (-1) + 1;
91 if (searchSeq.length() + 1 < matrixLength)
92 matrixLength = searchSeq.length() + 1;
93 return 1;
94 }
95
setValues(const SMatrix & _substitutionMatrix,const QByteArray & _patternSeq,const QByteArray & _searchSeq,int _gapOpen,int _gapExtension,int _minScore,SmithWatermanSettings::SWResultView _resultView)96 void SmithWatermanAlgorithm::setValues(const SMatrix &_substitutionMatrix,
97 const QByteArray &_patternSeq,
98 const QByteArray &_searchSeq,
99 int _gapOpen,
100 int _gapExtension,
101 int _minScore,
102 SmithWatermanSettings::SWResultView _resultView) {
103 substitutionMatrix = _substitutionMatrix;
104 patternSeq = _patternSeq;
105 searchSeq = _searchSeq;
106 gapOpen = _gapOpen;
107 gapExtension = _gapExtension;
108 minScore = _minScore;
109 resultView = _resultView;
110 }
111
launch(const SMatrix & _substitutionMatrix,const QByteArray & _patternSeq,const QByteArray & _searchSeq,int _gapOpen,int _gapExtension,int _minScore,SmithWatermanSettings::SWResultView _resultView)112 void SmithWatermanAlgorithm::launch(const SMatrix &_substitutionMatrix,
113 const QByteArray &_patternSeq,
114 const QByteArray &_searchSeq,
115 int _gapOpen,
116 int _gapExtension,
117 int _minScore,
118 SmithWatermanSettings::SWResultView _resultView) {
119 setValues(_substitutionMatrix, _patternSeq, _searchSeq, _gapOpen, _gapExtension, _minScore, _resultView);
120 if (isValidParams() && calculateMatrixLength()) {
121 switch (resultView) {
122 case SmithWatermanSettings::MULTIPLE_ALIGNMENT:
123 calculateMatrixForMultipleAlignmentResult();
124 break;
125 case SmithWatermanSettings::ANNOTATIONS:
126 calculateMatrixForAnnotationsResult();
127 break;
128 default:
129 assert(false);
130 }
131 }
132 }
133
isValidParams()134 bool SmithWatermanAlgorithm::isValidParams() {
135 if (searchSeq.length() <= 0 || patternSeq.length() <= 0)
136 return false;
137 if (searchSeq.length() < patternSeq.length())
138 return false;
139 if (gapOpen >= 0 || gapExtension >= 0)
140 return false;
141 return true;
142 }
143
144 // Get results
145 // countResults - count of results will be return
getResults()146 QList<PairAlignSequences> SmithWatermanAlgorithm::getResults() {
147 return pairAlignmentStrings;
148 }
149
sortByScore(QList<PairAlignSequences> & res)150 void SmithWatermanAlgorithm::sortByScore(QList<PairAlignSequences> &res) {
151 QList<PairAlignSequences> buf;
152 QVector<int> pos;
153 QVector<KeyOfPairAlignSeq> sortedScores;
154
155 for (int i = 0; i < res.size(); i++)
156 for (int j = i + 1; j < res.size(); j++) {
157 if (res.at(i).score < res.at(j).score) {
158 KeyOfPairAlignSeq::exchange(res[i], res[j]);
159 } else if (res.at(i).score == res.at(j).score && res.at(i).refSubseqInterval.startPos > res.at(j).refSubseqInterval.startPos) {
160 KeyOfPairAlignSeq::exchange(res[i], res[j]);
161 } else if (res.at(i).score == res.at(j).score && res.at(i).refSubseqInterval.startPos == res.at(j).refSubseqInterval.startPos && res.at(i).refSubseqInterval.length > res.at(j).refSubseqInterval.length) {
162 KeyOfPairAlignSeq::exchange(res[i], res[j]);
163 }
164 }
165 }
166
calculateMatrixForMultipleAlignmentResult()167 void SmithWatermanAlgorithm::calculateMatrixForMultipleAlignmentResult() {
168 int subst, max, max1;
169 int i, j, n, e1, f1, x;
170 unsigned int xpos = 0;
171 unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
172 unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
173
174 n = pat_n * 2;
175 unsigned int dirn = (4 + pat_n + 3) >> 2;
176 unsigned int memory = n * sizeof(int) + pat_n * 0x80 + matrixLength * dirn;
177 int *buf, *matrix = (int *)malloc(memory);
178 if (matrix == nullptr) {
179 std::bad_alloc e;
180 throw e;
181 }
182 char *score, *score1 = (char *)(matrix + n);
183 unsigned char *dir, *dir2, *dir1 = (unsigned char *)score1 + pat_n * 0x80;
184 memset(matrix, 0, n * sizeof(int));
185 memset(dir1, 0, dirn);
186 dir = dir1 + dirn;
187 dir2 = dir1 + matrixLength * dirn;
188
189 QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
190 char *alphaCharsData = alphaChars.data();
191 n = alphaChars.size();
192 for (i = 0; i < n; i++) {
193 unsigned char ch = alphaCharsData[i];
194 score = score1 + ch * pat_n;
195 j = 0;
196 do {
197 score[j] = substitutionMatrix.getScore(ch, pat[j]);
198 } while (++j < static_cast<int>(pat_n));
199 }
200
201 PairAlignSequences p;
202 p.refSubseqInterval.startPos = 0;
203 p.score = 0;
204
205 #define SW_LOOP(SWX, N) \
206 max = subst + *score++; \
207 x = 3 << N; \
208 f1 = buf[1]; \
209 if (max <= 0) { \
210 max = 0; \
211 x = 0; \
212 } \
213 if (max >= max1) { \
214 max1 = max; \
215 xpos = j; \
216 } \
217 if (max < f1) { \
218 max = f1; \
219 x = 2 << N; \
220 } \
221 if (max < e1) { \
222 max = e1; \
223 x = 1 << N; \
224 } \
225 subst = buf[0]; \
226 buf[0] = max; \
227 SWX; \
228 \
229 e1 += gapExtension; \
230 max += gapOpen; \
231 f1 = buf[1] + gapExtension; \
232 e1 = e1 > max ? e1 : max; \
233 f1 = f1 > max ? f1 : max; \
234 buf[1] = f1; \
235 buf += 2;
236
237 i = 1;
238 do {
239 buf = matrix;
240 score = score1 + src[i - 1] * pat_n;
241 e1 = 0;
242 max1 = 0;
243 subst = 0;
244
245 if (dir == dir2) {
246 dir = dir1;
247 }
248 *dir++ = 0;
249 j = pat_n;
250 do {
251 SW_LOOP(*dir++ = x, 0);
252 if (!(--j))
253 break;
254 SW_LOOP(dir[-1] |= x, 2);
255 if (!(--j))
256 break;
257 SW_LOOP(dir[-1] |= x, 4);
258 if (!(--j))
259 break;
260 SW_LOOP(dir[-1] |= x, 6);
261 } while (--j);
262
263 #undef SW_LOOP
264
265 /*
266 for(j = 0; j < pat_n; j++)
267 printf(" %02X", matrix[j * 2]);
268 printf("\n");
269 */
270 if (max1 >= minScore) {
271 QByteArray pairAlign;
272 xpos = pat_n - xpos + 4;
273 j = i;
274 int xend = xpos - 3;
275 unsigned char *xdir = (unsigned char *)dir - dirn;
276 for (;;) {
277 x = (xdir[xpos >> 2] >> ((xpos & 3) * 2)) & 3;
278 if (!x)
279 break;
280 if (x == 1) {
281 pairAlign.append(PairAlignSequences::LEFT);
282 xpos--;
283 continue;
284 }
285 if (x == 2) {
286 pairAlign.append(PairAlignSequences::UP);
287 } else if (x == 3) {
288 pairAlign.append(PairAlignSequences::DIAG);
289 xpos--;
290 }
291 if (xdir == dir1) {
292 xdir = dir2;
293 }
294 if (xdir == dir) {
295 /* printf("#error\n"); */ break;
296 }
297 xdir -= dirn;
298 j--;
299 }
300 xpos -= 3;
301
302 p.score = max1;
303 p.refSubseqInterval.startPos = j;
304 p.refSubseqInterval.length = i - j;
305 p.ptrnSubseqInterval.startPos = xpos;
306 p.ptrnSubseqInterval.length = xend - xpos;
307 p.pairAlignment = pairAlign;
308 pairAlignmentStrings.append(p);
309
310 // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
311 // printf("#%i-%i %s\n", xpos, xend - xpos, pairAlign.data());
312 }
313 } while (++i <= static_cast<int>(src_n));
314
315 free(matrix);
316 }
317
calculateMatrixForAnnotationsResult()318 void SmithWatermanAlgorithm::calculateMatrixForAnnotationsResult() {
319 int subst, max, pos = 0, max1;
320 int i, j, n, e1, f1, fpos;
321 unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
322 unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
323
324 n = pat_n * 3;
325 int *buf, *matrix = (int *)malloc(n * sizeof(int) + pat_n * 0x80);
326 if (matrix == nullptr) {
327 std::bad_alloc e;
328 throw e;
329 }
330 char *score, *score1 = (char *)(matrix + n);
331 memset(matrix, 0, n * sizeof(int));
332
333 QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
334 char *alphaCharsData = alphaChars.data();
335 n = alphaChars.size();
336 for (i = 0; i < n; i++) {
337 unsigned char ch = alphaCharsData[i];
338 score = score1 + ch * pat_n;
339 j = 0;
340 do {
341 score[j] = substitutionMatrix.getScore(ch, pat[j]);
342 } while (++j < static_cast<int>(pat_n));
343 }
344
345 PairAlignSequences p;
346
347 p.refSubseqInterval.startPos = 0;
348 p.score = 0;
349
350 i = 1;
351 do {
352 buf = matrix;
353 score = score1 + src[i - 1] * pat_n;
354 e1 = 0;
355 max1 = 0;
356 subst = 0;
357 fpos = i - 1;
358 j = pat_n;
359 do {
360 max = subst + *score++;
361 n = fpos;
362 f1 = buf[2];
363 if (max <= 0) {
364 max = 0;
365 n = i;
366 }
367 if (max >= max1) {
368 max1 = max;
369 pos = n;
370 }
371 if (max < f1) {
372 max = f1;
373 n = buf[1];
374 }
375 if (max < e1) {
376 max = e1;
377 n = buf[1 - 3];
378 }
379 subst = buf[0];
380 fpos = buf[1];
381 buf[0] = max;
382 buf[1] = n;
383
384 e1 += gapExtension;
385 max += gapOpen;
386 f1 = buf[2] + gapExtension;
387 e1 = e1 > max ? e1 : max;
388 f1 = f1 > max ? f1 : max;
389 buf[2] = f1;
390 buf += 3;
391 } while (--j);
392
393 // #define SW_FILT
394
395 if (max1 >= minScore) {
396 #ifdef SW_FILT
397 if (p.refSubseqInterval.startPos != pos) {
398 if (p.score) {
399 pairAlignmentStrings.append(p);
400 // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
401 }
402 p.refSubseqInterval.startPos = pos;
403 p.refSubseqInterval.length = i - pos;
404 p.score = max1;
405 } else if (p.score < max1) {
406 p.refSubseqInterval.length = i - pos;
407 p.score = max1;
408 }
409 #else
410 p.refSubseqInterval.startPos = pos;
411 p.refSubseqInterval.length = i - pos;
412 p.score = max1;
413 pairAlignmentStrings.append(p);
414 // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
415 #endif
416 }
417 } while (++i <= static_cast<int>(src_n));
418
419 #ifdef SW_FILT
420 if (p.score) {
421 pairAlignmentStrings.append(p);
422 // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
423 }
424 #endif
425
426 free(matrix);
427 }
428
429 } // namespace U2
430