1 /**
2 * UGENE - Integrated Bioinformatics Tools.
3 * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4 * http://ugene.net
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19 * MA 02110-1301, USA.
20 */
21
22 #include "SmithWatermanAlgorithmSSE2.h"
23 #include <emmintrin.h>
24 #include <iostream>
25
26 #ifdef _MSC_VER
27 # pragma warning(disable : 4799)
28 #endif
29
30 extern "C" {
31
32 // http://sourceforge.net/p/predef/wiki/Compilers/
33
34 #ifdef _MSC_VER
35 # define LAME_MSC
36 #endif
37
38 #if defined(LAME_MSC) || defined(Q_OS_DARWIN)
xmm_store8(__m64 * m,__m128i x)39 inline void xmm_store8(__m64 *m, __m128i x) {
40 *(int *)m = _mm_cvtsi128_si32(x);
41 *((int *)m + 1) = _mm_cvtsi128_si32(_mm_srli_epi64(x, 32));
42 }
43
xmm_load8(__m64 * m)44 inline __m128i xmm_load8(__m64 *m) {
45 return _mm_or_si128(_mm_cvtsi32_si128(*(int *)m), _mm_slli_si128(_mm_cvtsi32_si128(*((int *)m + 1)), 4));
46 }
47 #else
48 inline void xmm_store8(__m64 *m, __m128i x) {
49 *m = _mm_movepi64_pi64(x);
50 }
51 inline __m128i xmm_load8(__m64 *m) {
52 return _mm_movpi64_epi64(*m);
53 }
54 #endif
55 }
56
57 using namespace std;
58
59 #ifdef __GNUC__
60 # pragma GCC diagnostic push
61 # pragma GCC diagnostic ignored "-Wstrict-aliasing"
62 #endif
63
64 namespace U2 {
65
estimateNeededRamAmount(const QByteArray & _patternSeq,const QByteArray & _searchSeq,const qint32 gapOpen,const qint32 gapExtension,const quint32 minScore,const quint32 maxScore,const SmithWatermanSettings::SWResultView resultView)66 quint64 SmithWatermanAlgorithmSSE2::estimateNeededRamAmount(const QByteArray
67 &_patternSeq,
68 const QByteArray &_searchSeq,
69 const qint32 gapOpen,
70 const qint32 gapExtension,
71 const quint32 minScore,
72 const quint32 maxScore,
73 const SmithWatermanSettings::SWResultView resultView) {
74 const double b_to_mb_factor = 1048576.0;
75
76 const quint64 queryLength = _patternSeq.length();
77 const quint64 searchLength = _searchSeq.length();
78
79 const quint32 iter = (queryLength + 7) >> 3;
80
81 quint64 memNeeded = 0;
82 if (SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
83 qint32 maxGapPenalty = (gapOpen > gapExtension) ? gapOpen : gapExtension;
84 assert(0 > maxGapPenalty);
85
86 quint64 matrixLength = queryLength - (maxScore - minScore) / maxGapPenalty + 1;
87 if (searchLength + 1 < matrixLength) {
88 matrixLength = searchLength + 1;
89 }
90
91 memNeeded = iter * (131 + matrixLength) * sizeof(__m128i);
92 } else if (SmithWatermanSettings::ANNOTATIONS == resultView) {
93 memNeeded = (5 + iter * 133) * sizeof(__m128i);
94 } else {
95 assert(false);
96 }
97
98 return memNeeded / b_to_mb_factor;
99 }
100
launch(const SMatrix & _substitutionMatrix,const QByteArray & _patternSeq,const QByteArray & _searchSeq,int _gapOpen,int _gapExtension,int _minScore,SmithWatermanSettings::SWResultView _resultView)101 void SmithWatermanAlgorithmSSE2::launch(const SMatrix &_substitutionMatrix, const QByteArray &_patternSeq, const QByteArray &_searchSeq, int _gapOpen, int _gapExtension, int _minScore, SmithWatermanSettings::SWResultView _resultView) {
102 setValues(_substitutionMatrix, _patternSeq, _searchSeq, _gapOpen, _gapExtension, _minScore, _resultView);
103 int maxScore = 0;
104 if (isValidParams() && calculateMatrixLength()) {
105 maxScore = calculateMatrixSSE2(patternSeq.length(), (unsigned char *)searchSeq.data(), searchSeq.length(), (-1) * (gapOpen + gapExtension), (-1) * (gapExtension));
106
107 if (minScore <= maxScore) {
108 if (maxScore >= 0x8000 || matrixLength >= 0x10000) {
109 switch (resultView) {
110 case SmithWatermanSettings::MULTIPLE_ALIGNMENT:
111 calculateMatrixForMultipleAlignmentResultWithInt();
112 break;
113 case SmithWatermanSettings::ANNOTATIONS:
114 calculateMatrixForAnnotationsResultWithInt();
115 break;
116 default:
117 assert(false);
118 }
119 } else {
120 switch (resultView) {
121 case SmithWatermanSettings::MULTIPLE_ALIGNMENT:
122 calculateMatrixForMultipleAlignmentResultWithShort();
123 break;
124 case SmithWatermanSettings::ANNOTATIONS:
125 calculateMatrixForAnnotationsResultWithShort();
126 break;
127 default:
128 assert(false);
129 }
130 }
131 }
132 }
133 }
134
calculateMatrixForMultipleAlignmentResultWithShort()135 void SmithWatermanAlgorithmSSE2::calculateMatrixForMultipleAlignmentResultWithShort() {
136 int i, j, alphaCharSize, k, max1;
137 __m128i f1 = _mm_setzero_si128(), f2 = _mm_setzero_si128(), f3 = _mm_setzero_si128(), f4 = _mm_setzero_si128(), e1 = _mm_setzero_si128();
138 unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
139 unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
140 unsigned int iter = (pat_n + 7) >> 3;
141
142 alphaCharSize = iter * 2;
143 __m128i *buf, *matrix = (__m128i *)_mm_malloc((alphaCharSize + iter * 0x80) * 16 + iter * 8 * 4 + matrixLength * iter * 8, 16);
144 short *score, *score1 = (short *)(matrix + alphaCharSize);
145 int *map = (int *)(score1 + iter * 0x80 * 8);
146 char *dir, *dir2, *dir1 = (char *)(map + iter * 8);
147 memset(matrix, 0, alphaCharSize * sizeof(__m128i));
148 memset(dir1, 0, iter * 8);
149 dir = dir1 + iter * 8;
150 dir2 = dir1 + matrixLength * iter * 8;
151
152 for (i = 0, j = 0; j < static_cast<int>(iter); j++) {
153 for (k = j, alphaCharSize = 0; alphaCharSize < 8; alphaCharSize++, k += iter) {
154 map[k] = i++;
155 }
156 }
157
158 QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
159 char *alphaCharsData = alphaChars.data();
160 alphaCharSize = alphaChars.size();
161 for (i = 0; i < alphaCharSize; i++) {
162 int n;
163 unsigned char ch = alphaCharsData[i];
164 score = score1 + ch * iter * 8;
165 for (j = 0; j < static_cast<int>(iter); j++) {
166 for (k = j, n = 0; n < 8; n++, k += iter) {
167 int a = -0x8000;
168 if (k < static_cast<int>(pat_n)) {
169 a = substitutionMatrix.getScore(ch, pat[k]);
170 }
171 *score++ = a;
172 }
173 }
174 }
175
176 __m128i xMax = _mm_setzero_si128(), xPos = _mm_setzero_si128();
177 __m128i xOpen = _mm_setzero_si128();
178 xOpen = _mm_insert_epi16(xOpen, gapOpen, 0);
179 __m128i xExt = _mm_setzero_si128();
180 xExt = _mm_insert_epi16(xExt, gapExtension, 0);
181 xOpen = _mm_shufflelo_epi16(xOpen, 0);
182 xExt = _mm_shufflelo_epi16(xExt, 0);
183 xOpen = _mm_unpacklo_epi32(xOpen, xOpen);
184 xExt = _mm_unpacklo_epi32(xExt, xExt);
185
186 PairAlignSequences p;
187
188 p.refSubseqInterval.startPos = 0;
189 p.score = 0;
190
191 i = 1;
192 do {
193 buf = matrix;
194 score = score1 + src[i - 1] * iter * 8;
195 xMax = _mm_xor_si128(xMax, xMax);
196 f1 = _mm_slli_si128(_mm_load_si128(buf + (iter - 1) * 2), 2);
197 e1 = _mm_xor_si128(e1, e1);
198 if (dir == dir2)
199 dir = dir1;
200 j = iter;
201 do {
202 f2 = _mm_adds_epi16(f1, *((__m128i *)score));
203 score += 8; /* subst */
204
205 f3 = _mm_xor_si128(f3, f3);
206 f2 = _mm_max_epi16(f2, f3);
207 f3 = _mm_cmpgt_epi16(f2, f3);
208 f3 = _mm_slli_epi16(f3, 2);
209 /* f2 f3 */
210 f4 = _mm_insert_epi16(f4, j, 0);
211 f4 = _mm_shufflelo_epi16(f4, 0);
212 f4 = _mm_unpacklo_epi32(f4, f4);
213 xMax = _mm_max_epi16(xMax, f2);
214 f1 = _mm_cmpeq_epi16(f2, xMax);
215 xPos = _mm_or_si128(_mm_and_si128(f4, f1), _mm_andnot_si128(f1, xPos));
216
217 f1 = _mm_load_si128(buf + 1);
218 f1 = _mm_max_epi16(f1, f2);
219 f2 = _mm_cmpgt_epi16(f1, f2);
220 f2 = _mm_slli_epi16(f2, 1);
221 f3 = _mm_or_si128(f3, f2);
222 /* f1 f3 */
223 f2 = _mm_max_epi16(e1, f1);
224 f1 = _mm_cmpgt_epi16(e1, f1);
225 f3 = _mm_or_si128(f3, f1);
226 /* f2 f3 */
227 f1 = _mm_load_si128(buf);
228 _mm_store_si128(buf, f2);
229 f3 = _mm_packs_epi16(f3, f3);
230 // *(__m64*)dir = _mm_movepi64_pi64(f3); dir += 8;
231 xmm_store8((__m64 *)dir, f3);
232 dir += 8;
233
234 f2 = _mm_adds_epi16(f2, xOpen);
235 e1 = _mm_max_epi16(_mm_adds_epi16(e1, xExt), f2);
236 f3 = _mm_load_si128(buf + 1);
237 f3 = _mm_max_epi16(_mm_adds_epi16(f3, xExt), f2);
238 _mm_store_si128(buf + 1, f3);
239 buf += 2;
240 } while (--j);
241
242 buf = matrix;
243 j = (-1) * iter;
244 e1 = _mm_slli_si128(e1, 2);
245 f1 = _mm_load_si128(buf);
246 f3 = _mm_max_epi16(_mm_xor_si128(f3, f3), _mm_adds_epi16(f1, xOpen));
247 k = _mm_movemask_epi8(_mm_cmpgt_epi16(e1, f3));
248 if (k) {
249 do {
250 f2 = _mm_max_epi16(e1, f1);
251 _mm_store_si128(buf, f2);
252
253 f1 = _mm_cmpgt_epi16(f2, f1);
254 f2 = _mm_adds_epi16(f2, xOpen);
255 f1 = _mm_packs_epi16(f1, f1);
256 // f1 = _mm_or_si128(f1, _mm_movpi64_epi64(*((__m64*)dir + j)));
257 f1 = _mm_or_si128(f1, xmm_load8((__m64 *)dir + j));
258 f2 = _mm_max_epi16(f2, *(buf + 1));
259 // *((__m64*)dir + j) = _mm_movepi64_pi64(f1);
260 xmm_store8((__m64 *)dir + j, f1);
261 _mm_store_si128(buf + 1, f2);
262
263 e1 = _mm_adds_epi16(e1, xExt);
264 buf += 2;
265 if (!(++j)) {
266 buf = matrix;
267 j = (-1) * iter;
268 e1 = _mm_slli_si128(e1, 2);
269 }
270 f1 = _mm_load_si128(buf);
271 f3 = _mm_max_epi16(_mm_xor_si128(f3, f3), _mm_adds_epi16(f1, xOpen));
272 k = _mm_movemask_epi8(_mm_cmpgt_epi16(e1, f3));
273 } while (k);
274 }
275 /*
276 for(j = 0; j < pat_n; j++)
277 printf(" %02X", *((unsigned short*)(matrix + (j % iter) * 2) + (j / iter)));
278 printf("\n");
279 */
280 max1 = *((short *)(&xMax));
281 alphaCharSize = 0;
282 k = 1;
283 do {
284 j = ((short *)(&xMax))[k];
285 if (j >= max1) {
286 max1 = j;
287 alphaCharSize = k;
288 }
289 } while (++k < 8);
290
291 if (max1 >= minScore) {
292 QByteArray pairAlign;
293 int xpos = 1 + alphaCharSize * iter + iter - ((unsigned short *)(&xPos))[alphaCharSize];
294 j = i;
295 int xend = xpos;
296 char *xdir = dir - iter * 8;
297 for (;;) {
298 if (!xpos)
299 break;
300 k = xdir[map[xpos - 1]];
301 if (!k)
302 break;
303 if (k == -1) {
304 pairAlign.append(PairAlignSequences::LEFT);
305 xpos--;
306 continue;
307 }
308 if (k == -2) {
309 pairAlign.append(PairAlignSequences::UP);
310 } else if (k == -4) {
311 pairAlign.append(PairAlignSequences::DIAG);
312 xpos--;
313 }
314 if (xdir == dir1) {
315 xdir = dir2;
316 }
317 if (xdir == dir) {
318 /* printf("#error\n"); */ break;
319 }
320 xdir -= iter * 8;
321 j--;
322 }
323
324 p.score = max1;
325 p.refSubseqInterval.startPos = j;
326 p.refSubseqInterval.length = i - j;
327 p.ptrnSubseqInterval.startPos = xpos;
328 p.ptrnSubseqInterval.length = xend - xpos;
329 p.pairAlignment = pairAlign;
330 pairAlignmentStrings.append(p);
331
332 // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
333 // printf("#%i-%i %s\n", xpos, xend - xpos, pairAlign.data());
334 }
335 } while (++i <= static_cast<int>(src_n));
336
337 _mm_free(matrix);
338 }
339
calculateMatrixForAnnotationsResultWithShort()340 void SmithWatermanAlgorithmSSE2::calculateMatrixForAnnotationsResultWithShort() {
341 int i, j, n, k, max1;
342 __m128i f1 = _mm_setzero_si128(), f2 = _mm_setzero_si128(), f3 = _mm_setzero_si128(), f4 = _mm_setzero_si128(), e1 = _mm_setzero_si128();
343 unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
344 unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
345 unsigned int iter = (pat_n + 7) >> 3;
346
347 n = (iter + 1) * 5;
348 __m128i *buf, *matrix = (__m128i *)_mm_malloc((n + iter * 0x80) * sizeof(__m128i), 16);
349 short *score, *score1 = (short *)(matrix + n);
350 memset(matrix, 0, n * sizeof(__m128i));
351
352 QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
353 char *alphaCharsData = alphaChars.data();
354 n = alphaChars.size();
355 for (i = 0; i < n; i++) {
356 int n2;
357 unsigned char ch = alphaCharsData[i];
358 score = score1 + ch * iter * 8;
359 for (j = 0; j < static_cast<int>(iter); j++) {
360 for (k = j, n2 = 0; n2 < 8; n2++, k += iter) {
361 int a = -0x8000;
362 if (k < static_cast<int>(pat_n)) {
363 a = substitutionMatrix.getScore(ch, pat[k]);
364 }
365 *score++ = a;
366 }
367 }
368 }
369
370 __m128i xMax = _mm_setzero_si128(), xPos = _mm_setzero_si128();
371 __m128i xOpen = _mm_setzero_si128();
372 xOpen = _mm_insert_epi16(xOpen, gapOpen, 0);
373 __m128i xExt = _mm_setzero_si128();
374 xExt = _mm_insert_epi16(xExt, gapExtension, 0);
375 xOpen = _mm_shufflelo_epi16(xOpen, 0);
376 xExt = _mm_shufflelo_epi16(xExt, 0);
377 xOpen = _mm_unpacklo_epi32(xOpen, xOpen);
378 xExt = _mm_unpacklo_epi32(xExt, xExt);
379
380 PairAlignSequences p;
381
382 p.refSubseqInterval.startPos = 0;
383 p.score = 0;
384
385 #define SW_LOOP(SWA, SWB) \
386 buf = matrix + 5; \
387 score = score1 + src[i - 1] * iter * 8; \
388 xMax = _mm_xor_si128(xMax, xMax); \
389 f4 = _mm_insert_epi16(f4, i, 0); \
390 f4 = _mm_shufflelo_epi16(f4, 0); \
391 f4 = _mm_unpacklo_epi32(f4, f4); \
392 f2 = _mm_slli_si128(_mm_load_si128(SWB + (iter - 1) * 5), 2); \
393 f1 = _mm_slli_si128(_mm_load_si128(SWB + 1 + (iter - 1) * 5), 2); \
394 f1 = _mm_insert_epi16(f1, i - 1, 0); \
395 e1 = _mm_xor_si128(e1, e1); \
396 j = iter; \
397 do { \
398 f2 = _mm_adds_epi16(f2, *((__m128i *)score)); \
399 score += 8; /* subst */ \
400 /* f2 f1 */ \
401 f3 = _mm_xor_si128(f3, f3); \
402 f2 = _mm_max_epi16(f2, f3); \
403 f3 = _mm_cmpeq_epi16(f3, f2); \
404 f3 = _mm_or_si128(_mm_and_si128(f3, f4), _mm_andnot_si128(f3, f1)); \
405 /* f2 f3 */ \
406 xMax = _mm_max_epi16(xMax, f2); \
407 f1 = _mm_cmpeq_epi16(f2, xMax); \
408 xPos = _mm_or_si128(_mm_and_si128(f1, f3), _mm_andnot_si128(f1, xPos)); \
409 \
410 f1 = _mm_load_si128(buf + 4); \
411 f1 = _mm_max_epi16(f1, f2); \
412 f2 = _mm_cmpeq_epi16(f2, f1); \
413 f3 = _mm_or_si128(_mm_and_si128(f3, f2), _mm_andnot_si128(f2, _mm_load_si128(SWB + 1))); \
414 /* f1 f3 */ \
415 f2 = _mm_max_epi16(e1, f1); \
416 f1 = _mm_cmpeq_epi16(f1, f2); \
417 f3 = _mm_or_si128(_mm_and_si128(f3, f1), _mm_andnot_si128(f1, _mm_load_si128(SWA - 5 + 1))); \
418 /* f2 f3 */ \
419 _mm_store_si128(SWA, f2); \
420 _mm_store_si128(SWA + 1, f3); \
421 f2 = _mm_adds_epi16(f2, xOpen); \
422 e1 = _mm_max_epi16(_mm_adds_epi16(e1, xExt), f2); \
423 f1 = _mm_load_si128(buf + 4); \
424 f1 = _mm_max_epi16(_mm_adds_epi16(f1, xExt), f2); \
425 _mm_store_si128(buf + 4, f1); \
426 \
427 f2 = _mm_load_si128(SWB); \
428 f1 = _mm_load_si128(SWB + 1); \
429 buf += 5; \
430 } while (--j); \
431 \
432 f4 = _mm_slli_si128(_mm_load_si128(SWA - 5 + 1), 2); \
433 buf = matrix + 5; \
434 j = 0; \
435 e1 = _mm_slli_si128(e1, 2); \
436 f2 = _mm_load_si128(SWA); \
437 f3 = _mm_max_epi16(_mm_xor_si128(f3, f3), _mm_adds_epi16(f2, xOpen)); \
438 k = _mm_movemask_epi8(_mm_cmpgt_epi16(e1, f3)); \
439 if (k) \
440 do { \
441 f1 = _mm_max_epi16(e1, f2); \
442 f2 = _mm_cmpeq_epi16(f2, f1); \
443 f2 = _mm_or_si128(_mm_and_si128(f2, *(SWA + 1)), _mm_andnot_si128(f2, f4)); \
444 _mm_store_si128(SWA, f1); \
445 _mm_store_si128(SWA + 1, f2); \
446 \
447 f1 = _mm_adds_epi16(f1, xOpen); \
448 f1 = _mm_max_epi16(f1, *(buf + 4)); \
449 _mm_store_si128(buf + 4, f1); \
450 \
451 e1 = _mm_adds_epi16(e1, xExt); \
452 buf += 5; \
453 if (++j >= static_cast<int>(iter)) { \
454 buf = matrix + 5; \
455 j = 0; \
456 e1 = _mm_slli_si128(e1, 2); \
457 f4 = _mm_slli_si128(f4, 2); \
458 } \
459 f2 = _mm_load_si128(SWA); \
460 f3 = _mm_max_epi16(_mm_xor_si128(f3, f3), _mm_adds_epi16(f2, xOpen)); \
461 k = _mm_movemask_epi8(_mm_cmpgt_epi16(e1, f3)); \
462 } while (k); \
463 \
464 max1 = *((short *)(&xMax)); \
465 n = 0; \
466 k = 1; \
467 do { \
468 j = ((short *)(&xMax))[k]; \
469 if (j >= max1) { \
470 max1 = j; \
471 n = k; \
472 } \
473 } while (++k < 8); \
474 \
475 if (max1 >= minScore) { \
476 j = ((((short *)(&xPos))[n] - i - 1) | -0x10000) + i + 1; \
477 SW_FILT_MACRO; \
478 }
479
480 // #define SW_FILT
481
482 #ifdef SW_FILT
483 # define SW_FILT_MACRO \
484 if (p.refSubseqInterval.startPos != j) { \
485 if (p.score) { \
486 pairAlignmentStrings.append(p); \
487 /* printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score); */ \
488 } \
489 p.refSubseqInterval.startPos = j; \
490 p.refSubseqInterval.length = i - j; \
491 p.score = max1; \
492 } else if (p.score < max1) { \
493 p.refSubseqInterval.length = i - j; \
494 p.score = max1; \
495 }
496 #else
497 # define SW_FILT_MACRO \
498 p.refSubseqInterval.startPos = j; \
499 p.refSubseqInterval.length = i - j; \
500 p.score = max1; \
501 pairAlignmentStrings.append(p); \
502 /* printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score); */
503 #endif
504
505 i = 1;
506 do {
507 SW_LOOP(buf, buf + 2);
508 if (++i > static_cast<int>(src_n)) {
509 break;
510 }
511 SW_LOOP(buf + 2, buf);
512 } while (++i <= static_cast<int>(src_n));
513
514 #undef SW_LOOP
515 #undef SW_FILT_MACRO
516
517 #ifdef SW_FILT
518 if (p.score) {
519 pairAlignmentStrings.append(p);
520 // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
521 }
522 #endif
523 _mm_free(matrix);
524 }
525
calculateMatrixForMultipleAlignmentResultWithInt()526 void SmithWatermanAlgorithmSSE2::calculateMatrixForMultipleAlignmentResultWithInt() {
527 int i, j, n, k, max1;
528 __m128i f1 = _mm_setzero_si128(), f2 = _mm_setzero_si128(), f3 = _mm_setzero_si128(), f4 = _mm_setzero_si128(), e1 = _mm_setzero_si128();
529 unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
530 unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
531 unsigned int iter = (pat_n + 3) >> 2;
532
533 n = iter * 2;
534 __m128i *buf, *matrix = (__m128i *)_mm_malloc((n + iter * 0x80 + iter) * 16 + matrixLength * iter * 4, 16);
535 int *score, *score1 = (int *)(matrix + n);
536 int *map = score1 + iter * 0x80 * 4;
537 char *dir, *dir2, *dir1 = (char *)(map + iter * 4);
538 memset(matrix, 0, n * sizeof(__m128i));
539 memset(dir1, 0, iter * 4);
540 dir = dir1 + iter * 4;
541 dir2 = dir1 + matrixLength * iter * 4;
542
543 for (i = 0, j = 0; j < static_cast<int>(iter); j++) {
544 for (k = j, n = 0; n < 4; n++, k += iter) {
545 map[k] = i++;
546 }
547 }
548
549 QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
550 char *alphaCharsData = alphaChars.data();
551 n = alphaChars.size();
552 for (i = 0; i < n; i++) {
553 int n2;
554 unsigned char ch = alphaCharsData[i];
555 score = score1 + ch * iter * 4;
556 for (j = 0; j < static_cast<int>(iter); j++) {
557 for (k = j, n2 = 0; n2 < 4; n2++, k += iter) {
558 int a = -0x8000;
559 if (k < static_cast<int>(pat_n)) {
560 a = substitutionMatrix.getScore(ch, pat[k]);
561 }
562 *score++ = a;
563 }
564 }
565 }
566
567 __m128i xMax = _mm_setzero_si128(), xPos = _mm_setzero_si128();
568 __m128i xOpen = _mm_cvtsi32_si128(gapOpen);
569 __m128i xExt = _mm_cvtsi32_si128(gapExtension);
570 xOpen = _mm_shuffle_epi32(xOpen, 0);
571 xExt = _mm_shuffle_epi32(xExt, 0);
572
573 PairAlignSequences p;
574
575 p.refSubseqInterval.startPos = 0;
576 p.score = 0;
577
578 i = 1;
579 do {
580 buf = matrix;
581 score = score1 + src[i - 1] * iter * 4;
582 xMax = _mm_xor_si128(xMax, xMax);
583 f1 = _mm_slli_si128(_mm_load_si128(buf + (iter - 1) * 2), 4);
584 e1 = _mm_xor_si128(e1, e1);
585 if (dir == dir2)
586 dir = dir1;
587 j = iter;
588 do {
589 f2 = _mm_add_epi32(f1, *((__m128i *)score));
590 score += 4; /* subst */
591
592 n = alphaChars.size();
593 for (i = 0; i < n; i++) {
594 int n2;
595 unsigned char ch = alphaCharsData[i];
596 score = score1 + ch * iter * 4;
597 for (j = 0; j < static_cast<int>(iter); j++) {
598 for (k = j, n2 = 0; n2 < 4; n2++, k += iter) {
599 int a = -0x8000;
600 if (k < static_cast<int>(pat_n)) {
601 a = substitutionMatrix.getScore(ch, pat[k]);
602 }
603 *score++ = a;
604 }
605 }
606 }
607
608 f3 = _mm_xor_si128(f3, f3);
609 f3 = _mm_cmpgt_epi32(f2, f3);
610 f2 = _mm_and_si128(f2, f3);
611 f3 = _mm_slli_epi32(f3, 2);
612 /* f2 f3 */
613 f4 = _mm_cvtsi32_si128(j);
614 f4 = _mm_shuffle_epi32(f4, 0);
615 f1 = _mm_cmpgt_epi32(xMax, f2);
616 xMax = _mm_xor_si128(xMax, f2);
617 xMax = _mm_and_si128(xMax, f1);
618 xMax = _mm_xor_si128(xMax, f2);
619 xPos = _mm_or_si128(_mm_and_si128(xPos, f1), _mm_andnot_si128(f1, f4));
620
621 f1 = _mm_load_si128(buf + 1);
622 f4 = _mm_cmpgt_epi32(f1, f2);
623 f1 = _mm_xor_si128(f1, f2);
624 f1 = _mm_and_si128(f1, f4);
625 f1 = _mm_xor_si128(f1, f2);
626 f4 = _mm_slli_epi32(f4, 1);
627 f3 = _mm_or_si128(f3, f4);
628 /* f1 f3 */
629 f4 = _mm_cmpgt_epi32(e1, f1);
630 f2 = _mm_xor_si128(e1, f1);
631 f2 = _mm_and_si128(f2, f4);
632 f2 = _mm_xor_si128(f2, f1);
633 f3 = _mm_or_si128(f3, f4);
634 /* f2 f3 */
635 f1 = _mm_load_si128(buf);
636 _mm_store_si128(buf, f2);
637 f3 = _mm_packs_epi32(f3, f3);
638 f3 = _mm_packs_epi16(f3, f3);
639 *(int *)dir = _mm_cvtsi128_si32(f3);
640 dir += 4;
641 f2 = _mm_add_epi32(f2, xOpen);
642
643 e1 = _mm_add_epi32(e1, xExt);
644 f4 = _mm_cmpgt_epi32(e1, f2);
645 e1 = _mm_xor_si128(e1, f2);
646 e1 = _mm_and_si128(e1, f4);
647 e1 = _mm_xor_si128(e1, f2);
648
649 f3 = _mm_load_si128(buf + 1);
650 f3 = _mm_add_epi32(f3, xExt);
651 f4 = _mm_cmpgt_epi32(f3, f2);
652 f3 = _mm_xor_si128(f3, f2);
653 f3 = _mm_and_si128(f3, f4);
654 f3 = _mm_xor_si128(f3, f2);
655 _mm_store_si128(buf + 1, f3);
656 buf += 2;
657 } while (--j);
658
659 buf = matrix;
660 j = (-1) * iter;
661 e1 = _mm_slli_si128(e1, 4);
662 f1 = _mm_load_si128(buf);
663
664 f2 = _mm_add_epi32(f1, xOpen);
665 f3 = _mm_xor_si128(f3, f3);
666 f3 = _mm_cmpgt_epi32(f2, f3);
667 f2 = _mm_and_si128(f2, f3);
668 k = _mm_movemask_epi8(_mm_cmpgt_epi32(e1, f2));
669 if (k) {
670 do {
671 f4 = _mm_cmpgt_epi32(e1, f1);
672 f2 = _mm_xor_si128(e1, f1);
673 f2 = _mm_and_si128(f2, f4);
674 f2 = _mm_xor_si128(f2, f1);
675 _mm_store_si128(buf, f2);
676
677 f2 = _mm_add_epi32(f2, xOpen);
678 f4 = _mm_packs_epi32(f4, f4);
679 f4 = _mm_packs_epi16(f4, f4);
680 f4 = _mm_or_si128(f4, _mm_cvtsi32_si128(*((int *)dir + j)));
681 f1 = _mm_load_si128(buf + 1);
682 *((int *)dir + j) = _mm_cvtsi128_si32(f4);
683 f3 = _mm_cmpgt_epi32(f1, f2);
684 f1 = _mm_xor_si128(f1, f2);
685 f1 = _mm_and_si128(f1, f3);
686 f1 = _mm_xor_si128(f1, f2);
687 _mm_store_si128(buf + 1, f1);
688
689 e1 = _mm_add_epi32(e1, xExt);
690 buf += 2;
691 if (!(++j)) {
692 buf = matrix;
693 j = (-1) * iter;
694 e1 = _mm_slli_si128(e1, 4);
695 }
696 f1 = _mm_load_si128(buf);
697 f2 = _mm_add_epi32(f1, xOpen);
698 f3 = _mm_xor_si128(f3, f3);
699 f3 = _mm_cmpgt_epi32(f2, f3);
700 f2 = _mm_and_si128(f2, f3);
701 k = _mm_movemask_epi8(_mm_cmpgt_epi32(e1, f2));
702 } while (k);
703 }
704 /*
705 for(j = 0; j < pat_n; j++)
706 printf(" %02X", *((int*)(matrix + (j % iter) * 2) + (j / iter)));
707 printf("\n");
708 */
709 max1 = *((int *)(&xMax));
710 n = 0;
711 k = 1;
712 do {
713 j = ((int *)(&xMax))[k];
714 if (j >= max1) {
715 max1 = j;
716 n = k;
717 }
718 } while (++k < 4);
719
720 if (max1 >= minScore) {
721 QByteArray pairAlign;
722 int xpos = 1 + n * iter + iter - ((int *)(&xPos))[n];
723 j = i;
724 int xend = xpos;
725 char *xdir = dir - iter * 4;
726 for (;;) {
727 if (!xpos)
728 break;
729 k = xdir[map[xpos - 1]];
730 if (!k)
731 break;
732 if (k == -1) {
733 pairAlign.append(PairAlignSequences::LEFT);
734 xpos--;
735 continue;
736 }
737 if (k == -2) {
738 pairAlign.append(PairAlignSequences::UP);
739 } else if (k == -4) {
740 pairAlign.append(PairAlignSequences::DIAG);
741 xpos--;
742 }
743 if (xdir == dir1)
744 xdir = dir2;
745 if (xdir == dir) { /* printf("#error\n"); */
746 break;
747 }
748 xdir -= iter * 4;
749 j--;
750 }
751
752 p.score = max1;
753 p.refSubseqInterval.startPos = j;
754 p.refSubseqInterval.length = i - j;
755 p.ptrnSubseqInterval.startPos = xpos;
756 p.ptrnSubseqInterval.length = xend - xpos;
757 p.pairAlignment = pairAlign;
758 pairAlignmentStrings.append(p);
759
760 // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
761 // printf("#%i-%i %s\n", xpos, xend - xpos, pairAlign.data());
762 }
763 } while (++i <= static_cast<int>(src_n));
764
765 _mm_free(matrix);
766 }
767
calculateMatrixForAnnotationsResultWithInt()768 void SmithWatermanAlgorithmSSE2::calculateMatrixForAnnotationsResultWithInt() {
769 int i, j, n, k, max1;
770 __m128i f1 = _mm_setzero_si128(), f2 = _mm_setzero_si128(), f3 = _mm_setzero_si128(), f4 = _mm_setzero_si128(), e1 = _mm_setzero_si128();
771 unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
772 unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
773 unsigned int iter = (pat_n + 3) >> 2;
774
775 n = (iter + 1) * 5;
776 __m128i *buf, *matrix = (__m128i *)_mm_malloc((n + iter * 0x80) * sizeof(__m128i), 16);
777 int *score, *score1 = (int *)(matrix + n);
778 memset(matrix, 0, n * sizeof(__m128i));
779
780 QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
781 char *alphaCharsData = alphaChars.data();
782 n = alphaChars.size();
783 for (i = 0; i < n; i++) {
784 int n2;
785 unsigned char ch = alphaCharsData[i];
786 score = score1 + ch * iter * 4;
787 for (j = 0; j < static_cast<int>(iter); j++) {
788 for (k = j, n2 = 0; n2 < 4; n2++, k += iter) {
789 int a = -0x8000;
790 if (k < static_cast<int>(pat_n)) {
791 a = substitutionMatrix.getScore(ch, pat[k]);
792 }
793 *score++ = a;
794 }
795 }
796 }
797
798 __m128i xMax = _mm_setzero_si128(), xPos = _mm_setzero_si128();
799 __m128i xOpen = _mm_setzero_si128();
800 xOpen = _mm_cvtsi32_si128(gapOpen);
801 __m128i xExt = _mm_setzero_si128();
802 xExt = _mm_cvtsi32_si128(gapExtension);
803 xOpen = _mm_shuffle_epi32(xOpen, 0);
804 xExt = _mm_shuffle_epi32(xExt, 0);
805
806 PairAlignSequences p;
807
808 p.refSubseqInterval.startPos = 0;
809 p.score = 0;
810
811 #define SW_LOOP(SWA, SWB) \
812 buf = matrix + 5; \
813 score = score1 + src[i - 1] * iter * 4; \
814 xMax = _mm_xor_si128(xMax, xMax); \
815 f2 = _mm_slli_si128(_mm_load_si128(SWB + (iter - 1) * 5), 4); \
816 f1 = _mm_slli_si128(_mm_load_si128(SWB + 1 + (iter - 1) * 5), 4); \
817 f1 = _mm_or_si128(f1, _mm_cvtsi32_si128(i - 1)); \
818 e1 = _mm_xor_si128(e1, e1); \
819 j = iter; \
820 do { \
821 f2 = _mm_add_epi32(f2, *((__m128i *)score)); \
822 score += 4; /* subst */ \
823 /* f2 f1 */ \
824 f4 = _mm_cvtsi32_si128(i); \
825 f4 = _mm_shuffle_epi32(f4, 0); \
826 f3 = _mm_xor_si128(f3, f3); \
827 f3 = _mm_cmpgt_epi32(f2, f3); \
828 f2 = _mm_and_si128(f2, f3); \
829 f3 = _mm_or_si128(_mm_and_si128(f1, f3), _mm_andnot_si128(f3, f4)); \
830 /* f2 f3 */ \
831 f1 = _mm_cmpgt_epi32(xMax, f2); \
832 xMax = _mm_xor_si128(xMax, f2); \
833 xMax = _mm_and_si128(xMax, f1); \
834 xMax = _mm_xor_si128(xMax, f2); \
835 xPos = _mm_or_si128(_mm_and_si128(xPos, f1), _mm_andnot_si128(f1, f3)); \
836 \
837 f1 = _mm_load_si128(buf + 4); \
838 f4 = _mm_cmpgt_epi32(f1, f2); \
839 f1 = _mm_xor_si128(f1, f2); \
840 f1 = _mm_and_si128(f1, f4); \
841 f1 = _mm_xor_si128(f1, f2); \
842 f3 = _mm_or_si128(_mm_and_si128(f4, *(SWB + 1)), _mm_andnot_si128(f4, f3)); \
843 /* f1 f3 */ \
844 f4 = _mm_cmpgt_epi32(e1, f1); \
845 f2 = _mm_xor_si128(e1, f1); \
846 f2 = _mm_and_si128(f2, f4); \
847 f2 = _mm_xor_si128(f2, f1); \
848 f3 = _mm_or_si128(_mm_and_si128(f4, *(SWA - 5 + 1)), _mm_andnot_si128(f4, f3)); \
849 /* f2 f3 */ \
850 _mm_store_si128(SWA, f2); \
851 _mm_store_si128(SWA + 1, f3); \
852 f2 = _mm_add_epi32(f2, xOpen); \
853 e1 = _mm_add_epi32(e1, xExt); \
854 f1 = _mm_cmpgt_epi32(e1, f2); \
855 e1 = _mm_xor_si128(e1, f2); \
856 e1 = _mm_and_si128(e1, f1); \
857 e1 = _mm_xor_si128(e1, f2); \
858 f3 = _mm_load_si128(buf + 4); \
859 f3 = _mm_add_epi32(f3, xExt); \
860 f1 = _mm_cmpgt_epi32(f3, f2); \
861 f3 = _mm_xor_si128(f3, f2); \
862 f3 = _mm_and_si128(f3, f1); \
863 f3 = _mm_xor_si128(f3, f2); \
864 _mm_store_si128(buf + 4, f3); \
865 \
866 f2 = _mm_load_si128(SWB); \
867 f1 = _mm_load_si128(SWB + 1); \
868 buf += 5; \
869 } while (--j); \
870 \
871 f4 = _mm_slli_si128(_mm_load_si128(SWA - 5 + 1), 4); \
872 buf = matrix + 5; \
873 j = 0; \
874 e1 = _mm_slli_si128(e1, 4); \
875 f2 = _mm_load_si128(SWA); \
876 f1 = _mm_add_epi32(f2, xOpen); \
877 f3 = _mm_xor_si128(f3, f3); \
878 f3 = _mm_cmpgt_epi32(f1, f3); \
879 f1 = _mm_and_si128(f1, f3); \
880 k = _mm_movemask_epi8(_mm_cmpgt_epi32(e1, f1)); \
881 if (k) \
882 do { \
883 f3 = _mm_cmpgt_epi32(e1, f2); \
884 f1 = _mm_xor_si128(e1, f2); \
885 f1 = _mm_and_si128(f1, f3); \
886 f1 = _mm_xor_si128(f1, f2); \
887 f2 = _mm_or_si128(_mm_and_si128(f3, f4), _mm_andnot_si128(f3, *(SWA + 1))); \
888 _mm_store_si128(SWA, f1); \
889 _mm_store_si128(SWA + 1, f2); \
890 \
891 f1 = _mm_add_epi32(f1, xOpen); \
892 f2 = _mm_load_si128(buf + 4); \
893 f3 = _mm_cmpgt_epi32(f1, f2); \
894 f1 = _mm_xor_si128(f1, f2); \
895 f1 = _mm_and_si128(f1, f3); \
896 f1 = _mm_xor_si128(f1, f2); \
897 _mm_store_si128(buf + 4, f2); \
898 \
899 e1 = _mm_add_epi32(e1, xExt); \
900 buf += 5; \
901 if (++j >= static_cast<int>(iter)) { \
902 buf = matrix + 5; \
903 j = 0; \
904 e1 = _mm_slli_si128(e1, 4); \
905 f4 = _mm_slli_si128(f4, 4); \
906 } \
907 f2 = _mm_load_si128(SWA); \
908 f1 = _mm_add_epi32(f2, xOpen); \
909 f3 = _mm_xor_si128(f3, f3); \
910 f3 = _mm_cmpgt_epi32(f1, f3); \
911 f1 = _mm_and_si128(f1, f3); \
912 k = _mm_movemask_epi8(_mm_cmpgt_epi32(e1, f1)); \
913 } while (k); \
914 \
915 max1 = *((int *)(&xMax)); \
916 n = 0; \
917 k = 1; \
918 do { \
919 j = ((int *)(&xMax))[k]; \
920 if (j >= max1) { \
921 max1 = j; \
922 n = k; \
923 } \
924 } while (++k < 4); \
925 \
926 if (max1 >= minScore) { \
927 j = ((int *)(&xPos))[n]; \
928 SW_FILT_MACRO; \
929 }
930
931 // #define SW_FILT
932
933 #ifdef SW_FILT
934 # define SW_FILT_MACRO \
935 if (p.refSubseqInterval.startPos != j) { \
936 if (p.score) { \
937 pairAlignmentStrings.append(p); \
938 /* printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score); */ \
939 } \
940 p.refSubseqInterval.startPos = j; \
941 p.refSubseqInterval.length = i - j; \
942 p.score = max1; \
943 } else if (p.score < max1) { \
944 p.refSubseqInterval.length = i - j; \
945 p.score = max1; \
946 }
947 #else
948 # define SW_FILT_MACRO \
949 p.refSubseqInterval.startPos = j; \
950 p.refSubseqInterval.length = i - j; \
951 p.score = max1; \
952 pairAlignmentStrings.append(p); \
953 /* printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score); */
954 #endif
955
956 i = 1;
957 do {
958 SW_LOOP(buf, buf + 2);
959 if (++i > static_cast<int>(src_n)) {
960 break;
961 }
962 SW_LOOP(buf + 2, buf);
963 } while (++i <= static_cast<int>(src_n));
964
965 #undef SW_LOOP
966 #undef SW_FILT_MACRO
967
968 #ifdef SW_FILT
969 if (p.score) {
970 pairAlignmentStrings.append(p);
971 // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
972 }
973 #endif
974 _mm_free(matrix);
975 }
976
printVector(__m128i & toprint,int add)977 inline void SmithWatermanAlgorithmSSE2::printVector(__m128i &toprint, int add) {
978 ScoreType *tmpArray = (ScoreType *)_mm_malloc(nElementsInVec * sizeof(ScoreType), 16);
979
980 _mm_store_si128((__m128i *)&tmpArray[0], toprint);
981 cout << "printVector" << endl;
982 for (int i = 0; i < nElementsInVec; i++)
983 cout << tmpArray[i] + add << " ";
984
985 cout << endl;
986 }
987
calculateMatrixSSE2(unsigned queryLength,unsigned char * dbSeq,unsigned dbLength,unsigned short gapOpenOrig,unsigned short gapExtend)988 int SmithWatermanAlgorithmSSE2::calculateMatrixSSE2(unsigned queryLength, unsigned char *dbSeq, unsigned dbLength, unsigned short gapOpenOrig, unsigned short gapExtend) {
989 unsigned iter = (queryLength + nElementsInVec - 1) / nElementsInVec;
990
991 int ALPHA_SIZE = substitutionMatrix.getAlphabet()->getNumAlphabetChars();
992
993 __m128i *pvQueryProf = (__m128i *)_mm_malloc('Z' * ALPHA_SIZE * iter * sizeof(__m128i), 16);
994
995 int weight = 0;
996 unsigned short *queryProfile = (unsigned short *)pvQueryProf;
997 int segSize = (queryLength + 7) / 8; // iter
998 int nCount = segSize * 8;
999 char curChar = ' ';
1000
1001 QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
1002 for (int i = 0; i < ALPHA_SIZE; i++) {
1003 curChar = alphaChars.at(i);
1004 int h = 0;
1005 for (int j = 0; j < segSize; j++) {
1006 unsigned int k = j;
1007 for (unsigned int kk = 0; kk < 8; kk++) {
1008 if (k >= queryLength) {
1009 weight = 0;
1010 } else {
1011 weight = substitutionMatrix.getScore(curChar, patternSeq.at(k));
1012 }
1013 queryProfile[curChar * nCount + h] = (unsigned short)weight;
1014 k += segSize;
1015 h++;
1016 }
1017 }
1018 }
1019
1020 __m128i *pvHLoad = (__m128i *)_mm_malloc(iter * sizeof(__m128i), 16);
1021 __m128i *pvHStore = (__m128i *)_mm_malloc(iter * sizeof(__m128i), 16);
1022 __m128i *pvE = (__m128i *)_mm_malloc(iter * sizeof(__m128i), 16);
1023
1024 unsigned i = 0, j = 0;
1025 int score = 0;
1026
1027 int cmp = 0;
1028
1029 unsigned short gapOpenFarrar = gapOpenOrig - gapExtend;
1030
1031 __m128i *pv = 0;
1032
1033 __m128i vE = _mm_set1_epi32(0), vF = _mm_set1_epi32(0), vH = _mm_set1_epi32(0);
1034
1035 __m128i vMaxScore = _mm_set1_epi32(0);
1036 __m128i vGapOpen = _mm_set1_epi32(0);
1037 __m128i vGapExtend = _mm_set1_epi32(0);
1038
1039 __m128i vMin = _mm_set1_epi32(0);
1040 __m128i vMinimums = _mm_set1_epi32(0);
1041 __m128i vTemp = _mm_set1_epi32(0);
1042
1043 __m128i *pvScore = 0;
1044
1045 /* Load gap opening penalty to all elements of a constant */
1046 vGapOpen = _mm_insert_epi16(vGapOpen, gapOpenFarrar, 0);
1047 vGapOpen = _mm_shufflelo_epi16(vGapOpen, 0);
1048 vGapOpen = _mm_shuffle_epi32(vGapOpen, 0);
1049
1050 /* Load gap extension penalty to all elements of a constant */
1051 vGapExtend = _mm_insert_epi16(vGapExtend, gapExtend, 0);
1052 vGapExtend = _mm_shufflelo_epi16(vGapExtend, 0);
1053 vGapExtend = _mm_shuffle_epi32(vGapExtend, 0);
1054
1055 /* load vMaxScore with the zeros. since we are using signed */
1056 /* math, we will bias the maxscore to -32768 so we have the */
1057 /* full range of the short. */
1058 vMaxScore = _mm_cmpeq_epi16(vMaxScore, vMaxScore);
1059 vMaxScore = _mm_slli_epi16(vMaxScore, 15);
1060
1061 vMinimums = _mm_shuffle_epi32(vMaxScore, 0);
1062 Q_UNUSED(vMinimums);
1063
1064 vMin = _mm_shuffle_epi32(vMaxScore, 0);
1065 vMin = _mm_srli_si128(vMin, 14);
1066
1067 /* Zero out the storage vector */
1068 for (i = 0; i < iter; ++i) {
1069 _mm_store_si128(pvE + i, vMaxScore);
1070 _mm_store_si128(pvHStore + i, vMaxScore);
1071 }
1072
1073 PairAlignSequences p;
1074 U2Region sReg;
1075
1076 for (i = 0; i < dbLength; ++i) {
1077 /* fetch first data asap. */
1078 pvScore = pvQueryProf + dbSeq[i] * iter;
1079
1080 /* zero out F. */
1081 vF = _mm_cmpeq_epi16(vF, vF);
1082 vF = _mm_slli_epi16(vF, 15);
1083
1084 /* load the next h value */
1085 vH = _mm_load_si128(pvHStore + iter - 1);
1086 vH = _mm_slli_si128(vH, 2);
1087 vH = _mm_or_si128(vH, vMin);
1088
1089 pv = pvHLoad;
1090 pvHLoad = pvHStore;
1091 pvHStore = pv;
1092
1093 for (j = 0; j < iter; ++j) {
1094 /* load values of vF and vH from previous row (one unit up) */
1095 vE = _mm_load_si128(pvE + j);
1096
1097 /* add score to vH */
1098 vH = _mm_adds_epi16(vH, *pvScore++);
1099
1100 /* Update highest score encountered this far */
1101 vMaxScore = _mm_max_epi16(vMaxScore, vH);
1102
1103 /* get max from vH, vE and vF */
1104 vH = _mm_max_epi16(vH, vE);
1105 vH = _mm_max_epi16(vH, vF);
1106
1107 /* save vH values */
1108 _mm_store_si128(pvHStore + j, vH);
1109
1110 /* update vE value */
1111 vH = _mm_subs_epi16(vH, vGapOpen);
1112 vE = _mm_subs_epi16(vE, vGapExtend);
1113 vE = _mm_max_epi16(vE, vH);
1114
1115 /* update vF value */
1116 vF = _mm_subs_epi16(vF, vGapExtend);
1117 vF = _mm_max_epi16(vF, vH);
1118
1119 /* save vE values */
1120 _mm_store_si128(pvE + j, vE);
1121
1122 /* load the next h value */
1123 vH = _mm_load_si128(pvHLoad + j);
1124 }
1125
1126 /* reset pointers to the start of the saved data */
1127 j = 0;
1128 vH = _mm_load_si128(pvHStore + j);
1129
1130 /* the computed vF value is for the given column. since */
1131 /* we are at the end, we need to shift the vF value over */
1132 /* to the next column. */
1133
1134 vF = _mm_slli_si128(vF, 2);
1135 vF = _mm_or_si128(vF, vMin);
1136 vTemp = _mm_subs_epi16(vH, vGapOpen);
1137
1138 vTemp = _mm_cmpgt_epi16(vF, vTemp);
1139
1140 cmp = _mm_movemask_epi8(vTemp);
1141
1142 while (cmp != 0x0000)
1143 // for (unsigned cnt=0; cnt<iter; ++cnt)
1144 {
1145 vE = _mm_load_si128(pvE + j);
1146
1147 vH = _mm_max_epi16(vH, vF);
1148
1149 /* save vH values */
1150 _mm_store_si128(pvHStore + j, vH);
1151
1152 /* update vE incase the new vH value would change it */
1153 vH = _mm_subs_epi16(vH, vGapOpen);
1154 vE = _mm_max_epi16(vE, vH);
1155
1156 /* update vF value */
1157 vF = _mm_subs_epi16(vF, vGapExtend);
1158
1159 j++;
1160 if (j >= iter) {
1161 j = 0;
1162 vF = _mm_slli_si128(vF, 2);
1163 vF = _mm_or_si128(vF, vMin);
1164 }
1165
1166 vH = _mm_load_si128(pvHStore + j);
1167
1168 vTemp = _mm_subs_epi16(vH, vGapOpen);
1169 vTemp = _mm_cmpgt_epi16(vF, vTemp);
1170 cmp = _mm_movemask_epi8(vTemp);
1171 }
1172 }
1173
1174 /* find largest score in the vMaxScore vector */
1175 vTemp = _mm_srli_si128(vMaxScore, 8);
1176 vMaxScore = _mm_max_epi16(vMaxScore, vTemp);
1177 vTemp = _mm_srli_si128(vMaxScore, 4);
1178 vMaxScore = _mm_max_epi16(vMaxScore, vTemp);
1179 vTemp = _mm_srli_si128(vMaxScore, 2);
1180 vMaxScore = _mm_max_epi16(vMaxScore, vTemp);
1181
1182 /* store in temporary variable */
1183 score = (short)_mm_extract_epi16(vMaxScore, 0);
1184
1185 _mm_free(pvHLoad);
1186 _mm_free(pvHStore);
1187 _mm_free(pvE);
1188 _mm_free(pvQueryProf);
1189
1190 /* return largest score */
1191 return score + 32768;
1192 }
1193
1194 } // namespace U2
1195