1 /**
2  * UGENE - Integrated Bioinformatics Tools.
3  * Copyright (C) 2008-2021 UniPro <ugene@unipro.ru>
4  * http://ugene.net
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
19  * MA 02110-1301, USA.
20  */
21 
22 #include "SmithWatermanAlgorithmSSE2.h"
23 #include <emmintrin.h>
24 #include <iostream>
25 
26 #ifdef _MSC_VER
27 #    pragma warning(disable : 4799)
28 #endif
29 
30 extern "C" {
31 
32 // http://sourceforge.net/p/predef/wiki/Compilers/
33 
34 #ifdef _MSC_VER
35 #        define LAME_MSC
36 #endif
37 
38 #if defined(LAME_MSC) || defined(Q_OS_DARWIN)
xmm_store8(__m64 * m,__m128i x)39 inline void xmm_store8(__m64 *m, __m128i x) {
40     *(int *)m = _mm_cvtsi128_si32(x);
41     *((int *)m + 1) = _mm_cvtsi128_si32(_mm_srli_epi64(x, 32));
42 }
43 
xmm_load8(__m64 * m)44 inline __m128i xmm_load8(__m64 *m) {
45     return _mm_or_si128(_mm_cvtsi32_si128(*(int *)m), _mm_slli_si128(_mm_cvtsi32_si128(*((int *)m + 1)), 4));
46 }
47 #else
48 inline void xmm_store8(__m64 *m, __m128i x) {
49     *m = _mm_movepi64_pi64(x);
50 }
51 inline __m128i xmm_load8(__m64 *m) {
52     return _mm_movpi64_epi64(*m);
53 }
54 #endif
55 }
56 
57 using namespace std;
58 
59 #ifdef __GNUC__
60 #    pragma GCC diagnostic push
61 #    pragma GCC diagnostic ignored "-Wstrict-aliasing"
62 #endif
63 
64 namespace U2 {
65 
estimateNeededRamAmount(const QByteArray & _patternSeq,const QByteArray & _searchSeq,const qint32 gapOpen,const qint32 gapExtension,const quint32 minScore,const quint32 maxScore,const SmithWatermanSettings::SWResultView resultView)66 quint64 SmithWatermanAlgorithmSSE2::estimateNeededRamAmount(const QByteArray
67                                                                 &_patternSeq,
68                                                             const QByteArray &_searchSeq,
69                                                             const qint32 gapOpen,
70                                                             const qint32 gapExtension,
71                                                             const quint32 minScore,
72                                                             const quint32 maxScore,
73                                                             const SmithWatermanSettings::SWResultView resultView) {
74     const double b_to_mb_factor = 1048576.0;
75 
76     const quint64 queryLength = _patternSeq.length();
77     const quint64 searchLength = _searchSeq.length();
78 
79     const quint32 iter = (queryLength + 7) >> 3;
80 
81     quint64 memNeeded = 0;
82     if (SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
83         qint32 maxGapPenalty = (gapOpen > gapExtension) ? gapOpen : gapExtension;
84         assert(0 > maxGapPenalty);
85 
86         quint64 matrixLength = queryLength - (maxScore - minScore) / maxGapPenalty + 1;
87         if (searchLength + 1 < matrixLength) {
88             matrixLength = searchLength + 1;
89         }
90 
91         memNeeded = iter * (131 + matrixLength) * sizeof(__m128i);
92     } else if (SmithWatermanSettings::ANNOTATIONS == resultView) {
93         memNeeded = (5 + iter * 133) * sizeof(__m128i);
94     } else {
95         assert(false);
96     }
97 
98     return memNeeded / b_to_mb_factor;
99 }
100 
launch(const SMatrix & _substitutionMatrix,const QByteArray & _patternSeq,const QByteArray & _searchSeq,int _gapOpen,int _gapExtension,int _minScore,SmithWatermanSettings::SWResultView _resultView)101 void SmithWatermanAlgorithmSSE2::launch(const SMatrix &_substitutionMatrix, const QByteArray &_patternSeq, const QByteArray &_searchSeq, int _gapOpen, int _gapExtension, int _minScore, SmithWatermanSettings::SWResultView _resultView) {
102     setValues(_substitutionMatrix, _patternSeq, _searchSeq, _gapOpen, _gapExtension, _minScore, _resultView);
103     int maxScore = 0;
104     if (isValidParams() && calculateMatrixLength()) {
105         maxScore = calculateMatrixSSE2(patternSeq.length(), (unsigned char *)searchSeq.data(), searchSeq.length(), (-1) * (gapOpen + gapExtension), (-1) * (gapExtension));
106 
107         if (minScore <= maxScore) {
108             if (maxScore >= 0x8000 || matrixLength >= 0x10000) {
109                 switch (resultView) {
110                     case SmithWatermanSettings::MULTIPLE_ALIGNMENT:
111                         calculateMatrixForMultipleAlignmentResultWithInt();
112                         break;
113                     case SmithWatermanSettings::ANNOTATIONS:
114                         calculateMatrixForAnnotationsResultWithInt();
115                         break;
116                     default:
117                         assert(false);
118                 }
119             } else {
120                 switch (resultView) {
121                     case SmithWatermanSettings::MULTIPLE_ALIGNMENT:
122                         calculateMatrixForMultipleAlignmentResultWithShort();
123                         break;
124                     case SmithWatermanSettings::ANNOTATIONS:
125                         calculateMatrixForAnnotationsResultWithShort();
126                         break;
127                     default:
128                         assert(false);
129                 }
130             }
131         }
132     }
133 }
134 
calculateMatrixForMultipleAlignmentResultWithShort()135 void SmithWatermanAlgorithmSSE2::calculateMatrixForMultipleAlignmentResultWithShort() {
136     int i, j, alphaCharSize, k, max1;
137     __m128i f1 = _mm_setzero_si128(), f2 = _mm_setzero_si128(), f3 = _mm_setzero_si128(), f4 = _mm_setzero_si128(), e1 = _mm_setzero_si128();
138     unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
139     unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
140     unsigned int iter = (pat_n + 7) >> 3;
141 
142     alphaCharSize = iter * 2;
143     __m128i *buf, *matrix = (__m128i *)_mm_malloc((alphaCharSize + iter * 0x80) * 16 + iter * 8 * 4 + matrixLength * iter * 8, 16);
144     short *score, *score1 = (short *)(matrix + alphaCharSize);
145     int *map = (int *)(score1 + iter * 0x80 * 8);
146     char *dir, *dir2, *dir1 = (char *)(map + iter * 8);
147     memset(matrix, 0, alphaCharSize * sizeof(__m128i));
148     memset(dir1, 0, iter * 8);
149     dir = dir1 + iter * 8;
150     dir2 = dir1 + matrixLength * iter * 8;
151 
152     for (i = 0, j = 0; j < static_cast<int>(iter); j++) {
153         for (k = j, alphaCharSize = 0; alphaCharSize < 8; alphaCharSize++, k += iter) {
154             map[k] = i++;
155         }
156     }
157 
158     QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
159     char *alphaCharsData = alphaChars.data();
160     alphaCharSize = alphaChars.size();
161     for (i = 0; i < alphaCharSize; i++) {
162         int n;
163         unsigned char ch = alphaCharsData[i];
164         score = score1 + ch * iter * 8;
165         for (j = 0; j < static_cast<int>(iter); j++) {
166             for (k = j, n = 0; n < 8; n++, k += iter) {
167                 int a = -0x8000;
168                 if (k < static_cast<int>(pat_n)) {
169                     a = substitutionMatrix.getScore(ch, pat[k]);
170                 }
171                 *score++ = a;
172             }
173         }
174     }
175 
176     __m128i xMax = _mm_setzero_si128(), xPos = _mm_setzero_si128();
177     __m128i xOpen = _mm_setzero_si128();
178     xOpen = _mm_insert_epi16(xOpen, gapOpen, 0);
179     __m128i xExt = _mm_setzero_si128();
180     xExt = _mm_insert_epi16(xExt, gapExtension, 0);
181     xOpen = _mm_shufflelo_epi16(xOpen, 0);
182     xExt = _mm_shufflelo_epi16(xExt, 0);
183     xOpen = _mm_unpacklo_epi32(xOpen, xOpen);
184     xExt = _mm_unpacklo_epi32(xExt, xExt);
185 
186     PairAlignSequences p;
187 
188     p.refSubseqInterval.startPos = 0;
189     p.score = 0;
190 
191     i = 1;
192     do {
193         buf = matrix;
194         score = score1 + src[i - 1] * iter * 8;
195         xMax = _mm_xor_si128(xMax, xMax);
196         f1 = _mm_slli_si128(_mm_load_si128(buf + (iter - 1) * 2), 2);
197         e1 = _mm_xor_si128(e1, e1);
198         if (dir == dir2)
199             dir = dir1;
200         j = iter;
201         do {
202             f2 = _mm_adds_epi16(f1, *((__m128i *)score));
203             score += 8; /* subst */
204 
205             f3 = _mm_xor_si128(f3, f3);
206             f2 = _mm_max_epi16(f2, f3);
207             f3 = _mm_cmpgt_epi16(f2, f3);
208             f3 = _mm_slli_epi16(f3, 2);
209             /* f2 f3 */
210             f4 = _mm_insert_epi16(f4, j, 0);
211             f4 = _mm_shufflelo_epi16(f4, 0);
212             f4 = _mm_unpacklo_epi32(f4, f4);
213             xMax = _mm_max_epi16(xMax, f2);
214             f1 = _mm_cmpeq_epi16(f2, xMax);
215             xPos = _mm_or_si128(_mm_and_si128(f4, f1), _mm_andnot_si128(f1, xPos));
216 
217             f1 = _mm_load_si128(buf + 1);
218             f1 = _mm_max_epi16(f1, f2);
219             f2 = _mm_cmpgt_epi16(f1, f2);
220             f2 = _mm_slli_epi16(f2, 1);
221             f3 = _mm_or_si128(f3, f2);
222             /* f1 f3 */
223             f2 = _mm_max_epi16(e1, f1);
224             f1 = _mm_cmpgt_epi16(e1, f1);
225             f3 = _mm_or_si128(f3, f1);
226             /* f2 f3 */
227             f1 = _mm_load_si128(buf);
228             _mm_store_si128(buf, f2);
229             f3 = _mm_packs_epi16(f3, f3);
230             // *(__m64*)dir = _mm_movepi64_pi64(f3); dir += 8;
231             xmm_store8((__m64 *)dir, f3);
232             dir += 8;
233 
234             f2 = _mm_adds_epi16(f2, xOpen);
235             e1 = _mm_max_epi16(_mm_adds_epi16(e1, xExt), f2);
236             f3 = _mm_load_si128(buf + 1);
237             f3 = _mm_max_epi16(_mm_adds_epi16(f3, xExt), f2);
238             _mm_store_si128(buf + 1, f3);
239             buf += 2;
240         } while (--j);
241 
242         buf = matrix;
243         j = (-1) * iter;
244         e1 = _mm_slli_si128(e1, 2);
245         f1 = _mm_load_si128(buf);
246         f3 = _mm_max_epi16(_mm_xor_si128(f3, f3), _mm_adds_epi16(f1, xOpen));
247         k = _mm_movemask_epi8(_mm_cmpgt_epi16(e1, f3));
248         if (k) {
249             do {
250                 f2 = _mm_max_epi16(e1, f1);
251                 _mm_store_si128(buf, f2);
252 
253                 f1 = _mm_cmpgt_epi16(f2, f1);
254                 f2 = _mm_adds_epi16(f2, xOpen);
255                 f1 = _mm_packs_epi16(f1, f1);
256                 // f1 = _mm_or_si128(f1, _mm_movpi64_epi64(*((__m64*)dir + j)));
257                 f1 = _mm_or_si128(f1, xmm_load8((__m64 *)dir + j));
258                 f2 = _mm_max_epi16(f2, *(buf + 1));
259                 // *((__m64*)dir + j) = _mm_movepi64_pi64(f1);
260                 xmm_store8((__m64 *)dir + j, f1);
261                 _mm_store_si128(buf + 1, f2);
262 
263                 e1 = _mm_adds_epi16(e1, xExt);
264                 buf += 2;
265                 if (!(++j)) {
266                     buf = matrix;
267                     j = (-1) * iter;
268                     e1 = _mm_slli_si128(e1, 2);
269                 }
270                 f1 = _mm_load_si128(buf);
271                 f3 = _mm_max_epi16(_mm_xor_si128(f3, f3), _mm_adds_epi16(f1, xOpen));
272                 k = _mm_movemask_epi8(_mm_cmpgt_epi16(e1, f3));
273             } while (k);
274         }
275         /*
276         for(j = 0; j < pat_n; j++)
277         printf(" %02X", *((unsigned short*)(matrix + (j % iter) * 2) + (j / iter)));
278         printf("\n");
279     */
280         max1 = *((short *)(&xMax));
281         alphaCharSize = 0;
282         k = 1;
283         do {
284             j = ((short *)(&xMax))[k];
285             if (j >= max1) {
286                 max1 = j;
287                 alphaCharSize = k;
288             }
289         } while (++k < 8);
290 
291         if (max1 >= minScore) {
292             QByteArray pairAlign;
293             int xpos = 1 + alphaCharSize * iter + iter - ((unsigned short *)(&xPos))[alphaCharSize];
294             j = i;
295             int xend = xpos;
296             char *xdir = dir - iter * 8;
297             for (;;) {
298                 if (!xpos)
299                     break;
300                 k = xdir[map[xpos - 1]];
301                 if (!k)
302                     break;
303                 if (k == -1) {
304                     pairAlign.append(PairAlignSequences::LEFT);
305                     xpos--;
306                     continue;
307                 }
308                 if (k == -2) {
309                     pairAlign.append(PairAlignSequences::UP);
310                 } else if (k == -4) {
311                     pairAlign.append(PairAlignSequences::DIAG);
312                     xpos--;
313                 }
314                 if (xdir == dir1) {
315                     xdir = dir2;
316                 }
317                 if (xdir == dir) {
318                     /* printf("#error\n"); */ break;
319                 }
320                 xdir -= iter * 8;
321                 j--;
322             }
323 
324             p.score = max1;
325             p.refSubseqInterval.startPos = j;
326             p.refSubseqInterval.length = i - j;
327             p.ptrnSubseqInterval.startPos = xpos;
328             p.ptrnSubseqInterval.length = xend - xpos;
329             p.pairAlignment = pairAlign;
330             pairAlignmentStrings.append(p);
331 
332             // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
333             // printf("#%i-%i %s\n", xpos, xend - xpos, pairAlign.data());
334         }
335     } while (++i <= static_cast<int>(src_n));
336 
337     _mm_free(matrix);
338 }
339 
calculateMatrixForAnnotationsResultWithShort()340 void SmithWatermanAlgorithmSSE2::calculateMatrixForAnnotationsResultWithShort() {
341     int i, j, n, k, max1;
342     __m128i f1 = _mm_setzero_si128(), f2 = _mm_setzero_si128(), f3 = _mm_setzero_si128(), f4 = _mm_setzero_si128(), e1 = _mm_setzero_si128();
343     unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
344     unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
345     unsigned int iter = (pat_n + 7) >> 3;
346 
347     n = (iter + 1) * 5;
348     __m128i *buf, *matrix = (__m128i *)_mm_malloc((n + iter * 0x80) * sizeof(__m128i), 16);
349     short *score, *score1 = (short *)(matrix + n);
350     memset(matrix, 0, n * sizeof(__m128i));
351 
352     QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
353     char *alphaCharsData = alphaChars.data();
354     n = alphaChars.size();
355     for (i = 0; i < n; i++) {
356         int n2;
357         unsigned char ch = alphaCharsData[i];
358         score = score1 + ch * iter * 8;
359         for (j = 0; j < static_cast<int>(iter); j++) {
360             for (k = j, n2 = 0; n2 < 8; n2++, k += iter) {
361                 int a = -0x8000;
362                 if (k < static_cast<int>(pat_n)) {
363                     a = substitutionMatrix.getScore(ch, pat[k]);
364                 }
365                 *score++ = a;
366             }
367         }
368     }
369 
370     __m128i xMax = _mm_setzero_si128(), xPos = _mm_setzero_si128();
371     __m128i xOpen = _mm_setzero_si128();
372     xOpen = _mm_insert_epi16(xOpen, gapOpen, 0);
373     __m128i xExt = _mm_setzero_si128();
374     xExt = _mm_insert_epi16(xExt, gapExtension, 0);
375     xOpen = _mm_shufflelo_epi16(xOpen, 0);
376     xExt = _mm_shufflelo_epi16(xExt, 0);
377     xOpen = _mm_unpacklo_epi32(xOpen, xOpen);
378     xExt = _mm_unpacklo_epi32(xExt, xExt);
379 
380     PairAlignSequences p;
381 
382     p.refSubseqInterval.startPos = 0;
383     p.score = 0;
384 
385 #define SW_LOOP(SWA, SWB) \
386     buf = matrix + 5; \
387     score = score1 + src[i - 1] * iter * 8; \
388     xMax = _mm_xor_si128(xMax, xMax); \
389     f4 = _mm_insert_epi16(f4, i, 0); \
390     f4 = _mm_shufflelo_epi16(f4, 0); \
391     f4 = _mm_unpacklo_epi32(f4, f4); \
392     f2 = _mm_slli_si128(_mm_load_si128(SWB + (iter - 1) * 5), 2); \
393     f1 = _mm_slli_si128(_mm_load_si128(SWB + 1 + (iter - 1) * 5), 2); \
394     f1 = _mm_insert_epi16(f1, i - 1, 0); \
395     e1 = _mm_xor_si128(e1, e1); \
396     j = iter; \
397     do { \
398         f2 = _mm_adds_epi16(f2, *((__m128i *)score)); \
399         score += 8; /* subst */ \
400         /* f2 f1 */ \
401         f3 = _mm_xor_si128(f3, f3); \
402         f2 = _mm_max_epi16(f2, f3); \
403         f3 = _mm_cmpeq_epi16(f3, f2); \
404         f3 = _mm_or_si128(_mm_and_si128(f3, f4), _mm_andnot_si128(f3, f1)); \
405         /* f2 f3 */ \
406         xMax = _mm_max_epi16(xMax, f2); \
407         f1 = _mm_cmpeq_epi16(f2, xMax); \
408         xPos = _mm_or_si128(_mm_and_si128(f1, f3), _mm_andnot_si128(f1, xPos)); \
409 \
410         f1 = _mm_load_si128(buf + 4); \
411         f1 = _mm_max_epi16(f1, f2); \
412         f2 = _mm_cmpeq_epi16(f2, f1); \
413         f3 = _mm_or_si128(_mm_and_si128(f3, f2), _mm_andnot_si128(f2, _mm_load_si128(SWB + 1))); \
414         /* f1 f3 */ \
415         f2 = _mm_max_epi16(e1, f1); \
416         f1 = _mm_cmpeq_epi16(f1, f2); \
417         f3 = _mm_or_si128(_mm_and_si128(f3, f1), _mm_andnot_si128(f1, _mm_load_si128(SWA - 5 + 1))); \
418         /* f2 f3 */ \
419         _mm_store_si128(SWA, f2); \
420         _mm_store_si128(SWA + 1, f3); \
421         f2 = _mm_adds_epi16(f2, xOpen); \
422         e1 = _mm_max_epi16(_mm_adds_epi16(e1, xExt), f2); \
423         f1 = _mm_load_si128(buf + 4); \
424         f1 = _mm_max_epi16(_mm_adds_epi16(f1, xExt), f2); \
425         _mm_store_si128(buf + 4, f1); \
426 \
427         f2 = _mm_load_si128(SWB); \
428         f1 = _mm_load_si128(SWB + 1); \
429         buf += 5; \
430     } while (--j); \
431 \
432     f4 = _mm_slli_si128(_mm_load_si128(SWA - 5 + 1), 2); \
433     buf = matrix + 5; \
434     j = 0; \
435     e1 = _mm_slli_si128(e1, 2); \
436     f2 = _mm_load_si128(SWA); \
437     f3 = _mm_max_epi16(_mm_xor_si128(f3, f3), _mm_adds_epi16(f2, xOpen)); \
438     k = _mm_movemask_epi8(_mm_cmpgt_epi16(e1, f3)); \
439     if (k) \
440         do { \
441             f1 = _mm_max_epi16(e1, f2); \
442             f2 = _mm_cmpeq_epi16(f2, f1); \
443             f2 = _mm_or_si128(_mm_and_si128(f2, *(SWA + 1)), _mm_andnot_si128(f2, f4)); \
444             _mm_store_si128(SWA, f1); \
445             _mm_store_si128(SWA + 1, f2); \
446 \
447             f1 = _mm_adds_epi16(f1, xOpen); \
448             f1 = _mm_max_epi16(f1, *(buf + 4)); \
449             _mm_store_si128(buf + 4, f1); \
450 \
451             e1 = _mm_adds_epi16(e1, xExt); \
452             buf += 5; \
453             if (++j >= static_cast<int>(iter)) { \
454                 buf = matrix + 5; \
455                 j = 0; \
456                 e1 = _mm_slli_si128(e1, 2); \
457                 f4 = _mm_slli_si128(f4, 2); \
458             } \
459             f2 = _mm_load_si128(SWA); \
460             f3 = _mm_max_epi16(_mm_xor_si128(f3, f3), _mm_adds_epi16(f2, xOpen)); \
461             k = _mm_movemask_epi8(_mm_cmpgt_epi16(e1, f3)); \
462         } while (k); \
463 \
464     max1 = *((short *)(&xMax)); \
465     n = 0; \
466     k = 1; \
467     do { \
468         j = ((short *)(&xMax))[k]; \
469         if (j >= max1) { \
470             max1 = j; \
471             n = k; \
472         } \
473     } while (++k < 8); \
474 \
475     if (max1 >= minScore) { \
476         j = ((((short *)(&xPos))[n] - i - 1) | -0x10000) + i + 1; \
477         SW_FILT_MACRO; \
478     }
479 
480     // #define SW_FILT
481 
482 #ifdef SW_FILT
483 #    define SW_FILT_MACRO \
484         if (p.refSubseqInterval.startPos != j) { \
485             if (p.score) { \
486                 pairAlignmentStrings.append(p); \
487                 /* printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score); */ \
488             } \
489             p.refSubseqInterval.startPos = j; \
490             p.refSubseqInterval.length = i - j; \
491             p.score = max1; \
492         } else if (p.score < max1) { \
493             p.refSubseqInterval.length = i - j; \
494             p.score = max1; \
495         }
496 #else
497 #    define SW_FILT_MACRO \
498         p.refSubseqInterval.startPos = j; \
499         p.refSubseqInterval.length = i - j; \
500         p.score = max1; \
501         pairAlignmentStrings.append(p); \
502         /* printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score); */
503 #endif
504 
505     i = 1;
506     do {
507         SW_LOOP(buf, buf + 2);
508         if (++i > static_cast<int>(src_n)) {
509             break;
510         }
511         SW_LOOP(buf + 2, buf);
512     } while (++i <= static_cast<int>(src_n));
513 
514 #undef SW_LOOP
515 #undef SW_FILT_MACRO
516 
517 #ifdef SW_FILT
518     if (p.score) {
519         pairAlignmentStrings.append(p);
520         // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
521     }
522 #endif
523     _mm_free(matrix);
524 }
525 
calculateMatrixForMultipleAlignmentResultWithInt()526 void SmithWatermanAlgorithmSSE2::calculateMatrixForMultipleAlignmentResultWithInt() {
527     int i, j, n, k, max1;
528     __m128i f1 = _mm_setzero_si128(), f2 = _mm_setzero_si128(), f3 = _mm_setzero_si128(), f4 = _mm_setzero_si128(), e1 = _mm_setzero_si128();
529     unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
530     unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
531     unsigned int iter = (pat_n + 3) >> 2;
532 
533     n = iter * 2;
534     __m128i *buf, *matrix = (__m128i *)_mm_malloc((n + iter * 0x80 + iter) * 16 + matrixLength * iter * 4, 16);
535     int *score, *score1 = (int *)(matrix + n);
536     int *map = score1 + iter * 0x80 * 4;
537     char *dir, *dir2, *dir1 = (char *)(map + iter * 4);
538     memset(matrix, 0, n * sizeof(__m128i));
539     memset(dir1, 0, iter * 4);
540     dir = dir1 + iter * 4;
541     dir2 = dir1 + matrixLength * iter * 4;
542 
543     for (i = 0, j = 0; j < static_cast<int>(iter); j++) {
544         for (k = j, n = 0; n < 4; n++, k += iter) {
545             map[k] = i++;
546         }
547     }
548 
549     QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
550     char *alphaCharsData = alphaChars.data();
551     n = alphaChars.size();
552     for (i = 0; i < n; i++) {
553         int n2;
554         unsigned char ch = alphaCharsData[i];
555         score = score1 + ch * iter * 4;
556         for (j = 0; j < static_cast<int>(iter); j++) {
557             for (k = j, n2 = 0; n2 < 4; n2++, k += iter) {
558                 int a = -0x8000;
559                 if (k < static_cast<int>(pat_n)) {
560                     a = substitutionMatrix.getScore(ch, pat[k]);
561                 }
562                 *score++ = a;
563             }
564         }
565     }
566 
567     __m128i xMax = _mm_setzero_si128(), xPos = _mm_setzero_si128();
568     __m128i xOpen = _mm_cvtsi32_si128(gapOpen);
569     __m128i xExt = _mm_cvtsi32_si128(gapExtension);
570     xOpen = _mm_shuffle_epi32(xOpen, 0);
571     xExt = _mm_shuffle_epi32(xExt, 0);
572 
573     PairAlignSequences p;
574 
575     p.refSubseqInterval.startPos = 0;
576     p.score = 0;
577 
578     i = 1;
579     do {
580         buf = matrix;
581         score = score1 + src[i - 1] * iter * 4;
582         xMax = _mm_xor_si128(xMax, xMax);
583         f1 = _mm_slli_si128(_mm_load_si128(buf + (iter - 1) * 2), 4);
584         e1 = _mm_xor_si128(e1, e1);
585         if (dir == dir2)
586             dir = dir1;
587         j = iter;
588         do {
589             f2 = _mm_add_epi32(f1, *((__m128i *)score));
590             score += 4; /* subst */
591 
592             n = alphaChars.size();
593             for (i = 0; i < n; i++) {
594                 int n2;
595                 unsigned char ch = alphaCharsData[i];
596                 score = score1 + ch * iter * 4;
597                 for (j = 0; j < static_cast<int>(iter); j++) {
598                     for (k = j, n2 = 0; n2 < 4; n2++, k += iter) {
599                         int a = -0x8000;
600                         if (k < static_cast<int>(pat_n)) {
601                             a = substitutionMatrix.getScore(ch, pat[k]);
602                         }
603                         *score++ = a;
604                     }
605                 }
606             }
607 
608             f3 = _mm_xor_si128(f3, f3);
609             f3 = _mm_cmpgt_epi32(f2, f3);
610             f2 = _mm_and_si128(f2, f3);
611             f3 = _mm_slli_epi32(f3, 2);
612             /* f2 f3 */
613             f4 = _mm_cvtsi32_si128(j);
614             f4 = _mm_shuffle_epi32(f4, 0);
615             f1 = _mm_cmpgt_epi32(xMax, f2);
616             xMax = _mm_xor_si128(xMax, f2);
617             xMax = _mm_and_si128(xMax, f1);
618             xMax = _mm_xor_si128(xMax, f2);
619             xPos = _mm_or_si128(_mm_and_si128(xPos, f1), _mm_andnot_si128(f1, f4));
620 
621             f1 = _mm_load_si128(buf + 1);
622             f4 = _mm_cmpgt_epi32(f1, f2);
623             f1 = _mm_xor_si128(f1, f2);
624             f1 = _mm_and_si128(f1, f4);
625             f1 = _mm_xor_si128(f1, f2);
626             f4 = _mm_slli_epi32(f4, 1);
627             f3 = _mm_or_si128(f3, f4);
628             /* f1 f3 */
629             f4 = _mm_cmpgt_epi32(e1, f1);
630             f2 = _mm_xor_si128(e1, f1);
631             f2 = _mm_and_si128(f2, f4);
632             f2 = _mm_xor_si128(f2, f1);
633             f3 = _mm_or_si128(f3, f4);
634             /* f2 f3 */
635             f1 = _mm_load_si128(buf);
636             _mm_store_si128(buf, f2);
637             f3 = _mm_packs_epi32(f3, f3);
638             f3 = _mm_packs_epi16(f3, f3);
639             *(int *)dir = _mm_cvtsi128_si32(f3);
640             dir += 4;
641             f2 = _mm_add_epi32(f2, xOpen);
642 
643             e1 = _mm_add_epi32(e1, xExt);
644             f4 = _mm_cmpgt_epi32(e1, f2);
645             e1 = _mm_xor_si128(e1, f2);
646             e1 = _mm_and_si128(e1, f4);
647             e1 = _mm_xor_si128(e1, f2);
648 
649             f3 = _mm_load_si128(buf + 1);
650             f3 = _mm_add_epi32(f3, xExt);
651             f4 = _mm_cmpgt_epi32(f3, f2);
652             f3 = _mm_xor_si128(f3, f2);
653             f3 = _mm_and_si128(f3, f4);
654             f3 = _mm_xor_si128(f3, f2);
655             _mm_store_si128(buf + 1, f3);
656             buf += 2;
657         } while (--j);
658 
659         buf = matrix;
660         j = (-1) * iter;
661         e1 = _mm_slli_si128(e1, 4);
662         f1 = _mm_load_si128(buf);
663 
664         f2 = _mm_add_epi32(f1, xOpen);
665         f3 = _mm_xor_si128(f3, f3);
666         f3 = _mm_cmpgt_epi32(f2, f3);
667         f2 = _mm_and_si128(f2, f3);
668         k = _mm_movemask_epi8(_mm_cmpgt_epi32(e1, f2));
669         if (k) {
670             do {
671                 f4 = _mm_cmpgt_epi32(e1, f1);
672                 f2 = _mm_xor_si128(e1, f1);
673                 f2 = _mm_and_si128(f2, f4);
674                 f2 = _mm_xor_si128(f2, f1);
675                 _mm_store_si128(buf, f2);
676 
677                 f2 = _mm_add_epi32(f2, xOpen);
678                 f4 = _mm_packs_epi32(f4, f4);
679                 f4 = _mm_packs_epi16(f4, f4);
680                 f4 = _mm_or_si128(f4, _mm_cvtsi32_si128(*((int *)dir + j)));
681                 f1 = _mm_load_si128(buf + 1);
682                 *((int *)dir + j) = _mm_cvtsi128_si32(f4);
683                 f3 = _mm_cmpgt_epi32(f1, f2);
684                 f1 = _mm_xor_si128(f1, f2);
685                 f1 = _mm_and_si128(f1, f3);
686                 f1 = _mm_xor_si128(f1, f2);
687                 _mm_store_si128(buf + 1, f1);
688 
689                 e1 = _mm_add_epi32(e1, xExt);
690                 buf += 2;
691                 if (!(++j)) {
692                     buf = matrix;
693                     j = (-1) * iter;
694                     e1 = _mm_slli_si128(e1, 4);
695                 }
696                 f1 = _mm_load_si128(buf);
697                 f2 = _mm_add_epi32(f1, xOpen);
698                 f3 = _mm_xor_si128(f3, f3);
699                 f3 = _mm_cmpgt_epi32(f2, f3);
700                 f2 = _mm_and_si128(f2, f3);
701                 k = _mm_movemask_epi8(_mm_cmpgt_epi32(e1, f2));
702             } while (k);
703         }
704         /*
705         for(j = 0; j < pat_n; j++)
706         printf(" %02X", *((int*)(matrix + (j % iter) * 2) + (j / iter)));
707         printf("\n");
708     */
709         max1 = *((int *)(&xMax));
710         n = 0;
711         k = 1;
712         do {
713             j = ((int *)(&xMax))[k];
714             if (j >= max1) {
715                 max1 = j;
716                 n = k;
717             }
718         } while (++k < 4);
719 
720         if (max1 >= minScore) {
721             QByteArray pairAlign;
722             int xpos = 1 + n * iter + iter - ((int *)(&xPos))[n];
723             j = i;
724             int xend = xpos;
725             char *xdir = dir - iter * 4;
726             for (;;) {
727                 if (!xpos)
728                     break;
729                 k = xdir[map[xpos - 1]];
730                 if (!k)
731                     break;
732                 if (k == -1) {
733                     pairAlign.append(PairAlignSequences::LEFT);
734                     xpos--;
735                     continue;
736                 }
737                 if (k == -2) {
738                     pairAlign.append(PairAlignSequences::UP);
739                 } else if (k == -4) {
740                     pairAlign.append(PairAlignSequences::DIAG);
741                     xpos--;
742                 }
743                 if (xdir == dir1)
744                     xdir = dir2;
745                 if (xdir == dir) { /* printf("#error\n"); */
746                     break;
747                 }
748                 xdir -= iter * 4;
749                 j--;
750             }
751 
752             p.score = max1;
753             p.refSubseqInterval.startPos = j;
754             p.refSubseqInterval.length = i - j;
755             p.ptrnSubseqInterval.startPos = xpos;
756             p.ptrnSubseqInterval.length = xend - xpos;
757             p.pairAlignment = pairAlign;
758             pairAlignmentStrings.append(p);
759 
760             // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
761             // printf("#%i-%i %s\n", xpos, xend - xpos, pairAlign.data());
762         }
763     } while (++i <= static_cast<int>(src_n));
764 
765     _mm_free(matrix);
766 }
767 
calculateMatrixForAnnotationsResultWithInt()768 void SmithWatermanAlgorithmSSE2::calculateMatrixForAnnotationsResultWithInt() {
769     int i, j, n, k, max1;
770     __m128i f1 = _mm_setzero_si128(), f2 = _mm_setzero_si128(), f3 = _mm_setzero_si128(), f4 = _mm_setzero_si128(), e1 = _mm_setzero_si128();
771     unsigned int src_n = searchSeq.length(), pat_n = patternSeq.length();
772     unsigned char *src = (unsigned char *)searchSeq.data(), *pat = (unsigned char *)patternSeq.data();
773     unsigned int iter = (pat_n + 3) >> 2;
774 
775     n = (iter + 1) * 5;
776     __m128i *buf, *matrix = (__m128i *)_mm_malloc((n + iter * 0x80) * sizeof(__m128i), 16);
777     int *score, *score1 = (int *)(matrix + n);
778     memset(matrix, 0, n * sizeof(__m128i));
779 
780     QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
781     char *alphaCharsData = alphaChars.data();
782     n = alphaChars.size();
783     for (i = 0; i < n; i++) {
784         int n2;
785         unsigned char ch = alphaCharsData[i];
786         score = score1 + ch * iter * 4;
787         for (j = 0; j < static_cast<int>(iter); j++) {
788             for (k = j, n2 = 0; n2 < 4; n2++, k += iter) {
789                 int a = -0x8000;
790                 if (k < static_cast<int>(pat_n)) {
791                     a = substitutionMatrix.getScore(ch, pat[k]);
792                 }
793                 *score++ = a;
794             }
795         }
796     }
797 
798     __m128i xMax = _mm_setzero_si128(), xPos = _mm_setzero_si128();
799     __m128i xOpen = _mm_setzero_si128();
800     xOpen = _mm_cvtsi32_si128(gapOpen);
801     __m128i xExt = _mm_setzero_si128();
802     xExt = _mm_cvtsi32_si128(gapExtension);
803     xOpen = _mm_shuffle_epi32(xOpen, 0);
804     xExt = _mm_shuffle_epi32(xExt, 0);
805 
806     PairAlignSequences p;
807 
808     p.refSubseqInterval.startPos = 0;
809     p.score = 0;
810 
811 #define SW_LOOP(SWA, SWB) \
812     buf = matrix + 5; \
813     score = score1 + src[i - 1] * iter * 4; \
814     xMax = _mm_xor_si128(xMax, xMax); \
815     f2 = _mm_slli_si128(_mm_load_si128(SWB + (iter - 1) * 5), 4); \
816     f1 = _mm_slli_si128(_mm_load_si128(SWB + 1 + (iter - 1) * 5), 4); \
817     f1 = _mm_or_si128(f1, _mm_cvtsi32_si128(i - 1)); \
818     e1 = _mm_xor_si128(e1, e1); \
819     j = iter; \
820     do { \
821         f2 = _mm_add_epi32(f2, *((__m128i *)score)); \
822         score += 4; /* subst */ \
823         /* f2 f1 */ \
824         f4 = _mm_cvtsi32_si128(i); \
825         f4 = _mm_shuffle_epi32(f4, 0); \
826         f3 = _mm_xor_si128(f3, f3); \
827         f3 = _mm_cmpgt_epi32(f2, f3); \
828         f2 = _mm_and_si128(f2, f3); \
829         f3 = _mm_or_si128(_mm_and_si128(f1, f3), _mm_andnot_si128(f3, f4)); \
830         /* f2 f3 */ \
831         f1 = _mm_cmpgt_epi32(xMax, f2); \
832         xMax = _mm_xor_si128(xMax, f2); \
833         xMax = _mm_and_si128(xMax, f1); \
834         xMax = _mm_xor_si128(xMax, f2); \
835         xPos = _mm_or_si128(_mm_and_si128(xPos, f1), _mm_andnot_si128(f1, f3)); \
836 \
837         f1 = _mm_load_si128(buf + 4); \
838         f4 = _mm_cmpgt_epi32(f1, f2); \
839         f1 = _mm_xor_si128(f1, f2); \
840         f1 = _mm_and_si128(f1, f4); \
841         f1 = _mm_xor_si128(f1, f2); \
842         f3 = _mm_or_si128(_mm_and_si128(f4, *(SWB + 1)), _mm_andnot_si128(f4, f3)); \
843         /* f1 f3 */ \
844         f4 = _mm_cmpgt_epi32(e1, f1); \
845         f2 = _mm_xor_si128(e1, f1); \
846         f2 = _mm_and_si128(f2, f4); \
847         f2 = _mm_xor_si128(f2, f1); \
848         f3 = _mm_or_si128(_mm_and_si128(f4, *(SWA - 5 + 1)), _mm_andnot_si128(f4, f3)); \
849         /* f2 f3 */ \
850         _mm_store_si128(SWA, f2); \
851         _mm_store_si128(SWA + 1, f3); \
852         f2 = _mm_add_epi32(f2, xOpen); \
853         e1 = _mm_add_epi32(e1, xExt); \
854         f1 = _mm_cmpgt_epi32(e1, f2); \
855         e1 = _mm_xor_si128(e1, f2); \
856         e1 = _mm_and_si128(e1, f1); \
857         e1 = _mm_xor_si128(e1, f2); \
858         f3 = _mm_load_si128(buf + 4); \
859         f3 = _mm_add_epi32(f3, xExt); \
860         f1 = _mm_cmpgt_epi32(f3, f2); \
861         f3 = _mm_xor_si128(f3, f2); \
862         f3 = _mm_and_si128(f3, f1); \
863         f3 = _mm_xor_si128(f3, f2); \
864         _mm_store_si128(buf + 4, f3); \
865 \
866         f2 = _mm_load_si128(SWB); \
867         f1 = _mm_load_si128(SWB + 1); \
868         buf += 5; \
869     } while (--j); \
870 \
871     f4 = _mm_slli_si128(_mm_load_si128(SWA - 5 + 1), 4); \
872     buf = matrix + 5; \
873     j = 0; \
874     e1 = _mm_slli_si128(e1, 4); \
875     f2 = _mm_load_si128(SWA); \
876     f1 = _mm_add_epi32(f2, xOpen); \
877     f3 = _mm_xor_si128(f3, f3); \
878     f3 = _mm_cmpgt_epi32(f1, f3); \
879     f1 = _mm_and_si128(f1, f3); \
880     k = _mm_movemask_epi8(_mm_cmpgt_epi32(e1, f1)); \
881     if (k) \
882         do { \
883             f3 = _mm_cmpgt_epi32(e1, f2); \
884             f1 = _mm_xor_si128(e1, f2); \
885             f1 = _mm_and_si128(f1, f3); \
886             f1 = _mm_xor_si128(f1, f2); \
887             f2 = _mm_or_si128(_mm_and_si128(f3, f4), _mm_andnot_si128(f3, *(SWA + 1))); \
888             _mm_store_si128(SWA, f1); \
889             _mm_store_si128(SWA + 1, f2); \
890 \
891             f1 = _mm_add_epi32(f1, xOpen); \
892             f2 = _mm_load_si128(buf + 4); \
893             f3 = _mm_cmpgt_epi32(f1, f2); \
894             f1 = _mm_xor_si128(f1, f2); \
895             f1 = _mm_and_si128(f1, f3); \
896             f1 = _mm_xor_si128(f1, f2); \
897             _mm_store_si128(buf + 4, f2); \
898 \
899             e1 = _mm_add_epi32(e1, xExt); \
900             buf += 5; \
901             if (++j >= static_cast<int>(iter)) { \
902                 buf = matrix + 5; \
903                 j = 0; \
904                 e1 = _mm_slli_si128(e1, 4); \
905                 f4 = _mm_slli_si128(f4, 4); \
906             } \
907             f2 = _mm_load_si128(SWA); \
908             f1 = _mm_add_epi32(f2, xOpen); \
909             f3 = _mm_xor_si128(f3, f3); \
910             f3 = _mm_cmpgt_epi32(f1, f3); \
911             f1 = _mm_and_si128(f1, f3); \
912             k = _mm_movemask_epi8(_mm_cmpgt_epi32(e1, f1)); \
913         } while (k); \
914 \
915     max1 = *((int *)(&xMax)); \
916     n = 0; \
917     k = 1; \
918     do { \
919         j = ((int *)(&xMax))[k]; \
920         if (j >= max1) { \
921             max1 = j; \
922             n = k; \
923         } \
924     } while (++k < 4); \
925 \
926     if (max1 >= minScore) { \
927         j = ((int *)(&xPos))[n]; \
928         SW_FILT_MACRO; \
929     }
930 
931     // #define SW_FILT
932 
933 #ifdef SW_FILT
934 #    define SW_FILT_MACRO \
935         if (p.refSubseqInterval.startPos != j) { \
936             if (p.score) { \
937                 pairAlignmentStrings.append(p); \
938                 /* printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score); */ \
939             } \
940             p.refSubseqInterval.startPos = j; \
941             p.refSubseqInterval.length = i - j; \
942             p.score = max1; \
943         } else if (p.score < max1) { \
944             p.refSubseqInterval.length = i - j; \
945             p.score = max1; \
946         }
947 #else
948 #    define SW_FILT_MACRO \
949         p.refSubseqInterval.startPos = j; \
950         p.refSubseqInterval.length = i - j; \
951         p.score = max1; \
952         pairAlignmentStrings.append(p); \
953         /* printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score); */
954 #endif
955 
956     i = 1;
957     do {
958         SW_LOOP(buf, buf + 2);
959         if (++i > static_cast<int>(src_n)) {
960             break;
961         }
962         SW_LOOP(buf + 2, buf);
963     } while (++i <= static_cast<int>(src_n));
964 
965 #undef SW_LOOP
966 #undef SW_FILT_MACRO
967 
968 #ifdef SW_FILT
969     if (p.score) {
970         pairAlignmentStrings.append(p);
971         // printf("#%i-%i %i\n", (int)p.refSubseqInterval.startPos, (int)p.refSubseqInterval.length, (int)p.score);
972     }
973 #endif
974     _mm_free(matrix);
975 }
976 
printVector(__m128i & toprint,int add)977 inline void SmithWatermanAlgorithmSSE2::printVector(__m128i &toprint, int add) {
978     ScoreType *tmpArray = (ScoreType *)_mm_malloc(nElementsInVec * sizeof(ScoreType), 16);
979 
980     _mm_store_si128((__m128i *)&tmpArray[0], toprint);
981     cout << "printVector" << endl;
982     for (int i = 0; i < nElementsInVec; i++)
983         cout << tmpArray[i] + add << " ";
984 
985     cout << endl;
986 }
987 
calculateMatrixSSE2(unsigned queryLength,unsigned char * dbSeq,unsigned dbLength,unsigned short gapOpenOrig,unsigned short gapExtend)988 int SmithWatermanAlgorithmSSE2::calculateMatrixSSE2(unsigned queryLength, unsigned char *dbSeq, unsigned dbLength, unsigned short gapOpenOrig, unsigned short gapExtend) {
989     unsigned iter = (queryLength + nElementsInVec - 1) / nElementsInVec;
990 
991     int ALPHA_SIZE = substitutionMatrix.getAlphabet()->getNumAlphabetChars();
992 
993     __m128i *pvQueryProf = (__m128i *)_mm_malloc('Z' * ALPHA_SIZE * iter * sizeof(__m128i), 16);
994 
995     int weight = 0;
996     unsigned short *queryProfile = (unsigned short *)pvQueryProf;
997     int segSize = (queryLength + 7) / 8;  // iter
998     int nCount = segSize * 8;
999     char curChar = ' ';
1000 
1001     QByteArray alphaChars = substitutionMatrix.getAlphabet()->getAlphabetChars();
1002     for (int i = 0; i < ALPHA_SIZE; i++) {
1003         curChar = alphaChars.at(i);
1004         int h = 0;
1005         for (int j = 0; j < segSize; j++) {
1006             unsigned int k = j;
1007             for (unsigned int kk = 0; kk < 8; kk++) {
1008                 if (k >= queryLength) {
1009                     weight = 0;
1010                 } else {
1011                     weight = substitutionMatrix.getScore(curChar, patternSeq.at(k));
1012                 }
1013                 queryProfile[curChar * nCount + h] = (unsigned short)weight;
1014                 k += segSize;
1015                 h++;
1016             }
1017         }
1018     }
1019 
1020     __m128i *pvHLoad = (__m128i *)_mm_malloc(iter * sizeof(__m128i), 16);
1021     __m128i *pvHStore = (__m128i *)_mm_malloc(iter * sizeof(__m128i), 16);
1022     __m128i *pvE = (__m128i *)_mm_malloc(iter * sizeof(__m128i), 16);
1023 
1024     unsigned i = 0, j = 0;
1025     int score = 0;
1026 
1027     int cmp = 0;
1028 
1029     unsigned short gapOpenFarrar = gapOpenOrig - gapExtend;
1030 
1031     __m128i *pv = 0;
1032 
1033     __m128i vE = _mm_set1_epi32(0), vF = _mm_set1_epi32(0), vH = _mm_set1_epi32(0);
1034 
1035     __m128i vMaxScore = _mm_set1_epi32(0);
1036     __m128i vGapOpen = _mm_set1_epi32(0);
1037     __m128i vGapExtend = _mm_set1_epi32(0);
1038 
1039     __m128i vMin = _mm_set1_epi32(0);
1040     __m128i vMinimums = _mm_set1_epi32(0);
1041     __m128i vTemp = _mm_set1_epi32(0);
1042 
1043     __m128i *pvScore = 0;
1044 
1045     /* Load gap opening penalty to all elements of a constant */
1046     vGapOpen = _mm_insert_epi16(vGapOpen, gapOpenFarrar, 0);
1047     vGapOpen = _mm_shufflelo_epi16(vGapOpen, 0);
1048     vGapOpen = _mm_shuffle_epi32(vGapOpen, 0);
1049 
1050     /* Load gap extension penalty to all elements of a constant */
1051     vGapExtend = _mm_insert_epi16(vGapExtend, gapExtend, 0);
1052     vGapExtend = _mm_shufflelo_epi16(vGapExtend, 0);
1053     vGapExtend = _mm_shuffle_epi32(vGapExtend, 0);
1054 
1055     /*  load vMaxScore with the zeros.  since we are using signed */
1056     /*  math, we will bias the maxscore to -32768 so we have the */
1057     /*  full range of the short. */
1058     vMaxScore = _mm_cmpeq_epi16(vMaxScore, vMaxScore);
1059     vMaxScore = _mm_slli_epi16(vMaxScore, 15);
1060 
1061     vMinimums = _mm_shuffle_epi32(vMaxScore, 0);
1062     Q_UNUSED(vMinimums);
1063 
1064     vMin = _mm_shuffle_epi32(vMaxScore, 0);
1065     vMin = _mm_srli_si128(vMin, 14);
1066 
1067     /* Zero out the storage vector */
1068     for (i = 0; i < iter; ++i) {
1069         _mm_store_si128(pvE + i, vMaxScore);
1070         _mm_store_si128(pvHStore + i, vMaxScore);
1071     }
1072 
1073     PairAlignSequences p;
1074     U2Region sReg;
1075 
1076     for (i = 0; i < dbLength; ++i) {
1077         /* fetch first data asap. */
1078         pvScore = pvQueryProf + dbSeq[i] * iter;
1079 
1080         /* zero out F. */
1081         vF = _mm_cmpeq_epi16(vF, vF);
1082         vF = _mm_slli_epi16(vF, 15);
1083 
1084         /* load the next h value */
1085         vH = _mm_load_si128(pvHStore + iter - 1);
1086         vH = _mm_slli_si128(vH, 2);
1087         vH = _mm_or_si128(vH, vMin);
1088 
1089         pv = pvHLoad;
1090         pvHLoad = pvHStore;
1091         pvHStore = pv;
1092 
1093         for (j = 0; j < iter; ++j) {
1094             /* load values of vF and vH from previous row (one unit up) */
1095             vE = _mm_load_si128(pvE + j);
1096 
1097             /* add score to vH */
1098             vH = _mm_adds_epi16(vH, *pvScore++);
1099 
1100             /* Update highest score encountered this far */
1101             vMaxScore = _mm_max_epi16(vMaxScore, vH);
1102 
1103             /* get max from vH, vE and vF */
1104             vH = _mm_max_epi16(vH, vE);
1105             vH = _mm_max_epi16(vH, vF);
1106 
1107             /* save vH values */
1108             _mm_store_si128(pvHStore + j, vH);
1109 
1110             /* update vE value */
1111             vH = _mm_subs_epi16(vH, vGapOpen);
1112             vE = _mm_subs_epi16(vE, vGapExtend);
1113             vE = _mm_max_epi16(vE, vH);
1114 
1115             /* update vF value */
1116             vF = _mm_subs_epi16(vF, vGapExtend);
1117             vF = _mm_max_epi16(vF, vH);
1118 
1119             /* save vE values */
1120             _mm_store_si128(pvE + j, vE);
1121 
1122             /* load the next h value */
1123             vH = _mm_load_si128(pvHLoad + j);
1124         }
1125 
1126         /* reset pointers to the start of the saved data */
1127         j = 0;
1128         vH = _mm_load_si128(pvHStore + j);
1129 
1130         /*  the computed vF value is for the given column.  since */
1131         /*  we are at the end, we need to shift the vF value over */
1132         /*  to the next column. */
1133 
1134         vF = _mm_slli_si128(vF, 2);
1135         vF = _mm_or_si128(vF, vMin);
1136         vTemp = _mm_subs_epi16(vH, vGapOpen);
1137 
1138         vTemp = _mm_cmpgt_epi16(vF, vTemp);
1139 
1140         cmp = _mm_movemask_epi8(vTemp);
1141 
1142         while (cmp != 0x0000)
1143         // for (unsigned cnt=0; cnt<iter; ++cnt)
1144         {
1145             vE = _mm_load_si128(pvE + j);
1146 
1147             vH = _mm_max_epi16(vH, vF);
1148 
1149             /* save vH values */
1150             _mm_store_si128(pvHStore + j, vH);
1151 
1152             /*  update vE incase the new vH value would change it */
1153             vH = _mm_subs_epi16(vH, vGapOpen);
1154             vE = _mm_max_epi16(vE, vH);
1155 
1156             /* update vF value */
1157             vF = _mm_subs_epi16(vF, vGapExtend);
1158 
1159             j++;
1160             if (j >= iter) {
1161                 j = 0;
1162                 vF = _mm_slli_si128(vF, 2);
1163                 vF = _mm_or_si128(vF, vMin);
1164             }
1165 
1166             vH = _mm_load_si128(pvHStore + j);
1167 
1168             vTemp = _mm_subs_epi16(vH, vGapOpen);
1169             vTemp = _mm_cmpgt_epi16(vF, vTemp);
1170             cmp = _mm_movemask_epi8(vTemp);
1171         }
1172     }
1173 
1174     /* find largest score in the vMaxScore vector */
1175     vTemp = _mm_srli_si128(vMaxScore, 8);
1176     vMaxScore = _mm_max_epi16(vMaxScore, vTemp);
1177     vTemp = _mm_srli_si128(vMaxScore, 4);
1178     vMaxScore = _mm_max_epi16(vMaxScore, vTemp);
1179     vTemp = _mm_srli_si128(vMaxScore, 2);
1180     vMaxScore = _mm_max_epi16(vMaxScore, vTemp);
1181 
1182     /* store in temporary variable */
1183     score = (short)_mm_extract_epi16(vMaxScore, 0);
1184 
1185     _mm_free(pvHLoad);
1186     _mm_free(pvHStore);
1187     _mm_free(pvE);
1188     _mm_free(pvQueryProf);
1189 
1190     /* return largest score */
1191     return score + 32768;
1192 }
1193 
1194 }  // namespace U2
1195