1 #include "uhmmsearch_opt.h"
2 
3 #include <U2Core/SequenceWalkerTask.h>
4 #include <U2Core/U2Region.h>
5 #include <U2Core/Timer.h>
6 #include <hmmer2/funcs.h>
7 
8 #include <assert.h>
9 #include <emmintrin.h>
10 
11 using namespace U2;
12 
13 namespace {
14 
15 #define CHUNK_SIZE (10 * 1024)
16 const int ALIGNMENT_MEM = 128;
17 
18 #define ALIGNED(ptr, base) ( (unsigned char*) ( (((quintptr)(ptr))+((base)-1)) &~((base-1)) ) )
19 
20 struct hmm_opt{
21     static const int ALIGNMENT_MEM = 128; // alignment of mem
22     static const int ALIGNMENT_SCORES = 16; //alignment of tsc, msc, etc.
23     int   M;                         // Length of this HMM in states
24     int * mem;                       // honestly allocated memory. All pointers below are mem + some offset.
25 
26     int * tsc[7];                    // transition scores [0..6][1..M-1]
27     int * msc[MAXCODE];              // M state emission probs. [0..MAXCODE-1][1..M]
28     int * isc[MAXCODE];              // I state emission probs. [0..MAXCODE-1][1..M-1]
29     int * xsc[4];                    // N,E,C,J transition scores [0..3][0..1]
30     int * bsc;                       // B state transition probs. [1..M]
31     int * esc;                       // E state transition probs. [1..M]
32 
hmm_opt__anon41affd3f0111::hmm_opt33     hmm_opt( plan7_s * hmm ) {
34         assert( NULL != hmm );
35         M = hmm->M;
36 
37         qint64 tsc_sz = 7 * ( M + 16 );
38         qint64 msc_sz = MAXCODE * ( M + 1 + 16 );
39         qint64 isc_sz = MAXCODE * ( M + 16 );
40         qint64 xsc_sz = 50;
41         qint64 bsc_sz = M + 1 + 12;
42         qint64 esc_sz = M + 1 + 12;
43 
44         qint64 bsc_real_sz = M + 1;
45         qint64 esc_real_sz = M + 1;
46 
47         qint64 total_sz = ( tsc_sz + msc_sz + isc_sz + xsc_sz + bsc_sz + esc_sz ) * sizeof( int ) +
48             ALIGNMENT_MEM + 0x10 * ( 15 + 2 * MAXCODE ) * sizeof(int);
49 
50         mem = (int*)malloc( total_sz );
51         memset( mem, 0, total_sz );
52         int* mem_aligned = (int*)ALIGNED( mem, ALIGNMENT_MEM );
53 
54         bsc = mem_aligned;
55         bsc += 3;
56         memcpy( bsc, hmm->bsc_mem, bsc_real_sz * sizeof(int) );
57 
58         esc = (int*)ALIGNED( bsc + 4 + bsc_sz, ALIGNMENT_SCORES ) + 0x10;
59         esc += 3;
60         memcpy( esc, hmm->esc_mem, esc_real_sz * sizeof(int) );
61         for (int i=1;i<8;i++) {
62             esc[M+i] = -INFTY;
63         }
64 
65         int* tsc_tmp = (int*)ALIGNED( esc + esc_sz, ALIGNMENT_SCORES ) + 0x10;
66         for(int i = 0; i < 7; ++i ) {
67             tsc[i] = tsc_tmp;
68             tsc_tmp = (int*)ALIGNED( tsc_tmp + M, ALIGNMENT_SCORES ) + 0x10;
69         }
70         tsc[1] += 3;
71         tsc[4] += 3;
72         for(int i = 0; i < 7; ++i ) {
73             memcpy( tsc[i], hmm->tsc[i], ( M ) * sizeof(int) );
74         }
75 
76         int* msc_tmp = (int*)ALIGNED( tsc[0] + tsc_sz, ALIGNMENT_SCORES );
77         for(int i = 0; i < MAXCODE; ++i ) {
78             msc[i] = msc_tmp;
79             msc[i] += 3;
80             msc_tmp = (int*)ALIGNED( msc_tmp + M + 1, ALIGNMENT_SCORES )+ 0x10;
81             memcpy( msc[i], hmm->msc[i], ( M + 1 ) * sizeof(int) );
82         }
83 
84         int* isc_tmp = (int*)ALIGNED( msc[0] + msc_sz, ALIGNMENT_SCORES )+ 0x10;
85         for(int i = 0; i < MAXCODE; ++i ) {
86             isc[i] = isc_tmp;
87             isc[i] += 3;
88             isc_tmp = (int*)ALIGNED( isc_tmp + M, ALIGNMENT_SCORES )+ 0x10;
89             memcpy( isc[i], hmm->isc[i], ( M ) * sizeof(int) );
90         }
91 
92         int* xsc_tmp = (int*)ALIGNED( isc[0] + isc_sz, ALIGNMENT_SCORES )+ 0x10;
93         for(int i = 0; i < 4; ++i ) {
94             xsc[i] = xsc_tmp;
95             xsc_tmp = (int*)ALIGNED( xsc_tmp + 2, ALIGNMENT_SCORES )+ 0x10;
96             for( int j = 0; j < 2; ++j ) {
97                 xsc[i][j] = hmm->xsc[i][j];
98             }
99         }
100     }
101 
~hmm_opt__anon41affd3f0111::hmm_opt102     ~hmm_opt() {
103         free(mem);
104     }
105 
dump__anon41affd3f0111::hmm_opt106     void dump() {
107         int i = 0;
108         int j = 0;;
109         printf("--START HMM OPT DUMP (M=%d)\n", M);
110         for (i=0;i<M;i++){
111             printf("HMM->ESC[%d]=%d\n",i, esc[i]);
112         }
113         for (i=0;i<M;i++){
114             printf("HMM->BSC[%d]=%d\n",i, bsc[i]);
115         }
116         for (i=0;i<MAXCODE;i++){
117             for (j=0;j<=M;j++){
118                 printf("HMM->MSC[%d][%d]=%d\n",i,j, msc[i][j]);
119             }
120         }
121         for (i=0;i<4;i++){
122             for (j=0;j<2;j++){
123                 printf("HMM->XSC[%d][%d]=%d\n",i,j, xsc[i][j]);
124             }
125         }
126         for (i=0;i<7;i++){
127             for (j=0;j<M;j++){
128                 printf("HMM->TSC[%d][%d]=%d\n",i,j, tsc[i][j]);
129             }
130         }
131         for (i=0;i<MAXCODE;i++){
132             for (j=0;j<M;j++){
133                 printf("HMM->ISC[%d][%d]=%d\n",i,j, isc[i][j]);
134             }
135         }
136         printf("--END HMM OPT DUMP (M=%d)\n", M);
137     }
138 };
139 
at0(__m128i from)140 inline int at0( __m128i from ) {
141     int a = _mm_extract_epi16( from, 0 ) & 0xFFFF;
142     int b = _mm_extract_epi16( from, 1 ) & 0xFFFF;
143     return (a | (b << 16));
144 }
145 
at1(__m128i from)146 inline int at1( __m128i from ) {
147     int a = _mm_extract_epi16( from, 2 ) & 0xFFFF;
148     int b = _mm_extract_epi16( from, 3 ) & 0xFFFF;
149     return (a | (b << 16));
150 }
151 
at2(__m128i from)152 inline int at2( __m128i from ) {
153     int a = _mm_extract_epi16( from, 4 ) & 0xFFFF;
154     int b = _mm_extract_epi16( from, 5 ) & 0xFFFF;
155     return (a | (b << 16));
156 }
157 
at3(__m128i from)158 inline int at3( __m128i from ) {
159     int a = _mm_extract_epi16( from, 6 ) & 0xFFFF;
160     int b = _mm_extract_epi16( from, 7 ) & 0xFFFF;
161     return (a | (b << 16));
162 }
163 
164 
shufflePMask(__m128i a,__m128i b)165 inline __m128i shufflePMask( __m128i a, __m128i b ) {
166     return _mm_set_epi32( at2(a), at1(a), at0(a), at3(b) );
167 }
168 
shuffleD0Mask(const __m128i & a,const __m128i & b)169 inline __m128i shuffleD0Mask( const __m128i& a, const __m128i& b ) {
170     return _mm_set_epi32( at3(b), at2(b), at1(a), at0(a) );
171 }
172 
shuffleD1Mask(const __m128i & a,const __m128i & b)173 inline __m128i shuffleD1Mask( const __m128i& a, const __m128i& b ) {
174     return _mm_set_epi32( at0(b), at0(b), at0(b), at0(a) );
175 }
176 
shuffleD2Mask(const __m128i & a,const __m128i & b)177 inline __m128i shuffleD2Mask( const __m128i& a, const __m128i& b ) {
178     return _mm_set_epi32( at1(b), at1(b), at1(a), at0(a) );
179 }
180 
shuffleD3Mask(const __m128i & a,const __m128i & b)181 inline __m128i shuffleD3Mask( const __m128i& a, const __m128i& b ) {
182     return _mm_set_epi32( at2(b), at2(a), at1(a), at0(a) );
183 }
184 
insert0I32(const __m128i & a,int b)185 inline __m128i insert0I32( const __m128i& a, int b ) {
186     __m128i mask_a = _mm_set_epi32( 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0 );
187     __m128i mask_b = _mm_set_epi32( 0, 0, 0, b );
188     __m128i res = _mm_and_si128( a, mask_a );
189     res = _mm_or_si128( res, mask_b );
190     return res;
191 }
192 
insert3I32(const __m128i & a,int b)193 inline __m128i insert3I32( const __m128i& a, int b ) {
194     __m128i mask_a = _mm_set_epi32( 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF );
195     __m128i mask_b = _mm_set_epi32( b, 0, 0, 0 );
196     __m128i res = _mm_and_si128( a, mask_a );
197     res = _mm_or_si128( res, mask_b );
198     return res;
199 }
200 
selectBits(const __m128i & a,const __m128i & b,const __m128i & pattern)201 inline __m128i selectBits( const __m128i& a, const __m128i& b, const __m128i& pattern  ) {
202     __m128i tmpB = _mm_and_si128( pattern, b );
203     __m128i tmpA = _mm_andnot_si128( pattern, a );
204     return _mm_or_si128( tmpA, tmpB );
205 }
206 
maxComponent(const __m128i & a)207 inline int maxComponent( const __m128i& a ) {
208     int max = at0(a);
209     int e1 = at1(a);
210     int e2 = at2(a);
211     int e3 = at3(a);
212 
213     max = max > e1 ? max : e1;
214     max = max > e2 ? max : e2;
215     max = max > e3 ? max : e3;
216     return max;
217 }
218 
219 //void p ( const char * name, __m128i arg ) {
220 //    printf( "%s: {%d, %d, %d, %d}\n", name,
221 //        at0(arg),
222 //        at1(arg),
223 //        at2(arg),
224 //        at3(arg) );
225 //}
226 
227 
viterbiSSE(hmm_opt * hmm,const unsigned char * seq,int seqLength,int * mscoreC,int * mscoreP,int * iscoreC,int * iscoreP,int * dscoreC,int * dscoreP)228 float viterbiSSE( hmm_opt* hmm, const unsigned char * seq, int seqLength, int* mscoreC, int* mscoreP, int* iscoreC, int *iscoreP, int* dscoreC, int *dscoreP ){
229     assert( NULL != hmm );
230     assert( NULL != seq && 0 < seqLength );
231     assert( NULL != mscoreC && NULL != mscoreP );
232     assert( NULL != iscoreC && NULL != iscoreP );
233     assert( NULL != dscoreC && NULL != dscoreP );
234 
235     __m128i itV1, itV2, itV3, itV4, itV5, itV6, itV7, itV8;
236     __m128i ipV, ipV1, dpV, dpV1, mpV, mpV1;
237     __m128i dt1, dt2, dt3, dt4, dt, mt1, mt2, mt3, mt4;
238     __m128i xscXTN        = _mm_set1_epi32( hmm->xsc[XTN][MOVE] );
239     __m128i xscPXMB       = _mm_set1_epi32( hmm->xsc[XTN][MOVE] );
240     __m128i hmmxscXTNLOOP = _mm_set1_epi32( hmm->xsc[XTN][LOOP] );
241     __m128i hmmxscXTCMOVE = _mm_set1_epi32( hmm->xsc[XTC][MOVE] );
242     __m128i hmmxscXTCLOOP = _mm_set1_epi32( hmm->xsc[XTC][LOOP] );
243     __m128i hmmxscXTJMOVE = _mm_set1_epi32( hmm->xsc[XTJ][MOVE] );
244     __m128i hmmxscXTJLOOP = _mm_set1_epi32( hmm->xsc[XTJ][LOOP] );
245     __m128i hmmxscXTEMOVE = _mm_set1_epi32( hmm->xsc[XTE][MOVE] );
246     __m128i hmmxscXTELOOP = _mm_set1_epi32( hmm->xsc[XTE][LOOP] );
247     __m128i zeroVEC       = _mm_set1_epi32( 0 );
248     __m128i INFTY_VEC     = _mm_set1_epi32( -INFTY );
249     __m128i xscCXME       = _mm_set1_epi32( -INFTY );
250     __m128i xscCXMC       = _mm_set1_epi32( -INFTY );
251     __m128i xscCXMJ       = _mm_set1_epi32( -INFTY );
252 
253     __m128i *hmmtscTII    = (__m128i*)( hmm->tsc[TII] + 1 );
254     __m128i *hmmtscTMM    = (__m128i*)  hmm->tsc[TMM];
255     __m128i *hmmtscTIM    = (__m128i*)  hmm->tsc[TIM];
256     __m128i *hmmtscTDM    = (__m128i*)  hmm->tsc[TDM];
257     __m128i *hmmtscTMI    = (__m128i*)( hmm->tsc[TMI] + 1 );
258     __m128i *hmmbsc       = (__m128i*)( hmm->bsc + 1 );
259     __m128i *hmmesc       = (__m128i*)( hmm->esc + 1 );
260 
261     __m128i *mscCV        = (__m128i*)  ( mscoreC + 1 );
262     __m128i *iscCV        = (__m128i*)  ( iscoreC + 1 );
263     __m128i *dscCV        = (__m128i*)  ( dscoreC + 1 );
264     __m128i *mscPV        = (__m128i*)  ( mscoreP + 1 );
265     __m128i *iscPV        = (__m128i*)  ( iscoreP + 1 );
266     __m128i *dscPV        = (__m128i*)  ( dscoreP + 1 );
267     __m128i *Mdt          = (__m128i*)  hmm->tsc[TMD] - 1;
268 
269     __m128i *mscDSQ, *iscDSQ, *tmpMV,  *tmpIV, *tmpDV;
270     __m128i cmp1, cmp2, cmp3;
271 
272     float result = 0;
273     int   i = 0, j = 0, k = 0, sc = 0, M4th = 0;
274     int   *tmpM = NULL, *tmpD = NULL, *hmmtscTDDip = hmm->tsc[TDD];
275 
276     M4th = ( hmm->M + 3 ) >> 2;
277     for( k = 0; k <= M4th; ++k ) {
278         mscPV[k] = INFTY_VEC;
279         iscPV[k] = INFTY_VEC;
280         dscPV[k] = INFTY_VEC;
281     }
282 
283     for( i = 1; i <= seqLength; ++i ) {
284         xscCXME = INFTY_VEC;
285 
286         mscDSQ   = (__m128i*)( hmm->msc[(int)seq[i]] + 1);
287         iscDSQ   = (__m128i*)( hmm->isc[(int)seq[i]] + 1);
288         for( k = 1, j = 1; k <= M4th; j += 8, k += 2 ) {
289             ipV        = shufflePMask( iscPV[k-1],  iscPV[k-2] );
290             ipV1       = shufflePMask( iscPV[k],  iscPV[k-1] );
291             mpV        = shufflePMask( mscPV[k-1],mscPV[k-2] );
292             mpV1       = shufflePMask( mscPV[k],  mscPV[k-1] );
293             dpV        = shufflePMask( dscPV[k-1],dscPV[k-2] );
294             dpV1       = shufflePMask( dscPV[k],  dscPV[k-1] );
295 
296             itV2       = _mm_add_epi32( ipV,  hmmtscTIM[k-1] );
297             itV6       = _mm_add_epi32( ipV1, hmmtscTIM[k] );
298             itV1       = _mm_add_epi32( mpV,  hmmtscTMM[k-1] );
299             itV5       = _mm_add_epi32( mpV1, hmmtscTMM[k] );
300             itV4       = _mm_add_epi32( dpV,  hmmtscTDM[k-1] );
301             itV8       = _mm_add_epi32( dpV1, hmmtscTDM[k] );
302             itV3       = _mm_add_epi32( xscPXMB, hmmbsc[k-1] );
303             itV7       = _mm_add_epi32( xscPXMB, hmmbsc[k] );
304 
305             cmp1       = _mm_cmpgt_epi32( itV1, itV2 );
306             itV1       = selectBits( itV2, itV1, cmp1 );
307 
308             cmp2       = _mm_cmpgt_epi32( itV3, itV4 );
309             dt4        = insert3I32( zeroVEC, hmmtscTDDip[j+2] );
310             itV2       = selectBits( itV4, itV3, cmp2 );
311             dt3        = _mm_set1_epi32(hmmtscTDDip[j+1]);
312             cmp1       = _mm_cmpgt_epi32( itV1, itV2 );
313             dt3        = _mm_add_epi32( dt3, dt4 );
314             itV3       = selectBits( itV2, itV1, cmp1 );
315             cmp2       = _mm_cmpgt_epi32( itV5, itV6 );
316             dt3        = shuffleD0Mask( zeroVEC, dt3 );
317             itV1       = selectBits( itV6, itV5, cmp2 );
318 
319             cmp1       = _mm_cmpgt_epi32( itV7, itV8 );
320             dt2        = _mm_set1_epi32(hmmtscTDDip[j]);
321             itV2       = selectBits( itV8, itV7, cmp1 );
322             dt2        = _mm_add_epi32( dt2, dt3 );
323             cmp2       = _mm_cmpgt_epi32( itV1, itV2 );
324             mscCV[k-1] = _mm_add_epi32( itV3, mscDSQ[k-1] );
325             itV4       = selectBits( itV2, itV1, cmp2 );
326             dt2        = insert0I32(dt2, 0 );
327             mscCV[k]   = _mm_add_epi32( itV4, mscDSQ[k] );
328             dt1        = _mm_set1_epi32( hmmtscTDDip[j-1] );
329             mpV        = shufflePMask( mscCV[k-1], mscCV[k-2] );
330             dt1        = _mm_add_epi32( dt1, dt2 );
331             mt1        = _mm_add_epi32( mpV, Mdt[k] );
332             dt         = _mm_set1_epi32( dscoreC[j-1] );
333             mt2        = shuffleD1Mask( INFTY_VEC, mt1 );
334             dt1        = _mm_add_epi32( dt, dt1 );
335             mt3        = shuffleD2Mask( INFTY_VEC, mt1 );
336             dt2        = _mm_add_epi32( mt2, dt2 );
337             mt4        = shuffleD3Mask( INFTY_VEC, mt1 );
338             dt3        = _mm_add_epi32( mt3, dt3 );
339             cmp1       = _mm_cmpgt_epi32( dt1, mt1 );
340             dt4        = _mm_add_epi32( mt4, dt4 );
341             cmp2       = _mm_cmpgt_epi32( dt2, dt3 );
342             dt1        = selectBits( mt1, dt1, cmp1 );
343             cmp3       = _mm_cmpgt_epi32( dt1, dt4 );
344             dt2        = selectBits( dt3, dt2, cmp2 );
345             dt1        = selectBits( dt4, dt1, cmp3 );
346             cmp2       = _mm_cmpgt_epi32( dt1, dt2 );
347             dt4        = insert3I32( zeroVEC, hmmtscTDDip[j+6] );
348             dscCV[k-1] = selectBits( dt2, dt1, cmp2 );
349             dt3        = _mm_set1_epi32( hmmtscTDDip[j+5] );
350             itV1       = _mm_add_epi32( mscPV[k-1], hmmtscTMI[k-1] );
351             dt3        = _mm_add_epi32( dt3, dt4 );
352             dt3        = shuffleD0Mask( zeroVEC, dt3 );
353             dt2        = _mm_set1_epi32( hmmtscTDDip[j+4] );
354             itV2       = _mm_add_epi32( iscPV[k-1], hmmtscTII[k-1] );
355             dt2        = _mm_add_epi32( dt2, dt3 );
356             dt1        = _mm_set1_epi32( hmmtscTDDip[j+3] );
357             dt2        = insert0I32( dt2, 0 );
358             cmp1       = _mm_cmpgt_epi32( itV1, itV2 );
359             dt1        = _mm_add_epi32( dt1, dt2 );
360             mpV        = shufflePMask( mscCV[k], mscCV[k-1] );
361             mt1        = _mm_add_epi32( mpV, Mdt[k+1] );
362             dt         = _mm_set1_epi32( dscoreC[j+3] );
363             itV3       = selectBits( itV2, itV1, cmp1 );
364             dt1        = _mm_add_epi32( dt, dt1 );
365             mt2        = shuffleD1Mask( INFTY_VEC, mt1 );
366             itV1       = _mm_add_epi32( mscPV[k], hmmtscTMI[k] );
367             dt2        = _mm_add_epi32( mt2, dt2 );
368             mt3        = shuffleD2Mask( INFTY_VEC, mt1 );
369             itV2       = _mm_add_epi32( iscPV[k], hmmtscTII[k] );
370             dt3        = _mm_add_epi32( mt3, dt3 );
371             mt4        = shuffleD3Mask( INFTY_VEC, mt1 );
372             cmp3       = _mm_cmpgt_epi32( itV1, itV2 );
373             dt4        = _mm_add_epi32( mt4, dt4 );
374             cmp1       = _mm_cmpgt_epi32( dt1, mt1 );
375             itV4       = selectBits( itV2, itV1, cmp3 );
376             cmp2       = _mm_cmpgt_epi32( dt2, dt3 );
377             dt1        = selectBits( mt1, dt1, cmp1 );
378             dt2        = selectBits( dt3, dt2, cmp2 );
379             cmp1       = _mm_cmpgt_epi32( dt1, dt4 );
380             dt1        = selectBits( dt4, dt1, cmp1 );
381             cmp2       = _mm_cmpgt_epi32( dt1, dt2 );
382             iscCV[k-1] = _mm_add_epi32( itV3, iscDSQ[k-1] );
383             dscCV[k]   = selectBits( dt2, dt1, cmp2 );
384             itV1       = _mm_add_epi32( mscCV[k-1], hmmesc[k-1] );
385             itV2       = _mm_add_epi32( mscCV[k], hmmesc[k] );
386             cmp1       = _mm_cmpgt_epi32( itV1, xscCXME );
387             cmp2       = _mm_cmpgt_epi32( itV2, xscCXME );
388             xscCXME    = selectBits( xscCXME, itV1, cmp1 );
389             iscCV[k]   = _mm_add_epi32( itV4, iscDSQ[k] );
390             xscCXME    = selectBits( xscCXME, itV2, cmp2 );
391         }
392         xscCXME = _mm_set1_epi32( maxComponent( xscCXME ) );
393         itV2    = _mm_add_epi32( xscCXMJ, hmmxscXTJLOOP );
394         itV3    = _mm_add_epi32( xscCXME, hmmxscXTELOOP );
395         itV1    = _mm_add_epi32( xscCXMJ, hmmxscXTJMOVE );
396 
397         cmp1    = _mm_cmpgt_epi32( itV2, itV3 );
398         xscXTN  = _mm_add_epi32( xscXTN, hmmxscXTNLOOP );
399         xscCXMJ = selectBits( itV3, itV2, cmp1 );
400         itV2    = _mm_add_epi32( xscCXMC, hmmxscXTCLOOP );
401         cmp1    = _mm_cmpgt_epi32( xscXTN, itV1 );
402         itV3    = _mm_add_epi32( xscCXME, hmmxscXTEMOVE );
403         xscPXMB = selectBits( itV1, xscXTN, cmp1 );
404         cmp2    = _mm_cmpgt_epi32( itV2, itV3 );
405         xscCXMC = selectBits( itV3, itV2, cmp2 );
406         tmpMV   = mscCV;
407         tmpIV   = iscCV;
408         tmpDV   = dscCV;
409         mscCV   = mscPV;
410         iscCV   = iscPV;
411         dscCV   = dscPV;
412         mscPV   = tmpMV;
413         iscPV   = tmpIV;
414         dscPV   = tmpDV;
415         tmpM    = mscoreC;
416         tmpD    = dscoreC;
417         mscoreC = mscoreP;
418         dscoreC = dscoreP;
419         mscoreP = tmpM;
420         dscoreP = tmpD;
421     }
422     itV1 = _mm_add_epi32( xscCXMC, hmmxscXTCMOVE );
423     sc = at0(itV1);
424     result = (float) sc / INTSCALE;
425     return result;
426 }
427 
428 } //anonymous namespace
429 
430 //assuming dsq is aligned
sseScoring(unsigned char * dsq,int seqlen,plan7_s * hmm,HMMSeqGranulation * gr,TaskStateInfo & ti)431 QList<float> sseScoring( unsigned char * dsq, int seqlen, plan7_s* hmm, HMMSeqGranulation * gr, TaskStateInfo& ti  ) {
432 
433     assert( dsq );
434     assert( gr );
435     assert( seqlen > 0);
436 
437     const int M4th = ( hmm->M + 3 ) >> 2;
438 	const int memSz = 4 + (M4th+1) * 16 + 12 + ALIGNMENT_MEM;
439 	char* mscCurMem = (char*)malloc( memSz );
440 	char* iscCurMem = (char*)malloc( memSz );
441 	char* dscCurMem = (char*)malloc( memSz );
442 	char* mscPrevMem = (char*)malloc( memSz );
443 	char* iscPrevMem = (char*)malloc( memSz );
444 	char* dscPrevMem = (char*)malloc( memSz );
445 
446 	memset( mscCurMem,  0, memSz );
447 	memset( iscCurMem,  0, memSz );
448 	memset( dscCurMem,  0, memSz );
449 	memset( mscPrevMem, 0, memSz );
450 	memset( iscPrevMem, 0, memSz );
451 	memset( dscPrevMem, 0, memSz );
452 
453     int* mscCur =  (int*)( ALIGNED( mscCurMem, ALIGNMENT_MEM  ) + 0xC );
454     int* iscCur =  (int*)( ALIGNED( iscCurMem, ALIGNMENT_MEM  ) + 0xC );
455     int* dscCur =  (int*)( ALIGNED( dscCurMem, ALIGNMENT_MEM  ) + 0xC );
456     int* mscPrev = (int*)( ALIGNED( mscPrevMem, ALIGNMENT_MEM ) + 0xC );
457     int* iscPrev = (int*)( ALIGNED( iscPrevMem, ALIGNMENT_MEM ) + 0xC );
458     int* dscPrev = (int*)( ALIGNED( dscPrevMem, ALIGNMENT_MEM ) + 0xC );
459 
460     QList<float> results;
461     U2Region range( 0, seqlen );
462     gr->overlap = 2 * hmm->M;
463     gr->exOverlap = 0;
464     gr->chunksize = qBound( gr->overlap+1, CHUNK_SIZE, seqlen );
465 	gr->regions = SequenceWalkerTask::splitRange( range, gr->chunksize, gr->overlap, gr->exOverlap, false );
466     const QVector<U2Region> & regions = gr->regions;
467 
468     int regionsSz = regions.size();
469     int regionsPassed = 0;
470 
471     hmm_opt hmmOpt( hmm );
472     foreach( U2Region chunk, regions ) {
473         unsigned char * curSeqAddr = dsq + chunk.startPos;
474         int curSeqLen = chunk.length;
475         float sc = viterbiSSE( &hmmOpt, curSeqAddr, curSeqLen, mscCur, mscPrev, iscCur, iscPrev, dscCur, dscPrev );
476         results.push_back( sc );
477 
478          ti.progress = (int)( ( 100.0 * ( regionsPassed++ ) ) / regionsSz );
479          if( ti.cancelFlag ) {
480              break;
481          }
482     }
483 
484 	free( mscCurMem );
485 	free( iscCurMem );
486 	free( dscCurMem );
487 	free( mscPrevMem );
488 	free( iscPrevMem );
489 	free( dscPrevMem );
490 
491 	return results;
492 }
493