1 //
2 //  hhviterbialgorithm.cpp
3 //
4 //  Created by Martin Steinegger on 19.11.12.
5 //  Copyright (c) 2012 -. All rights reserved.
6 //
7 #include "hhviterbi.h"
8 #include "hhviterbimatrix.h"
9 
10 
11 #define MAX2_SET_MASK(vec1, vec2, vec3, res)        \
12 res_gt_vec = (simd_int)simdf32_gt(vec1,vec2);      \
13 index_vec  = simdi_and(res_gt_vec,vec3);          \
14 res        = simdi_xor(res,index_vec);
15 
16 
17 #define MAX2(vec1, vec2, vec3, res)           \
18 res_gt_vec = (simd_int)simdf32_gt(vec1,vec2);          \
19 index_vec  = simdi_and(res_gt_vec,vec3);          \
20 res        = simdui8_max(res,index_vec);
21 
22 
23 
24 
25 /////////////////////////////////////////////////////////////////////////////////////
26 // Compare HMMs with one another and look for sub-optimal alignments that share no pair with previous ones
27 // The function is called with q and t
28 /////////////////////////////////////////////////////////////////////////////////////
29 #ifdef VITERBI_SS_SCORE
30 #ifdef VITERBI_CELLOFF
AlignWithCellOffAndSS(HMMSimd * q,HMMSimd * t,ViterbiMatrix * viterbiMatrix,int maxres,ViterbiResult * result,int ss_hmm_mode)31 void Viterbi::AlignWithCellOffAndSS(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix,
32                                     int maxres, ViterbiResult* result, int ss_hmm_mode)
33 #else
34 void Viterbi::AlignWithOutCellOffAndSS(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix,
35                                     int maxres, ViterbiResult* result, int ss_hmm_mode)
36 #endif
37 #else
38 #ifdef VITERBI_CELLOFF
39 void Viterbi::AlignWithCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix, int maxres, ViterbiResult* result)
40 #else
41 void Viterbi::AlignWithOutCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix,
42                                   int maxres, ViterbiResult* result)
43 #endif
44 #endif
45 {
46 
47     // Linear topology of query (and template) HMM:
48     // 1. The HMM HMM has L+2 columns. Columns 1 to L contain
49     //    a match state, a delete state and an insert state each.
50     // 2. The Start state is M0, the virtual match state in column i=0 (j=0). (Therefore X[k][0]=ANY)
51     //    This column has only a match state and it has only a transitions to the next match state.
52     // 3. The End state is M(L+1), the virtual match state in column i=L+1.(j=L+1) (Therefore X[k][L+1]=ANY)
53     //    Column L has no transitions to the delete state: tr[L][M2D]=tr[L][D2D]=0.
54     // 4. Transitions I->D and D->I are ignored, since they do not appear in PsiBlast alignments
55     //    (as long as the gap opening penalty d is higher than the best match score S(a,b)).
56 
57     // Pairwise alignment of two HMMs:
58     // 1. Pair-states for the alignment of two HMMs are
59     //    MM (Q:Match T:Match) , GD (Q:Gap T:Delete), IM (Q:Insert T:Match),  DG (Q:Delelte, T:Match) , MI (Q:Match T:Insert)
60     // 2. Transitions are allowed only between the MM-state and each of the four other states.
61 
62     // Saving space:
63     // The best score ending in pair state XY sXY[i][j] is calculated from left to right (j=1->t->L)
64     // and top to bottom (i=1->q->L). To save space, only the last row of scores calculated is kept in memory.
65     // (The backtracing matrices are kept entirely in memory [O(t->L*q->L)]).
66     // When the calculation has proceeded up to the point where the scores for cell (i,j) are caculated,
67     //    sXY[i-1][j'] = sXY[j']   for j'>=j (A below)
68     //    sXY[i][j']   = sXY[j']   for j'<j  (B below)
69     //    sXY[i-1][j-1]= sXY_i_1_j_1         (C below)
70     //    sXY[i][j]    = sXY_i_j             (D below)
71     //                   j-1
72     //                     j
73     // i-1:               CAAAAAAAAAAAAAAAAAA
74     //  i :   BBBBBBBBBBBBBD
75     // Variable declarations
76 
77     const float smin = (this->local ? 0 : -FLT_MAX);  //used to distinguish between SW and NW algorithms in maximization
78     const simd_float smin_vec    = simdf32_set(smin);
79     const simd_float shift_vec   = simdf32_set(shift);
80 //    const simd_float one_vec     = simdf32_set(1); //   00000001
81     const simd_int mm_vec        = simdi32_set(2); //MM 00000010
82     const simd_int gd_vec        = simdi32_set(3); //GD 00000011
83     const simd_int im_vec        = simdi32_set(4); //IM 00000100
84     const simd_int dg_vec        = simdi32_set(5); //DG 00000101
85     const simd_int mi_vec        = simdi32_set(6); //MI 00000110
86     const simd_int gd_mm_vec     = simdi32_set(8); //   00001000
87     const simd_int im_mm_vec     = simdi32_set(16);//   00010000
88     const simd_int dg_mm_vec     = simdi32_set(32);//   00100000
89     const simd_int mi_mm_vec     = simdi32_set(64);//   01000000
90 
91 #ifdef VITERBI_SS_SCORE
92     HMM * q_s = q->GetHMM(0);
93     const unsigned char * t_index;
94     if(ss_hmm_mode == HMM::PRED_PRED || ss_hmm_mode == HMM::DSSP_PRED  ){
95         t_index = t->pred_index;
96     }else if(ss_hmm_mode == HMM::PRED_DSSP){
97         t_index = t->dssp_index;
98     }
99     simd_float * ss_score_vec = (simd_float *) ss_score;
100 #endif
101 
102 #ifdef AVX2
103     const simd_int shuffle_mask_extract = _mm256_setr_epi8(0,  4,  8,  12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
104                                                            -1, -1, -1,  -1,  0,  4,  8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
105 #endif
106 #ifdef VITERBI_CELLOFF
107 #ifdef AVX2
108     const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000
109     const simd_int co_vec               = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1);
110     const simd_int float_min_vec     = (simd_int) _mm256_set1_ps(-FLT_MAX);
111     const simd_int shuffle_mask_celloff = _mm256_set_epi8(
112                                                           15, 14, 13, 12,
113                                                           15, 14, 13, 12,
114                                                           15, 14, 13, 12,
115                                                           15, 14, 13, 12,
116                                                           3, 2,  1, 0,
117                                                           3, 2,  1, 0,
118                                                           3, 2,  1, 0,
119                                                           3, 2,  1, 0);
120 #else // SSE case
121     const simd_int tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);
122     const simd_int co_vec = tmp_vec;
123     const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX);
124 #endif
125 #endif // AVX2 end
126 
127     int i,j;      //query and template match state indices
128     simd_int i2_vec = simdi32_set(0);
129     simd_int j2_vec = simdi32_set(0);
130     simd_float sMM_i_j = simdf32_set(0);
131     simd_float sMI_i_j,sIM_i_j,sGD_i_j,sDG_i_j;
132 
133 
134     simd_float Si_vec;
135     simd_float sMM_i_1_j_1;
136     simd_float sMI_i_1_j_1;
137     simd_float sIM_i_1_j_1;
138     simd_float sGD_i_1_j_1;
139     simd_float sDG_i_1_j_1;
140 
141     simd_float score_vec     = simdf32_set(-FLT_MAX);
142     simd_int byte_result_vec = simdi32_set(0);
143 
144     // Initialization of top row, i.e. cells (0,j)
145     for (j=0; j <= t->L; ++j)
146     {
147         const unsigned int index_pos_j = j * 5;
148         sMM_DG_MI_GD_IM_vec[index_pos_j + 0] = simdf32_set(-j*penalty_gap_template);
149         sMM_DG_MI_GD_IM_vec[index_pos_j + 1] = simdf32_set(-FLT_MAX);
150         sMM_DG_MI_GD_IM_vec[index_pos_j + 2] = simdf32_set(-FLT_MAX);
151         sMM_DG_MI_GD_IM_vec[index_pos_j + 3] = simdf32_set(-FLT_MAX);
152         sMM_DG_MI_GD_IM_vec[index_pos_j + 4] = simdf32_set(-FLT_MAX);
153     }
154     // Viterbi algorithm
155     const int queryLength = q->L;
156     for (i=1; i <= queryLength; ++i) // Loop through query positions i
157     {
158 
159         // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues
160         // Initialize cells
161         sMM_i_1_j_1 = simdf32_set(-(i - 1) * penalty_gap_query);  // initialize at (i-1,0)
162         sIM_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i-1,jmin-1)
163         sMI_i_1_j_1 = simdf32_set(-FLT_MAX);
164         sDG_i_1_j_1 = simdf32_set(-FLT_MAX);
165         sGD_i_1_j_1 = simdf32_set(-FLT_MAX);
166 
167         // initialize at (i,jmin-1)
168         const unsigned int index_pos_i = 0 * 5;
169         sMM_DG_MI_GD_IM_vec[index_pos_i + 0] = simdf32_set(-i * penalty_gap_query);           // initialize at (i,0)
170         sMM_DG_MI_GD_IM_vec[index_pos_i + 1] = simdf32_set(-FLT_MAX);
171         sMM_DG_MI_GD_IM_vec[index_pos_i + 2] = simdf32_set(-FLT_MAX);
172         sMM_DG_MI_GD_IM_vec[index_pos_i + 3] = simdf32_set(-FLT_MAX);
173         sMM_DG_MI_GD_IM_vec[index_pos_i + 4] = simdf32_set(-FLT_MAX);
174 #ifdef AVX2
175         unsigned long long * sCO_MI_DG_IM_GD_MM_vec = (unsigned long long *) viterbiMatrix->getRow(i);
176 #else
177         unsigned int *sCO_MI_DG_IM_GD_MM_vec = (unsigned int *) viterbiMatrix->getRow(i);
178 #endif
179 
180         const unsigned int start_pos_tr_i_1 = (i - 1) * 7;
181         const unsigned int start_pos_tr_i = (i) * 7;
182         const simd_float q_m2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 2)); // M2M
183         const simd_float q_m2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 3)); // M2D
184         const simd_float q_d2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 4)); // D2M
185         const simd_float q_d2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 5)); // D2D
186         const simd_float q_i2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 6)); // I2m
187         const simd_float q_i2i = simdf32_load((float *) (q->tr + start_pos_tr_i)); // I2I
188         const simd_float q_m2i = simdf32_load((float *) (q->tr + start_pos_tr_i + 1)); // M2I
189 
190 
191         // Find maximum score; global alignment: maxize only over last row and last column
192         const bool findMaxInnerLoop = (local || i == queryLength);
193         const int targetLength = t->L;
194 #ifdef VITERBI_SS_SCORE
195         if(ss_hmm_mode == HMM::NO_SS_INFORMATION){
196             // set all to log(1.0) = 0.0
197             memset(ss_score, 0, (targetLength+1)*VECSIZE_FLOAT*sizeof(float));
198         }else {
199             const float * score;
200             if(ss_hmm_mode == HMM::PRED_PRED){
201                 score = &S33[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0][0];
202             }else if (ss_hmm_mode == HMM::DSSP_PRED){
203                 score = &S73[ (int)q_s->ss_dssp[i]][0][0];
204             }else{
205                 score = &S37[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0];
206             }
207             // access SS scores and write them to the ss_score array
208             for (j = 0; j < targetLength*VECSIZE_FLOAT; j++) // Loop through template positions j
209             {
210                 ss_score[j + VECSIZE_FLOAT] = ssw * score[t_index[j]];
211             }
212         }
213 #endif
214         for (j=1; j <= targetLength; ++j) // Loop through template positions j
215         {
216             simd_int index_vec;
217             simd_int res_gt_vec;
218             // cache line optimized reading
219             const unsigned int start_pos_tr_j_1 = (j-1) * 7;
220             const unsigned int start_pos_tr_j = (j) * 7;
221 
222             const simd_float t_m2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+2)); // M2M
223             const simd_float t_m2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+3)); // M2D
224             const simd_float t_d2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+4)); // D2M
225             const simd_float t_d2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+5)); // D2D
226             const simd_float t_i2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+6)); // I2m
227             const simd_float t_i2i = simdf32_load((float *) (t->tr+start_pos_tr_j));   // I2i
228             const simd_float t_m2i = simdf32_load((float *) (t->tr+start_pos_tr_j+1));     // M2I
229 
230             // Find max value
231             // CALCULATE_MAX6( sMM_i_j,
232             //                 smin,
233             //                 sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M],
234             //                 sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M],
235             //                 sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M],
236             //                 sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M],
237             //                 sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M],
238             //                 bMM[i][j]
239             //                 );
240             // same as sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M]
241             simd_float mm_m2m_m2m_vec = simdf32_add( simdf32_add(sMM_i_1_j_1, q_m2m), t_m2m);
242             // if mm > min { 2 }
243             res_gt_vec       = (simd_int)simdf32_gt(mm_m2m_m2m_vec, smin_vec);
244             byte_result_vec  = simdi_and(res_gt_vec, mm_vec);
245             sMM_i_j = simdf32_max(smin_vec, mm_m2m_m2m_vec);
246 
247             // same as sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M]
248             simd_float gd_m2m_d2m_vec = simdf32_add( simdf32_add(sGD_i_1_j_1, q_m2m), t_d2m);
249             // if gd > max { 3 }
250             res_gt_vec       = (simd_int)simdf32_gt(gd_m2m_d2m_vec, sMM_i_j);
251             index_vec        = simdi_and( res_gt_vec, gd_vec);
252             byte_result_vec  = simdi_or(  index_vec,  byte_result_vec);
253 
254             sMM_i_j = simdf32_max(sMM_i_j, gd_m2m_d2m_vec);
255 
256 
257             // same as sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M]
258             simd_float im_m2m_d2m_vec = simdf32_add( simdf32_add(sIM_i_1_j_1, q_i2m), t_m2m);
259             // if im > max { 4 }
260             MAX2(im_m2m_d2m_vec, sMM_i_j, im_vec,byte_result_vec);
261             sMM_i_j = simdf32_max(sMM_i_j, im_m2m_d2m_vec);
262 
263             // same as sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M]
264             simd_float dg_m2m_d2m_vec = simdf32_add( simdf32_add(sDG_i_1_j_1, q_d2m), t_m2m);
265             // if dg > max { 5 }
266             MAX2(dg_m2m_d2m_vec, sMM_i_j, dg_vec,byte_result_vec);
267             sMM_i_j = simdf32_max(sMM_i_j, dg_m2m_d2m_vec);
268 
269             // same as sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M],
270             simd_float mi_m2m_d2m_vec = simdf32_add( simdf32_add(sMI_i_1_j_1, q_m2m), t_i2m);
271             // if mi > max { 6 }
272             MAX2(mi_m2m_d2m_vec, sMM_i_j, mi_vec, byte_result_vec);
273             sMM_i_j = simdf32_max(sMM_i_j, mi_m2m_d2m_vec);
274 
275             // TODO add secondary structure score
276             // calculate amino acid profile-profile scores
277             Si_vec = log2f4(ScalarProd20Vec((simd_float *) q->p[i],(simd_float *) t->p[j]));
278 #ifdef VITERBI_SS_SCORE
279             Si_vec = simdf32_add(ss_score_vec[j], Si_vec);
280 #endif
281             Si_vec = simdf32_add(Si_vec, shift_vec);
282 
283             sMM_i_j = simdf32_add(sMM_i_j, Si_vec);
284             //+ ScoreSS(q,t,i,j) + shift + (Sstruc==NULL? 0: Sstruc[i][j]);
285 
286             const unsigned int index_pos_j   = (j * 5);
287             const unsigned int index_pos_j_1 = (j - 1) * 5;
288             const simd_float sMM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 0));
289             const simd_float sGD_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 3));
290             const simd_float sIM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 4));
291             const simd_float sMM_j   = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 0));
292             const simd_float sDG_j   = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 1));
293             const simd_float sMI_j   = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 2));
294             sMM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 0));
295             sDG_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 1));
296             sMI_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 2));
297             sGD_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 3));
298             sIM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 4));
299 
300             //            sGD_i_j = max2
301             //            (
302             //             sMM[j-1] + t->tr[j-1][M2D], // MM->GD gap opening in query
303             //             sGD[j-1] + t->tr[j-1][D2D], // GD->GD gap extension in query
304             //             bGD[i][j]
305             //             );
306             //sMM_DG_GD_MI_IM_vec
307             simd_float mm_gd_vec = simdf32_add(sMM_j_1, t_m2d); // MM->GD gap opening in query
308             simd_float gd_gd_vec = simdf32_add(sGD_j_1, t_d2d); // GD->GD gap extension in query
309             // if mm_gd > gd_dg { 8 }
310             MAX2_SET_MASK(mm_gd_vec, gd_gd_vec,gd_mm_vec, byte_result_vec);
311 
312             sGD_i_j = simdf32_max(
313                                  mm_gd_vec,
314                                  gd_gd_vec
315                                  );
316             //            sIM_i_j = max2
317             //            (
318             //             sMM[j-1] + q->tr[i][M2I] + t->tr[j-1][M2M] ,
319             //             sIM[j-1] + q->tr[i][I2I] + t->tr[j-1][M2M], // IM->IM gap extension in query
320             //             bIM[i][j]
321             //             );
322 
323 
324             simd_float mm_mm_vec = simdf32_add(simdf32_add(sMM_j_1, q_m2i), t_m2m);
325             simd_float im_im_vec = simdf32_add(simdf32_add(sIM_j_1, q_i2i), t_m2m); // IM->IM gap extension in query
326             // if mm_mm > im_im { 16 }
327             MAX2_SET_MASK(mm_mm_vec,im_im_vec, im_mm_vec, byte_result_vec);
328 
329             sIM_i_j = simdf32_max(
330                                   mm_mm_vec,
331                                   im_im_vec
332                                   );
333 
334             //            sDG_i_j = max2
335             //            (
336             //             sMM[j] + q->tr[i-1][M2D],
337             //             sDG[j] + q->tr[i-1][D2D], //gap extension (DD) in query
338             //             bDG[i][j]
339             //             );
340             simd_float mm_dg_vec = simdf32_add(sMM_j, q_m2d);
341             simd_float dg_dg_vec = simdf32_add(sDG_j, q_d2d); //gap extension (DD) in query
342             // if mm_dg > dg_dg { 32 }
343             MAX2_SET_MASK(mm_dg_vec,dg_dg_vec, dg_mm_vec, byte_result_vec);
344 
345             sDG_i_j = simdf32_max( mm_dg_vec
346                                   ,
347                                   dg_dg_vec
348                                   );
349 
350 
351 
352             //            sMI_i_j = max2
353             //            (
354             //             sMM[j] + q->tr[i-1][M2M] + t->tr[j][M2I], // MM->MI gap opening M2I in template
355             //             sMI[j] + q->tr[i-1][M2M] + t->tr[j][I2I], // MI->MI gap extension I2I in template
356             //             bMI[i][j]
357             //             );
358             simd_float mm_mi_vec = simdf32_add( simdf32_add(sMM_j, q_m2m), t_m2i);  // MM->MI gap opening M2I in template
359             simd_float mi_mi_vec = simdf32_add( simdf32_add(sMI_j, q_m2m), t_i2i);  // MI->MI gap extension I2I in template
360             // if mm_mi > mi_mi { 64 }
361             MAX2_SET_MASK(mm_mi_vec, mi_mi_vec,mi_mm_vec, byte_result_vec);
362 
363             sMI_i_j = simdf32_max(
364                                   mm_mi_vec,
365                                   mi_mi_vec
366                                   );
367 
368 
369             // Cell of logic
370             // if (cell_off[i][j])
371             //shift   10000000100000001000000010000000 -> 01000000010000000100000001000000
372             //because 10000000000000000000000000000000 = -2147483648 kills cmplt
373 #ifdef VITERBI_CELLOFF
374 #ifdef AVX2
375             simd_int matrix_vec    = _mm256_set1_epi64x(sCO_MI_DG_IM_GD_MM_vec[j]>>1);
376             matrix_vec             = _mm256_shuffle_epi8(matrix_vec,shuffle_mask_celloff);
377 #else
378 //            if(((sCO_MI_DG_IM_GD_MM_vec[j]  >>1) & 0x40404040) > 0){
379 //                std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j]  >>1) & 0x40404040   ) << std::endl;
380 //            }
381             simd_int matrix_vec    = simdi32_set(sCO_MI_DG_IM_GD_MM_vec[j]>>1);
382 
383 #endif
384             simd_int cell_off_vec  = simdi_and(matrix_vec, co_vec);
385             simd_int res_eq_co_vec = simdi32_gt(co_vec, cell_off_vec    ); // shift is because signed can't be checked here
386             simd_float  cell_off_float_min_vec = (simd_float) simdi_andnot(res_eq_co_vec, float_min_vec); // inverse
387             sMM_i_j = simdf32_add(sMM_i_j,cell_off_float_min_vec);    // add the cell off vec to sMM_i_j. Set -FLT_MAX to cell off
388             sGD_i_j = simdf32_add(sGD_i_j,cell_off_float_min_vec);
389             sIM_i_j = simdf32_add(sIM_i_j,cell_off_float_min_vec);
390             sDG_i_j = simdf32_add(sDG_i_j,cell_off_float_min_vec);
391             sMI_i_j = simdf32_add(sMI_i_j,cell_off_float_min_vec);
392 #endif
393 
394 
395 
396             simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 0), sMM_i_j);
397             simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 1), sDG_i_j);
398             simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 2), sMI_i_j);
399             simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 3), sGD_i_j);
400             simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 4), sIM_i_j);
401 
402             // write values back to ViterbiMatrix
403 #ifdef AVX2
404             /* byte_result_vec        000H  000G  000F  000E   000D  000C  000B  000A */
405             /* abcdefgh               0000  0000  HGFE  0000   0000  0000  0000  DCBA */
406             const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask_extract);
407             /* abcd                                            0000  0000  0000  DCBA */
408             const __m128i abcd     = _mm256_castsi256_si128(abcdefgh);
409             /* efgh                                            0000  0000  HGFE  0000 */
410             const __m128i efgh     = _mm256_extracti128_si256(abcdefgh, 1);
411             _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh));
412 #else
413             byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec);
414             byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec);
415             int int_result  = _mm_cvtsi128_si32(byte_result_vec);
416             sCO_MI_DG_IM_GD_MM_vec[j] = int_result;
417 #endif
418 
419 
420 
421             // Find maximum score; global alignment: maxize only over last row and last column
422             // if(sMM_i_j>score && (par.loc || i==q->L)) { i2=i; j2=j; score=sMM_i_j; }
423             if (findMaxInnerLoop){
424 
425                 // new score is higer
426                 // output
427                 //  0   0   0   MAX
428                 simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec);
429                 simd_int lookup_mask_lo = simdi_andnot(lookup_mask_hi,simdi32_set(-1));
430 
431                 //simd_int lookup_mask_lo = (simd_int) simdf32_gt(score_vec,sMM_i_j);
432 
433                 // old score is higher
434                 // output
435                 //  MAX MAX MAX 0
436                 //simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec);
437 
438 
439                 simd_int curr_pos_j   = simdi32_set(j);
440                 simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j);
441                 simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec);
442                 j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo);
443                 simd_int curr_pos_i   = simdi32_set(i);
444                 simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i);
445                 simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec);
446                 i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo);
447 
448                 score_vec=simdf32_max(sMM_i_j,score_vec);
449 //                printf("%d %d ",i, j);
450 //                for(int seq_index=0; seq_index < maxres; seq_index++){
451 //                    printf("(%d %d %d %.3f %.3f %d %d)\t",  seq_index, ((int*)&lookup_mask_hi)[seq_index], ((int*)&lookup_mask_lo)[seq_index], ((float*)&sMM_i_j)[seq_index], ((float*)&score_vec)[seq_index],
452 //                           ((int*)&i2_vec)[seq_index], ((int*)&j2_vec)[seq_index]);
453 //                }
454 //                printf("\n");
455             }
456 
457 
458 
459         } //end for j
460 
461         // if global alignment: look for best cell in last column
462         if (!local){
463 
464             // new score is higer
465             // output
466             //  0   0   0   MAX
467             simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec);
468 //            simd_int lookup_mask_lo;
469             simd_int lookup_mask_lo = simdi_andnot(lookup_mask_hi,simdi32_set(-1));
470 
471             // old score is higher
472             // output
473             //  MAX MAX MAX 0
474 
475 
476             simd_int curr_pos_j   = simdi32_set(j-1);
477             simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j);
478             simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec);
479             j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo);
480             simd_int curr_pos_i   = simdi32_set(i);
481             simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i);
482             simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec);
483             i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo);
484 
485             score_vec = simdf32_max(sMM_i_j,score_vec);
486         }    // end for j
487     }     // end for i
488 
489     for(int seq_index=0; seq_index < maxres; seq_index++){
490         result->score[seq_index]=((float*)&score_vec)[seq_index];
491         result->i[seq_index] = ((int*)&i2_vec)[seq_index];
492         result->j[seq_index] = ((int*)&j2_vec)[seq_index];
493 //        std::cout << seq_index << "\t" << result->score[seq_index] << "\t" << result->i[seq_index] <<"\t" << result->j[seq_index] << std::endl;
494     }
495 
496     //   printf("Template=%-12.12s  i=%-4i j=%-4i score=%6.3f\n",t->name,i2,j2,score);
497 }
498 
499 
500 #undef MAX2_SET_MASK
501 #undef MAX2
502 
503 
504