1 //
2 // hhviterbialgorithm.cpp
3 //
4 // Created by Martin Steinegger on 19.11.12.
5 // Copyright (c) 2012 -. All rights reserved.
6 //
7 #include "hhviterbi.h"
8 #include "hhviterbimatrix.h"
9
10
11 #define MAX2_SET_MASK(vec1, vec2, vec3, res) \
12 res_gt_vec = (simd_int)simdf32_gt(vec1,vec2); \
13 index_vec = simdi_and(res_gt_vec,vec3); \
14 res = simdi_xor(res,index_vec);
15
16
17 #define MAX2(vec1, vec2, vec3, res) \
18 res_gt_vec = (simd_int)simdf32_gt(vec1,vec2); \
19 index_vec = simdi_and(res_gt_vec,vec3); \
20 res = simdui8_max(res,index_vec);
21
22
23
24
25 /////////////////////////////////////////////////////////////////////////////////////
26 // Compare HMMs with one another and look for sub-optimal alignments that share no pair with previous ones
27 // The function is called with q and t
28 /////////////////////////////////////////////////////////////////////////////////////
29 #ifdef VITERBI_SS_SCORE
30 #ifdef VITERBI_CELLOFF
AlignWithCellOffAndSS(HMMSimd * q,HMMSimd * t,ViterbiMatrix * viterbiMatrix,int maxres,ViterbiResult * result,int ss_hmm_mode)31 void Viterbi::AlignWithCellOffAndSS(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix,
32 int maxres, ViterbiResult* result, int ss_hmm_mode)
33 #else
34 void Viterbi::AlignWithOutCellOffAndSS(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix,
35 int maxres, ViterbiResult* result, int ss_hmm_mode)
36 #endif
37 #else
38 #ifdef VITERBI_CELLOFF
39 void Viterbi::AlignWithCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix, int maxres, ViterbiResult* result)
40 #else
41 void Viterbi::AlignWithOutCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix,
42 int maxres, ViterbiResult* result)
43 #endif
44 #endif
45 {
46
47 // Linear topology of query (and template) HMM:
48 // 1. The HMM HMM has L+2 columns. Columns 1 to L contain
49 // a match state, a delete state and an insert state each.
50 // 2. The Start state is M0, the virtual match state in column i=0 (j=0). (Therefore X[k][0]=ANY)
51 // This column has only a match state and it has only a transitions to the next match state.
52 // 3. The End state is M(L+1), the virtual match state in column i=L+1.(j=L+1) (Therefore X[k][L+1]=ANY)
53 // Column L has no transitions to the delete state: tr[L][M2D]=tr[L][D2D]=0.
54 // 4. Transitions I->D and D->I are ignored, since they do not appear in PsiBlast alignments
55 // (as long as the gap opening penalty d is higher than the best match score S(a,b)).
56
57 // Pairwise alignment of two HMMs:
58 // 1. Pair-states for the alignment of two HMMs are
59 // MM (Q:Match T:Match) , GD (Q:Gap T:Delete), IM (Q:Insert T:Match), DG (Q:Delelte, T:Match) , MI (Q:Match T:Insert)
60 // 2. Transitions are allowed only between the MM-state and each of the four other states.
61
62 // Saving space:
63 // The best score ending in pair state XY sXY[i][j] is calculated from left to right (j=1->t->L)
64 // and top to bottom (i=1->q->L). To save space, only the last row of scores calculated is kept in memory.
65 // (The backtracing matrices are kept entirely in memory [O(t->L*q->L)]).
66 // When the calculation has proceeded up to the point where the scores for cell (i,j) are caculated,
67 // sXY[i-1][j'] = sXY[j'] for j'>=j (A below)
68 // sXY[i][j'] = sXY[j'] for j'<j (B below)
69 // sXY[i-1][j-1]= sXY_i_1_j_1 (C below)
70 // sXY[i][j] = sXY_i_j (D below)
71 // j-1
72 // j
73 // i-1: CAAAAAAAAAAAAAAAAAA
74 // i : BBBBBBBBBBBBBD
75 // Variable declarations
76
77 const float smin = (this->local ? 0 : -FLT_MAX); //used to distinguish between SW and NW algorithms in maximization
78 const simd_float smin_vec = simdf32_set(smin);
79 const simd_float shift_vec = simdf32_set(shift);
80 // const simd_float one_vec = simdf32_set(1); // 00000001
81 const simd_int mm_vec = simdi32_set(2); //MM 00000010
82 const simd_int gd_vec = simdi32_set(3); //GD 00000011
83 const simd_int im_vec = simdi32_set(4); //IM 00000100
84 const simd_int dg_vec = simdi32_set(5); //DG 00000101
85 const simd_int mi_vec = simdi32_set(6); //MI 00000110
86 const simd_int gd_mm_vec = simdi32_set(8); // 00001000
87 const simd_int im_mm_vec = simdi32_set(16);// 00010000
88 const simd_int dg_mm_vec = simdi32_set(32);// 00100000
89 const simd_int mi_mm_vec = simdi32_set(64);// 01000000
90
91 #ifdef VITERBI_SS_SCORE
92 HMM * q_s = q->GetHMM(0);
93 const unsigned char * t_index;
94 if(ss_hmm_mode == HMM::PRED_PRED || ss_hmm_mode == HMM::DSSP_PRED ){
95 t_index = t->pred_index;
96 }else if(ss_hmm_mode == HMM::PRED_DSSP){
97 t_index = t->dssp_index;
98 }
99 simd_float * ss_score_vec = (simd_float *) ss_score;
100 #endif
101
102 #ifdef AVX2
103 const simd_int shuffle_mask_extract = _mm256_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
104 -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1);
105 #endif
106 #ifdef VITERBI_CELLOFF
107 #ifdef AVX2
108 const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000
109 const simd_int co_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1);
110 const simd_int float_min_vec = (simd_int) _mm256_set1_ps(-FLT_MAX);
111 const simd_int shuffle_mask_celloff = _mm256_set_epi8(
112 15, 14, 13, 12,
113 15, 14, 13, 12,
114 15, 14, 13, 12,
115 15, 14, 13, 12,
116 3, 2, 1, 0,
117 3, 2, 1, 0,
118 3, 2, 1, 0,
119 3, 2, 1, 0);
120 #else // SSE case
121 const simd_int tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);
122 const simd_int co_vec = tmp_vec;
123 const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX);
124 #endif
125 #endif // AVX2 end
126
127 int i,j; //query and template match state indices
128 simd_int i2_vec = simdi32_set(0);
129 simd_int j2_vec = simdi32_set(0);
130 simd_float sMM_i_j = simdf32_set(0);
131 simd_float sMI_i_j,sIM_i_j,sGD_i_j,sDG_i_j;
132
133
134 simd_float Si_vec;
135 simd_float sMM_i_1_j_1;
136 simd_float sMI_i_1_j_1;
137 simd_float sIM_i_1_j_1;
138 simd_float sGD_i_1_j_1;
139 simd_float sDG_i_1_j_1;
140
141 simd_float score_vec = simdf32_set(-FLT_MAX);
142 simd_int byte_result_vec = simdi32_set(0);
143
144 // Initialization of top row, i.e. cells (0,j)
145 for (j=0; j <= t->L; ++j)
146 {
147 const unsigned int index_pos_j = j * 5;
148 sMM_DG_MI_GD_IM_vec[index_pos_j + 0] = simdf32_set(-j*penalty_gap_template);
149 sMM_DG_MI_GD_IM_vec[index_pos_j + 1] = simdf32_set(-FLT_MAX);
150 sMM_DG_MI_GD_IM_vec[index_pos_j + 2] = simdf32_set(-FLT_MAX);
151 sMM_DG_MI_GD_IM_vec[index_pos_j + 3] = simdf32_set(-FLT_MAX);
152 sMM_DG_MI_GD_IM_vec[index_pos_j + 4] = simdf32_set(-FLT_MAX);
153 }
154 // Viterbi algorithm
155 const int queryLength = q->L;
156 for (i=1; i <= queryLength; ++i) // Loop through query positions i
157 {
158
159 // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues
160 // Initialize cells
161 sMM_i_1_j_1 = simdf32_set(-(i - 1) * penalty_gap_query); // initialize at (i-1,0)
162 sIM_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i-1,jmin-1)
163 sMI_i_1_j_1 = simdf32_set(-FLT_MAX);
164 sDG_i_1_j_1 = simdf32_set(-FLT_MAX);
165 sGD_i_1_j_1 = simdf32_set(-FLT_MAX);
166
167 // initialize at (i,jmin-1)
168 const unsigned int index_pos_i = 0 * 5;
169 sMM_DG_MI_GD_IM_vec[index_pos_i + 0] = simdf32_set(-i * penalty_gap_query); // initialize at (i,0)
170 sMM_DG_MI_GD_IM_vec[index_pos_i + 1] = simdf32_set(-FLT_MAX);
171 sMM_DG_MI_GD_IM_vec[index_pos_i + 2] = simdf32_set(-FLT_MAX);
172 sMM_DG_MI_GD_IM_vec[index_pos_i + 3] = simdf32_set(-FLT_MAX);
173 sMM_DG_MI_GD_IM_vec[index_pos_i + 4] = simdf32_set(-FLT_MAX);
174 #ifdef AVX2
175 unsigned long long * sCO_MI_DG_IM_GD_MM_vec = (unsigned long long *) viterbiMatrix->getRow(i);
176 #else
177 unsigned int *sCO_MI_DG_IM_GD_MM_vec = (unsigned int *) viterbiMatrix->getRow(i);
178 #endif
179
180 const unsigned int start_pos_tr_i_1 = (i - 1) * 7;
181 const unsigned int start_pos_tr_i = (i) * 7;
182 const simd_float q_m2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 2)); // M2M
183 const simd_float q_m2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 3)); // M2D
184 const simd_float q_d2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 4)); // D2M
185 const simd_float q_d2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 5)); // D2D
186 const simd_float q_i2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 6)); // I2m
187 const simd_float q_i2i = simdf32_load((float *) (q->tr + start_pos_tr_i)); // I2I
188 const simd_float q_m2i = simdf32_load((float *) (q->tr + start_pos_tr_i + 1)); // M2I
189
190
191 // Find maximum score; global alignment: maxize only over last row and last column
192 const bool findMaxInnerLoop = (local || i == queryLength);
193 const int targetLength = t->L;
194 #ifdef VITERBI_SS_SCORE
195 if(ss_hmm_mode == HMM::NO_SS_INFORMATION){
196 // set all to log(1.0) = 0.0
197 memset(ss_score, 0, (targetLength+1)*VECSIZE_FLOAT*sizeof(float));
198 }else {
199 const float * score;
200 if(ss_hmm_mode == HMM::PRED_PRED){
201 score = &S33[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0][0];
202 }else if (ss_hmm_mode == HMM::DSSP_PRED){
203 score = &S73[ (int)q_s->ss_dssp[i]][0][0];
204 }else{
205 score = &S37[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0];
206 }
207 // access SS scores and write them to the ss_score array
208 for (j = 0; j < targetLength*VECSIZE_FLOAT; j++) // Loop through template positions j
209 {
210 ss_score[j + VECSIZE_FLOAT] = ssw * score[t_index[j]];
211 }
212 }
213 #endif
214 for (j=1; j <= targetLength; ++j) // Loop through template positions j
215 {
216 simd_int index_vec;
217 simd_int res_gt_vec;
218 // cache line optimized reading
219 const unsigned int start_pos_tr_j_1 = (j-1) * 7;
220 const unsigned int start_pos_tr_j = (j) * 7;
221
222 const simd_float t_m2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+2)); // M2M
223 const simd_float t_m2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+3)); // M2D
224 const simd_float t_d2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+4)); // D2M
225 const simd_float t_d2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+5)); // D2D
226 const simd_float t_i2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+6)); // I2m
227 const simd_float t_i2i = simdf32_load((float *) (t->tr+start_pos_tr_j)); // I2i
228 const simd_float t_m2i = simdf32_load((float *) (t->tr+start_pos_tr_j+1)); // M2I
229
230 // Find max value
231 // CALCULATE_MAX6( sMM_i_j,
232 // smin,
233 // sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M],
234 // sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M],
235 // sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M],
236 // sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M],
237 // sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M],
238 // bMM[i][j]
239 // );
240 // same as sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M]
241 simd_float mm_m2m_m2m_vec = simdf32_add( simdf32_add(sMM_i_1_j_1, q_m2m), t_m2m);
242 // if mm > min { 2 }
243 res_gt_vec = (simd_int)simdf32_gt(mm_m2m_m2m_vec, smin_vec);
244 byte_result_vec = simdi_and(res_gt_vec, mm_vec);
245 sMM_i_j = simdf32_max(smin_vec, mm_m2m_m2m_vec);
246
247 // same as sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M]
248 simd_float gd_m2m_d2m_vec = simdf32_add( simdf32_add(sGD_i_1_j_1, q_m2m), t_d2m);
249 // if gd > max { 3 }
250 res_gt_vec = (simd_int)simdf32_gt(gd_m2m_d2m_vec, sMM_i_j);
251 index_vec = simdi_and( res_gt_vec, gd_vec);
252 byte_result_vec = simdi_or( index_vec, byte_result_vec);
253
254 sMM_i_j = simdf32_max(sMM_i_j, gd_m2m_d2m_vec);
255
256
257 // same as sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M]
258 simd_float im_m2m_d2m_vec = simdf32_add( simdf32_add(sIM_i_1_j_1, q_i2m), t_m2m);
259 // if im > max { 4 }
260 MAX2(im_m2m_d2m_vec, sMM_i_j, im_vec,byte_result_vec);
261 sMM_i_j = simdf32_max(sMM_i_j, im_m2m_d2m_vec);
262
263 // same as sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M]
264 simd_float dg_m2m_d2m_vec = simdf32_add( simdf32_add(sDG_i_1_j_1, q_d2m), t_m2m);
265 // if dg > max { 5 }
266 MAX2(dg_m2m_d2m_vec, sMM_i_j, dg_vec,byte_result_vec);
267 sMM_i_j = simdf32_max(sMM_i_j, dg_m2m_d2m_vec);
268
269 // same as sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M],
270 simd_float mi_m2m_d2m_vec = simdf32_add( simdf32_add(sMI_i_1_j_1, q_m2m), t_i2m);
271 // if mi > max { 6 }
272 MAX2(mi_m2m_d2m_vec, sMM_i_j, mi_vec, byte_result_vec);
273 sMM_i_j = simdf32_max(sMM_i_j, mi_m2m_d2m_vec);
274
275 // TODO add secondary structure score
276 // calculate amino acid profile-profile scores
277 Si_vec = log2f4(ScalarProd20Vec((simd_float *) q->p[i],(simd_float *) t->p[j]));
278 #ifdef VITERBI_SS_SCORE
279 Si_vec = simdf32_add(ss_score_vec[j], Si_vec);
280 #endif
281 Si_vec = simdf32_add(Si_vec, shift_vec);
282
283 sMM_i_j = simdf32_add(sMM_i_j, Si_vec);
284 //+ ScoreSS(q,t,i,j) + shift + (Sstruc==NULL? 0: Sstruc[i][j]);
285
286 const unsigned int index_pos_j = (j * 5);
287 const unsigned int index_pos_j_1 = (j - 1) * 5;
288 const simd_float sMM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 0));
289 const simd_float sGD_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 3));
290 const simd_float sIM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 4));
291 const simd_float sMM_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 0));
292 const simd_float sDG_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 1));
293 const simd_float sMI_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 2));
294 sMM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 0));
295 sDG_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 1));
296 sMI_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 2));
297 sGD_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 3));
298 sIM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 4));
299
300 // sGD_i_j = max2
301 // (
302 // sMM[j-1] + t->tr[j-1][M2D], // MM->GD gap opening in query
303 // sGD[j-1] + t->tr[j-1][D2D], // GD->GD gap extension in query
304 // bGD[i][j]
305 // );
306 //sMM_DG_GD_MI_IM_vec
307 simd_float mm_gd_vec = simdf32_add(sMM_j_1, t_m2d); // MM->GD gap opening in query
308 simd_float gd_gd_vec = simdf32_add(sGD_j_1, t_d2d); // GD->GD gap extension in query
309 // if mm_gd > gd_dg { 8 }
310 MAX2_SET_MASK(mm_gd_vec, gd_gd_vec,gd_mm_vec, byte_result_vec);
311
312 sGD_i_j = simdf32_max(
313 mm_gd_vec,
314 gd_gd_vec
315 );
316 // sIM_i_j = max2
317 // (
318 // sMM[j-1] + q->tr[i][M2I] + t->tr[j-1][M2M] ,
319 // sIM[j-1] + q->tr[i][I2I] + t->tr[j-1][M2M], // IM->IM gap extension in query
320 // bIM[i][j]
321 // );
322
323
324 simd_float mm_mm_vec = simdf32_add(simdf32_add(sMM_j_1, q_m2i), t_m2m);
325 simd_float im_im_vec = simdf32_add(simdf32_add(sIM_j_1, q_i2i), t_m2m); // IM->IM gap extension in query
326 // if mm_mm > im_im { 16 }
327 MAX2_SET_MASK(mm_mm_vec,im_im_vec, im_mm_vec, byte_result_vec);
328
329 sIM_i_j = simdf32_max(
330 mm_mm_vec,
331 im_im_vec
332 );
333
334 // sDG_i_j = max2
335 // (
336 // sMM[j] + q->tr[i-1][M2D],
337 // sDG[j] + q->tr[i-1][D2D], //gap extension (DD) in query
338 // bDG[i][j]
339 // );
340 simd_float mm_dg_vec = simdf32_add(sMM_j, q_m2d);
341 simd_float dg_dg_vec = simdf32_add(sDG_j, q_d2d); //gap extension (DD) in query
342 // if mm_dg > dg_dg { 32 }
343 MAX2_SET_MASK(mm_dg_vec,dg_dg_vec, dg_mm_vec, byte_result_vec);
344
345 sDG_i_j = simdf32_max( mm_dg_vec
346 ,
347 dg_dg_vec
348 );
349
350
351
352 // sMI_i_j = max2
353 // (
354 // sMM[j] + q->tr[i-1][M2M] + t->tr[j][M2I], // MM->MI gap opening M2I in template
355 // sMI[j] + q->tr[i-1][M2M] + t->tr[j][I2I], // MI->MI gap extension I2I in template
356 // bMI[i][j]
357 // );
358 simd_float mm_mi_vec = simdf32_add( simdf32_add(sMM_j, q_m2m), t_m2i); // MM->MI gap opening M2I in template
359 simd_float mi_mi_vec = simdf32_add( simdf32_add(sMI_j, q_m2m), t_i2i); // MI->MI gap extension I2I in template
360 // if mm_mi > mi_mi { 64 }
361 MAX2_SET_MASK(mm_mi_vec, mi_mi_vec,mi_mm_vec, byte_result_vec);
362
363 sMI_i_j = simdf32_max(
364 mm_mi_vec,
365 mi_mi_vec
366 );
367
368
369 // Cell of logic
370 // if (cell_off[i][j])
371 //shift 10000000100000001000000010000000 -> 01000000010000000100000001000000
372 //because 10000000000000000000000000000000 = -2147483648 kills cmplt
373 #ifdef VITERBI_CELLOFF
374 #ifdef AVX2
375 simd_int matrix_vec = _mm256_set1_epi64x(sCO_MI_DG_IM_GD_MM_vec[j]>>1);
376 matrix_vec = _mm256_shuffle_epi8(matrix_vec,shuffle_mask_celloff);
377 #else
378 // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040) > 0){
379 // std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040 ) << std::endl;
380 // }
381 simd_int matrix_vec = simdi32_set(sCO_MI_DG_IM_GD_MM_vec[j]>>1);
382
383 #endif
384 simd_int cell_off_vec = simdi_and(matrix_vec, co_vec);
385 simd_int res_eq_co_vec = simdi32_gt(co_vec, cell_off_vec ); // shift is because signed can't be checked here
386 simd_float cell_off_float_min_vec = (simd_float) simdi_andnot(res_eq_co_vec, float_min_vec); // inverse
387 sMM_i_j = simdf32_add(sMM_i_j,cell_off_float_min_vec); // add the cell off vec to sMM_i_j. Set -FLT_MAX to cell off
388 sGD_i_j = simdf32_add(sGD_i_j,cell_off_float_min_vec);
389 sIM_i_j = simdf32_add(sIM_i_j,cell_off_float_min_vec);
390 sDG_i_j = simdf32_add(sDG_i_j,cell_off_float_min_vec);
391 sMI_i_j = simdf32_add(sMI_i_j,cell_off_float_min_vec);
392 #endif
393
394
395
396 simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 0), sMM_i_j);
397 simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 1), sDG_i_j);
398 simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 2), sMI_i_j);
399 simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 3), sGD_i_j);
400 simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 4), sIM_i_j);
401
402 // write values back to ViterbiMatrix
403 #ifdef AVX2
404 /* byte_result_vec 000H 000G 000F 000E 000D 000C 000B 000A */
405 /* abcdefgh 0000 0000 HGFE 0000 0000 0000 0000 DCBA */
406 const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask_extract);
407 /* abcd 0000 0000 0000 DCBA */
408 const __m128i abcd = _mm256_castsi256_si128(abcdefgh);
409 /* efgh 0000 0000 HGFE 0000 */
410 const __m128i efgh = _mm256_extracti128_si256(abcdefgh, 1);
411 _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh));
412 #else
413 byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec);
414 byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec);
415 int int_result = _mm_cvtsi128_si32(byte_result_vec);
416 sCO_MI_DG_IM_GD_MM_vec[j] = int_result;
417 #endif
418
419
420
421 // Find maximum score; global alignment: maxize only over last row and last column
422 // if(sMM_i_j>score && (par.loc || i==q->L)) { i2=i; j2=j; score=sMM_i_j; }
423 if (findMaxInnerLoop){
424
425 // new score is higer
426 // output
427 // 0 0 0 MAX
428 simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec);
429 simd_int lookup_mask_lo = simdi_andnot(lookup_mask_hi,simdi32_set(-1));
430
431 //simd_int lookup_mask_lo = (simd_int) simdf32_gt(score_vec,sMM_i_j);
432
433 // old score is higher
434 // output
435 // MAX MAX MAX 0
436 //simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec);
437
438
439 simd_int curr_pos_j = simdi32_set(j);
440 simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j);
441 simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec);
442 j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo);
443 simd_int curr_pos_i = simdi32_set(i);
444 simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i);
445 simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec);
446 i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo);
447
448 score_vec=simdf32_max(sMM_i_j,score_vec);
449 // printf("%d %d ",i, j);
450 // for(int seq_index=0; seq_index < maxres; seq_index++){
451 // printf("(%d %d %d %.3f %.3f %d %d)\t", seq_index, ((int*)&lookup_mask_hi)[seq_index], ((int*)&lookup_mask_lo)[seq_index], ((float*)&sMM_i_j)[seq_index], ((float*)&score_vec)[seq_index],
452 // ((int*)&i2_vec)[seq_index], ((int*)&j2_vec)[seq_index]);
453 // }
454 // printf("\n");
455 }
456
457
458
459 } //end for j
460
461 // if global alignment: look for best cell in last column
462 if (!local){
463
464 // new score is higer
465 // output
466 // 0 0 0 MAX
467 simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec);
468 // simd_int lookup_mask_lo;
469 simd_int lookup_mask_lo = simdi_andnot(lookup_mask_hi,simdi32_set(-1));
470
471 // old score is higher
472 // output
473 // MAX MAX MAX 0
474
475
476 simd_int curr_pos_j = simdi32_set(j-1);
477 simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j);
478 simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec);
479 j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo);
480 simd_int curr_pos_i = simdi32_set(i);
481 simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i);
482 simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec);
483 i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo);
484
485 score_vec = simdf32_max(sMM_i_j,score_vec);
486 } // end for j
487 } // end for i
488
489 for(int seq_index=0; seq_index < maxres; seq_index++){
490 result->score[seq_index]=((float*)&score_vec)[seq_index];
491 result->i[seq_index] = ((int*)&i2_vec)[seq_index];
492 result->j[seq_index] = ((int*)&j2_vec)[seq_index];
493 // std::cout << seq_index << "\t" << result->score[seq_index] << "\t" << result->i[seq_index] <<"\t" << result->j[seq_index] << std::endl;
494 }
495
496 // printf("Template=%-12.12s i=%-4i j=%-4i score=%6.3f\n",t->name,i2,j2,score);
497 }
498
499
500 #undef MAX2_SET_MASK
501 #undef MAX2
502
503
504