1 /**
2  * @file
3  *
4  * @author jeffrey.daily@gmail.com
5  *
6  * Copyright (c) 2015 Battelle Memorial Institute.
7  */
8 #include "config.h"
9 
10 #include <stdint.h>
11 #include <stdlib.h>
12 
13 #include <immintrin.h>
14 
15 #define SG_TRACE
16 #define SG_SUFFIX _striped_avx2_256_16
17 #define SG_SUFFIX_PROF _striped_profile_avx2_256_16
18 #include "sg_helper.h"
19 
20 #include "parasail.h"
21 #include "parasail/memory.h"
22 #include "parasail/internal_avx.h"
23 
24 #define SWAP(A,B) { __m256i* tmp = A; A = B; B = tmp; }
25 
26 #define NEG_INF (INT16_MIN/(int16_t)(2))
27 
28 #if HAVE_AVX2_MM256_INSERT_EPI16
29 #define _mm256_insert_epi16_rpl _mm256_insert_epi16
30 #else
_mm256_insert_epi16_rpl(__m256i a,int16_t i,int imm)31 static inline __m256i _mm256_insert_epi16_rpl(__m256i a, int16_t i, int imm) {
32     __m256i_16_t A;
33     A.m = a;
34     A.v[imm] = i;
35     return A.m;
36 }
37 #endif
38 
39 #if HAVE_AVX2_MM256_EXTRACT_EPI16
40 #define _mm256_extract_epi16_rpl _mm256_extract_epi16
41 #else
_mm256_extract_epi16_rpl(__m256i a,int imm)42 static inline int16_t _mm256_extract_epi16_rpl(__m256i a, int imm) {
43     __m256i_16_t A;
44     A.m = a;
45     return A.v[imm];
46 }
47 #endif
48 
49 #define _mm256_slli_si256_rpl(a,imm) _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,3,0)), 16-imm)
50 
51 
arr_store(__m256i * array,__m256i vH,int32_t t,int32_t seglen,int32_t d)52 static inline void arr_store(
53         __m256i *array,
54         __m256i vH,
55         int32_t t,
56         int32_t seglen,
57         int32_t d)
58 {
59     _mm256_store_si256(array + (1LL*d*seglen+t), vH);
60 }
61 
arr_load(__m256i * array,int32_t t,int32_t seglen,int32_t d)62 static inline __m256i arr_load(
63         __m256i *array,
64         int32_t t,
65         int32_t seglen,
66         int32_t d)
67 {
68     return _mm256_load_si256(array + (1LL*d*seglen+t));
69 }
70 
71 #define FNAME parasail_sg_flags_trace_striped_avx2_256_16
72 #define PNAME parasail_sg_flags_trace_striped_profile_avx2_256_16
73 
FNAME(const char * const restrict s1,const int s1Len,const char * const restrict s2,const int s2Len,const int open,const int gap,const parasail_matrix_t * matrix,int s1_beg,int s1_end,int s2_beg,int s2_end)74 parasail_result_t* FNAME(
75         const char * const restrict s1, const int s1Len,
76         const char * const restrict s2, const int s2Len,
77         const int open, const int gap, const parasail_matrix_t *matrix,
78         int s1_beg, int s1_end, int s2_beg, int s2_end)
79 {
80     parasail_profile_t *profile = parasail_profile_create_avx_256_16(s1, s1Len, matrix);
81     parasail_result_t *result = PNAME(profile, s2, s2Len, open, gap, s1_beg, s1_end, s2_beg, s2_end);
82     parasail_profile_free(profile);
83     return result;
84 }
85 
PNAME(const parasail_profile_t * const restrict profile,const char * const restrict s2,const int s2Len,const int open,const int gap,int s1_beg,int s1_end,int s2_beg,int s2_end)86 parasail_result_t* PNAME(
87         const parasail_profile_t * const restrict profile,
88         const char * const restrict s2, const int s2Len,
89         const int open, const int gap,
90         int s1_beg, int s1_end, int s2_beg, int s2_end)
91 {
92     int32_t i = 0;
93     int32_t j = 0;
94     int32_t k = 0;
95     const int s1Len = profile->s1Len;
96     int32_t end_query = s1Len-1;
97     int32_t end_ref = s2Len-1;
98     const parasail_matrix_t *matrix = profile->matrix;
99     const int32_t segWidth = 16; /* number of values in vector unit */
100     const int32_t segLen = (s1Len + segWidth - 1) / segWidth;
101     const int32_t offset = (s1Len - 1) % segLen;
102     const int32_t position = (segWidth - 1) - (s1Len - 1) / segLen;
103     __m256i* const restrict vProfile = (__m256i*)profile->profile16.score;
104     __m256i* restrict pvHStore = parasail_memalign___m256i(32, segLen);
105     __m256i* restrict pvHLoad = parasail_memalign___m256i(32, segLen);
106     __m256i* const restrict pvE = parasail_memalign___m256i(32, segLen);
107     __m256i* restrict pvEaStore = parasail_memalign___m256i(32, segLen);
108     __m256i* restrict pvEaLoad = parasail_memalign___m256i(32, segLen);
109     __m256i* const restrict pvHT = parasail_memalign___m256i(32, segLen);
110     int16_t* const restrict boundary = parasail_memalign_int16_t(32, s2Len+1);
111     __m256i vGapO = _mm256_set1_epi16(open);
112     __m256i vGapE = _mm256_set1_epi16(gap);
113     __m256i vNegInf = _mm256_set1_epi16(NEG_INF);
114     int16_t score = NEG_INF;
115     __m256i vMaxH = vNegInf;
116     __m256i vPosMask = _mm256_cmpeq_epi16(_mm256_set1_epi16(position),
117             _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15));
118 
119     parasail_result_t *result = parasail_result_new_trace(segLen, s2Len, 32, sizeof(__m256i));
120     __m256i vTIns  = _mm256_set1_epi16(PARASAIL_INS);
121     __m256i vTDel  = _mm256_set1_epi16(PARASAIL_DEL);
122     __m256i vTDiag = _mm256_set1_epi16(PARASAIL_DIAG);
123     __m256i vTDiagE = _mm256_set1_epi16(PARASAIL_DIAG_E);
124     __m256i vTInsE = _mm256_set1_epi16(PARASAIL_INS_E);
125     __m256i vTDiagF = _mm256_set1_epi16(PARASAIL_DIAG_F);
126     __m256i vTDelF = _mm256_set1_epi16(PARASAIL_DEL_F);
127     __m256i vTMask = _mm256_set1_epi16(PARASAIL_ZERO_MASK);
128     __m256i vFTMask = _mm256_set1_epi16(PARASAIL_F_MASK);
129 
130     /* initialize H and E */
131     {
132         int32_t index = 0;
133         for (i=0; i<segLen; ++i) {
134             int32_t segNum = 0;
135             __m256i_16_t h;
136             __m256i_16_t e;
137             for (segNum=0; segNum<segWidth; ++segNum) {
138                 int64_t tmp = s1_beg ? 0 : (-open-gap*(segNum*segLen+i));
139                 h.v[segNum] = tmp < INT16_MIN ? INT16_MIN : tmp;
140                 tmp = tmp - open;
141                 e.v[segNum] = tmp < INT16_MIN ? INT16_MIN : tmp;
142             }
143             _mm256_store_si256(&pvHStore[index], h.m);
144             _mm256_store_si256(&pvE[index], e.m);
145             _mm256_store_si256(&pvEaStore[index], e.m);
146             ++index;
147         }
148     }
149 
150     /* initialize uppder boundary */
151     {
152         boundary[0] = 0;
153         for (i=1; i<=s2Len; ++i) {
154             int64_t tmp = s2_beg ? 0 : (-open-gap*(i-1));
155             boundary[i] = tmp < INT16_MIN ? INT16_MIN : tmp;
156         }
157     }
158 
159     for (i=0; i<segLen; ++i) {
160         arr_store(result->trace->trace_table, vTDiagE, i, segLen, 0);
161     }
162 
163     /* outer loop over database sequence */
164     for (j=0; j<s2Len; ++j) {
165         __m256i vEF_opn;
166         __m256i vE;
167         __m256i vE_ext;
168         __m256i vF;
169         __m256i vF_ext;
170         __m256i vFa;
171         __m256i vFa_ext;
172         __m256i vH;
173         __m256i vH_dag;
174         const __m256i* vP = NULL;
175 
176         /* Initialize F value to -inf.  Any errors to vH values will be
177          * corrected in the Lazy_F loop. */
178         vF = vNegInf;
179 
180         /* load final segment of pvHStore and shift left by 2 bytes */
181         vH = _mm256_load_si256(&pvHStore[segLen - 1]);
182         vH = _mm256_slli_si256_rpl(vH, 2);
183 
184         /* insert upper boundary condition */
185         vH = _mm256_insert_epi16_rpl(vH, boundary[j], 0);
186 
187         /* Correct part of the vProfile */
188         vP = vProfile + matrix->mapper[(unsigned char)s2[j]] * segLen;
189 
190         /* Swap the 2 H buffers. */
191         SWAP(pvHLoad, pvHStore)
192         SWAP(pvEaLoad, pvEaStore)
193 
194         /* inner loop to process the query sequence */
195         for (i=0; i<segLen; ++i) {
196             vE = _mm256_load_si256(pvE + i);
197 
198             /* Get max from vH, vE and vF. */
199             vH_dag = _mm256_add_epi16(vH, _mm256_load_si256(vP + i));
200             vH = _mm256_max_epi16(vH_dag, vE);
201             vH = _mm256_max_epi16(vH, vF);
202             /* Save vH values. */
203             _mm256_store_si256(pvHStore + i, vH);
204 
205 
206             {
207                 __m256i vTAll = arr_load(result->trace->trace_table, i, segLen, j);
208                 __m256i case1 = _mm256_cmpeq_epi16(vH, vH_dag);
209                 __m256i case2 = _mm256_cmpeq_epi16(vH, vF);
210                 __m256i vT = _mm256_blendv_epi8(
211                         _mm256_blendv_epi8(vTIns, vTDel, case2),
212                         vTDiag, case1);
213                 _mm256_store_si256(pvHT + i, vT);
214                 vT = _mm256_or_si256(vT, vTAll);
215                 arr_store(result->trace->trace_table, vT, i, segLen, j);
216             }
217 
218             vEF_opn = _mm256_sub_epi16(vH, vGapO);
219 
220             /* Update vE value. */
221             vE_ext = _mm256_sub_epi16(vE, vGapE);
222             vE = _mm256_max_epi16(vEF_opn, vE_ext);
223             _mm256_store_si256(pvE + i, vE);
224             {
225                 __m256i vEa = _mm256_load_si256(pvEaLoad + i);
226                 __m256i vEa_ext = _mm256_sub_epi16(vEa, vGapE);
227                 vEa = _mm256_max_epi16(vEF_opn, vEa_ext);
228                 _mm256_store_si256(pvEaStore + i, vEa);
229                 if (j+1<s2Len) {
230                     __m256i cond = _mm256_cmpgt_epi16(vEF_opn, vEa_ext);
231                     __m256i vT = _mm256_blendv_epi8(vTInsE, vTDiagE, cond);
232                     arr_store(result->trace->trace_table, vT, i, segLen, j+1);
233                 }
234             }
235 
236             /* Update vF value. */
237             vF_ext = _mm256_sub_epi16(vF, vGapE);
238             vF = _mm256_max_epi16(vEF_opn, vF_ext);
239             if (i+1<segLen) {
240                 __m256i vTAll = arr_load(result->trace->trace_table, i+1, segLen, j);
241                 __m256i cond = _mm256_cmpgt_epi16(vEF_opn, vF_ext);
242                 __m256i vT = _mm256_blendv_epi8(vTDelF, vTDiagF, cond);
243                 vT = _mm256_or_si256(vT, vTAll);
244                 arr_store(result->trace->trace_table, vT, i+1, segLen, j);
245             }
246 
247             /* Load the next vH. */
248             vH = _mm256_load_si256(pvHLoad + i);
249         }
250 
251         /* Lazy_F loop: has been revised to disallow adjecent insertion and
252          * then deletion, so don't update E(i, i), learn from SWPS3 */
253         vFa_ext = vF_ext;
254         vFa = vF;
255         for (k=0; k<segWidth; ++k) {
256             int64_t tmp = s2_beg ? -open : (boundary[j+1]-open);
257             int16_t tmp2 = tmp < INT16_MIN ? INT16_MIN : tmp;
258             __m256i vHp = _mm256_load_si256(&pvHLoad[segLen - 1]);
259             vHp = _mm256_slli_si256_rpl(vHp, 2);
260             vHp = _mm256_insert_epi16_rpl(vHp, boundary[j], 0);
261             vEF_opn = _mm256_slli_si256_rpl(vEF_opn, 2);
262             vEF_opn = _mm256_insert_epi16_rpl(vEF_opn, tmp2, 0);
263             vF_ext = _mm256_slli_si256_rpl(vF_ext, 2);
264             vF_ext = _mm256_insert_epi16_rpl(vF_ext, NEG_INF, 0);
265             vF = _mm256_slli_si256_rpl(vF, 2);
266             vF = _mm256_insert_epi16_rpl(vF, tmp2, 0);
267             vFa_ext = _mm256_slli_si256_rpl(vFa_ext, 2);
268             vFa_ext = _mm256_insert_epi16_rpl(vFa_ext, NEG_INF, 0);
269             vFa = _mm256_slli_si256_rpl(vFa, 2);
270             vFa = _mm256_insert_epi16_rpl(vFa, tmp2, 0);
271             for (i=0; i<segLen; ++i) {
272                 vH = _mm256_load_si256(pvHStore + i);
273                 vH = _mm256_max_epi16(vH,vF);
274                 _mm256_store_si256(pvHStore + i, vH);
275 
276                 {
277                     __m256i vTAll;
278                     __m256i vT;
279                     __m256i case1;
280                     __m256i case2;
281                     __m256i cond;
282                     vHp = _mm256_add_epi16(vHp, _mm256_load_si256(vP + i));
283                     case1 = _mm256_cmpeq_epi16(vH, vHp);
284                     case2 = _mm256_cmpeq_epi16(vH, vF);
285                     cond = _mm256_andnot_si256(case1,case2);
286                     vTAll = arr_load(result->trace->trace_table, i, segLen, j);
287                     vT = _mm256_load_si256(pvHT + i);
288                     vT = _mm256_blendv_epi8(vT, vTDel, cond);
289                     _mm256_store_si256(pvHT + i, vT);
290                     vTAll = _mm256_and_si256(vTAll, vTMask);
291                     vTAll = _mm256_or_si256(vTAll, vT);
292                     arr_store(result->trace->trace_table, vTAll, i, segLen, j);
293                 }
294                 /* Update vF value. */
295                 {
296                     __m256i vTAll = arr_load(result->trace->trace_table, i, segLen, j);
297                     __m256i cond = _mm256_cmpgt_epi16(vEF_opn, vFa_ext);
298                     __m256i vT = _mm256_blendv_epi8(vTDelF, vTDiagF, cond);
299                     vTAll = _mm256_and_si256(vTAll, vFTMask);
300                     vTAll = _mm256_or_si256(vTAll, vT);
301                     arr_store(result->trace->trace_table, vTAll, i, segLen, j);
302                 }
303                 vEF_opn = _mm256_sub_epi16(vH, vGapO);
304                 vF_ext = _mm256_sub_epi16(vF, vGapE);
305                 {
306                     __m256i vEa = _mm256_load_si256(pvEaLoad + i);
307                     __m256i vEa_ext = _mm256_sub_epi16(vEa, vGapE);
308                     vEa = _mm256_max_epi16(vEF_opn, vEa_ext);
309                     _mm256_store_si256(pvEaStore + i, vEa);
310                     if (j+1<s2Len) {
311                         __m256i cond = _mm256_cmpgt_epi16(vEF_opn, vEa_ext);
312                         __m256i vT = _mm256_blendv_epi8(vTInsE, vTDiagE, cond);
313                         arr_store(result->trace->trace_table, vT, i, segLen, j+1);
314                     }
315                 }
316                 if (! _mm256_movemask_epi8(
317                             _mm256_or_si256(
318                                 _mm256_cmpgt_epi16(vF_ext, vEF_opn),
319                                 _mm256_cmpeq_epi16(vF_ext, vEF_opn))))
320                     goto end;
321                 /*vF = _mm256_max_epi16(vEF_opn, vF_ext);*/
322                 vF = vF_ext;
323                 vFa_ext = _mm256_sub_epi16(vFa, vGapE);
324                 vFa = _mm256_max_epi16(vEF_opn, vFa_ext);
325                 vHp = _mm256_load_si256(pvHLoad + i);
326             }
327         }
328 end:
329         {
330             /* extract vector containing last value from the column */
331             __m256i vCompare;
332             vH = _mm256_load_si256(pvHStore + offset);
333             vCompare = _mm256_and_si256(vPosMask, _mm256_cmpgt_epi16(vH, vMaxH));
334             vMaxH = _mm256_max_epi16(vH, vMaxH);
335             if (_mm256_movemask_epi8(vCompare)) {
336                 end_ref = j;
337             }
338         }
339     }
340 
341     /* max last value from all columns */
342     if (s2_end)
343     {
344         for (k=0; k<position; ++k) {
345             vMaxH = _mm256_slli_si256_rpl(vMaxH, 2);
346         }
347         score = (int16_t) _mm256_extract_epi16_rpl(vMaxH, 15);
348         end_query = s1Len-1;
349     }
350 
351     /* max of last column */
352     if (s1_end)
353     {
354         /* Trace the alignment ending position on read. */
355         int16_t *t = (int16_t*)pvHStore;
356         int32_t column_len = segLen * segWidth;
357         for (i = 0; i<column_len; ++i, ++t) {
358             int32_t temp = i / segWidth + i % segWidth * segLen;
359             if (temp >= s1Len) continue;
360             if (*t > score) {
361                 score = *t;
362                 end_query = temp;
363                 end_ref = s2Len-1;
364             }
365             else if (*t == score && end_ref == s2Len-1 && temp < end_query) {
366                 end_query = temp;
367             }
368         }
369     }
370 
371     if (!s1_end && !s2_end) {
372         /* extract last value from the last column */
373         {
374             __m256i vH = _mm256_load_si256(pvHStore + offset);
375             for (k=0; k<position; ++k) {
376                 vH = _mm256_slli_si256_rpl(vH, 2);
377             }
378             score = (int16_t) _mm256_extract_epi16_rpl (vH, 15);
379             end_ref = s2Len - 1;
380             end_query = s1Len - 1;
381         }
382     }
383 
384 
385 
386     result->score = score;
387     result->end_query = end_query;
388     result->end_ref = end_ref;
389     result->flag |= PARASAIL_FLAG_SG | PARASAIL_FLAG_STRIPED
390         | PARASAIL_FLAG_TRACE
391         | PARASAIL_FLAG_BITS_16 | PARASAIL_FLAG_LANES_16;
392     result->flag |= s1_beg ? PARASAIL_FLAG_SG_S1_BEG : 0;
393     result->flag |= s1_end ? PARASAIL_FLAG_SG_S1_END : 0;
394     result->flag |= s2_beg ? PARASAIL_FLAG_SG_S2_BEG : 0;
395     result->flag |= s2_end ? PARASAIL_FLAG_SG_S2_END : 0;
396 
397     parasail_free(boundary);
398     parasail_free(pvHT);
399     parasail_free(pvEaLoad);
400     parasail_free(pvEaStore);
401     parasail_free(pvE);
402     parasail_free(pvHLoad);
403     parasail_free(pvHStore);
404 
405     return result;
406 }
407 
408 SG_IMPL_ALL
409 SG_IMPL_PROF_ALL
410 
411 
412