1 /**
2 * @file
3 *
4 * @author jeffrey.daily@gmail.com
5 *
6 * Copyright (c) 2015 Battelle Memorial Institute.
7 */
8 #include "config.h"
9
10 #include <stdint.h>
11 #include <stdlib.h>
12
13 #include <immintrin.h>
14
15 #define SG_TRACE
16 #define SG_SUFFIX _striped_avx2_256_16
17 #define SG_SUFFIX_PROF _striped_profile_avx2_256_16
18 #include "sg_helper.h"
19
20 #include "parasail.h"
21 #include "parasail/memory.h"
22 #include "parasail/internal_avx.h"
23
24 #define SWAP(A,B) { __m256i* tmp = A; A = B; B = tmp; }
25
26 #define NEG_INF (INT16_MIN/(int16_t)(2))
27
28 #if HAVE_AVX2_MM256_INSERT_EPI16
29 #define _mm256_insert_epi16_rpl _mm256_insert_epi16
30 #else
_mm256_insert_epi16_rpl(__m256i a,int16_t i,int imm)31 static inline __m256i _mm256_insert_epi16_rpl(__m256i a, int16_t i, int imm) {
32 __m256i_16_t A;
33 A.m = a;
34 A.v[imm] = i;
35 return A.m;
36 }
37 #endif
38
39 #if HAVE_AVX2_MM256_EXTRACT_EPI16
40 #define _mm256_extract_epi16_rpl _mm256_extract_epi16
41 #else
_mm256_extract_epi16_rpl(__m256i a,int imm)42 static inline int16_t _mm256_extract_epi16_rpl(__m256i a, int imm) {
43 __m256i_16_t A;
44 A.m = a;
45 return A.v[imm];
46 }
47 #endif
48
49 #define _mm256_slli_si256_rpl(a,imm) _mm256_alignr_epi8(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,3,0)), 16-imm)
50
51
arr_store(__m256i * array,__m256i vH,int32_t t,int32_t seglen,int32_t d)52 static inline void arr_store(
53 __m256i *array,
54 __m256i vH,
55 int32_t t,
56 int32_t seglen,
57 int32_t d)
58 {
59 _mm256_store_si256(array + (1LL*d*seglen+t), vH);
60 }
61
arr_load(__m256i * array,int32_t t,int32_t seglen,int32_t d)62 static inline __m256i arr_load(
63 __m256i *array,
64 int32_t t,
65 int32_t seglen,
66 int32_t d)
67 {
68 return _mm256_load_si256(array + (1LL*d*seglen+t));
69 }
70
71 #define FNAME parasail_sg_flags_trace_striped_avx2_256_16
72 #define PNAME parasail_sg_flags_trace_striped_profile_avx2_256_16
73
FNAME(const char * const restrict s1,const int s1Len,const char * const restrict s2,const int s2Len,const int open,const int gap,const parasail_matrix_t * matrix,int s1_beg,int s1_end,int s2_beg,int s2_end)74 parasail_result_t* FNAME(
75 const char * const restrict s1, const int s1Len,
76 const char * const restrict s2, const int s2Len,
77 const int open, const int gap, const parasail_matrix_t *matrix,
78 int s1_beg, int s1_end, int s2_beg, int s2_end)
79 {
80 parasail_profile_t *profile = parasail_profile_create_avx_256_16(s1, s1Len, matrix);
81 parasail_result_t *result = PNAME(profile, s2, s2Len, open, gap, s1_beg, s1_end, s2_beg, s2_end);
82 parasail_profile_free(profile);
83 return result;
84 }
85
PNAME(const parasail_profile_t * const restrict profile,const char * const restrict s2,const int s2Len,const int open,const int gap,int s1_beg,int s1_end,int s2_beg,int s2_end)86 parasail_result_t* PNAME(
87 const parasail_profile_t * const restrict profile,
88 const char * const restrict s2, const int s2Len,
89 const int open, const int gap,
90 int s1_beg, int s1_end, int s2_beg, int s2_end)
91 {
92 int32_t i = 0;
93 int32_t j = 0;
94 int32_t k = 0;
95 const int s1Len = profile->s1Len;
96 int32_t end_query = s1Len-1;
97 int32_t end_ref = s2Len-1;
98 const parasail_matrix_t *matrix = profile->matrix;
99 const int32_t segWidth = 16; /* number of values in vector unit */
100 const int32_t segLen = (s1Len + segWidth - 1) / segWidth;
101 const int32_t offset = (s1Len - 1) % segLen;
102 const int32_t position = (segWidth - 1) - (s1Len - 1) / segLen;
103 __m256i* const restrict vProfile = (__m256i*)profile->profile16.score;
104 __m256i* restrict pvHStore = parasail_memalign___m256i(32, segLen);
105 __m256i* restrict pvHLoad = parasail_memalign___m256i(32, segLen);
106 __m256i* const restrict pvE = parasail_memalign___m256i(32, segLen);
107 __m256i* restrict pvEaStore = parasail_memalign___m256i(32, segLen);
108 __m256i* restrict pvEaLoad = parasail_memalign___m256i(32, segLen);
109 __m256i* const restrict pvHT = parasail_memalign___m256i(32, segLen);
110 int16_t* const restrict boundary = parasail_memalign_int16_t(32, s2Len+1);
111 __m256i vGapO = _mm256_set1_epi16(open);
112 __m256i vGapE = _mm256_set1_epi16(gap);
113 __m256i vNegInf = _mm256_set1_epi16(NEG_INF);
114 int16_t score = NEG_INF;
115 __m256i vMaxH = vNegInf;
116 __m256i vPosMask = _mm256_cmpeq_epi16(_mm256_set1_epi16(position),
117 _mm256_set_epi16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15));
118
119 parasail_result_t *result = parasail_result_new_trace(segLen, s2Len, 32, sizeof(__m256i));
120 __m256i vTIns = _mm256_set1_epi16(PARASAIL_INS);
121 __m256i vTDel = _mm256_set1_epi16(PARASAIL_DEL);
122 __m256i vTDiag = _mm256_set1_epi16(PARASAIL_DIAG);
123 __m256i vTDiagE = _mm256_set1_epi16(PARASAIL_DIAG_E);
124 __m256i vTInsE = _mm256_set1_epi16(PARASAIL_INS_E);
125 __m256i vTDiagF = _mm256_set1_epi16(PARASAIL_DIAG_F);
126 __m256i vTDelF = _mm256_set1_epi16(PARASAIL_DEL_F);
127 __m256i vTMask = _mm256_set1_epi16(PARASAIL_ZERO_MASK);
128 __m256i vFTMask = _mm256_set1_epi16(PARASAIL_F_MASK);
129
130 /* initialize H and E */
131 {
132 int32_t index = 0;
133 for (i=0; i<segLen; ++i) {
134 int32_t segNum = 0;
135 __m256i_16_t h;
136 __m256i_16_t e;
137 for (segNum=0; segNum<segWidth; ++segNum) {
138 int64_t tmp = s1_beg ? 0 : (-open-gap*(segNum*segLen+i));
139 h.v[segNum] = tmp < INT16_MIN ? INT16_MIN : tmp;
140 tmp = tmp - open;
141 e.v[segNum] = tmp < INT16_MIN ? INT16_MIN : tmp;
142 }
143 _mm256_store_si256(&pvHStore[index], h.m);
144 _mm256_store_si256(&pvE[index], e.m);
145 _mm256_store_si256(&pvEaStore[index], e.m);
146 ++index;
147 }
148 }
149
150 /* initialize uppder boundary */
151 {
152 boundary[0] = 0;
153 for (i=1; i<=s2Len; ++i) {
154 int64_t tmp = s2_beg ? 0 : (-open-gap*(i-1));
155 boundary[i] = tmp < INT16_MIN ? INT16_MIN : tmp;
156 }
157 }
158
159 for (i=0; i<segLen; ++i) {
160 arr_store(result->trace->trace_table, vTDiagE, i, segLen, 0);
161 }
162
163 /* outer loop over database sequence */
164 for (j=0; j<s2Len; ++j) {
165 __m256i vEF_opn;
166 __m256i vE;
167 __m256i vE_ext;
168 __m256i vF;
169 __m256i vF_ext;
170 __m256i vFa;
171 __m256i vFa_ext;
172 __m256i vH;
173 __m256i vH_dag;
174 const __m256i* vP = NULL;
175
176 /* Initialize F value to -inf. Any errors to vH values will be
177 * corrected in the Lazy_F loop. */
178 vF = vNegInf;
179
180 /* load final segment of pvHStore and shift left by 2 bytes */
181 vH = _mm256_load_si256(&pvHStore[segLen - 1]);
182 vH = _mm256_slli_si256_rpl(vH, 2);
183
184 /* insert upper boundary condition */
185 vH = _mm256_insert_epi16_rpl(vH, boundary[j], 0);
186
187 /* Correct part of the vProfile */
188 vP = vProfile + matrix->mapper[(unsigned char)s2[j]] * segLen;
189
190 /* Swap the 2 H buffers. */
191 SWAP(pvHLoad, pvHStore)
192 SWAP(pvEaLoad, pvEaStore)
193
194 /* inner loop to process the query sequence */
195 for (i=0; i<segLen; ++i) {
196 vE = _mm256_load_si256(pvE + i);
197
198 /* Get max from vH, vE and vF. */
199 vH_dag = _mm256_add_epi16(vH, _mm256_load_si256(vP + i));
200 vH = _mm256_max_epi16(vH_dag, vE);
201 vH = _mm256_max_epi16(vH, vF);
202 /* Save vH values. */
203 _mm256_store_si256(pvHStore + i, vH);
204
205
206 {
207 __m256i vTAll = arr_load(result->trace->trace_table, i, segLen, j);
208 __m256i case1 = _mm256_cmpeq_epi16(vH, vH_dag);
209 __m256i case2 = _mm256_cmpeq_epi16(vH, vF);
210 __m256i vT = _mm256_blendv_epi8(
211 _mm256_blendv_epi8(vTIns, vTDel, case2),
212 vTDiag, case1);
213 _mm256_store_si256(pvHT + i, vT);
214 vT = _mm256_or_si256(vT, vTAll);
215 arr_store(result->trace->trace_table, vT, i, segLen, j);
216 }
217
218 vEF_opn = _mm256_sub_epi16(vH, vGapO);
219
220 /* Update vE value. */
221 vE_ext = _mm256_sub_epi16(vE, vGapE);
222 vE = _mm256_max_epi16(vEF_opn, vE_ext);
223 _mm256_store_si256(pvE + i, vE);
224 {
225 __m256i vEa = _mm256_load_si256(pvEaLoad + i);
226 __m256i vEa_ext = _mm256_sub_epi16(vEa, vGapE);
227 vEa = _mm256_max_epi16(vEF_opn, vEa_ext);
228 _mm256_store_si256(pvEaStore + i, vEa);
229 if (j+1<s2Len) {
230 __m256i cond = _mm256_cmpgt_epi16(vEF_opn, vEa_ext);
231 __m256i vT = _mm256_blendv_epi8(vTInsE, vTDiagE, cond);
232 arr_store(result->trace->trace_table, vT, i, segLen, j+1);
233 }
234 }
235
236 /* Update vF value. */
237 vF_ext = _mm256_sub_epi16(vF, vGapE);
238 vF = _mm256_max_epi16(vEF_opn, vF_ext);
239 if (i+1<segLen) {
240 __m256i vTAll = arr_load(result->trace->trace_table, i+1, segLen, j);
241 __m256i cond = _mm256_cmpgt_epi16(vEF_opn, vF_ext);
242 __m256i vT = _mm256_blendv_epi8(vTDelF, vTDiagF, cond);
243 vT = _mm256_or_si256(vT, vTAll);
244 arr_store(result->trace->trace_table, vT, i+1, segLen, j);
245 }
246
247 /* Load the next vH. */
248 vH = _mm256_load_si256(pvHLoad + i);
249 }
250
251 /* Lazy_F loop: has been revised to disallow adjecent insertion and
252 * then deletion, so don't update E(i, i), learn from SWPS3 */
253 vFa_ext = vF_ext;
254 vFa = vF;
255 for (k=0; k<segWidth; ++k) {
256 int64_t tmp = s2_beg ? -open : (boundary[j+1]-open);
257 int16_t tmp2 = tmp < INT16_MIN ? INT16_MIN : tmp;
258 __m256i vHp = _mm256_load_si256(&pvHLoad[segLen - 1]);
259 vHp = _mm256_slli_si256_rpl(vHp, 2);
260 vHp = _mm256_insert_epi16_rpl(vHp, boundary[j], 0);
261 vEF_opn = _mm256_slli_si256_rpl(vEF_opn, 2);
262 vEF_opn = _mm256_insert_epi16_rpl(vEF_opn, tmp2, 0);
263 vF_ext = _mm256_slli_si256_rpl(vF_ext, 2);
264 vF_ext = _mm256_insert_epi16_rpl(vF_ext, NEG_INF, 0);
265 vF = _mm256_slli_si256_rpl(vF, 2);
266 vF = _mm256_insert_epi16_rpl(vF, tmp2, 0);
267 vFa_ext = _mm256_slli_si256_rpl(vFa_ext, 2);
268 vFa_ext = _mm256_insert_epi16_rpl(vFa_ext, NEG_INF, 0);
269 vFa = _mm256_slli_si256_rpl(vFa, 2);
270 vFa = _mm256_insert_epi16_rpl(vFa, tmp2, 0);
271 for (i=0; i<segLen; ++i) {
272 vH = _mm256_load_si256(pvHStore + i);
273 vH = _mm256_max_epi16(vH,vF);
274 _mm256_store_si256(pvHStore + i, vH);
275
276 {
277 __m256i vTAll;
278 __m256i vT;
279 __m256i case1;
280 __m256i case2;
281 __m256i cond;
282 vHp = _mm256_add_epi16(vHp, _mm256_load_si256(vP + i));
283 case1 = _mm256_cmpeq_epi16(vH, vHp);
284 case2 = _mm256_cmpeq_epi16(vH, vF);
285 cond = _mm256_andnot_si256(case1,case2);
286 vTAll = arr_load(result->trace->trace_table, i, segLen, j);
287 vT = _mm256_load_si256(pvHT + i);
288 vT = _mm256_blendv_epi8(vT, vTDel, cond);
289 _mm256_store_si256(pvHT + i, vT);
290 vTAll = _mm256_and_si256(vTAll, vTMask);
291 vTAll = _mm256_or_si256(vTAll, vT);
292 arr_store(result->trace->trace_table, vTAll, i, segLen, j);
293 }
294 /* Update vF value. */
295 {
296 __m256i vTAll = arr_load(result->trace->trace_table, i, segLen, j);
297 __m256i cond = _mm256_cmpgt_epi16(vEF_opn, vFa_ext);
298 __m256i vT = _mm256_blendv_epi8(vTDelF, vTDiagF, cond);
299 vTAll = _mm256_and_si256(vTAll, vFTMask);
300 vTAll = _mm256_or_si256(vTAll, vT);
301 arr_store(result->trace->trace_table, vTAll, i, segLen, j);
302 }
303 vEF_opn = _mm256_sub_epi16(vH, vGapO);
304 vF_ext = _mm256_sub_epi16(vF, vGapE);
305 {
306 __m256i vEa = _mm256_load_si256(pvEaLoad + i);
307 __m256i vEa_ext = _mm256_sub_epi16(vEa, vGapE);
308 vEa = _mm256_max_epi16(vEF_opn, vEa_ext);
309 _mm256_store_si256(pvEaStore + i, vEa);
310 if (j+1<s2Len) {
311 __m256i cond = _mm256_cmpgt_epi16(vEF_opn, vEa_ext);
312 __m256i vT = _mm256_blendv_epi8(vTInsE, vTDiagE, cond);
313 arr_store(result->trace->trace_table, vT, i, segLen, j+1);
314 }
315 }
316 if (! _mm256_movemask_epi8(
317 _mm256_or_si256(
318 _mm256_cmpgt_epi16(vF_ext, vEF_opn),
319 _mm256_cmpeq_epi16(vF_ext, vEF_opn))))
320 goto end;
321 /*vF = _mm256_max_epi16(vEF_opn, vF_ext);*/
322 vF = vF_ext;
323 vFa_ext = _mm256_sub_epi16(vFa, vGapE);
324 vFa = _mm256_max_epi16(vEF_opn, vFa_ext);
325 vHp = _mm256_load_si256(pvHLoad + i);
326 }
327 }
328 end:
329 {
330 /* extract vector containing last value from the column */
331 __m256i vCompare;
332 vH = _mm256_load_si256(pvHStore + offset);
333 vCompare = _mm256_and_si256(vPosMask, _mm256_cmpgt_epi16(vH, vMaxH));
334 vMaxH = _mm256_max_epi16(vH, vMaxH);
335 if (_mm256_movemask_epi8(vCompare)) {
336 end_ref = j;
337 }
338 }
339 }
340
341 /* max last value from all columns */
342 if (s2_end)
343 {
344 for (k=0; k<position; ++k) {
345 vMaxH = _mm256_slli_si256_rpl(vMaxH, 2);
346 }
347 score = (int16_t) _mm256_extract_epi16_rpl(vMaxH, 15);
348 end_query = s1Len-1;
349 }
350
351 /* max of last column */
352 if (s1_end)
353 {
354 /* Trace the alignment ending position on read. */
355 int16_t *t = (int16_t*)pvHStore;
356 int32_t column_len = segLen * segWidth;
357 for (i = 0; i<column_len; ++i, ++t) {
358 int32_t temp = i / segWidth + i % segWidth * segLen;
359 if (temp >= s1Len) continue;
360 if (*t > score) {
361 score = *t;
362 end_query = temp;
363 end_ref = s2Len-1;
364 }
365 else if (*t == score && end_ref == s2Len-1 && temp < end_query) {
366 end_query = temp;
367 }
368 }
369 }
370
371 if (!s1_end && !s2_end) {
372 /* extract last value from the last column */
373 {
374 __m256i vH = _mm256_load_si256(pvHStore + offset);
375 for (k=0; k<position; ++k) {
376 vH = _mm256_slli_si256_rpl(vH, 2);
377 }
378 score = (int16_t) _mm256_extract_epi16_rpl (vH, 15);
379 end_ref = s2Len - 1;
380 end_query = s1Len - 1;
381 }
382 }
383
384
385
386 result->score = score;
387 result->end_query = end_query;
388 result->end_ref = end_ref;
389 result->flag |= PARASAIL_FLAG_SG | PARASAIL_FLAG_STRIPED
390 | PARASAIL_FLAG_TRACE
391 | PARASAIL_FLAG_BITS_16 | PARASAIL_FLAG_LANES_16;
392 result->flag |= s1_beg ? PARASAIL_FLAG_SG_S1_BEG : 0;
393 result->flag |= s1_end ? PARASAIL_FLAG_SG_S1_END : 0;
394 result->flag |= s2_beg ? PARASAIL_FLAG_SG_S2_BEG : 0;
395 result->flag |= s2_end ? PARASAIL_FLAG_SG_S2_END : 0;
396
397 parasail_free(boundary);
398 parasail_free(pvHT);
399 parasail_free(pvEaLoad);
400 parasail_free(pvEaStore);
401 parasail_free(pvE);
402 parasail_free(pvHLoad);
403 parasail_free(pvHStore);
404
405 return result;
406 }
407
408 SG_IMPL_ALL
409 SG_IMPL_PROF_ALL
410
411
412