1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // René Rahn <rene.rahn@fu-berlin.de>
34 // Stefan Budach <stefan.budach@fu-berlin.de>
35 // ==========================================================================
36 // generic SIMD interface for SSE3 / AVX2
37 // ==========================================================================
38
39 #ifndef SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_AVX2_H_
40 #define SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_AVX2_H_
41
42 namespace seqan {
43
44 // SimdParams_<32, 32>: 256bit = 32 elements * 8bit
45 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector32Char, char, 32)
46 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector32SChar, signed char, 32)
47 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector32UChar, unsigned char, 32)
48
49 // SimdParams_<32, 16>: 256bit = 16 elements * 2 * 8bit
50 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16Short, short, 32)
51 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16UShort, unsigned short, 32)
52
53 // SimdParams_<32, 8>: 256bit = 8 elements * 4 * 8bit
54 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8Int, int, 32)
55 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8UInt, unsigned int, 32)
56
57 // SimdParams_<32, 4>: 256bit = 4 elements * 8 * 8bit
58 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4Int64, int64_t, 32)
59 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4UInt64, uint64_t, 32)
60
61 // ============================================================================
62 // Functions
63 // ============================================================================
64
65 // ============================================================================
66 // AVX/AVX2 wrappers (256bit vectors)
67 // ============================================================================
68
69 // --------------------------------------------------------------------------
70 // _fillVector (256bit)
71 // --------------------------------------------------------------------------
72
73 template <typename TSimdVector, typename ...TValue>
74 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<32,32>)75 _fillVector(TSimdVector & vector,
76 std::tuple<TValue...> const & x,
77 std::index_sequence<0> const &, SimdParams_<32, 32>)
78 {
79 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi8(std::get<0>(x)));
80 }
81
82 template <typename TSimdVector, typename ...TValue>
83 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<32,16>)84 _fillVector(TSimdVector & vector,
85 std::tuple<TValue...> const & x,
86 std::index_sequence<0> const &, SimdParams_<32, 16>)
87 {
88 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi16(std::get<0>(x)));
89 }
90
91 template <typename TSimdVector, typename ...TValue>
92 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<32,8>)93 _fillVector(TSimdVector & vector,
94 std::tuple<TValue...> const & x,
95 std::index_sequence<0> const &, SimdParams_<32, 8>)
96 {
97 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi32(std::get<0>(x)));
98 }
99
100 template <typename TSimdVector, typename ...TValue>
101 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<32,4>)102 _fillVector(TSimdVector & vector,
103 std::tuple<TValue...> const & x,
104 std::index_sequence<0> const &, SimdParams_<32, 4>)
105 {
106 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi64x(std::get<0>(x)));
107 }
108
109 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
110 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<32,32>)111 _fillVector(TSimdVector & vector,
112 std::tuple<TValue...> const & args, std::index_sequence<INDICES...> const &, SimdParams_<32, 32>)
113 {
114 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_setr_epi8(std::get<INDICES>(args)...));
115 }
116
117 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
118 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<32,16>)119 _fillVector(TSimdVector & vector,
120 std::tuple<TValue...> const & args, std::index_sequence<INDICES...> const &, SimdParams_<32, 16>)
121 {
122 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_setr_epi16(std::get<INDICES>(args)...));
123 }
124 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
125 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<32,8>)126 _fillVector(TSimdVector & vector,
127 std::tuple<TValue...> const & args, std::index_sequence<INDICES...> const &, SimdParams_<32, 8>)
128 {
129 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_setr_epi32(std::get<INDICES>(args)...));
130 }
131
132 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
133 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<32,4>)134 _fillVector(TSimdVector & vector,
135 std::tuple<TValue...> const & args, std::index_sequence<INDICES...> const &, SimdParams_<32, 4>)
136 {
137 // reverse argument list 0, 1, 2, 3 -> 3, 2, 1, 0
138 // NOTE(marehr): Intel linux fails to reverse argument list and only
139 // _mm256_set_epi64x has no reverse equivalent
140 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set_epi64x(std::get<sizeof...(INDICES) - 1 - INDICES>(args)...));
141 }
142
143 // --------------------------------------------------------------------------
144 // _clearVector (256bit)
145 // --------------------------------------------------------------------------
146
147 template <typename TSimdVector, int L>
_clearVector(TSimdVector & vector,SimdParams_<32,L>)148 inline void _clearVector(TSimdVector & vector, SimdParams_<32, L>)
149 {
150 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_setzero_si256());
151 }
152
153 // --------------------------------------------------------------------------
154 // _createVector (256bit)
155 // --------------------------------------------------------------------------
156
157 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<32,32>)158 inline TSimdVector _createVector(TValue const x, SimdParams_<32, 32>)
159 {
160 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi8(x));
161 }
162
163 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<32,16>)164 inline TSimdVector _createVector(TValue const x, SimdParams_<32, 16>)
165 {
166 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi16(x));
167 }
168
169 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<32,8>)170 inline TSimdVector _createVector(TValue const x, SimdParams_<32, 8>)
171 {
172 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi32(x));
173 }
174
175 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<32,4>)176 inline TSimdVector _createVector(TValue const x, SimdParams_< 32, 4>)
177 {
178 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi64x(x));
179 }
180
181 // --------------------------------------------------------------------------
182 // _cmpEq (256bit)
183 // --------------------------------------------------------------------------
184
185 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32>)186 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32>)
187 {
188 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpeq_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
189 SEQAN_VECTOR_CAST_(const __m256i&, b)));
190 }
191
192 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16>)193 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16>)
194 {
195 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpeq_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
196 SEQAN_VECTOR_CAST_(const __m256i&, b)));
197 }
198
199 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8>)200 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8>)
201 {
202 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpeq_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
203 SEQAN_VECTOR_CAST_(const __m256i&, b)));
204 }
205
206 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4>)207 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4>)
208 {
209 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpeq_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
210 SEQAN_VECTOR_CAST_(const __m256i&, b)));
211 }
212
213 // --------------------------------------------------------------------------
214 // _cmpGt (256bit)
215 // --------------------------------------------------------------------------
216
217 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,int8_t>)218 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, int8_t>)
219 {
220 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpgt_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
221 SEQAN_VECTOR_CAST_(const __m256i&, b)));
222 }
223
224 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,uint8_t>)225 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, uint8_t>)
226 {
227 // There is no unsigned cmpgt, we reduce it to the signed case.
228 // Note that 0x80 = ~0x7F (prevent overflow messages).
229 return SEQAN_VECTOR_CAST_(TSimdVector,
230 _mm256_cmpgt_epi8(
231 _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_set1_epi8(~0x7F)),
232 _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, b), _mm256_set1_epi8(~0x7F))));
233 }
234
235 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,int16_t>)236 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, int16_t>)
237 {
238 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpgt_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
239 SEQAN_VECTOR_CAST_(const __m256i&, b)));
240 }
241
242 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,uint16_t>)243 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, uint16_t>)
244 {
245 // There is no unsigned cmpgt, we reduce it to the signed case.
246 // Note that 0x8000 = ~0x7FFF (prevent overflow messages).
247 return SEQAN_VECTOR_CAST_(TSimdVector,
248 _mm256_cmpgt_epi16(
249 _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_set1_epi16(~0x7FFF)),
250 _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, b), _mm256_set1_epi16(~0x7FFF))));
251 }
252
253 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,int32_t>)254 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, int32_t>)
255 {
256 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpgt_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
257 SEQAN_VECTOR_CAST_(const __m256i&, b)));
258 }
259
260 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,uint32_t>)261 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, uint32_t>)
262 {
263 // There is no unsigned cmpgt, we reduce it to the signed case.
264 // Note that 0x80000000 = ~0x7FFFFFFF (prevent overflow messages).
265 return SEQAN_VECTOR_CAST_(TSimdVector,
266 _mm256_cmpgt_epi32(
267 _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_set1_epi32(~0x7FFFFFFF)),
268 _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, b), _mm256_set1_epi32(~0x7FFFFFFF))));
269 }
270
271 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,int64_t>)272 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, int64_t>)
273 {
274 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpgt_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
275 SEQAN_VECTOR_CAST_(const __m256i&, b)));
276 }
277
278 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,uint64_t>)279 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, uint64_t>)
280 {
281 // There is no unsigned cmpgt, we reduce it to the signed case.
282 // Note that 0x8000000000000000ul = ~0x7FFFFFFFFFFFFFFFul (prevent overflow messages).
283 return SEQAN_VECTOR_CAST_(TSimdVector,
284 _mm256_cmpgt_epi64(
285 _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, a) ,_mm256_set1_epi64x(~0x7FFFFFFFFFFFFFFFul)),
286 _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, b), _mm256_set1_epi64x(~0x7FFFFFFFFFFFFFFFul))));
287 }
288
289 // --------------------------------------------------------------------------
290 // _bitwiseOr (256bit)
291 // --------------------------------------------------------------------------
292
293 template <typename TSimdVector, int L>
_bitwiseOr(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,L>)294 inline TSimdVector _bitwiseOr(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, L>)
295 {
296 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_or_si256(SEQAN_VECTOR_CAST_(const __m256i&, a),
297 SEQAN_VECTOR_CAST_(const __m256i&, b)));
298 }
299
300 // --------------------------------------------------------------------------
301 // _bitwiseAnd (256bit)
302 // --------------------------------------------------------------------------
303
304 template <typename TSimdVector, int L>
_bitwiseAnd(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,L>)305 inline TSimdVector _bitwiseAnd(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, L>)
306 {
307 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_and_si256(SEQAN_VECTOR_CAST_(const __m256i&, a),
308 SEQAN_VECTOR_CAST_(const __m256i&, b)));
309 }
310
311 // --------------------------------------------------------------------------
312 // _bitwiseAndNot (256bit)
313 // --------------------------------------------------------------------------
314
315 template <typename TSimdVector, int L>
_bitwiseAndNot(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,L>)316 inline TSimdVector _bitwiseAndNot(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, L>)
317 {
318 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_andnot_si256(SEQAN_VECTOR_CAST_(const __m256i&, a),
319 SEQAN_VECTOR_CAST_(const __m256i&, b)));
320 }
321
322 // --------------------------------------------------------------------------
323 // _bitwiseNot (256bit)
324 // --------------------------------------------------------------------------
325
326 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<32,32>)327 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<32, 32>)
328 {
329 return SEQAN_VECTOR_CAST_(TSimdVector,
330 _mm256_cmpeq_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_setzero_si256()));
331 }
332
333 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<32,16>)334 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<32, 16>)
335 {
336 return SEQAN_VECTOR_CAST_(TSimdVector,
337 _mm256_cmpeq_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_setzero_si256()));
338 }
339
340 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<32,8>)341 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<32, 8>)
342 {
343 return SEQAN_VECTOR_CAST_(TSimdVector,
344 _mm256_cmpeq_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_setzero_si256()));
345
346 }
347 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<32,4>)348 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<32, 4>)
349 {
350 return SEQAN_VECTOR_CAST_(TSimdVector,
351 _mm256_cmpeq_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_setzero_si256()));
352 }
353
354 // --------------------------------------------------------------------------
355 // _divide (256bit)
356 // --------------------------------------------------------------------------
357
358 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<32,32>)359 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<32, 32>)
360 {
361 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_div_epi8(a, _mm256_set1_epi8(b)));
362 }
363
364 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<32,16>)365 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<32, 16>)
366 {
367 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_div_epi16(a, _mm256_set1_epi16(b)));
368 }
369
370 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<32,8>)371 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<32, 8>)
372 {
373 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_div_epi32(a, _mm256_set1_epi32(b)));
374 }
375
376 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<32,4>)377 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<32, 4>)
378 {
379 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_div_epi64(a, _mm256_set1_epi64x(b)));
380 }
381
382 // --------------------------------------------------------------------------
383 // _add (256bit)
384 // --------------------------------------------------------------------------
385
386 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32>)387 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32>)
388 {
389 return SEQAN_VECTOR_CAST_(TSimdVector,
390 _mm256_add_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
391 SEQAN_VECTOR_CAST_(const __m256i&, b)));
392 }
393
394 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16>)395 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16>)
396 {
397 return SEQAN_VECTOR_CAST_(TSimdVector,
398 _mm256_add_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
399 SEQAN_VECTOR_CAST_(const __m256i&, b)));
400 }
401
402 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8>)403 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8>)
404 {
405 return SEQAN_VECTOR_CAST_(TSimdVector,
406 _mm256_add_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
407 SEQAN_VECTOR_CAST_(const __m256i&, b)));
408 }
409
410 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4>)411 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4>)
412 {
413 return SEQAN_VECTOR_CAST_(TSimdVector,
414 _mm256_add_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
415 SEQAN_VECTOR_CAST_(const __m256i&, b)));
416 }
417
418 // --------------------------------------------------------------------------
419 // _sub (256bit)
420 // --------------------------------------------------------------------------
421
422 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32>)423 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32>)
424 {
425 return SEQAN_VECTOR_CAST_(TSimdVector,
426 _mm256_sub_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
427 SEQAN_VECTOR_CAST_(const __m256i&, b)));
428 }
429
430 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16>)431 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16>)
432 {
433 return SEQAN_VECTOR_CAST_(TSimdVector,
434 _mm256_sub_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
435 SEQAN_VECTOR_CAST_(const __m256i&, b)));
436 }
437
438 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8>)439 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8>)
440 {
441 return SEQAN_VECTOR_CAST_(TSimdVector,
442 _mm256_sub_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
443 SEQAN_VECTOR_CAST_(const __m256i&, b)));
444 }
445
446 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4>)447 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4>)
448 {
449 return SEQAN_VECTOR_CAST_(TSimdVector,
450 _mm256_sub_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
451 SEQAN_VECTOR_CAST_(const __m256i&, b)));
452 }
453
454 // --------------------------------------------------------------------------
455 // _mult (256bit)
456 // --------------------------------------------------------------------------
457
458 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const &,SimdParams_<32,32>)459 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const &/*b*/, SimdParams_<32, 32>)
460 {
461 SEQAN_SKIP_TEST;
462 SEQAN_ASSERT_FAIL("AVX2 intrinsics for multiplying 8 bit values not implemented!");
463 return a;
464 }
465
466 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16>)467 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16>)
468 {
469 return SEQAN_VECTOR_CAST_(TSimdVector,
470 _mm256_mullo_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
471 SEQAN_VECTOR_CAST_(const __m256i&, b)));
472 }
473
474 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8>)475 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8>)
476 {
477 return SEQAN_VECTOR_CAST_(TSimdVector,
478 _mm256_mullo_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
479 SEQAN_VECTOR_CAST_(const __m256i&, b)));
480 }
481
482 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const &,SimdParams_<32,4>)483 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const &/*b*/, SimdParams_<32, 4>)
484 {
485 SEQAN_SKIP_TEST;
486 SEQAN_ASSERT_FAIL("AVX2 intrinsics for multiplying 64 bit values not implemented!");
487 return a;
488 }
489
490 // --------------------------------------------------------------------------
491 // _max (256bit)
492 // --------------------------------------------------------------------------
493
494 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,int8_t>)495 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, int8_t>)
496 {
497 return SEQAN_VECTOR_CAST_(TSimdVector,
498 _mm256_max_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
499 SEQAN_VECTOR_CAST_(const __m256i&, b)));
500 }
501
502 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,uint8_t>)503 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, uint8_t>)
504 {
505 return SEQAN_VECTOR_CAST_(TSimdVector,
506 _mm256_max_epu8(SEQAN_VECTOR_CAST_(const __m256i&, a),
507 SEQAN_VECTOR_CAST_(const __m256i&, b)));
508 }
509
510 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,int16_t>)511 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, int16_t>)
512 {
513 return SEQAN_VECTOR_CAST_(TSimdVector,
514 _mm256_max_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
515 SEQAN_VECTOR_CAST_(const __m256i&, b)));
516 }
517
518 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,uint16_t>)519 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, uint16_t>)
520 {
521 return SEQAN_VECTOR_CAST_(TSimdVector,
522 _mm256_max_epu16(SEQAN_VECTOR_CAST_(const __m256i&, a),
523 SEQAN_VECTOR_CAST_(const __m256i&, b)));
524 }
525
526 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,int32_t>)527 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, int32_t>)
528 {
529 return SEQAN_VECTOR_CAST_(TSimdVector,
530 _mm256_max_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
531 SEQAN_VECTOR_CAST_(const __m256i&, b)));
532 }
533
534 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,uint32_t>)535 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, uint32_t>)
536 {
537 return SEQAN_VECTOR_CAST_(TSimdVector,
538 _mm256_max_epu32(SEQAN_VECTOR_CAST_(const __m256i&, a),
539 SEQAN_VECTOR_CAST_(const __m256i&, b)));
540 }
541
542 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,int64_t>)543 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, int64_t>)
544 {
545 #if defined(__AVX512VL__)
546 return SEQAN_VECTOR_CAST_(TSimdVector,
547 _mm256_max_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
548 SEQAN_VECTOR_CAST_(const __m256i&, b)));
549 #else // defined(__AVX512VL__)
550 return blend(b, a, cmpGt(a, b));
551 #endif // defined(__AVX512VL__)
552 }
553
554 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,uint64_t>)555 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, uint64_t>)
556 {
557 #if defined(__AVX512VL__)
558 return SEQAN_VECTOR_CAST_(TSimdVector,
559 _mm256_max_epu64(SEQAN_VECTOR_CAST_(const __m256i&, a),
560 SEQAN_VECTOR_CAST_(const __m256i&, b)));
561 #else // defined(__AVX512VL__)
562 return blend(b, a, cmpGt(a, b));
563 #endif // defined(__AVX512VL__)
564 }
565
566
567 // --------------------------------------------------------------------------
568 // _min (256bit)
569 // --------------------------------------------------------------------------
570
571 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,int8_t>)572 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, int8_t>)
573 {
574 return SEQAN_VECTOR_CAST_(TSimdVector,
575 _mm256_min_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
576 SEQAN_VECTOR_CAST_(const __m256i&, b)));
577 }
578
579 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,uint8_t>)580 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, uint8_t>)
581 {
582 return SEQAN_VECTOR_CAST_(TSimdVector,
583 _mm256_min_epu8(SEQAN_VECTOR_CAST_(const __m256i&, a),
584 SEQAN_VECTOR_CAST_(const __m256i&, b)));
585 }
586
587 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,int16_t>)588 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, int16_t>)
589 {
590 return SEQAN_VECTOR_CAST_(TSimdVector,
591 _mm256_min_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
592 SEQAN_VECTOR_CAST_(const __m256i&, b)));
593 }
594
595 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,uint16_t>)596 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, uint16_t>)
597 {
598 return SEQAN_VECTOR_CAST_(TSimdVector,
599 _mm256_min_epu16(SEQAN_VECTOR_CAST_(const __m256i&, a),
600 SEQAN_VECTOR_CAST_(const __m256i&, b)));
601 }
602
603 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,int32_t>)604 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, int32_t>)
605 {
606 return SEQAN_VECTOR_CAST_(TSimdVector,
607 _mm256_min_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
608 SEQAN_VECTOR_CAST_(const __m256i&, b)));
609 }
610
611 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,uint32_t>)612 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, uint32_t>)
613 {
614 return SEQAN_VECTOR_CAST_(TSimdVector,
615 _mm256_min_epu32(SEQAN_VECTOR_CAST_(const __m256i&, a),
616 SEQAN_VECTOR_CAST_(const __m256i&, b)));
617 }
618
619 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,int64_t>)620 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, int64_t>)
621 {
622 #if defined(__AVX512VL__)
623 return SEQAN_VECTOR_CAST_(TSimdVector,
624 _mm256_min_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
625 SEQAN_VECTOR_CAST_(const __m256i&, b)));
626 #else // defined(__AVX512VL__)
627 return blend(a, b, cmpGt(a, b));
628 #endif // defined(__AVX512VL__)
629 }
630
631 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,uint64_t>)632 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, uint64_t>)
633 {
634 #if defined(__AVX512VL__)
635 return SEQAN_VECTOR_CAST_(TSimdVector,
636 _mm256_min_epu64(SEQAN_VECTOR_CAST_(const __m256i&, a),
637 SEQAN_VECTOR_CAST_(const __m256i&, b)));
638 #else // defined(__AVX512VL__)
639 return blend(a, b, cmpGt(a, b));
640 #endif // defined(__AVX512VL__)
641 }
642
643 // --------------------------------------------------------------------------
644 // _blend (256bit)
645 // --------------------------------------------------------------------------
646
647 template <typename TSimdVector, typename TSimdVectorMask, int L>
_blend(TSimdVector const & a,TSimdVector const & b,TSimdVectorMask const & mask,SimdParams_<32,L>)648 inline TSimdVector _blend(TSimdVector const & a, TSimdVector const & b, TSimdVectorMask const & mask, SimdParams_<32, L>)
649 {
650 return SEQAN_VECTOR_CAST_(TSimdVector,
651 _mm256_blendv_epi8(SEQAN_VECTOR_CAST_(const __m256i &, a),
652 SEQAN_VECTOR_CAST_(const __m256i &, b),
653 SEQAN_VECTOR_CAST_(const __m256i &, mask)));
654 }
655
656 // --------------------------------------------------------------------------
657 // _storeu (256bit)
658 // --------------------------------------------------------------------------
659
660 template <typename T, typename TSimdVector, int L>
_storeu(T * memAddr,TSimdVector const & vec,SimdParams_<32,L>)661 inline void _storeu(T * memAddr, TSimdVector const & vec, SimdParams_<32, L>)
662 {
663 _mm256_storeu_si256((__m256i*)memAddr, SEQAN_VECTOR_CAST_(const __m256i&, vec));
664 }
665
666 // ----------------------------------------------------------------------------
667 // Function _load() 256bit
668 // ----------------------------------------------------------------------------
669
670 template <typename TSimdVector, typename T, int L>
_load(T const * memAddr,SimdParams_<32,L>)671 inline TSimdVector _load(T const * memAddr, SimdParams_<32, L>)
672 {
673 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_load_si256((__m256i const *) memAddr));
674 }
675
676 // --------------------------------------------------------------------------
677 // _shiftRightLogical (256bit)
678 // --------------------------------------------------------------------------
679
680 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<32,32>)681 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<32, 32>)
682 {
683 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_srli_epi16(SEQAN_VECTOR_CAST_(const __m256i &, vector), imm) & _mm256_set1_epi8(0xff >> imm));
684 }
685 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<32,16>)686 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<32, 16>)
687 {
688 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_srli_epi16(SEQAN_VECTOR_CAST_(const __m256i &, vector), imm));
689 }
690 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<32,8>)691 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<32, 8>)
692 {
693 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_srli_epi32(SEQAN_VECTOR_CAST_(const __m256i &, vector), imm));
694 }
695 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<32,4>)696 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<32, 4>)
697 {
698 return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_srli_epi64(SEQAN_VECTOR_CAST_(const __m256i &, vector), imm));
699 }
700
701 // --------------------------------------------------------------------------
702 // Extend sign from integer types 256bit
703 // --------------------------------------------------------------------------
704
705 inline __m256i
seqan_mm256_i16sign_extend_epis8(__m256i const & v)706 seqan_mm256_i16sign_extend_epis8(__m256i const & v)
707 {
708 return _mm256_or_si256( // extend sign (v | hi-bits)
709 v,
710 _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xff00)
711 _mm256_sub_epi16( // msk = msb - 1
712 _mm256_andnot_si256( //msb = ~v & 0x80 (select msb)
713 v,
714 _mm256_set1_epi16(0x80)
715 ),
716 _mm256_set1_epi16(1)
717 ),
718 _mm256_set1_epi16(static_cast<uint16_t>(0xff00u))
719 )
720 );
721 }
722
723 inline __m256i
seqan_mm256_i32sign_extend_epis8(__m256i const & v)724 seqan_mm256_i32sign_extend_epis8(__m256i const & v)
725 {
726 return _mm256_or_si256( // extend sign (v | hi-bits)
727 v,
728 _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffffff00u)
729 _mm256_sub_epi32( // msk = msb - 1
730 _mm256_andnot_si256( //msb = ~v & 0x80 (select msb)
731 v,
732 _mm256_set1_epi32(0x80)
733 ),
734 _mm256_set1_epi32(1)
735 ),
736 _mm256_set1_epi32(static_cast<uint32_t>(0xffffff00u))
737 )
738 );
739 }
740
741 inline __m256i
seqan_mm256_i32sign_extend_epis16(__m256i const & v)742 seqan_mm256_i32sign_extend_epis16(__m256i const & v)
743 {
744 return _mm256_or_si256( // extend sign (v | hi-bits)
745 v,
746 _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffff0000u)
747 _mm256_sub_epi32( // msk = msb - 1
748 _mm256_andnot_si256( //msb = ~v & 0x8000 (select msb)
749 v,
750 _mm256_set1_epi32(0x8000)
751 ),
752 _mm256_set1_epi32(1)
753 ),
754 _mm256_set1_epi32(static_cast<uint32_t>(0xffff0000u))
755 )
756 );
757 }
758
759 inline __m256i
seqan_mm256_i64sign_extend_epis8(__m256i const & v)760 seqan_mm256_i64sign_extend_epis8(__m256i const & v)
761 {
762 return _mm256_or_si256( // extend sign (v | hi-bits)
763 v,
764 _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffffffffffffff00ul)
765 _mm256_sub_epi64( // msk = msb - 1
766 _mm256_andnot_si256( //msb = ~v & 0x80 (select msb)
767 v,
768 _mm256_set1_epi64x(0x80)
769 ),
770 _mm256_set1_epi64x(1)
771 ),
772 _mm256_set1_epi64x(static_cast<uint64_t>(0xffffffffffffff00ul))
773 )
774 );
775 }
776
777 inline __m256i
seqan_mm256_i64sign_extend_epis16(__m256i const & v)778 seqan_mm256_i64sign_extend_epis16(__m256i const & v)
779 {
780 return _mm256_or_si256( // extend sign (v | hi-bits)
781 v,
782 _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffffffffffff0000ul)
783 _mm256_sub_epi64( // msk = msb - 1
784 _mm256_andnot_si256( //msb = ~v & 0x8000 (select msb)
785 v,
786 _mm256_set1_epi64x(0x8000)
787 ),
788 _mm256_set1_epi64x(1)
789 ),
790 _mm256_set1_epi64x(static_cast<uint64_t>(0xffffffffffff0000ul))
791 )
792 );
793 }
794
795 inline __m256i
seqan_mm256_i64sign_extend_epis32(__m256i const & v)796 seqan_mm256_i64sign_extend_epis32(__m256i const & v)
797 {
798 return _mm256_or_si256( // extend sign (v | hi-bits)
799 v,
800 _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffffffffffff0000ul)
801 _mm256_sub_epi64( // msk = msb - 1
802 _mm256_andnot_si256( //msb = ~v & 0x80000000 (select msb)
803 v,
804 _mm256_set1_epi64x(0x80000000)
805 ),
806 _mm256_set1_epi64x(1)
807 ),
808 _mm256_set1_epi64x(static_cast<uint64_t>(0xffffffff00000000ul))
809 )
810 );
811 }
812
813 // --------------------------------------------------------------------------
814 // _gather (256bit)
815 // --------------------------------------------------------------------------
816
817 template <typename TValue, typename TSize, TSize SCALE>
818 inline __m256i
seqan_mm256_i8gather_epi(TValue const * memAddr,__m256i const & idx,std::integral_constant<TSize,SCALE> const &)819 seqan_mm256_i8gather_epi(TValue const * memAddr,
820 __m256i const & idx,
821 std::integral_constant<TSize, SCALE> const & /*scale*/)
822 {
823 // mem: ( 0, 3, 6, 9 | 12, 15, 18, 21 | 24, 27, 30, 33 | 36, 39, 42, 45 || 48, 51, 54, 57 | 60, 63, 66, 69 | 72, 75, 78, 81 | 84, 87, 90, 93)
824 // idx: (31, 30, 29, 28 | 27, 26, 25, 24 | 23, 22, 21, 20 | 19, 18, 17, 16 || 15, 14, 13, 12 | 11, 10, 9, 8 | 7, 6, 5, 4 | 3, 2, 1, 0)
825 // pack: (93, 90, 87, 84 | 81, 78, 75, 72 | 69, 66, 63, 60 | 57, 54, 51, 48 || 45, 42, 39, 36 | 33, 30, 27, 24 | 21, 18, 15, 12 | 9, 6, 3, 0)
826 return _mm256_packus_epi16(
827 // pckLow: (93, 0, 90, 0 | 87, 0, 84, 0 | 81, 0, 78, 0 | 75, 0, 72, 0 || 45, 0, 42, 0 | 39, 0, 36, 0 | 33, 0, 30, 0 | 27, 0, 24, 0)
828 _mm256_packus_epi16(
829 // mskLL: (93, 0, 0, 0 | 90, 0, 0, 0 | 87, 0, 0, 0 | 84, 0, 0, 0 || 45, 0, 0, 0 | 42, 0, 0, 0 | 39, 0, 0, 0 | 36, 0, 0, 0)
830 _mm256_and_si256(
831 // gtrLL: (93, 31, 30, 29 | 90, 93, 31, 30 | 87, 90, 93, 31 | 84, 87, 90, 93 || 45, 48, 51, 54 | 42, 45, 48, 51 | 39, 42, 45, 48 | 36, 39, 42, 45)
832 _mm256_i32gather_epi32(
833 (const int *) memAddr,
834 // lowlow: (31, 0, 0, 0 | 30, 0, 0, 0 | 29, 0, 0, 0 | 28, 0, 0, 0 || 15, 0, 0, 0 | 14, 0, 0, 0 | 13, 0, 0, 0 | 12, 0, 0, 0)
835 _mm256_shuffle_epi8(idx, __m256i {
836 ~0xFF000000FFl | 0x0100000000, ~0xFF000000FFl | 0x0300000002,
837 ~0xFF000000FFl | 0x0100000000, ~0xFF000000FFl | 0x0300000002
838 }),
839 SCALE
840 ),
841 _mm256_set1_epi32(0xFF)
842 ),
843 // mskLH: (81, 0, 0, 0 | 78, 0, 0, 0 | 75, 0, 0, 0 | 72, 0, 0, 0 || 33, 0, 0, 0 | 30, 0, 0, 0 | 27, 0, 0, 0 | 24, 0, 0, 0)
844 _mm256_and_si256(
845 // gtrLH: (81, 84, 87, 90 | 78, 81, 84, 87 | 75, 78, 81, 84 | 72, 75, 78, 81 || 33, 36, 39, 42 | 30, 33, 36, 39 | 27, 30, 33, 36 | 24, 27, 30, 33)
846 _mm256_i32gather_epi32(
847 (const int *) memAddr,
848 // lowhig: (27, 0, 0, 0 | 26, 0, 0, 0 | 25, 0, 0, 0 | 24, 0, 0, 0 || 11, 0, 0, 0 | 10, 0, 0, 0 | 9, 0, 0, 0 | 8, 0, 0, 0)
849 _mm256_shuffle_epi8(idx, __m256i {
850 ~0xFF000000FFl | 0x0500000004, ~0xFF000000FFl | 0x0700000006,
851 ~0xFF000000FFl | 0x0500000004, ~0xFF000000FFl | 0x0700000006
852 }),
853 SCALE
854 ),
855 _mm256_set1_epi32(0xFF)
856 )
857 ),
858 // pckHih: (69, 0, 66, 0 | 63, 0, 60, 0 | 57, 0, 54, 0 | 51, 0, 48, 0 || 21, 0, 18, 0 | 15, 0, 12, 0 | 9, 0, 6, 0 | 3, 0, 0, 0)
859 _mm256_packus_epi16(
860 // mskHL: (69, 0, 0, 0 | 66, 0, 0, 0 | 63, 0, 0, 0 | 60, 0, 0, 0 || 21, 0, 0, 0 | 18, 0, 0, 0 | 15, 0, 0, 0 | 12, 0, 0, 0)
861 _mm256_and_si256(
862 // gtrHL: (69, 72, 75, 78 | 66, 69, 72, 75 | 63, 66, 69, 72 | 60, 63, 66, 69 || 21, 24, 27, 30 | 18, 21, 24, 27 | 15, 18, 21, 24 | 12, 15, 18, 21)
863 _mm256_i32gather_epi32(
864 (const int *) memAddr,
865 // higlow: (23, 0, 0, 0 | 22, 0, 0, 0 | 21, 0, 0, 0 | 20, 0, 0, 0 || 7, 0, 0, 0 | 6, 0, 0, 0 | 5, 0, 0, 0 | 4, 0, 0, 0)
866 _mm256_shuffle_epi8(idx, __m256i {
867 ~0xFF000000FFl | 0x0900000008, ~0xFF000000FFl | 0x0B0000000A,
868 ~0xFF000000FFl | 0x0900000008, ~0xFF000000FFl | 0x0B0000000A
869 }),
870 SCALE
871 ),
872 _mm256_set1_epi32(0xFF)
873 ),
874 // mskHH: (57, 0, 0, 0 | 54, 0, 0, 0 | 51, 0, 0, 0 | 48, 0, 0, 0 || 9, 0, 0, 0 | 6, 0, 0, 0 | 3, 0, 0, 0 | 0, 0, 0, 0)
875 _mm256_and_si256(
876 // gtrHH: (57, 60, 63, 66 | 54, 57, 60, 63 | 51, 54, 57, 60 | 48, 51, 54, 57 || 9, 12, 15, 18 | 6, 9, 12, 15 | 3, 6, 9, 12 | 0, 3, 6, 9)
877 _mm256_i32gather_epi32(
878 (const int *) memAddr,
879 // highig: (19, 0, 0, 0 | 18, 0, 0, 0 | 17, 0, 0, 0 | 16, 0, 0, 0 || 3, 0, 0, 0 | 2, 0, 0, 0 | 1, 0, 0, 0 | 0, 0, 0, 0)
880 _mm256_shuffle_epi8(idx, __m256i {
881 ~0xFF000000FFl | 0x0D0000000C, ~0xFF000000FFl | 0x0F0000000E,
882 ~0xFF000000FFl | 0x0D0000000C, ~0xFF000000FFl | 0x0F0000000E
883 }),
884 SCALE
885 ),
886 _mm256_set1_epi32(0xFF)
887 )
888 )
889 );
890 }
891
892 template <typename TValue, typename TSize, TSize SCALE>
893 inline __m256i
seqan_mm256_i16gather_epi(TValue const * memAddr,__m256i const & idx,std::integral_constant<TSize,SCALE> const &)894 seqan_mm256_i16gather_epi(TValue const * memAddr,
895 __m256i const & idx,
896 std::integral_constant<TSize, SCALE> const & /*scale*/)
897 {
898 using TUnsignedValue = typename MakeUnsigned<TValue>::Type;
899
900 // The cast makes sure that the max value of TValue = (u)int64_t and
901 // (u)int32_t will be max value of int16_t (i.e. `~0` in int16_t), because
902 // the resulting __m256i can only hold int16_t values.
903 //
904 // NOTE(marehr): the masking is only needed for TValue = (u)int8_t and
905 // (u)int16_t. It could be omitted if _mm256_packus_epi32 would be exchanged
906 // by _mm256_packs_epi32, because for (u)int32_t and (u)int64_t the masking
907 // operations are basically the identity function.
908 constexpr int const mask = static_cast<uint16_t>(MaxValue<TUnsignedValue>::VALUE);
909
910 // 1. Unpack low idx values and interleave with 0 and gather from memAddr.
911 // 2. Unpack high idx values and interleave with 0, than gather from memAddr.
912 // 3. Merge 2 8x32 vectors into 1x16 vector by signed saturation. This operation reverts the interleave by the unpack operations above.
913 //
914 // The following is an example for SimdVector<uint16_t, 16> idx and uint16_t
915 // const * memAddr:
916 // mem: ( 0, 0, 3, 0 | 6, 0, 9, 0 | 12, 0, 15, 0 | 18, 0, 21, 0 || 24, 0, 27, 0 | 30, 0, 33, 0 | 36, 0, 39, 0 | 42, 0, 45, 0)
917 // idx: (15, 0, 14, 0 | 13, 0, 12, 0 | 11, 0, 10, 0 | 9, 0, 8, 0 || 7, 0, 6, 0 | 5, 0, 4, 0 | 3, 0, 2, 0 | 1, 0, 0, 0)
918 // pack: (45, 0, 42, 0 | 39, 0, 36, 0 | 33, 0, 30, 0 | 27, 0, 24, 0 || 21, 0, 18, 0 | 15, 0, 12, 0 | 9, 0, 6, 0 | 3, 0, 0, 0)
919 return _mm256_packus_epi32(
920 // mskLow: (45, 0, 0, 0 | 42, 0, 0, 0 | 39, 0, 0, 0 | 36, 0, 0, 0 || 21, 0, 0, 0 | 18, 0, 0, 0 | 15, 0, 0, 0 | 12, 0, 0, 0)
921 _mm256_and_si256(
922 // gtrLow: (45, 0, 15, 0 | 42, 0, 45, 0 | 39, 0, 42, 0 | 36, 0, 39, 0 || 21, 0, 24, 0 | 18, 0, 21, 0 | 15, 0, 18, 0 | 12, 0, 15, 0)
923 _mm256_i32gather_epi32(
924 (const int *) memAddr,
925 // low: (15, 0, 0, 0 | 14, 0, 0, 0 | 13, 0, 0, 0 | 12, 0, 0, 0 || 7, 0, 0, 0 | 6, 0, 0, 0 | 5, 0, 0, 0 | 4, 0, 0, 0)
926 _mm256_unpacklo_epi16(
927 idx, _mm256_set1_epi16(0)
928 ),
929 SCALE
930 ),
931 _mm256_set1_epi32(mask)
932 ),
933 // mskHih: (33, 0, 0, 0 | 30, 0, 0, 0 | 27, 0, 0, 0 | 24, 0, 0, 0 || 9, 0, 0, 0 | 6, 0, 0, 0 | 3, 0, 0, 0 | 0, 0, 0, 0)
934 _mm256_and_si256(
935 // gtrHih: (33, 0, 36, 0 | 30, 0, 33, 0 | 27, 0, 30, 0 | 24, 0, 27, 0 || 9, 0, 12, 0 | 6, 0, 9, 0 | 3, 0, 6, 0 | 0, 0, 3, 0)
936 _mm256_i32gather_epi32(
937 (const int *) memAddr,
938 // high: (11, 0, 0, 0 | 10, 0, 0, 0 | 9, 0, 0, 0 | 8, 0, 0, 0 || 3, 0, 0, 0 | 2, 0, 0, 0 | 1, 0, 0, 0 | 0, 0, 0, 0)
939 _mm256_unpackhi_epi16(
940 idx, _mm256_set1_epi16(0)
941 ),
942 SCALE
943 ),
944 _mm256_set1_epi32(mask)
945 )
946 );
947 }
948
949 template <typename TValue, typename TSize, TSize SCALE>
950 inline __m256i
seqan_mm256_i32gather_epi(TValue const * memAddr,__m256i const & idx,std::integral_constant<TSize,SCALE> const &)951 seqan_mm256_i32gather_epi(TValue const * memAddr,
952 __m256i const & idx,
953 std::integral_constant<TSize, SCALE> const & /*scale*/)
954 {
955 using TUnsignedValue = typename MakeUnsigned<TValue>::Type;
956 constexpr auto const mask = static_cast<uint32_t>(MaxValue<TUnsignedValue>::VALUE);
957
958 return _mm256_and_si256(
959 _mm256_i32gather_epi32((const int *) memAddr, idx, SCALE),
960 _mm256_set1_epi32(mask)
961 );
962 }
963
964 template <typename TValue, typename TSize, TSize SCALE>
965 inline __m256i
seqan_mm256_i64gather_epi(TValue const * memAddr,__m256i const & idx,std::integral_constant<TSize,SCALE> const &)966 seqan_mm256_i64gather_epi(TValue const * memAddr,
967 __m256i const & idx,
968 std::integral_constant<TSize, SCALE> const & /*scale*/)
969 {
970 using TUnsignedValue = typename MakeUnsigned<TValue>::Type;
971 constexpr auto const mask = static_cast<uint64_t>(MaxValue<TUnsignedValue>::VALUE);
972
973 return _mm256_and_si256(
974 _mm256_i64gather_epi64((const long long *) memAddr, idx, SCALE),
975 _mm256_set1_epi64x(mask)
976 );
977 }
978
979 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE>
980 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,32>)981 _gather(TValue const * memAddr,
982 TSimdVector const & idx,
983 std::integral_constant<TSize, SCALE> const & scale,
984 SimdParams_<32, 32>)
985 {
986 return SEQAN_VECTOR_CAST_(TSimdVector,
987 seqan_mm256_i8gather_epi(
988 memAddr,
989 SEQAN_VECTOR_CAST_(__m256i const &, idx),
990 scale
991 )
992 );
993 }
994
995 template <typename TSimdVector, typename TSize, TSize SCALE>
996 inline TSimdVector
_gather(int8_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,16>)997 _gather(int8_t const * memAddr,
998 TSimdVector const & idx,
999 std::integral_constant<TSize, SCALE> const & scale,
1000 SimdParams_<32, 16>)
1001 {
1002 // Note that memAddr is a signed integer type, thus a cast would extend the
1003 // sign. E.g., -3 = 253 in 8 bit, but would be 65533 in 16 bit.
1004 // Use _gather(uint8_t) and extend the sign to [u]int16_t.
1005 return SEQAN_VECTOR_CAST_(
1006 TSimdVector,
1007 seqan_mm256_i16sign_extend_epis8(
1008 seqan_mm256_i16gather_epi(
1009 memAddr,
1010 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1011 scale
1012 )
1013 )
1014 );
1015 }
1016
1017 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE>
1018 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,16>)1019 _gather(TValue const * memAddr,
1020 TSimdVector const & idx,
1021 std::integral_constant<TSize, SCALE> const & scale,
1022 SimdParams_<32, 16>)
1023 {
1024 return SEQAN_VECTOR_CAST_(
1025 TSimdVector,
1026 seqan_mm256_i16gather_epi(
1027 memAddr,
1028 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1029 scale
1030 )
1031 );
1032 }
1033
1034 template <typename TSimdVector, typename TSize, TSize SCALE>
1035 inline TSimdVector
_gather(int8_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,8>)1036 _gather(int8_t const * memAddr,
1037 TSimdVector const & idx,
1038 std::integral_constant<TSize, SCALE> const & scale,
1039 SimdParams_<32, 8>)
1040 {
1041 // Note that memAddr is a signed integer type, thus a cast would extend the
1042 // sign.
1043 return SEQAN_VECTOR_CAST_(
1044 TSimdVector,
1045 seqan_mm256_i32sign_extend_epis8(
1046 seqan_mm256_i32gather_epi(
1047 memAddr,
1048 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1049 scale
1050 )
1051 )
1052 );
1053 }
1054
1055 template <typename TSimdVector, typename TSize, TSize SCALE>
1056 inline TSimdVector
_gather(int16_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,8>)1057 _gather(int16_t const * memAddr,
1058 TSimdVector const & idx,
1059 std::integral_constant<TSize, SCALE> const & scale,
1060 SimdParams_<32, 8>)
1061 {
1062 // Note that memAddr is a signed integer type, thus a cast would extend the
1063 // sign.
1064 return SEQAN_VECTOR_CAST_(
1065 TSimdVector,
1066 seqan_mm256_i32sign_extend_epis16(
1067 seqan_mm256_i32gather_epi(
1068 memAddr,
1069 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1070 scale
1071 )
1072 )
1073 );
1074 }
1075
1076 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE>
1077 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,8>)1078 _gather(TValue const * memAddr,
1079 TSimdVector const & idx,
1080 std::integral_constant<TSize, SCALE> const & scale,
1081 SimdParams_<32, 8>)
1082 {
1083 return SEQAN_VECTOR_CAST_(
1084 TSimdVector,
1085 seqan_mm256_i32gather_epi(
1086 memAddr,
1087 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1088 scale
1089 )
1090 );
1091 }
1092
1093 template <typename TSimdVector, typename TSize, TSize SCALE>
1094 inline TSimdVector
_gather(int8_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,4>)1095 _gather(int8_t const * memAddr,
1096 TSimdVector const & idx,
1097 std::integral_constant<TSize, SCALE> const & scale,
1098 SimdParams_<32, 4>)
1099 {
1100 return SEQAN_VECTOR_CAST_(
1101 TSimdVector,
1102 seqan_mm256_i64sign_extend_epis8(
1103 seqan_mm256_i64gather_epi(
1104 memAddr,
1105 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1106 scale
1107 )
1108 )
1109 );
1110 }
1111
1112 template <typename TSimdVector, typename TSize, TSize SCALE>
1113 inline TSimdVector
_gather(int16_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,4>)1114 _gather(int16_t const * memAddr,
1115 TSimdVector const & idx,
1116 std::integral_constant<TSize, SCALE> const & scale,
1117 SimdParams_<32, 4>)
1118 {
1119 return SEQAN_VECTOR_CAST_(
1120 TSimdVector,
1121 seqan_mm256_i64sign_extend_epis16(
1122 seqan_mm256_i64gather_epi(
1123 memAddr,
1124 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1125 scale
1126 )
1127 )
1128 );
1129 }
1130
1131 template <typename TSimdVector, typename TSize, TSize SCALE>
1132 inline TSimdVector
_gather(int32_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,4>)1133 _gather(int32_t const * memAddr,
1134 TSimdVector const & idx,
1135 std::integral_constant<TSize, SCALE> const & scale,
1136 SimdParams_<32, 4>)
1137 {
1138 return SEQAN_VECTOR_CAST_(
1139 TSimdVector,
1140 seqan_mm256_i64sign_extend_epis32(
1141 seqan_mm256_i64gather_epi(
1142 memAddr,
1143 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1144 scale
1145 )
1146 )
1147 );
1148 }
1149
1150 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE>
1151 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,4>)1152 _gather(TValue const * memAddr,
1153 TSimdVector const & idx,
1154 std::integral_constant<TSize, SCALE> const & scale,
1155 SimdParams_<32, 4>)
1156 {
1157 return SEQAN_VECTOR_CAST_(
1158 TSimdVector,
1159 seqan_mm256_i64gather_epi(
1160 memAddr,
1161 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1162 scale
1163 )
1164 );
1165 }
1166
1167 // --------------------------------------------------------------------------
1168 // _shuffleVector (256bit)
1169 // --------------------------------------------------------------------------
1170
1171 inline __m256i
seqan_m256_shuffle_epi8(__m256i const & vector,__m256i const & indices)1172 seqan_m256_shuffle_epi8(__m256i const & vector, __m256i const & indices)
1173 {
1174 return _mm256_xor_si256(
1175 // shuffle bytes from the lower bytes of vector
1176 _mm256_shuffle_epi8(
1177 // repeat twice the low bytes of vector in a new __m256i vector i.e.
1178 // vh[127:0] = v[127:0]
1179 // vh[255:128] = v[127:0]
1180 _mm256_broadcastsi128_si256(
1181 _mm256_extracti128_si256(vector, 0)
1182 ),
1183 // ((indices[i] << 3) & 0b1000 0000) ^ indices[i]:
1184 // Adds the 5th bit of indices[i] as most significant bit. If the
1185 // 5th bit is set, that means that indices[i] >= 16.
1186 // r = _mm256_shuffle_epi8(vl, indices) will set r[i] = 0 if the
1187 // most significant bit of indices[i] is 1. Since this bit is the
1188 // 5th bit, r[i] = 0 if indices[i] >= 16 and r[i] = vl[indices[i]]
1189 // if indices[i] < 16.
1190 _mm256_xor_si256(
1191 _mm256_and_si256(
1192 _mm256_slli_epi16(indices, 3),
1193 _mm256_set1_epi8(-127) // 0b1000 0000
1194 ),
1195 indices
1196 )
1197 ),
1198 // shuffle bytes from the higher bytes of vector
1199 _mm256_shuffle_epi8(
1200 // repeat twice the higher bytes of vector in a new __m256i vector
1201 // i.e.
1202 // vh[127:0] = v[255:128]
1203 // vh[255:128] = v[255:128]
1204 _mm256_broadcastsi128_si256(
1205 _mm256_extracti128_si256(vector, 1)
1206 ),
1207 // indices[i] - 16:
1208 // r = _mm256_shuffle_epi8(vh, indices)
1209 // will return r[i] = 0 if the most significant bit of the byte
1210 // indices[i] is 1. Thus, indices[i] - 16 will select all high
1211 // bytes in vh, i.e. r[i] = vh[indices[i] - 16], if indices[i] >=
1212 // 16 and r[i] = 0 if indices[i] < 16.
1213 _mm256_sub_epi8(
1214 indices,
1215 _mm256_set1_epi8(16)
1216 )
1217 )
1218 );
1219 }
1220
1221 inline __m256i
seqan_m256_shuffle_epi16(const __m256i a,const __m256i b)1222 seqan_m256_shuffle_epi16(const __m256i a, const __m256i b)
1223 {
1224 // multiply by 2
1225 __m256i idx = _mm256_slli_epi16(
1226 _mm256_permute4x64_epi64(b, 0b01010000),
1227 1
1228 );
1229 // _print(_mm256_add_epi8(idx, _mm256_set1_epi8(1)));
1230 // _print( _mm256_unpacklo_epi8(
1231 // idx,
1232 // _mm256_add_epi8(idx, _mm256_set1_epi8(1))
1233 // ));
1234 return seqan_m256_shuffle_epi8(
1235 a,
1236 // interleave idx[15:0] = 2*indices[15], ..., 2*indices[0]
1237 // with idx[15:0]+1 = 2*indices[15]+1, ..., 2*indices[0]+1
1238 // => 2*indices[15]+1, 2*indices[15], ..., 2*indices[0]+1, 2*indices[0]
1239 _mm256_unpacklo_epi8(
1240 idx,
1241 _mm256_add_epi8(idx, _mm256_set1_epi8(1))
1242 )
1243 );
1244 }
1245
1246 inline __m256i
seqan_m256_shuffle_epi32(const __m256i a,const __m256i b)1247 seqan_m256_shuffle_epi32(const __m256i a, const __m256i b)
1248 {
1249 // multiply by 4
1250 __m256i idx = _mm256_slli_epi16(
1251 _mm256_permutevar8x32_epi32(b, __m256i {0x0, 0x0, 0x1, 0x0}),
1252 2
1253 );
1254 return seqan_m256_shuffle_epi8(
1255 a,
1256 // interleave 4*indices[7]+1, 4*indices[7]+0; ..., 4*indices[0]+1, 4*indices[0]+0
1257 // with 4*indices[7]+3, 4*indices[7]+2; ..., 4*indices[0]+3, 4*indices[0]+2
1258 // => 4*indices[7]+3, 4*indices[7]+2; 4*indices[7]+1, 4*indices[7]+0;
1259 // ...
1260 // 4*indices[0]+3, 4*indices[0]+2; 4*indices[0]+1, 4*indices[0]+0
1261 _mm256_unpacklo_epi16(
1262 // interleave idx[7:0]+0 = 4*indices[7]+0; ...; 4*indices[0]+0
1263 // with idx[7:0]+1 = 4*indices[7]+1; ...; 4*indices[0]+1
1264 // => 4*indices[7]+1; 4*indices[7]+0; ...; 4*indices[0]+1; 4*indices[0]+0
1265 _mm256_unpacklo_epi8(
1266 idx,
1267 _mm256_add_epi8(idx, _mm256_set1_epi8(1))
1268 ),
1269 // interleave idx[7:0]+2 = 4*indices[7]+2; ...; 4*indices[0]+2
1270 // with idx[7:0]+3 = 4*indices[7]+3; ...; 4*indices[0]+3
1271 // => 4*indices[7]+3; 4*indices[7]+2; ...; 4*indices[0]+3; 4*indices[0]+2
1272 _mm256_unpacklo_epi8(
1273 _mm256_add_epi8(idx, _mm256_set1_epi8(2)),
1274 _mm256_add_epi8(idx, _mm256_set1_epi8(3))
1275 )
1276 ));
1277 }
1278
1279 #define seqan_mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
1280
1281 inline __m256i
seqan_m256_shuffle_epi64(const __m256i a,const __m256i b)1282 seqan_m256_shuffle_epi64(const __m256i a, const __m256i b)
1283 {
1284 __m128i lowidx = _mm256_extracti128_si256(
1285 // multiply by 8
1286 _mm256_slli_epi16(b, 3),
1287 0
1288 );
1289
1290 __m256i idx = seqan_mm256_set_m128i(
1291 _mm_srli_si128(lowidx, 2),
1292 lowidx
1293 );
1294
1295 return seqan_m256_shuffle_epi8(
1296 a,
1297 _mm256_unpacklo_epi32(
1298 // interleave 8*indices[3]+1, 8*indices[3]+0; ..., 8*indices[0]+1, 8*indices[0]+0
1299 // with 8*indices[3]+3, 8*indices[3]+2; ..., 8*indices[0]+3, 8*indices[0]+2
1300 // => 8*indices[3]+3, 8*indices[3]+2; 8*indices[3]+1, 8*indices[3]+0;
1301 // ...
1302 // 8*indices[0]+3, 8*indices[0]+2; 8*indices[0]+1, 8*indices[0]+0
1303 _mm256_unpacklo_epi16(
1304 // interleave idx[3:0]+0 = 8*indices[3]+0; ...; 8*indices[0]+0
1305 // with idx[3:0]+1 = 8*indices[3]+1; ...; 8*indices[0]+1
1306 // => 8*indices[3]+1; 8*indices[3]+0; ...; 8*indices[0]+1; 8*indices[0]+0
1307 _mm256_unpacklo_epi8(
1308 idx,
1309 _mm256_add_epi8(idx, _mm256_set1_epi8(1))
1310 ),
1311 // interleave idx[3:0]+2 = 8*indices[3]+2; ...; 8*indices[0]+2
1312 // with idx[3:0]+3 = 8*indices[3]+3; ...; 8*indices[0]+3
1313 // => 8*indices[3]+3; 8*indices[3]+2; ...; 8*indices[0]+3; 8*indices[0]+2
1314 _mm256_unpacklo_epi8(
1315 _mm256_add_epi8(idx, _mm256_set1_epi8(2)),
1316 _mm256_add_epi8(idx, _mm256_set1_epi8(3))
1317 )
1318 ),
1319 // interleave 8*indices[3]+5, 8*indices[3]+4; ..., 8*indices[0]+5, 8*indices[0]+4
1320 // with 8*indices[3]+7, 8*indices[3]+6; ..., 8*indices[0]+7, 8*indices[0]+6
1321 // => 8*indices[3]+7, 8*indices[3]+6; 8*indices[3]+5, 8*indices[3]+4;
1322 // ...
1323 // 8*indices[0]+7, 8*indices[0]+6; 8*indices[0]+5, 8*indices[0]+4
1324 _mm256_unpacklo_epi16(
1325 // interleave idx[3:0]+4 = 8*indices[3]+4; ...; 8*indices[0]+4
1326 // with idx[3:0]+5 = 8*indices[3]+5; ...; 8*indices[0]+5
1327 // => 8*indices[3]+5; 8*indices[3]+4; ...; 8*indices[0]+5; 8*indices[0]+4
1328 _mm256_unpacklo_epi8(
1329 _mm256_add_epi8(idx, _mm256_set1_epi8(4)),
1330 _mm256_add_epi8(idx, _mm256_set1_epi8(5))
1331 ),
1332 // interleave idx[3:0]+6 = 8*indices[3]+6; ...; 8*indices[0]+6
1333 // with idx[3:0]+7 = 8*indices[3]+7; ...; 8*indices[0]+7
1334 // => 8*indices[3]+7; 8*indices[3]+6; ...; 8*indices[0]+7; 8*indices[0]+6
1335 _mm256_unpacklo_epi8(
1336 _mm256_add_epi8(idx, _mm256_set1_epi8(6)),
1337 _mm256_add_epi8(idx, _mm256_set1_epi8(7))
1338 )
1339 )
1340 )
1341 );
1342 }
1343
1344 template <typename TSimdVector1, typename TSimdVector2>
1345 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,16>,SimdParams_<16,16>)1346 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 16>, SimdParams_<16, 16>)
1347 {
1348 // copy 2nd 64bit word to 3rd, compute 2*idx
1349 __m256i idx = _mm256_slli_epi16(_mm256_permute4x64_epi64(_mm256_castsi128_si256(SEQAN_VECTOR_CAST_(const __m128i &, indices)), 0x50), 1);
1350
1351 // interleave with 2*idx+1 and call shuffle
1352 return SEQAN_VECTOR_CAST_(TSimdVector1,
1353 _mm256_shuffle_epi8(
1354 SEQAN_VECTOR_CAST_(const __m256i &, vector),
1355 _mm256_unpacklo_epi8(
1356 idx,
1357 _mm256_add_epi8(
1358 idx, _mm256_set1_epi8(1)
1359 )
1360 )
1361 )
1362 );
1363 }
1364
1365 template <typename TSimdVector1, typename TSimdVector2>
1366 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,32>,SimdParams_<32,32>)1367 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 32>, SimdParams_<32, 32>)
1368 {
1369 return SEQAN_VECTOR_CAST_(TSimdVector1, seqan_m256_shuffle_epi8(
1370 SEQAN_VECTOR_CAST_(const __m256i &, vector),
1371 SEQAN_VECTOR_CAST_(const __m256i &, indices)
1372 ));
1373 }
1374
1375 template <typename TSimdVector1, typename TSimdVector2>
1376 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,16>,SimdParams_<32,32>)1377 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 16>, SimdParams_<32, 32>)
1378 {
1379 return SEQAN_VECTOR_CAST_(TSimdVector1, seqan_m256_shuffle_epi16(
1380 SEQAN_VECTOR_CAST_(const __m256i &, vector),
1381 SEQAN_VECTOR_CAST_(const __m256i &, indices)
1382 ));
1383 }
1384
1385 template <typename TSimdVector1, typename TSimdVector2>
1386 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,8>,SimdParams_<32,32>)1387 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 8>, SimdParams_<32, 32>)
1388 {
1389 return SEQAN_VECTOR_CAST_(TSimdVector1, seqan_m256_shuffle_epi32(
1390 SEQAN_VECTOR_CAST_(const __m256i &, vector),
1391 SEQAN_VECTOR_CAST_(const __m256i &, indices)
1392 ));
1393 }
1394
1395 template <typename TSimdVector1, typename TSimdVector2>
1396 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,4>,SimdParams_<32,32>)1397 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 4>, SimdParams_<32, 32>)
1398 {
1399 return SEQAN_VECTOR_CAST_(TSimdVector1, seqan_m256_shuffle_epi64(
1400 SEQAN_VECTOR_CAST_(const __m256i &, vector),
1401 SEQAN_VECTOR_CAST_(const __m256i &, indices)
1402 ));
1403 }
1404
1405 // --------------------------------------------------------------------------
1406 // _transposeMatrix (256bit)
1407 // --------------------------------------------------------------------------
1408
1409 // emulate missing _mm256_unpacklo_epi128/_mm256_unpackhi_epi128 instructions
_mm256_unpacklo_epi128(__m256i const & a,__m256i const & b)1410 inline __m256i _mm256_unpacklo_epi128(__m256i const & a, __m256i const & b)
1411 {
1412 return _mm256_permute2x128_si256(a, b, 0x20);
1413 // return _mm256_inserti128_si256(a, _mm256_extracti128_si256(b, 0), 1);
1414 }
1415
_mm256_unpackhi_epi128(__m256i const & a,__m256i const & b)1416 inline __m256i _mm256_unpackhi_epi128(__m256i const & a, __m256i const & b)
1417 {
1418 return _mm256_permute2x128_si256(a, b, 0x31);
1419 // return _mm256_inserti128_si256(b, _mm256_extracti128_si256(a, 1), 0);
1420 }
1421
1422 template <typename TSimdVector>
1423 inline void
_transposeMatrix(TSimdVector matrix[],SimdMatrixParams_<32,32,8>)1424 _transposeMatrix(TSimdVector matrix[], SimdMatrixParams_<32, 32, 8>)
1425 {
1426 // we need a look-up table to reverse the lowest 4 bits
1427 // in order to place the permute the transposed rows
1428 static const unsigned char bitRev[] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15,
1429 16,24,20,28,18,26,22,30,17,25,21,29,19,27,23,31};
1430
1431 // transpose a 32x32 byte matrix
1432 __m256i tmp1[32];
1433 for (int i = 0; i < 16; ++i)
1434 {
1435 tmp1[i] = _mm256_unpacklo_epi8(
1436 SEQAN_VECTOR_CAST_(const __m256i &, matrix[2*i]),
1437 SEQAN_VECTOR_CAST_(const __m256i &, matrix[2*i+1])
1438 );
1439 tmp1[i+16] = _mm256_unpackhi_epi8(
1440 SEQAN_VECTOR_CAST_(const __m256i &, matrix[2*i]),
1441 SEQAN_VECTOR_CAST_(const __m256i &, matrix[2*i+1])
1442 );
1443 }
1444 __m256i tmp2[32];
1445 for (int i = 0; i < 16; ++i)
1446 {
1447 tmp2[i] = _mm256_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
1448 tmp2[i+16] = _mm256_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
1449 }
1450 for (int i = 0; i < 16; ++i)
1451 {
1452 tmp1[i] = _mm256_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
1453 tmp1[i+16] = _mm256_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
1454 }
1455 for (int i = 0; i < 16; ++i)
1456 {
1457 tmp2[i] = _mm256_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]);
1458 tmp2[i+16] = _mm256_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]);
1459 }
1460 for (int i = 0; i < 16; ++i)
1461 {
1462 matrix[bitRev[i]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_unpacklo_epi128(tmp2[2*i],tmp2[2*i+1]));
1463 matrix[bitRev[i+16]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_unpackhi_epi128(tmp2[2*i],tmp2[2*i+1]));
1464 }
1465 }
1466
1467 // --------------------------------------------------------------------------
1468 // Function _testAllZeros (256bit)
1469 // --------------------------------------------------------------------------
1470
1471 template <typename TSimdVector>
SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector>>,int)1472 SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector> >, int)
1473 inline _testAllZeros(TSimdVector const & vector, TSimdVector const & mask, SimdParams_<32>)
1474 {
1475 return _mm256_testz_si256(SEQAN_VECTOR_CAST_(const __m256i &, vector),
1476 SEQAN_VECTOR_CAST_(const __m256i &, mask));
1477 }
1478
1479 // --------------------------------------------------------------------------
1480 // Function _testAllOnes (256bit)
1481 // --------------------------------------------------------------------------
1482
1483 template <typename TSimdVector>
_testAllOnes(TSimdVector const & vector,SimdParams_<32>)1484 inline int _testAllOnes(TSimdVector const & vector, SimdParams_<32>)
1485 {
1486 __m256i vec = SEQAN_VECTOR_CAST_(const __m256i &, vector);
1487 return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
1488 }
1489
1490 } // namespace seqan
1491
1492 #endif // SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_AVX2_H_
1493