1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 //         René Rahn <rene.rahn@fu-berlin.de>
34 //         Stefan Budach <stefan.budach@fu-berlin.de>
35 // ==========================================================================
36 // generic SIMD interface for SSE3 / AVX2
37 // ==========================================================================
38 
39 #ifndef SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_AVX2_H_
40 #define SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_AVX2_H_
41 
42 namespace seqan {
43 
44 // SimdParams_<32, 32>: 256bit = 32 elements * 8bit
45 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector32Char,     char,           32)
46 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector32SChar,    signed char,    32)
47 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector32UChar,    unsigned char,  32)
48 
49 // SimdParams_<32, 16>: 256bit = 16 elements * 2 * 8bit
50 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16Short,    short,          32)
51 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16UShort,   unsigned short, 32)
52 
53 // SimdParams_<32, 8>: 256bit = 8 elements * 4 * 8bit
54 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8Int,       int,            32)
55 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8UInt,      unsigned int,   32)
56 
57 // SimdParams_<32, 4>: 256bit = 4 elements * 8 * 8bit
58 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4Int64,     int64_t,        32)
59 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4UInt64,    uint64_t,       32)
60 
61 // ============================================================================
62 // Functions
63 // ============================================================================
64 
65 // ============================================================================
66 // AVX/AVX2 wrappers (256bit vectors)
67 // ============================================================================
68 
69 // --------------------------------------------------------------------------
70 // _fillVector (256bit)
71 // --------------------------------------------------------------------------
72 
73 template <typename TSimdVector, typename ...TValue>
74 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<32,32>)75 _fillVector(TSimdVector & vector,
76             std::tuple<TValue...> const & x,
77             std::index_sequence<0> const &, SimdParams_<32, 32>)
78 {
79     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi8(std::get<0>(x)));
80 }
81 
82 template <typename TSimdVector, typename ...TValue>
83 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<32,16>)84 _fillVector(TSimdVector & vector,
85             std::tuple<TValue...> const & x,
86             std::index_sequence<0> const &, SimdParams_<32, 16>)
87 {
88     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi16(std::get<0>(x)));
89 }
90 
91 template <typename TSimdVector, typename ...TValue>
92 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<32,8>)93 _fillVector(TSimdVector & vector,
94             std::tuple<TValue...> const & x,
95             std::index_sequence<0> const &, SimdParams_<32, 8>)
96 {
97     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi32(std::get<0>(x)));
98 }
99 
100 template <typename TSimdVector, typename ...TValue>
101 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<32,4>)102 _fillVector(TSimdVector & vector,
103             std::tuple<TValue...> const & x,
104             std::index_sequence<0> const &, SimdParams_<32, 4>)
105 {
106     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi64x(std::get<0>(x)));
107 }
108 
109 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
110 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<32,32>)111 _fillVector(TSimdVector & vector,
112             std::tuple<TValue...> const & args, std::index_sequence<INDICES...> const &, SimdParams_<32, 32>)
113 {
114     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_setr_epi8(std::get<INDICES>(args)...));
115 }
116 
117 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
118 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<32,16>)119 _fillVector(TSimdVector & vector,
120             std::tuple<TValue...> const & args, std::index_sequence<INDICES...> const &, SimdParams_<32, 16>)
121 {
122     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_setr_epi16(std::get<INDICES>(args)...));
123 }
124 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
125 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<32,8>)126 _fillVector(TSimdVector & vector,
127             std::tuple<TValue...> const & args, std::index_sequence<INDICES...> const &, SimdParams_<32, 8>)
128 {
129     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_setr_epi32(std::get<INDICES>(args)...));
130 }
131 
132 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
133 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<32,4>)134 _fillVector(TSimdVector & vector,
135             std::tuple<TValue...> const & args, std::index_sequence<INDICES...> const &, SimdParams_<32, 4>)
136 {
137     // reverse argument list 0, 1, 2, 3 -> 3, 2, 1, 0
138     // NOTE(marehr): Intel linux fails to reverse argument list and only
139     // _mm256_set_epi64x has no reverse equivalent
140     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set_epi64x(std::get<sizeof...(INDICES) - 1 - INDICES>(args)...));
141 }
142 
143 // --------------------------------------------------------------------------
144 // _clearVector (256bit)
145 // --------------------------------------------------------------------------
146 
147 template <typename TSimdVector, int L>
_clearVector(TSimdVector & vector,SimdParams_<32,L>)148 inline void _clearVector(TSimdVector & vector, SimdParams_<32, L>)
149 {
150     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_setzero_si256());
151 }
152 
153 // --------------------------------------------------------------------------
154 // _createVector (256bit)
155 // --------------------------------------------------------------------------
156 
157 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<32,32>)158 inline TSimdVector _createVector(TValue const x, SimdParams_<32, 32>)
159 {
160     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi8(x));
161 }
162 
163 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<32,16>)164 inline TSimdVector _createVector(TValue const x, SimdParams_<32, 16>)
165 {
166     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi16(x));
167 }
168 
169 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<32,8>)170 inline TSimdVector _createVector(TValue const x, SimdParams_<32, 8>)
171 {
172     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi32(x));
173 }
174 
175 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<32,4>)176 inline TSimdVector _createVector(TValue const x, SimdParams_< 32, 4>)
177 {
178     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_set1_epi64x(x));
179 }
180 
181 // --------------------------------------------------------------------------
182 // _cmpEq (256bit)
183 // --------------------------------------------------------------------------
184 
185 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32>)186 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32>)
187 {
188     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpeq_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
189                                                              SEQAN_VECTOR_CAST_(const __m256i&, b)));
190 }
191 
192 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16>)193 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16>)
194 {
195     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpeq_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
196                                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
197 }
198 
199 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8>)200 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8>)
201 {
202     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpeq_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
203                                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
204 }
205 
206 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4>)207 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4>)
208 {
209     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpeq_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
210                                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
211 }
212 
213 // --------------------------------------------------------------------------
214 // _cmpGt (256bit)
215 // --------------------------------------------------------------------------
216 
217 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,int8_t>)218 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, int8_t>)
219 {
220     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpgt_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
221                                                              SEQAN_VECTOR_CAST_(const __m256i&, b)));
222 }
223 
224 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,uint8_t>)225 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, uint8_t>)
226 {
227     // There is no unsigned cmpgt, we reduce it to the signed case.
228     // Note that 0x80 = ~0x7F (prevent overflow messages).
229     return SEQAN_VECTOR_CAST_(TSimdVector,
230                               _mm256_cmpgt_epi8(
231                                   _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_set1_epi8(~0x7F)),
232                                   _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, b), _mm256_set1_epi8(~0x7F))));
233 }
234 
235 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,int16_t>)236 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, int16_t>)
237 {
238     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpgt_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
239                                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
240 }
241 
242 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,uint16_t>)243 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, uint16_t>)
244 {
245     // There is no unsigned cmpgt, we reduce it to the signed case.
246     // Note that 0x8000 = ~0x7FFF (prevent overflow messages).
247     return SEQAN_VECTOR_CAST_(TSimdVector,
248                               _mm256_cmpgt_epi16(
249                                   _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_set1_epi16(~0x7FFF)),
250                                   _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, b), _mm256_set1_epi16(~0x7FFF))));
251 }
252 
253 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,int32_t>)254 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, int32_t>)
255 {
256     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpgt_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
257                                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
258 }
259 
260 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,uint32_t>)261 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, uint32_t>)
262 {
263     // There is no unsigned cmpgt, we reduce it to the signed case.
264     // Note that 0x80000000 = ~0x7FFFFFFF (prevent overflow messages).
265     return SEQAN_VECTOR_CAST_(TSimdVector,
266                               _mm256_cmpgt_epi32(
267                                   _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_set1_epi32(~0x7FFFFFFF)),
268                                   _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, b), _mm256_set1_epi32(~0x7FFFFFFF))));
269 }
270 
271 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,int64_t>)272 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, int64_t>)
273 {
274     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_cmpgt_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
275                                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
276 }
277 
278 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,uint64_t>)279 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, uint64_t>)
280 {
281     // There is no unsigned cmpgt, we reduce it to the signed case.
282     // Note that 0x8000000000000000ul = ~0x7FFFFFFFFFFFFFFFul (prevent overflow messages).
283     return SEQAN_VECTOR_CAST_(TSimdVector,
284                               _mm256_cmpgt_epi64(
285                                   _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, a) ,_mm256_set1_epi64x(~0x7FFFFFFFFFFFFFFFul)),
286                                   _mm256_xor_si256(SEQAN_VECTOR_CAST_(const __m256i&, b), _mm256_set1_epi64x(~0x7FFFFFFFFFFFFFFFul))));
287 }
288 
289 // --------------------------------------------------------------------------
290 // _bitwiseOr (256bit)
291 // --------------------------------------------------------------------------
292 
293 template <typename TSimdVector, int L>
_bitwiseOr(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,L>)294 inline TSimdVector _bitwiseOr(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, L>)
295 {
296     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_or_si256(SEQAN_VECTOR_CAST_(const __m256i&, a),
297                                                            SEQAN_VECTOR_CAST_(const __m256i&, b)));
298 }
299 
300 // --------------------------------------------------------------------------
301 // _bitwiseAnd (256bit)
302 // --------------------------------------------------------------------------
303 
304 template <typename TSimdVector, int L>
_bitwiseAnd(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,L>)305 inline TSimdVector _bitwiseAnd(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, L>)
306 {
307     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_and_si256(SEQAN_VECTOR_CAST_(const __m256i&, a),
308                                                             SEQAN_VECTOR_CAST_(const __m256i&, b)));
309 }
310 
311 // --------------------------------------------------------------------------
312 // _bitwiseAndNot (256bit)
313 // --------------------------------------------------------------------------
314 
315 template <typename TSimdVector, int L>
_bitwiseAndNot(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,L>)316 inline TSimdVector _bitwiseAndNot(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, L>)
317 {
318     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_andnot_si256(SEQAN_VECTOR_CAST_(const __m256i&, a),
319                                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
320 }
321 
322 // --------------------------------------------------------------------------
323 // _bitwiseNot (256bit)
324 // --------------------------------------------------------------------------
325 
326 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<32,32>)327 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<32, 32>)
328 {
329     return SEQAN_VECTOR_CAST_(TSimdVector,
330                               _mm256_cmpeq_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_setzero_si256()));
331 }
332 
333 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<32,16>)334 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<32, 16>)
335 {
336     return SEQAN_VECTOR_CAST_(TSimdVector,
337                               _mm256_cmpeq_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_setzero_si256()));
338 }
339 
340 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<32,8>)341 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<32, 8>)
342 {
343     return SEQAN_VECTOR_CAST_(TSimdVector,
344                               _mm256_cmpeq_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_setzero_si256()));
345 
346 }
347 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<32,4>)348 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<32, 4>)
349 {
350     return SEQAN_VECTOR_CAST_(TSimdVector,
351                               _mm256_cmpeq_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a), _mm256_setzero_si256()));
352 }
353 
354 // --------------------------------------------------------------------------
355 // _divide (256bit)
356 // --------------------------------------------------------------------------
357 
358 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<32,32>)359 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<32, 32>)
360 {
361     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_div_epi8(a, _mm256_set1_epi8(b)));
362 }
363 
364 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<32,16>)365 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<32, 16>)
366 {
367     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_div_epi16(a, _mm256_set1_epi16(b)));
368 }
369 
370 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<32,8>)371 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<32, 8>)
372 {
373     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_div_epi32(a, _mm256_set1_epi32(b)));
374 }
375 
376 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<32,4>)377 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<32, 4>)
378 {
379     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_div_epi64(a, _mm256_set1_epi64x(b)));
380 }
381 
382 // --------------------------------------------------------------------------
383 // _add (256bit)
384 // --------------------------------------------------------------------------
385 
386 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32>)387 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32>)
388 {
389     return SEQAN_VECTOR_CAST_(TSimdVector,
390                               _mm256_add_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
391                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
392 }
393 
394 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16>)395 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16>)
396 {
397     return SEQAN_VECTOR_CAST_(TSimdVector,
398                               _mm256_add_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
399                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
400 }
401 
402 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8>)403 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8>)
404 {
405     return SEQAN_VECTOR_CAST_(TSimdVector,
406                               _mm256_add_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
407                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
408 }
409 
410 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4>)411 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4>)
412 {
413     return SEQAN_VECTOR_CAST_(TSimdVector,
414                               _mm256_add_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
415                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
416 }
417 
418 // --------------------------------------------------------------------------
419 // _sub (256bit)
420 // --------------------------------------------------------------------------
421 
422 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32>)423 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32>)
424 {
425     return SEQAN_VECTOR_CAST_(TSimdVector,
426                               _mm256_sub_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
427                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
428 }
429 
430 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16>)431 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16>)
432 {
433     return SEQAN_VECTOR_CAST_(TSimdVector,
434                               _mm256_sub_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
435                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
436 }
437 
438 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8>)439 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8>)
440 {
441     return SEQAN_VECTOR_CAST_(TSimdVector,
442                               _mm256_sub_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
443                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
444 }
445 
446 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4>)447 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4>)
448 {
449     return SEQAN_VECTOR_CAST_(TSimdVector,
450                               _mm256_sub_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
451                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
452 }
453 
454 // --------------------------------------------------------------------------
455 // _mult (256bit)
456 // --------------------------------------------------------------------------
457 
458 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const &,SimdParams_<32,32>)459 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const &/*b*/, SimdParams_<32, 32>)
460 {
461     SEQAN_SKIP_TEST;
462     SEQAN_ASSERT_FAIL("AVX2 intrinsics for multiplying 8 bit values not implemented!");
463     return a;
464 }
465 
466 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16>)467 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16>)
468 {
469     return SEQAN_VECTOR_CAST_(TSimdVector,
470                               _mm256_mullo_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
471                                                  SEQAN_VECTOR_CAST_(const __m256i&, b)));
472 }
473 
474 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8>)475 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8>)
476 {
477     return SEQAN_VECTOR_CAST_(TSimdVector,
478                               _mm256_mullo_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
479                                                  SEQAN_VECTOR_CAST_(const __m256i&, b)));
480 }
481 
482 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const &,SimdParams_<32,4>)483 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const &/*b*/, SimdParams_<32, 4>)
484 {
485     SEQAN_SKIP_TEST;
486     SEQAN_ASSERT_FAIL("AVX2 intrinsics for multiplying 64 bit values not implemented!");
487     return a;
488 }
489 
490 // --------------------------------------------------------------------------
491 // _max (256bit)
492 // --------------------------------------------------------------------------
493 
494 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,int8_t>)495 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, int8_t>)
496 {
497     return SEQAN_VECTOR_CAST_(TSimdVector,
498                               _mm256_max_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
499                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
500 }
501 
502 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,uint8_t>)503 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, uint8_t>)
504 {
505     return SEQAN_VECTOR_CAST_(TSimdVector,
506                               _mm256_max_epu8(SEQAN_VECTOR_CAST_(const __m256i&, a),
507                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
508 }
509 
510 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,int16_t>)511 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, int16_t>)
512 {
513     return SEQAN_VECTOR_CAST_(TSimdVector,
514                               _mm256_max_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
515                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
516 }
517 
518 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,uint16_t>)519 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, uint16_t>)
520 {
521     return SEQAN_VECTOR_CAST_(TSimdVector,
522                               _mm256_max_epu16(SEQAN_VECTOR_CAST_(const __m256i&, a),
523                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
524 }
525 
526 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,int32_t>)527 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, int32_t>)
528 {
529     return SEQAN_VECTOR_CAST_(TSimdVector,
530                               _mm256_max_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
531                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
532 }
533 
534 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,uint32_t>)535 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, uint32_t>)
536 {
537     return SEQAN_VECTOR_CAST_(TSimdVector,
538                               _mm256_max_epu32(SEQAN_VECTOR_CAST_(const __m256i&, a),
539                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
540 }
541 
542 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,int64_t>)543 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, int64_t>)
544 {
545     #if defined(__AVX512VL__)
546         return SEQAN_VECTOR_CAST_(TSimdVector,
547                                   _mm256_max_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
548                                                    SEQAN_VECTOR_CAST_(const __m256i&, b)));
549     #else // defined(__AVX512VL__)
550         return blend(b, a, cmpGt(a, b));
551     #endif // defined(__AVX512VL__)
552 }
553 
554 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,uint64_t>)555 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, uint64_t>)
556 {
557     #if defined(__AVX512VL__)
558         return SEQAN_VECTOR_CAST_(TSimdVector,
559                                   _mm256_max_epu64(SEQAN_VECTOR_CAST_(const __m256i&, a),
560                                                    SEQAN_VECTOR_CAST_(const __m256i&, b)));
561     #else // defined(__AVX512VL__)
562         return blend(b, a, cmpGt(a, b));
563     #endif // defined(__AVX512VL__)
564 }
565 
566 
567 // --------------------------------------------------------------------------
568 // _min (256bit)
569 // --------------------------------------------------------------------------
570 
571 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,int8_t>)572 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, int8_t>)
573 {
574     return SEQAN_VECTOR_CAST_(TSimdVector,
575                               _mm256_min_epi8(SEQAN_VECTOR_CAST_(const __m256i&, a),
576                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
577 }
578 
579 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,32,uint8_t>)580 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 32, uint8_t>)
581 {
582     return SEQAN_VECTOR_CAST_(TSimdVector,
583                               _mm256_min_epu8(SEQAN_VECTOR_CAST_(const __m256i&, a),
584                                               SEQAN_VECTOR_CAST_(const __m256i&, b)));
585 }
586 
587 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,int16_t>)588 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, int16_t>)
589 {
590     return SEQAN_VECTOR_CAST_(TSimdVector,
591                               _mm256_min_epi16(SEQAN_VECTOR_CAST_(const __m256i&, a),
592                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
593 }
594 
595 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,16,uint16_t>)596 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 16, uint16_t>)
597 {
598     return SEQAN_VECTOR_CAST_(TSimdVector,
599                               _mm256_min_epu16(SEQAN_VECTOR_CAST_(const __m256i&, a),
600                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
601 }
602 
603 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,int32_t>)604 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, int32_t>)
605 {
606     return SEQAN_VECTOR_CAST_(TSimdVector,
607                               _mm256_min_epi32(SEQAN_VECTOR_CAST_(const __m256i&, a),
608                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
609 }
610 
611 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,8,uint32_t>)612 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 8, uint32_t>)
613 {
614     return SEQAN_VECTOR_CAST_(TSimdVector,
615                               _mm256_min_epu32(SEQAN_VECTOR_CAST_(const __m256i&, a),
616                                                SEQAN_VECTOR_CAST_(const __m256i&, b)));
617 }
618 
619 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,int64_t>)620 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, int64_t>)
621 {
622     #if defined(__AVX512VL__)
623         return SEQAN_VECTOR_CAST_(TSimdVector,
624                                   _mm256_min_epi64(SEQAN_VECTOR_CAST_(const __m256i&, a),
625                                                    SEQAN_VECTOR_CAST_(const __m256i&, b)));
626     #else // defined(__AVX512VL__)
627         return blend(a, b, cmpGt(a, b));
628     #endif // defined(__AVX512VL__)
629 }
630 
631 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<32,4,uint64_t>)632 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<32, 4, uint64_t>)
633 {
634     #if defined(__AVX512VL__)
635         return SEQAN_VECTOR_CAST_(TSimdVector,
636                                   _mm256_min_epu64(SEQAN_VECTOR_CAST_(const __m256i&, a),
637                                                    SEQAN_VECTOR_CAST_(const __m256i&, b)));
638     #else // defined(__AVX512VL__)
639         return blend(a, b, cmpGt(a, b));
640     #endif // defined(__AVX512VL__)
641 }
642 
643 // --------------------------------------------------------------------------
644 // _blend (256bit)
645 // --------------------------------------------------------------------------
646 
647 template <typename TSimdVector, typename TSimdVectorMask, int L>
_blend(TSimdVector const & a,TSimdVector const & b,TSimdVectorMask const & mask,SimdParams_<32,L>)648 inline TSimdVector _blend(TSimdVector const & a, TSimdVector const & b, TSimdVectorMask const & mask, SimdParams_<32, L>)
649 {
650     return SEQAN_VECTOR_CAST_(TSimdVector,
651                               _mm256_blendv_epi8(SEQAN_VECTOR_CAST_(const __m256i &, a),
652                                                  SEQAN_VECTOR_CAST_(const __m256i &, b),
653                                                  SEQAN_VECTOR_CAST_(const __m256i &, mask)));
654 }
655 
656 // --------------------------------------------------------------------------
657 // _storeu (256bit)
658 // --------------------------------------------------------------------------
659 
660 template <typename T, typename TSimdVector, int L>
_storeu(T * memAddr,TSimdVector const & vec,SimdParams_<32,L>)661 inline void _storeu(T * memAddr, TSimdVector const & vec, SimdParams_<32, L>)
662 {
663     _mm256_storeu_si256((__m256i*)memAddr, SEQAN_VECTOR_CAST_(const __m256i&, vec));
664 }
665 
666 // ----------------------------------------------------------------------------
667 // Function _load() 256bit
668 // ----------------------------------------------------------------------------
669 
670 template <typename TSimdVector, typename T, int L>
_load(T const * memAddr,SimdParams_<32,L>)671 inline TSimdVector _load(T const * memAddr, SimdParams_<32, L>)
672 {
673     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_load_si256((__m256i const *) memAddr));
674 }
675 
676 // --------------------------------------------------------------------------
677 // _shiftRightLogical (256bit)
678 // --------------------------------------------------------------------------
679 
680 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<32,32>)681 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<32, 32>)
682 {
683     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_srli_epi16(SEQAN_VECTOR_CAST_(const __m256i &, vector), imm) & _mm256_set1_epi8(0xff >> imm));
684 }
685 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<32,16>)686 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<32, 16>)
687 {
688     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_srli_epi16(SEQAN_VECTOR_CAST_(const __m256i &, vector), imm));
689 }
690 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<32,8>)691 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<32, 8>)
692 {
693     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_srli_epi32(SEQAN_VECTOR_CAST_(const __m256i &, vector), imm));
694 }
695 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<32,4>)696 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<32, 4>)
697 {
698     return SEQAN_VECTOR_CAST_(TSimdVector, _mm256_srli_epi64(SEQAN_VECTOR_CAST_(const __m256i &, vector), imm));
699 }
700 
701 // --------------------------------------------------------------------------
702 // Extend sign from integer types 256bit
703 // --------------------------------------------------------------------------
704 
705 inline __m256i
seqan_mm256_i16sign_extend_epis8(__m256i const & v)706 seqan_mm256_i16sign_extend_epis8(__m256i const & v)
707 {
708     return _mm256_or_si256( // extend sign (v | hi-bits)
709         v,
710         _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xff00)
711             _mm256_sub_epi16( // msk = msb - 1
712                 _mm256_andnot_si256( //msb = ~v & 0x80 (select msb)
713                     v,
714                     _mm256_set1_epi16(0x80)
715                 ),
716                 _mm256_set1_epi16(1)
717             ),
718             _mm256_set1_epi16(static_cast<uint16_t>(0xff00u))
719         )
720     );
721 }
722 
723 inline __m256i
seqan_mm256_i32sign_extend_epis8(__m256i const & v)724 seqan_mm256_i32sign_extend_epis8(__m256i const & v)
725 {
726     return _mm256_or_si256( // extend sign (v | hi-bits)
727         v,
728         _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffffff00u)
729             _mm256_sub_epi32( // msk = msb - 1
730                 _mm256_andnot_si256( //msb = ~v & 0x80 (select msb)
731                     v,
732                     _mm256_set1_epi32(0x80)
733                 ),
734                 _mm256_set1_epi32(1)
735             ),
736             _mm256_set1_epi32(static_cast<uint32_t>(0xffffff00u))
737         )
738     );
739 }
740 
741 inline __m256i
seqan_mm256_i32sign_extend_epis16(__m256i const & v)742 seqan_mm256_i32sign_extend_epis16(__m256i const & v)
743 {
744     return _mm256_or_si256( // extend sign (v | hi-bits)
745         v,
746         _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffff0000u)
747             _mm256_sub_epi32( // msk = msb - 1
748                 _mm256_andnot_si256( //msb = ~v & 0x8000 (select msb)
749                     v,
750                     _mm256_set1_epi32(0x8000)
751                 ),
752                 _mm256_set1_epi32(1)
753             ),
754             _mm256_set1_epi32(static_cast<uint32_t>(0xffff0000u))
755         )
756     );
757 }
758 
759 inline __m256i
seqan_mm256_i64sign_extend_epis8(__m256i const & v)760 seqan_mm256_i64sign_extend_epis8(__m256i const & v)
761 {
762     return _mm256_or_si256( // extend sign (v | hi-bits)
763         v,
764         _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffffffffffffff00ul)
765             _mm256_sub_epi64( // msk = msb - 1
766                 _mm256_andnot_si256( //msb = ~v & 0x80 (select msb)
767                     v,
768                     _mm256_set1_epi64x(0x80)
769                 ),
770                 _mm256_set1_epi64x(1)
771             ),
772             _mm256_set1_epi64x(static_cast<uint64_t>(0xffffffffffffff00ul))
773         )
774     );
775 }
776 
777 inline __m256i
seqan_mm256_i64sign_extend_epis16(__m256i const & v)778 seqan_mm256_i64sign_extend_epis16(__m256i const & v)
779 {
780     return _mm256_or_si256( // extend sign (v | hi-bits)
781         v,
782         _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffffffffffff0000ul)
783             _mm256_sub_epi64( // msk = msb - 1
784                 _mm256_andnot_si256( //msb = ~v & 0x8000 (select msb)
785                     v,
786                     _mm256_set1_epi64x(0x8000)
787                 ),
788                 _mm256_set1_epi64x(1)
789             ),
790             _mm256_set1_epi64x(static_cast<uint64_t>(0xffffffffffff0000ul))
791         )
792     );
793 }
794 
795 inline __m256i
seqan_mm256_i64sign_extend_epis32(__m256i const & v)796 seqan_mm256_i64sign_extend_epis32(__m256i const & v)
797 {
798     return _mm256_or_si256( // extend sign (v | hi-bits)
799         v,
800         _mm256_and_si256( // select hi-bits (hi-bits = msk & 0xffffffffffff0000ul)
801             _mm256_sub_epi64( // msk = msb - 1
802                 _mm256_andnot_si256( //msb = ~v & 0x80000000 (select msb)
803                     v,
804                     _mm256_set1_epi64x(0x80000000)
805                 ),
806                 _mm256_set1_epi64x(1)
807             ),
808             _mm256_set1_epi64x(static_cast<uint64_t>(0xffffffff00000000ul))
809         )
810     );
811 }
812 
813 // --------------------------------------------------------------------------
814 // _gather (256bit)
815 // --------------------------------------------------------------------------
816 
817 template <typename TValue, typename TSize, TSize SCALE>
818 inline __m256i
seqan_mm256_i8gather_epi(TValue const * memAddr,__m256i const & idx,std::integral_constant<TSize,SCALE> const &)819 seqan_mm256_i8gather_epi(TValue const * memAddr,
820                          __m256i const & idx,
821                          std::integral_constant<TSize, SCALE> const & /*scale*/)
822 {
823     // mem:    ( 0,  3,  6,  9 | 12, 15, 18, 21 | 24, 27, 30, 33 | 36, 39, 42, 45 || 48, 51, 54, 57 | 60, 63, 66, 69 | 72, 75, 78, 81 | 84, 87, 90, 93)
824     // idx:    (31, 30, 29, 28 | 27, 26, 25, 24 | 23, 22, 21, 20 | 19, 18, 17, 16 || 15, 14, 13, 12 | 11, 10,  9,  8 |  7,  6,  5,  4 |  3,  2,  1,  0)
825     // pack:   (93, 90, 87, 84 | 81, 78, 75, 72 | 69, 66, 63, 60 | 57, 54, 51, 48 || 45, 42, 39, 36 | 33, 30, 27, 24 | 21, 18, 15, 12 |  9,  6,  3,  0)
826     return _mm256_packus_epi16(
827         // pckLow: (93,  0, 90,  0 | 87,  0, 84,  0 | 81,  0, 78,  0 | 75,  0, 72,  0 || 45,  0, 42,  0 | 39,  0, 36,  0 | 33,  0, 30,  0 | 27,  0, 24,  0)
828         _mm256_packus_epi16(
829             // mskLL:  (93,  0,  0,  0 | 90,  0,  0,  0 | 87,  0,  0,  0 | 84,  0,  0,  0 || 45,  0,  0,  0 | 42,  0,  0,  0 | 39,  0,  0,  0 | 36,  0,  0,  0)
830             _mm256_and_si256(
831                 // gtrLL:  (93, 31, 30, 29 | 90, 93, 31, 30 | 87, 90, 93, 31 | 84, 87, 90, 93 || 45, 48, 51, 54 | 42, 45, 48, 51 | 39, 42, 45, 48 | 36, 39, 42, 45)
832                 _mm256_i32gather_epi32(
833                     (const int *) memAddr,
834                     // lowlow: (31,  0,  0,  0 | 30,  0,  0,  0 | 29,  0,  0,  0 | 28,  0,  0,  0 || 15,  0,  0,  0 | 14,  0,  0,  0 | 13,  0,  0,  0 | 12,  0,  0,  0)
835                     _mm256_shuffle_epi8(idx, __m256i {
836                         ~0xFF000000FFl | 0x0100000000, ~0xFF000000FFl | 0x0300000002,
837                         ~0xFF000000FFl | 0x0100000000, ~0xFF000000FFl | 0x0300000002
838                     }),
839                     SCALE
840                 ),
841                 _mm256_set1_epi32(0xFF)
842             ),
843             // mskLH:  (81,  0,  0,  0 | 78,  0,  0,  0 | 75,  0,  0,  0 | 72,  0,  0,  0 || 33,  0,  0,  0 | 30,  0,  0,  0 | 27,  0,  0,  0 | 24,  0,  0,  0)
844             _mm256_and_si256(
845                 // gtrLH:  (81, 84, 87, 90 | 78, 81, 84, 87 | 75, 78, 81, 84 | 72, 75, 78, 81 || 33, 36, 39, 42 | 30, 33, 36, 39 | 27, 30, 33, 36 | 24, 27, 30, 33)
846                 _mm256_i32gather_epi32(
847                     (const int *) memAddr,
848                     // lowhig: (27,  0,  0,  0 | 26,  0,  0,  0 | 25,  0,  0,  0 | 24,  0,  0,  0 || 11,  0,  0,  0 | 10,  0,  0,  0 |  9,  0,  0,  0 |  8,  0,  0,  0)
849                     _mm256_shuffle_epi8(idx, __m256i {
850                         ~0xFF000000FFl | 0x0500000004, ~0xFF000000FFl | 0x0700000006,
851                         ~0xFF000000FFl | 0x0500000004, ~0xFF000000FFl | 0x0700000006
852                     }),
853                     SCALE
854                 ),
855                 _mm256_set1_epi32(0xFF)
856             )
857         ),
858         // pckHih: (69,  0, 66,  0 | 63,  0, 60,  0 | 57,  0, 54,  0 | 51,  0, 48,  0 || 21,  0, 18,  0 | 15,  0, 12,  0 |  9,  0,  6,  0 |  3,  0,  0,  0)
859         _mm256_packus_epi16(
860             // mskHL:  (69,  0,  0,  0 | 66,  0,  0,  0 | 63,  0,  0,  0 | 60,  0,  0,  0 || 21,  0,  0,  0 | 18,  0,  0,  0 | 15,  0,  0,  0 | 12,  0,  0,  0)
861             _mm256_and_si256(
862                 // gtrHL:  (69, 72, 75, 78 | 66, 69, 72, 75 | 63, 66, 69, 72 | 60, 63, 66, 69 || 21, 24, 27, 30 | 18, 21, 24, 27 | 15, 18, 21, 24 | 12, 15, 18, 21)
863                 _mm256_i32gather_epi32(
864                     (const int *) memAddr,
865                     // higlow: (23,  0,  0,  0 | 22,  0,  0,  0 | 21,  0,  0,  0 | 20,  0,  0,  0 ||  7,  0,  0,  0 |  6,  0,  0,  0 |  5,  0,  0,  0 |  4,  0,  0,  0)
866                     _mm256_shuffle_epi8(idx, __m256i {
867                         ~0xFF000000FFl | 0x0900000008, ~0xFF000000FFl | 0x0B0000000A,
868                         ~0xFF000000FFl | 0x0900000008, ~0xFF000000FFl | 0x0B0000000A
869                     }),
870                     SCALE
871                 ),
872                 _mm256_set1_epi32(0xFF)
873             ),
874             // mskHH:  (57,  0,  0,  0 | 54,  0,  0,  0 | 51,  0,  0,  0 | 48,  0,  0,  0 ||  9,  0,  0,  0 |  6,  0,  0,  0 |  3,  0,  0,  0 |  0,  0,  0,  0)
875             _mm256_and_si256(
876                 // gtrHH:  (57, 60, 63, 66 | 54, 57, 60, 63 | 51, 54, 57, 60 | 48, 51, 54, 57 ||  9, 12, 15, 18 |  6,  9, 12, 15 |  3,  6,  9, 12 |  0,  3,  6,  9)
877                 _mm256_i32gather_epi32(
878                     (const int *) memAddr,
879                     // highig: (19,  0,  0,  0 | 18,  0,  0,  0 | 17,  0,  0,  0 | 16,  0,  0,  0 ||  3,  0,  0,  0 |  2,  0,  0,  0 |  1,  0,  0,  0 |  0,  0,  0,  0)
880                     _mm256_shuffle_epi8(idx, __m256i {
881                         ~0xFF000000FFl | 0x0D0000000C, ~0xFF000000FFl | 0x0F0000000E,
882                         ~0xFF000000FFl | 0x0D0000000C, ~0xFF000000FFl | 0x0F0000000E
883                     }),
884                     SCALE
885                 ),
886                 _mm256_set1_epi32(0xFF)
887             )
888         )
889     );
890 }
891 
892 template <typename TValue, typename TSize, TSize SCALE>
893 inline __m256i
seqan_mm256_i16gather_epi(TValue const * memAddr,__m256i const & idx,std::integral_constant<TSize,SCALE> const &)894 seqan_mm256_i16gather_epi(TValue const * memAddr,
895                           __m256i const & idx,
896                           std::integral_constant<TSize, SCALE> const & /*scale*/)
897 {
898     using TUnsignedValue = typename MakeUnsigned<TValue>::Type;
899 
900     // The cast makes sure that the max value of TValue = (u)int64_t and
901     // (u)int32_t will be max value of int16_t (i.e. `~0` in int16_t), because
902     // the resulting __m256i can only hold int16_t values.
903     //
904     // NOTE(marehr): the masking is only needed for TValue = (u)int8_t and
905     // (u)int16_t. It could be omitted if _mm256_packus_epi32 would be exchanged
906     // by _mm256_packs_epi32, because for (u)int32_t and (u)int64_t the masking
907     // operations are basically the identity function.
908     constexpr int const mask = static_cast<uint16_t>(MaxValue<TUnsignedValue>::VALUE);
909 
910     // 1. Unpack low idx values and interleave with 0 and gather from memAddr.
911     // 2. Unpack high idx values and interleave with 0, than gather from memAddr.
912     // 3. Merge 2 8x32 vectors into 1x16 vector by signed saturation. This operation reverts the interleave by the unpack operations above.
913     //
914     // The following is an example for SimdVector<uint16_t, 16> idx and uint16_t
915     // const * memAddr:
916     // mem:    ( 0,  0,  3,  0 |  6,  0,  9,  0 | 12,  0, 15,  0 | 18,  0, 21,  0 || 24,  0, 27,  0 | 30,  0, 33,  0 | 36,  0, 39,  0 | 42,  0, 45,  0)
917     // idx:    (15,  0, 14,  0 | 13,  0, 12,  0 | 11,  0, 10,  0 |  9,  0,  8,  0 ||  7,  0,  6,  0 |  5,  0,  4,  0 |  3,  0,  2,  0 |  1,  0,  0,  0)
918     // pack:   (45,  0, 42,  0 | 39,  0, 36,  0 | 33,  0, 30,  0 | 27,  0, 24,  0 || 21,  0, 18,  0 | 15,  0, 12,  0 |  9,  0,  6,  0 |  3,  0,  0,  0)
919     return _mm256_packus_epi32(
920         // mskLow: (45,  0,  0,  0 | 42,  0,  0,  0 | 39,  0,  0,  0 | 36,  0,  0,  0 || 21,  0,  0,  0 | 18,  0,  0,  0 | 15,  0,  0,  0 | 12,  0,  0,  0)
921         _mm256_and_si256(
922             // gtrLow: (45,  0, 15,  0 | 42,  0, 45,  0 | 39,  0, 42,  0 | 36,  0, 39,  0 || 21,  0, 24,  0 | 18,  0, 21,  0 | 15,  0, 18,  0 | 12,  0, 15,  0)
923             _mm256_i32gather_epi32(
924                 (const int *) memAddr,
925                 // low:    (15,  0,  0,  0 | 14,  0,  0,  0 | 13,  0,  0,  0 | 12,  0,  0,  0 ||  7,  0,  0,  0 |  6,  0,  0,  0 |  5,  0,  0,  0 |  4,  0,  0,  0)
926                 _mm256_unpacklo_epi16(
927                     idx, _mm256_set1_epi16(0)
928                 ),
929                 SCALE
930             ),
931             _mm256_set1_epi32(mask)
932         ),
933         // mskHih: (33,  0,  0,  0 | 30,  0,  0,  0 | 27,  0,  0,  0 | 24,  0,  0,  0 ||  9,  0,  0,  0 |  6,  0,  0,  0 |  3,  0,  0,  0 |  0,  0,  0,  0)
934         _mm256_and_si256(
935             // gtrHih: (33,  0, 36,  0 | 30,  0, 33,  0 | 27,  0, 30,  0 | 24,  0, 27,  0 ||  9,  0, 12,  0 |  6,  0,  9,  0 |  3,  0,  6,  0 |  0,  0,  3,  0)
936             _mm256_i32gather_epi32(
937                 (const int *) memAddr,
938                 // high:   (11,  0,  0,  0 | 10,  0,  0,  0 |  9,  0,  0,  0 |  8,  0,  0,  0 ||  3,  0,  0,  0 |  2,  0,  0,  0 |  1,  0,  0,  0 |  0,  0,  0,  0)
939                 _mm256_unpackhi_epi16(
940                     idx, _mm256_set1_epi16(0)
941                 ),
942                 SCALE
943             ),
944             _mm256_set1_epi32(mask)
945         )
946     );
947 }
948 
949 template <typename TValue, typename TSize, TSize SCALE>
950 inline __m256i
seqan_mm256_i32gather_epi(TValue const * memAddr,__m256i const & idx,std::integral_constant<TSize,SCALE> const &)951 seqan_mm256_i32gather_epi(TValue const * memAddr,
952                           __m256i const & idx,
953                           std::integral_constant<TSize, SCALE> const & /*scale*/)
954 {
955     using TUnsignedValue = typename MakeUnsigned<TValue>::Type;
956     constexpr auto const mask = static_cast<uint32_t>(MaxValue<TUnsignedValue>::VALUE);
957 
958     return _mm256_and_si256(
959         _mm256_i32gather_epi32((const int *) memAddr, idx, SCALE),
960         _mm256_set1_epi32(mask)
961     );
962 }
963 
964 template <typename TValue, typename TSize, TSize SCALE>
965 inline __m256i
seqan_mm256_i64gather_epi(TValue const * memAddr,__m256i const & idx,std::integral_constant<TSize,SCALE> const &)966 seqan_mm256_i64gather_epi(TValue const * memAddr,
967                           __m256i const & idx,
968                           std::integral_constant<TSize, SCALE> const & /*scale*/)
969 {
970     using TUnsignedValue = typename MakeUnsigned<TValue>::Type;
971     constexpr auto const mask = static_cast<uint64_t>(MaxValue<TUnsignedValue>::VALUE);
972 
973     return _mm256_and_si256(
974         _mm256_i64gather_epi64((const long long *) memAddr, idx, SCALE),
975         _mm256_set1_epi64x(mask)
976     );
977 }
978 
979 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE>
980 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,32>)981 _gather(TValue const * memAddr,
982         TSimdVector const & idx,
983         std::integral_constant<TSize, SCALE> const & scale,
984         SimdParams_<32, 32>)
985 {
986     return SEQAN_VECTOR_CAST_(TSimdVector,
987         seqan_mm256_i8gather_epi(
988             memAddr,
989             SEQAN_VECTOR_CAST_(__m256i const &, idx),
990             scale
991         )
992     );
993 }
994 
995 template <typename TSimdVector, typename TSize, TSize SCALE>
996 inline TSimdVector
_gather(int8_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,16>)997 _gather(int8_t const * memAddr,
998         TSimdVector const & idx,
999         std::integral_constant<TSize, SCALE> const & scale,
1000         SimdParams_<32, 16>)
1001 {
1002     // Note that memAddr is a signed integer type, thus a cast would extend the
1003     // sign. E.g., -3 = 253 in 8 bit, but would be 65533 in 16 bit.
1004     // Use _gather(uint8_t) and extend the sign to [u]int16_t.
1005     return SEQAN_VECTOR_CAST_(
1006         TSimdVector,
1007         seqan_mm256_i16sign_extend_epis8(
1008             seqan_mm256_i16gather_epi(
1009                 memAddr,
1010                 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1011                 scale
1012             )
1013         )
1014     );
1015 }
1016 
1017 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE>
1018 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,16>)1019 _gather(TValue const * memAddr,
1020         TSimdVector const & idx,
1021         std::integral_constant<TSize, SCALE> const & scale,
1022         SimdParams_<32, 16>)
1023 {
1024     return SEQAN_VECTOR_CAST_(
1025         TSimdVector,
1026         seqan_mm256_i16gather_epi(
1027             memAddr,
1028             SEQAN_VECTOR_CAST_(__m256i const &, idx),
1029             scale
1030         )
1031     );
1032 }
1033 
1034 template <typename TSimdVector, typename TSize, TSize SCALE>
1035 inline TSimdVector
_gather(int8_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,8>)1036 _gather(int8_t const * memAddr,
1037         TSimdVector const & idx,
1038         std::integral_constant<TSize, SCALE> const & scale,
1039         SimdParams_<32, 8>)
1040 {
1041     // Note that memAddr is a signed integer type, thus a cast would extend the
1042     // sign.
1043     return SEQAN_VECTOR_CAST_(
1044         TSimdVector,
1045         seqan_mm256_i32sign_extend_epis8(
1046             seqan_mm256_i32gather_epi(
1047                 memAddr,
1048                 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1049                 scale
1050             )
1051         )
1052     );
1053 }
1054 
1055 template <typename TSimdVector, typename TSize, TSize SCALE>
1056 inline TSimdVector
_gather(int16_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,8>)1057 _gather(int16_t const * memAddr,
1058         TSimdVector const & idx,
1059         std::integral_constant<TSize, SCALE> const & scale,
1060         SimdParams_<32, 8>)
1061 {
1062     // Note that memAddr is a signed integer type, thus a cast would extend the
1063     // sign.
1064     return SEQAN_VECTOR_CAST_(
1065         TSimdVector,
1066         seqan_mm256_i32sign_extend_epis16(
1067             seqan_mm256_i32gather_epi(
1068                 memAddr,
1069                 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1070                 scale
1071             )
1072         )
1073     );
1074 }
1075 
1076 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE>
1077 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,8>)1078 _gather(TValue const * memAddr,
1079         TSimdVector const & idx,
1080         std::integral_constant<TSize, SCALE> const & scale,
1081         SimdParams_<32, 8>)
1082 {
1083     return SEQAN_VECTOR_CAST_(
1084         TSimdVector,
1085         seqan_mm256_i32gather_epi(
1086             memAddr,
1087             SEQAN_VECTOR_CAST_(__m256i const &, idx),
1088             scale
1089         )
1090     );
1091 }
1092 
1093 template <typename TSimdVector, typename TSize, TSize SCALE>
1094 inline TSimdVector
_gather(int8_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,4>)1095 _gather(int8_t const * memAddr,
1096         TSimdVector const & idx,
1097         std::integral_constant<TSize, SCALE> const & scale,
1098         SimdParams_<32, 4>)
1099 {
1100     return SEQAN_VECTOR_CAST_(
1101         TSimdVector,
1102         seqan_mm256_i64sign_extend_epis8(
1103             seqan_mm256_i64gather_epi(
1104                 memAddr,
1105                 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1106                 scale
1107             )
1108         )
1109     );
1110 }
1111 
1112 template <typename TSimdVector, typename TSize, TSize SCALE>
1113 inline TSimdVector
_gather(int16_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,4>)1114 _gather(int16_t const * memAddr,
1115         TSimdVector const & idx,
1116         std::integral_constant<TSize, SCALE> const & scale,
1117         SimdParams_<32, 4>)
1118 {
1119     return SEQAN_VECTOR_CAST_(
1120         TSimdVector,
1121         seqan_mm256_i64sign_extend_epis16(
1122             seqan_mm256_i64gather_epi(
1123                 memAddr,
1124                 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1125                 scale
1126             )
1127         )
1128     );
1129 }
1130 
1131 template <typename TSimdVector, typename TSize, TSize SCALE>
1132 inline TSimdVector
_gather(int32_t const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,4>)1133 _gather(int32_t const * memAddr,
1134         TSimdVector const & idx,
1135         std::integral_constant<TSize, SCALE> const & scale,
1136         SimdParams_<32, 4>)
1137 {
1138     return SEQAN_VECTOR_CAST_(
1139         TSimdVector,
1140         seqan_mm256_i64sign_extend_epis32(
1141             seqan_mm256_i64gather_epi(
1142                 memAddr,
1143                 SEQAN_VECTOR_CAST_(__m256i const &, idx),
1144                 scale
1145             )
1146         )
1147     );
1148 }
1149 
1150 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE>
1151 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const & scale,SimdParams_<32,4>)1152 _gather(TValue const * memAddr,
1153         TSimdVector const & idx,
1154         std::integral_constant<TSize, SCALE> const & scale,
1155         SimdParams_<32, 4>)
1156 {
1157     return SEQAN_VECTOR_CAST_(
1158         TSimdVector,
1159         seqan_mm256_i64gather_epi(
1160             memAddr,
1161             SEQAN_VECTOR_CAST_(__m256i const &, idx),
1162             scale
1163         )
1164     );
1165 }
1166 
1167 // --------------------------------------------------------------------------
1168 // _shuffleVector (256bit)
1169 // --------------------------------------------------------------------------
1170 
1171 inline __m256i
seqan_m256_shuffle_epi8(__m256i const & vector,__m256i const & indices)1172 seqan_m256_shuffle_epi8(__m256i const & vector, __m256i const & indices)
1173 {
1174     return _mm256_xor_si256(
1175         // shuffle bytes from the lower bytes of vector
1176         _mm256_shuffle_epi8(
1177             // repeat twice the low bytes of vector in a new __m256i vector i.e.
1178             //   vh[127:0] = v[127:0]
1179             //   vh[255:128] = v[127:0]
1180             _mm256_broadcastsi128_si256(
1181                  _mm256_extracti128_si256(vector, 0)
1182             ),
1183             // ((indices[i] << 3) & 0b1000 0000) ^ indices[i]:
1184             //   Adds the 5th bit of indices[i] as most significant bit. If the
1185             //   5th bit is set, that means that indices[i] >= 16.
1186             //   r = _mm256_shuffle_epi8(vl, indices) will set r[i] = 0 if the
1187             //   most significant bit of indices[i] is 1. Since this bit is the
1188             //   5th bit, r[i] = 0 if indices[i] >= 16 and r[i] = vl[indices[i]]
1189             //   if indices[i] < 16.
1190             _mm256_xor_si256(
1191                 _mm256_and_si256(
1192                     _mm256_slli_epi16(indices, 3),
1193                     _mm256_set1_epi8(-127) // 0b1000 0000
1194                 ),
1195                 indices
1196             )
1197         ),
1198         // shuffle bytes from the higher bytes of vector
1199         _mm256_shuffle_epi8(
1200             // repeat twice the higher bytes of vector in a new __m256i vector
1201             // i.e.
1202             //   vh[127:0] = v[255:128]
1203             //   vh[255:128] = v[255:128]
1204             _mm256_broadcastsi128_si256(
1205                  _mm256_extracti128_si256(vector, 1)
1206             ),
1207             // indices[i] - 16:
1208             //   r = _mm256_shuffle_epi8(vh, indices)
1209             //   will return r[i] = 0 if the most significant bit of the byte
1210             //   indices[i] is 1. Thus, indices[i] - 16 will select all high
1211             //   bytes in vh, i.e. r[i] = vh[indices[i] - 16], if indices[i] >=
1212             //   16 and r[i] = 0 if indices[i] < 16.
1213             _mm256_sub_epi8(
1214                 indices,
1215                 _mm256_set1_epi8(16)
1216             )
1217         )
1218     );
1219 }
1220 
1221 inline __m256i
seqan_m256_shuffle_epi16(const __m256i a,const __m256i b)1222 seqan_m256_shuffle_epi16(const __m256i a, const __m256i b)
1223 {
1224     // multiply by 2
1225     __m256i idx = _mm256_slli_epi16(
1226         _mm256_permute4x64_epi64(b, 0b01010000),
1227         1
1228     );
1229     // _print(_mm256_add_epi8(idx, _mm256_set1_epi8(1)));
1230     // _print(        _mm256_unpacklo_epi8(
1231     //             idx,
1232     //             _mm256_add_epi8(idx, _mm256_set1_epi8(1))
1233     //         ));
1234     return seqan_m256_shuffle_epi8(
1235         a,
1236         // interleave idx[15:0]   = 2*indices[15],   ..., 2*indices[0]
1237         // with       idx[15:0]+1 = 2*indices[15]+1, ..., 2*indices[0]+1
1238         // => 2*indices[15]+1, 2*indices[15], ..., 2*indices[0]+1, 2*indices[0]
1239         _mm256_unpacklo_epi8(
1240             idx,
1241             _mm256_add_epi8(idx, _mm256_set1_epi8(1))
1242         )
1243     );
1244 }
1245 
1246 inline __m256i
seqan_m256_shuffle_epi32(const __m256i a,const __m256i b)1247 seqan_m256_shuffle_epi32(const __m256i a, const __m256i b)
1248 {
1249     // multiply by 4
1250     __m256i idx = _mm256_slli_epi16(
1251         _mm256_permutevar8x32_epi32(b, __m256i {0x0, 0x0, 0x1, 0x0}),
1252         2
1253     );
1254     return seqan_m256_shuffle_epi8(
1255         a,
1256         // interleave 4*indices[7]+1, 4*indices[7]+0; ..., 4*indices[0]+1, 4*indices[0]+0
1257         // with       4*indices[7]+3, 4*indices[7]+2; ..., 4*indices[0]+3, 4*indices[0]+2
1258         // => 4*indices[7]+3, 4*indices[7]+2; 4*indices[7]+1, 4*indices[7]+0;
1259         //    ...
1260         //    4*indices[0]+3, 4*indices[0]+2; 4*indices[0]+1, 4*indices[0]+0
1261         _mm256_unpacklo_epi16(
1262             // interleave idx[7:0]+0 = 4*indices[7]+0; ...; 4*indices[0]+0
1263             // with       idx[7:0]+1 = 4*indices[7]+1; ...; 4*indices[0]+1
1264             // => 4*indices[7]+1; 4*indices[7]+0; ...; 4*indices[0]+1; 4*indices[0]+0
1265             _mm256_unpacklo_epi8(
1266                 idx,
1267                 _mm256_add_epi8(idx, _mm256_set1_epi8(1))
1268             ),
1269             // interleave idx[7:0]+2 = 4*indices[7]+2; ...; 4*indices[0]+2
1270             // with       idx[7:0]+3 = 4*indices[7]+3; ...; 4*indices[0]+3
1271             // => 4*indices[7]+3; 4*indices[7]+2; ...; 4*indices[0]+3; 4*indices[0]+2
1272             _mm256_unpacklo_epi8(
1273                 _mm256_add_epi8(idx, _mm256_set1_epi8(2)),
1274                 _mm256_add_epi8(idx, _mm256_set1_epi8(3))
1275             )
1276     ));
1277 }
1278 
1279 #define seqan_mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
1280 
1281 inline __m256i
seqan_m256_shuffle_epi64(const __m256i a,const __m256i b)1282 seqan_m256_shuffle_epi64(const __m256i a, const __m256i b)
1283 {
1284     __m128i lowidx = _mm256_extracti128_si256(
1285         // multiply by 8
1286         _mm256_slli_epi16(b, 3),
1287         0
1288     );
1289 
1290     __m256i idx = seqan_mm256_set_m128i(
1291         _mm_srli_si128(lowidx, 2),
1292         lowidx
1293     );
1294 
1295     return seqan_m256_shuffle_epi8(
1296         a,
1297         _mm256_unpacklo_epi32(
1298             // interleave 8*indices[3]+1, 8*indices[3]+0; ..., 8*indices[0]+1, 8*indices[0]+0
1299             // with       8*indices[3]+3, 8*indices[3]+2; ..., 8*indices[0]+3, 8*indices[0]+2
1300             // => 8*indices[3]+3, 8*indices[3]+2; 8*indices[3]+1, 8*indices[3]+0;
1301             //    ...
1302             //    8*indices[0]+3, 8*indices[0]+2; 8*indices[0]+1, 8*indices[0]+0
1303             _mm256_unpacklo_epi16(
1304                 // interleave idx[3:0]+0 = 8*indices[3]+0; ...; 8*indices[0]+0
1305                 // with       idx[3:0]+1 = 8*indices[3]+1; ...; 8*indices[0]+1
1306                 // => 8*indices[3]+1; 8*indices[3]+0; ...; 8*indices[0]+1; 8*indices[0]+0
1307                _mm256_unpacklo_epi8(
1308                    idx,
1309                    _mm256_add_epi8(idx, _mm256_set1_epi8(1))
1310                ),
1311                // interleave idx[3:0]+2 = 8*indices[3]+2; ...; 8*indices[0]+2
1312                // with       idx[3:0]+3 = 8*indices[3]+3; ...; 8*indices[0]+3
1313                // => 8*indices[3]+3; 8*indices[3]+2; ...; 8*indices[0]+3; 8*indices[0]+2
1314                _mm256_unpacklo_epi8(
1315                    _mm256_add_epi8(idx, _mm256_set1_epi8(2)),
1316                    _mm256_add_epi8(idx, _mm256_set1_epi8(3))
1317                )
1318            ),
1319            // interleave 8*indices[3]+5, 8*indices[3]+4; ..., 8*indices[0]+5, 8*indices[0]+4
1320            // with       8*indices[3]+7, 8*indices[3]+6; ..., 8*indices[0]+7, 8*indices[0]+6
1321            // => 8*indices[3]+7, 8*indices[3]+6; 8*indices[3]+5, 8*indices[3]+4;
1322            //    ...
1323            //    8*indices[0]+7, 8*indices[0]+6; 8*indices[0]+5, 8*indices[0]+4
1324             _mm256_unpacklo_epi16(
1325                 // interleave idx[3:0]+4 = 8*indices[3]+4; ...; 8*indices[0]+4
1326                 // with       idx[3:0]+5 = 8*indices[3]+5; ...; 8*indices[0]+5
1327                 // => 8*indices[3]+5; 8*indices[3]+4; ...; 8*indices[0]+5; 8*indices[0]+4
1328                 _mm256_unpacklo_epi8(
1329                     _mm256_add_epi8(idx, _mm256_set1_epi8(4)),
1330                     _mm256_add_epi8(idx, _mm256_set1_epi8(5))
1331                 ),
1332                 // interleave idx[3:0]+6 = 8*indices[3]+6; ...; 8*indices[0]+6
1333                 // with       idx[3:0]+7 = 8*indices[3]+7; ...; 8*indices[0]+7
1334                 // => 8*indices[3]+7; 8*indices[3]+6; ...; 8*indices[0]+7; 8*indices[0]+6
1335                 _mm256_unpacklo_epi8(
1336                     _mm256_add_epi8(idx, _mm256_set1_epi8(6)),
1337                     _mm256_add_epi8(idx, _mm256_set1_epi8(7))
1338                 )
1339             )
1340         )
1341     );
1342 }
1343 
1344 template <typename TSimdVector1, typename TSimdVector2>
1345 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,16>,SimdParams_<16,16>)1346 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 16>, SimdParams_<16, 16>)
1347 {
1348     // copy 2nd 64bit word to 3rd, compute 2*idx
1349     __m256i idx = _mm256_slli_epi16(_mm256_permute4x64_epi64(_mm256_castsi128_si256(SEQAN_VECTOR_CAST_(const __m128i &, indices)), 0x50), 1);
1350 
1351     // interleave with 2*idx+1 and call shuffle
1352     return SEQAN_VECTOR_CAST_(TSimdVector1,
1353         _mm256_shuffle_epi8(
1354             SEQAN_VECTOR_CAST_(const __m256i &, vector),
1355             _mm256_unpacklo_epi8(
1356                 idx,
1357                 _mm256_add_epi8(
1358                     idx, _mm256_set1_epi8(1)
1359                 )
1360             )
1361         )
1362     );
1363 }
1364 
1365 template <typename TSimdVector1, typename TSimdVector2>
1366 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,32>,SimdParams_<32,32>)1367 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 32>, SimdParams_<32, 32>)
1368 {
1369     return SEQAN_VECTOR_CAST_(TSimdVector1, seqan_m256_shuffle_epi8(
1370         SEQAN_VECTOR_CAST_(const __m256i &, vector),
1371         SEQAN_VECTOR_CAST_(const __m256i &, indices)
1372     ));
1373 }
1374 
1375 template <typename TSimdVector1, typename TSimdVector2>
1376 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,16>,SimdParams_<32,32>)1377 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 16>, SimdParams_<32, 32>)
1378 {
1379     return SEQAN_VECTOR_CAST_(TSimdVector1, seqan_m256_shuffle_epi16(
1380         SEQAN_VECTOR_CAST_(const __m256i &, vector),
1381         SEQAN_VECTOR_CAST_(const __m256i &, indices)
1382     ));
1383 }
1384 
1385 template <typename TSimdVector1, typename TSimdVector2>
1386 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,8>,SimdParams_<32,32>)1387 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 8>, SimdParams_<32, 32>)
1388 {
1389     return SEQAN_VECTOR_CAST_(TSimdVector1, seqan_m256_shuffle_epi32(
1390         SEQAN_VECTOR_CAST_(const __m256i &, vector),
1391         SEQAN_VECTOR_CAST_(const __m256i &, indices)
1392     ));
1393 }
1394 
1395 template <typename TSimdVector1, typename TSimdVector2>
1396 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<32,4>,SimdParams_<32,32>)1397 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<32, 4>, SimdParams_<32, 32>)
1398 {
1399     return SEQAN_VECTOR_CAST_(TSimdVector1, seqan_m256_shuffle_epi64(
1400         SEQAN_VECTOR_CAST_(const __m256i &, vector),
1401         SEQAN_VECTOR_CAST_(const __m256i &, indices)
1402     ));
1403 }
1404 
1405 // --------------------------------------------------------------------------
1406 // _transposeMatrix (256bit)
1407 // --------------------------------------------------------------------------
1408 
1409 // emulate missing _mm256_unpacklo_epi128/_mm256_unpackhi_epi128 instructions
_mm256_unpacklo_epi128(__m256i const & a,__m256i const & b)1410 inline __m256i _mm256_unpacklo_epi128(__m256i const & a, __m256i const & b)
1411 {
1412     return _mm256_permute2x128_si256(a, b, 0x20);
1413 //    return _mm256_inserti128_si256(a, _mm256_extracti128_si256(b, 0), 1);
1414 }
1415 
_mm256_unpackhi_epi128(__m256i const & a,__m256i const & b)1416 inline __m256i _mm256_unpackhi_epi128(__m256i const & a, __m256i const & b)
1417 {
1418     return _mm256_permute2x128_si256(a, b, 0x31);
1419 //    return _mm256_inserti128_si256(b, _mm256_extracti128_si256(a, 1), 0);
1420 }
1421 
1422 template <typename TSimdVector>
1423 inline void
_transposeMatrix(TSimdVector matrix[],SimdMatrixParams_<32,32,8>)1424 _transposeMatrix(TSimdVector matrix[], SimdMatrixParams_<32, 32, 8>)
1425 {
1426     // we need a look-up table to reverse the lowest 4 bits
1427     // in order to place the permute the transposed rows
1428     static const unsigned char bitRev[] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15,
1429                                            16,24,20,28,18,26,22,30,17,25,21,29,19,27,23,31};
1430 
1431     // transpose a 32x32 byte matrix
1432     __m256i tmp1[32];
1433     for (int i = 0; i < 16; ++i)
1434     {
1435         tmp1[i]    = _mm256_unpacklo_epi8(
1436             SEQAN_VECTOR_CAST_(const __m256i &, matrix[2*i]),
1437             SEQAN_VECTOR_CAST_(const __m256i &, matrix[2*i+1])
1438         );
1439         tmp1[i+16] = _mm256_unpackhi_epi8(
1440             SEQAN_VECTOR_CAST_(const __m256i &, matrix[2*i]),
1441             SEQAN_VECTOR_CAST_(const __m256i &, matrix[2*i+1])
1442         );
1443     }
1444     __m256i  tmp2[32];
1445     for (int i = 0; i < 16; ++i)
1446     {
1447         tmp2[i]    = _mm256_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
1448         tmp2[i+16] = _mm256_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
1449     }
1450     for (int i = 0; i < 16; ++i)
1451     {
1452         tmp1[i]    = _mm256_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
1453         tmp1[i+16] = _mm256_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
1454     }
1455     for (int i = 0; i < 16; ++i)
1456     {
1457         tmp2[i]    = _mm256_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]);
1458         tmp2[i+16] = _mm256_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]);
1459     }
1460     for (int i = 0; i < 16; ++i)
1461     {
1462         matrix[bitRev[i]]    = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_unpacklo_epi128(tmp2[2*i],tmp2[2*i+1]));
1463         matrix[bitRev[i+16]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm256_unpackhi_epi128(tmp2[2*i],tmp2[2*i+1]));
1464     }
1465 }
1466 
1467 // --------------------------------------------------------------------------
1468 // Function _testAllZeros (256bit)
1469 // --------------------------------------------------------------------------
1470 
1471 template <typename TSimdVector>
SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector>>,int)1472 SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector> >, int)
1473 inline _testAllZeros(TSimdVector const & vector, TSimdVector const & mask, SimdParams_<32>)
1474 {
1475     return _mm256_testz_si256(SEQAN_VECTOR_CAST_(const __m256i &, vector),
1476                               SEQAN_VECTOR_CAST_(const __m256i &, mask));
1477 }
1478 
1479 // --------------------------------------------------------------------------
1480 // Function _testAllOnes (256bit)
1481 // --------------------------------------------------------------------------
1482 
1483 template <typename TSimdVector>
_testAllOnes(TSimdVector const & vector,SimdParams_<32>)1484 inline int _testAllOnes(TSimdVector const & vector, SimdParams_<32>)
1485 {
1486     __m256i vec = SEQAN_VECTOR_CAST_(const __m256i &, vector);
1487     return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
1488 }
1489 
1490 } // namespace seqan
1491 
1492 #endif // SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_AVX2_H_
1493