1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 //         René Rahn <rene.rahn@fu-berlin.de>
34 //         Stefan Budach <stefan.budach@fu-berlin.de>
35 // ==========================================================================
36 // generic SIMD interface for SSE3 / AVX2
37 // ==========================================================================
38 
39 #ifndef SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_SSE4_2_H_
40 #define SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_SSE4_2_H_
41 
42 namespace seqan {
43 
44 // SimdParams_<8, 8>: 64bit = 8 elements * 8bit
45 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8Char,      char,           8)
46 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8SChar,     signed char,    8)
47 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8UChar,     unsigned char,  8)
48 
49 // SimdParams_<8, 4>: 64bit = 4 elements * 2 * 8bit
50 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4Short,     short,          8)
51 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4UShort,    unsigned short, 8)
52 
53 // SimdParams_<8, 2>: 64bit = 2 elements * 4 * 8bit
54 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector2Int,       int,            8)
55 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector2UInt,      unsigned int,   8)
56 
57 // SimdParams_<16, 16>: 128bit = 16 elements * 8bit
58 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16Char,     char,           16)
59 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16SChar,    signed char,    16)
60 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16UChar,    unsigned char,  16)
61 
62 // SimdParams_<16, 8>: 128bit = 8 elements * 2 * 8bit
63 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8Short,     short,          16)
64 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8UShort,    unsigned short, 16)
65 
66 // SimdParams_<16, 4>: 128bit = 4 elements * 4 * 8bit
67 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4Int,       int,            16)
68 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4UInt,      unsigned int,   16)
69 
70 // SimdParams_<16, 2>: 128bit = 2 elements * 8 * 8bit
71 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector2Int64,     int64_t,        16)
72 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector2UInt64,    uint64_t,       16)
73 
74 // ============================================================================
75 // Functions
76 // ============================================================================
77 
78 // --------------------------------------------------------------------------
79 // _fillVector (128bit)
80 // --------------------------------------------------------------------------
81 
82 template <typename TSimdVector, typename... TValue>
83 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<16,16> const &)84 _fillVector(TSimdVector & vector,
85             std::tuple<TValue...> const & x,
86             std::index_sequence<0> const &,
87             SimdParams_<16, 16> const &)
88 {
89   vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi8(std::get<0>(x)));
90 }
91 
92 template <typename TSimdVector, typename... TValue>
93 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<16,8> const &)94 _fillVector(TSimdVector & vector,
95             std::tuple<TValue...> const & x,
96             std::index_sequence<0> const &,
97             SimdParams_<16, 8> const &)
98 {
99   vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi16(std::get<0>(x)));
100 }
101 
102 template <typename TSimdVector, typename... TValue>
103 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<16,4> const &)104 _fillVector(TSimdVector & vector,
105             std::tuple<TValue...> const & x,
106             std::index_sequence<0> const &,
107             SimdParams_<16, 4> const &)
108 {
109   vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi32(std::get<0>(x)));
110 }
111 
112 template <typename TSimdVector, typename... TValue>
113 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<16,2> const &)114 _fillVector(TSimdVector & vector,
115             std::tuple<TValue...> const & x,
116             std::index_sequence<0> const &,
117             SimdParams_<16, 2> const &)
118 {
119   vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi64x(std::get<0>(x)));
120 }
121 
122 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
123 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<16,16> const &)124 _fillVector(TSimdVector & vector,
125             std::tuple<TValue...> const & args,
126             std::index_sequence<INDICES...> const &,
127             SimdParams_<16, 16> const &)
128 {
129     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_setr_epi8(std::get<INDICES>(args)...));
130 }
131 
132 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
133 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<16,8> const &)134 _fillVector(TSimdVector & vector,
135             std::tuple<TValue...> const & args,
136             std::index_sequence<INDICES...> const &,
137             SimdParams_<16, 8> const &)
138 {
139     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_setr_epi16(std::get<INDICES>(args)...));
140 }
141 
142 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
143 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<16,4> const &)144 _fillVector(TSimdVector & vector,
145             std::tuple<TValue...> const & args,
146             std::index_sequence<INDICES...> const &,
147             SimdParams_<16, 4> const &)
148 {
149     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_setr_epi32(std::get<INDICES>(args)...));
150 }
151 
152 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
153 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<16,2> const &)154 _fillVector(TSimdVector & vector,
155             std::tuple<TValue...> const & args,
156             std::index_sequence<INDICES...> const &,
157             SimdParams_<16, 2> const &)
158 {
159     // reverse argument list 0, 1 -> 1, 0
160     // NOTE(marehr): Intel linux fails to reverse argument list and only
161     // _mm_set_epi64x has no reverse equivalent
162     // NOTE(rrahn): For g++-4.9 the set_epi function is a macro, which does not work with parameter pack expansion.
163     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set_epi64x(std::get<sizeof...(INDICES) - 1 - INDICES>(args)...));
164 }
165 
166 // --------------------------------------------------------------------------
167 // _clearVector (128bit)
168 // --------------------------------------------------------------------------
169 
170 template <typename TSimdVector, int L>
_clearVector(TSimdVector & vector,SimdParams_<16,L>)171 inline void _clearVector(TSimdVector & vector, SimdParams_<16, L>)
172 {
173     vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_setzero_si128());
174 }
175 
176 // --------------------------------------------------------------------------
177 // _createVector (128bit)
178 // --------------------------------------------------------------------------
179 
180 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<16,16>)181 inline TSimdVector _createVector(TValue const x, SimdParams_<16, 16>)
182 {
183     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi8(x));
184 }
185 
186 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<16,8>)187 inline TSimdVector _createVector(TValue const x, SimdParams_<16, 8>)
188 {
189     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi16(x));
190 }
191 
192 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<16,4>)193 inline TSimdVector _createVector(TValue const x, SimdParams_<16, 4>)
194 {
195     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi32(x));
196 }
197 
198 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<16,2>)199 inline TSimdVector _createVector(TValue const x, SimdParams_<16, 2>)
200 {
201     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi64x(x));
202 }
203 
204 // --------------------------------------------------------------------------
205 // cmpEq (128bit)
206 // --------------------------------------------------------------------------
207 
208 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16>)209 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16>)
210 {
211     return SEQAN_VECTOR_CAST_(TSimdVector,
212                               _mm_cmpeq_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
213                                              SEQAN_VECTOR_CAST_(const __m128i&, b)));
214 }
215 
216 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8>)217 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8>)
218 {
219     return SEQAN_VECTOR_CAST_(TSimdVector,
220                               _mm_cmpeq_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
221                                               SEQAN_VECTOR_CAST_(const __m128i&, b)));
222 }
223 
224 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4>)225 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4>)
226 {
227     return SEQAN_VECTOR_CAST_(TSimdVector,
228                               _mm_cmpeq_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
229                                               SEQAN_VECTOR_CAST_(const __m128i&, b)));
230 }
231 
232 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2>)233 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2>)
234 {
235     return SEQAN_VECTOR_CAST_(TSimdVector,
236                               _mm_cmpeq_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
237                                               SEQAN_VECTOR_CAST_(const __m128i&, b)));
238 }
239 
240 // --------------------------------------------------------------------------
241 // _cmpGt (128bit)
242 // --------------------------------------------------------------------------
243 
244 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,int8_t>)245 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, int8_t>)
246 {
247     return SEQAN_VECTOR_CAST_(TSimdVector,
248                               _mm_cmpgt_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
249                                              SEQAN_VECTOR_CAST_(const __m128i&, b)));
250 }
251 
252 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,uint8_t>)253 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, uint8_t>)
254 {
255     // There is no unsigned cmpgt, we reduce it to the signed case.
256     // Note that 0x80 = ~0x7F (prevent overflow messages).
257     return SEQAN_VECTOR_CAST_(TSimdVector,
258                               _mm_cmpgt_epi8(
259                                   _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, a), _mm_set1_epi8(~0x7F)),
260                                   _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, b), _mm_set1_epi8(~0x7F))));
261 }
262 
263 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,int16_t>)264 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, int16_t>)
265 {
266     return SEQAN_VECTOR_CAST_(TSimdVector,
267                               _mm_cmpgt_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
268                                               SEQAN_VECTOR_CAST_(const __m128i&, b)));
269 }
270 
271 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,uint16_t>)272 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, uint16_t>)
273 {
274     // There is no unsigned cmpgt, we reduce it to the signed case.
275     // Note that 0x8000 = ~0x7FFF (prevent overflow messages).
276     return SEQAN_VECTOR_CAST_(TSimdVector,
277                               _mm_cmpgt_epi16(
278                                   _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, a), _mm_set1_epi16(~0x7FFF)),
279                                   _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, b), _mm_set1_epi16(~0x7FFF))));
280 }
281 
282 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,int32_t>)283 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, int32_t>)
284 {
285     return SEQAN_VECTOR_CAST_(TSimdVector,
286                               _mm_cmpgt_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
287                                               SEQAN_VECTOR_CAST_(const __m128i&, b)));
288 }
289 
290 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,uint32_t>)291 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, uint32_t>)
292 {
293     // There is no unsigned cmpgt, we reduce it to the signed case.
294     // Note that 0x80000000 = ~0x7FFFFFFF (prevent overflow messages).
295     return SEQAN_VECTOR_CAST_(TSimdVector,
296                               _mm_cmpgt_epi32(
297                                   _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, a), _mm_set1_epi32(~0x7FFFFFFF)),
298                                   _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, b), _mm_set1_epi32(~0x7FFFFFFF))));
299 }
300 
301 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,int64_t>)302 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, int64_t>)
303 {
304     return SEQAN_VECTOR_CAST_(TSimdVector,
305                               _mm_cmpgt_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
306                                               SEQAN_VECTOR_CAST_(const __m128i&, b)));
307 }
308 
309 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,uint64_t>)310 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, uint64_t>)
311 {
312     // There is no unsigned cmpgt, we reduce it to the signed case.
313     // Note that 0x8000000000000000ul = ~0x7FFFFFFFFFFFFFFFul (prevent overflow messages).
314     return SEQAN_VECTOR_CAST_(TSimdVector,
315                               _mm_cmpgt_epi64(
316                                   _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, a) ,_mm_set1_epi64x(~0x7FFFFFFFFFFFFFFFul)),
317                                   _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, b), _mm_set1_epi64x(~0x7FFFFFFFFFFFFFFFul))));
318 }
319 
320 // --------------------------------------------------------------------------
321 // _bitwiseOr (128bit)
322 // --------------------------------------------------------------------------
323 
324 template <typename TSimdVector, int L>
_bitwiseOr(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,L>)325 inline TSimdVector _bitwiseOr(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, L>)
326 {
327     return SEQAN_VECTOR_CAST_(TSimdVector,
328                               _mm_or_si128(SEQAN_VECTOR_CAST_(const __m128i&, a),
329                                            SEQAN_VECTOR_CAST_(const __m128i&, b)));
330 }
331 
332 // --------------------------------------------------------------------------
333 // _bitwiseAnd (128bit)
334 // --------------------------------------------------------------------------
335 
336 template <typename TSimdVector, int L>
_bitwiseAnd(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,L>)337 inline TSimdVector _bitwiseAnd(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, L>)
338 {
339     return SEQAN_VECTOR_CAST_(TSimdVector,
340                               _mm_and_si128(SEQAN_VECTOR_CAST_(const __m128i&, a),
341                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
342 }
343 
344 // --------------------------------------------------------------------------
345 // _bitwiseAndNot (128bit)
346 // --------------------------------------------------------------------------
347 
348 template <typename TSimdVector, int L>
_bitwiseAndNot(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,L>)349 inline TSimdVector _bitwiseAndNot(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, L>)
350 {
351     return SEQAN_VECTOR_CAST_(TSimdVector,
352                               _mm_andnot_si128(SEQAN_VECTOR_CAST_(const __m128i&, a),
353                                                SEQAN_VECTOR_CAST_(const __m128i&, b)));
354 }
355 
356 // --------------------------------------------------------------------------
357 // _bitwiseNot (128bit)
358 // --------------------------------------------------------------------------
359 
360 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<16,16>)361 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<16, 16>)
362 {
363     return SEQAN_VECTOR_CAST_(TSimdVector,
364                               _mm_cmpeq_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
365                                              _mm_setzero_si128()));
366 }
367 
368 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<16,8>)369 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<16, 8>)
370 {
371     return SEQAN_VECTOR_CAST_(TSimdVector,
372                               _mm_cmpeq_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
373                                               _mm_setzero_si128()));
374 }
375 
376 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<16,4>)377 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<16, 4>)
378 {
379     return SEQAN_VECTOR_CAST_(TSimdVector,
380                               _mm_cmpeq_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
381                                               _mm_setzero_si128()));
382 }
383 
384 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<16,2>)385 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<16, 2>)
386 {
387     return SEQAN_VECTOR_CAST_(TSimdVector,
388                               _mm_cmpeq_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
389                                               _mm_setzero_si128()));
390 }
391 
392 // --------------------------------------------------------------------------
393 // _divide (128bit)
394 // --------------------------------------------------------------------------
395 
396 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<16,16>)397 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<16, 16>)
398 {
399     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_div_epi8(a, _mm_set1_epi8(b)));
400 }
401 
402 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<16,8>)403 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<16, 8>)
404 {
405     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_div_epi16(a, _mm_set1_epi16(b)));
406 }
407 
408 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<16,4>)409 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<16, 4>)
410 {
411     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_div_epi32(a, _mm_set1_epi32(b)));
412 }
413 
414 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<16,2>)415 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<16, 2>)
416 {
417     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_div_epi64(a, _mm_set1_epi64x(b)));
418 }
419 
420 // --------------------------------------------------------------------------
421 // _add (128bit)
422 // --------------------------------------------------------------------------
423 
424 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16>)425 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16>)
426 {
427     return SEQAN_VECTOR_CAST_(TSimdVector,
428                               _mm_add_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
429                                            SEQAN_VECTOR_CAST_(const __m128i&, b)));
430 }
431 
432 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8>)433 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8>)
434 {
435     return SEQAN_VECTOR_CAST_(TSimdVector,
436                               _mm_add_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
437                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
438 }
439 
440 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4>)441 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4>)
442 {
443     return SEQAN_VECTOR_CAST_(TSimdVector,
444                               _mm_add_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
445                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
446 }
447 
448 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2>)449 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2>)
450 {
451     return SEQAN_VECTOR_CAST_(TSimdVector,
452                               _mm_add_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
453                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
454 }
455 
456 // --------------------------------------------------------------------------
457 // _sub (128bit)
458 // --------------------------------------------------------------------------
459 
460 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16>)461 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16>)
462 {
463     return SEQAN_VECTOR_CAST_(TSimdVector,
464                               _mm_sub_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
465                                            SEQAN_VECTOR_CAST_(const __m128i&, b)));
466 }
467 
468 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8>)469 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8>)
470 {
471     return SEQAN_VECTOR_CAST_(TSimdVector,
472                               _mm_sub_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
473                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
474 }
475 
476 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4>)477 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4>)
478 {
479     return SEQAN_VECTOR_CAST_(TSimdVector,
480                               _mm_sub_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
481                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
482 }
483 
484 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2>)485 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2>)
486 {
487     return SEQAN_VECTOR_CAST_(TSimdVector,
488                               _mm_sub_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
489                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
490 }
491 
492 // --------------------------------------------------------------------------
493 // _mult (128bit)
494 // --------------------------------------------------------------------------
495 
496 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const &,SimdParams_<16,16>)497 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const &/*b*/, SimdParams_<16, 16>)
498 {
499     SEQAN_ASSERT_FAIL("SSE intrinsics for multiplying 8 bit values not implemented!");
500     return a;
501 }
502 
503 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8>)504 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8>)
505 {
506     return SEQAN_VECTOR_CAST_(TSimdVector,
507                               _mm_mullo_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
508                                               SEQAN_VECTOR_CAST_(const __m128i&, b)));
509 }
510 
511 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4>)512 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4>)
513 {
514     return SEQAN_VECTOR_CAST_(TSimdVector,
515                               _mm_mullo_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
516                                               SEQAN_VECTOR_CAST_(const __m128i&, b)));
517 }
518 
519 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const &,SimdParams_<16,2>)520 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const &/*b*/, SimdParams_<16, 2>)
521 {
522     SEQAN_ASSERT_FAIL("SSE intrinsics for multiplying 64 bit values not implemented!");
523     return a;
524 }
525 
526 // --------------------------------------------------------------------------
527 // _max (128bit)
528 // --------------------------------------------------------------------------
529 
530 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,int8_t>)531 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, int8_t>)
532 {
533     return SEQAN_VECTOR_CAST_(TSimdVector,
534                               _mm_max_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
535                                            SEQAN_VECTOR_CAST_(const __m128i&, b)));
536 }
537 
538 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,uint8_t>)539 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, uint8_t>)
540 {
541     return SEQAN_VECTOR_CAST_(TSimdVector,
542                               _mm_max_epu8(SEQAN_VECTOR_CAST_(const __m128i&, a),
543                                            SEQAN_VECTOR_CAST_(const __m128i&, b)));
544 }
545 
546 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,int16_t>)547 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, int16_t>)
548 {
549     return SEQAN_VECTOR_CAST_(TSimdVector,
550                               _mm_max_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
551                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
552 }
553 
554 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,uint16_t>)555 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, uint16_t>)
556 {
557     return SEQAN_VECTOR_CAST_(TSimdVector,
558                               _mm_max_epu16(SEQAN_VECTOR_CAST_(const __m128i&, a),
559                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
560 }
561 
562 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,int32_t>)563 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, int32_t>)
564 {
565     return SEQAN_VECTOR_CAST_(TSimdVector,
566                               _mm_max_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
567                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
568 }
569 
570 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,uint32_t>)571 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, uint32_t>)
572 {
573     return SEQAN_VECTOR_CAST_(TSimdVector,
574                               _mm_max_epu32(SEQAN_VECTOR_CAST_(const __m128i&, a),
575                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
576 }
577 
578 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,int64_t>)579 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, int64_t>)
580 {
581 #if defined(__AVX512VL__)
582     return SEQAN_VECTOR_CAST_(TSimdVector,
583                               _mm_max_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
584                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
585 #else // defined(__AVX512VL__)
586     return blend(b, a, cmpGt(a, b));
587 #endif // defined(__AVX512VL__)
588 }
589 
590 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,uint64_t>)591 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, uint64_t>)
592 {
593 #if defined(__AVX512VL__)
594     return SEQAN_VECTOR_CAST_(TSimdVector,
595                               _mm_max_epu64(SEQAN_VECTOR_CAST_(const __m128i&, a),
596                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
597 #else // defined(__AVX512VL__)
598     return blend(b, a, cmpGt(a, b));
599 #endif // defined(__AVX512VL__)
600 }
601 
602 
603 // --------------------------------------------------------------------------
604 // _min (128bit)
605 // --------------------------------------------------------------------------
606 
607 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,int8_t>)608 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, int8_t>)
609 {
610     return SEQAN_VECTOR_CAST_(TSimdVector,
611                               _mm_min_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
612                                            SEQAN_VECTOR_CAST_(const __m128i&, b)));
613 }
614 
615 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,uint8_t>)616 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, uint8_t>)
617 {
618     return SEQAN_VECTOR_CAST_(TSimdVector,
619                               _mm_min_epu8(SEQAN_VECTOR_CAST_(const __m128i&, a),
620                                            SEQAN_VECTOR_CAST_(const __m128i&, b)));
621 }
622 
623 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,int16_t>)624 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, int16_t>)
625 {
626     return SEQAN_VECTOR_CAST_(TSimdVector,
627                               _mm_min_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
628                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
629 }
630 
631 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,uint16_t>)632 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, uint16_t>)
633 {
634     return SEQAN_VECTOR_CAST_(TSimdVector,
635                               _mm_min_epu16(SEQAN_VECTOR_CAST_(const __m128i&, a),
636                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
637 }
638 
639 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,int32_t>)640 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, int32_t>)
641 {
642     return SEQAN_VECTOR_CAST_(TSimdVector,
643                               _mm_min_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
644                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
645 }
646 
647 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,uint32_t>)648 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, uint32_t>)
649 {
650     return SEQAN_VECTOR_CAST_(TSimdVector,
651                               _mm_min_epu32(SEQAN_VECTOR_CAST_(const __m128i&, a),
652                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
653 }
654 
655 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,int64_t>)656 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, int64_t>)
657 {
658 #if defined(__AVX512VL__)
659     return SEQAN_VECTOR_CAST_(TSimdVector,
660                               _mm_min_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
661                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
662 #else // defined(__AVX512VL__)
663     return blend(a, b, cmpGt(a, b));
664 #endif // defined(__AVX512VL__)
665 }
666 
667 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,uint64_t>)668 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, uint64_t>)
669 {
670 #if defined(__AVX512VL__)
671     return SEQAN_VECTOR_CAST_(TSimdVector,
672                               _mm_min_epu64(SEQAN_VECTOR_CAST_(const __m128i&, a),
673                                             SEQAN_VECTOR_CAST_(const __m128i&, b)));
674 #else // defined(__AVX512VL__)
675     return blend(a, b, cmpGt(a, b));
676 #endif // defined(__AVX512VL__)
677 }
678 
679 // --------------------------------------------------------------------------
680 // _blend (128bit)
681 // --------------------------------------------------------------------------
682 
683 template <typename TSimdVector, typename TSimdVectorMask, int L>
_blend(TSimdVector const & a,TSimdVector const & b,TSimdVectorMask const & mask,SimdParams_<16,L>)684 inline TSimdVector _blend(TSimdVector const & a, TSimdVector const & b, TSimdVectorMask const & mask, SimdParams_<16, L>)
685 {
686     return SEQAN_VECTOR_CAST_(TSimdVector,
687                               _mm_blendv_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
688                                               SEQAN_VECTOR_CAST_(const __m128i&, b),
689                                               SEQAN_VECTOR_CAST_(const __m128i&, mask)));
690 }
691 
692 // --------------------------------------------------------------------------
693 // _storeu (128bit)
694 // --------------------------------------------------------------------------
695 
696 template <typename T, typename TSimdVector, int L>
_storeu(T * memAddr,TSimdVector const & vec,SimdParams_<16,L>)697 inline void _storeu(T * memAddr, TSimdVector const & vec, SimdParams_<16, L>)
698 {
699     _mm_storeu_si128((__m128i*)memAddr, reinterpret_cast<const __m128i &>(vec));
700 }
701 
702 // ----------------------------------------------------------------------------
703 // Function _load() 128bit
704 // ----------------------------------------------------------------------------
705 
706 template <typename TSimdVector, typename T, int L>
_load(T const * memAddr,SimdParams_<16,L>)707 inline TSimdVector _load(T const * memAddr, SimdParams_<16, L>)
708 {
709     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_load_si128((__m128i const *) memAddr));
710 }
711 
712 // --------------------------------------------------------------------------
713 // _shiftRightLogical (128bit)
714 // --------------------------------------------------------------------------
715 
716 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<16,16>)717 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<16, 16>)
718 {
719     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_srli_epi16(SEQAN_VECTOR_CAST_(const __m128i &, vector), imm) & _mm_set1_epi8(0xff >> imm));
720 }
721 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<16,8>)722 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<16, 8>)
723 {
724     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_srli_epi16(SEQAN_VECTOR_CAST_(const __m128i &, vector), imm));
725 }
726 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<16,4>)727 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<16, 4>)
728 {
729     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_srli_epi32(SEQAN_VECTOR_CAST_(const __m128i &, vector), imm));
730 }
731 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<16,2>)732 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<16, 2>)
733 {
734     return SEQAN_VECTOR_CAST_(TSimdVector, _mm_srli_epi64(SEQAN_VECTOR_CAST_(const __m128i &, vector), imm));
735 }
736 
737 // --------------------------------------------------------------------------
738 // _gather (128bit)
739 // --------------------------------------------------------------------------
740 
741 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE, typename TSimdParams>
742 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const &,TSimdParams)743 _gather(TValue const * memAddr,
744         TSimdVector const & idx,
745         std::integral_constant<TSize, SCALE> const & /*scale*/,
746         TSimdParams)
747 {
748     TSimdVector ret;
749     for (auto i = 0u; i < LENGTH<TSimdVector>::VALUE; ++i)
750     {
751         ret[i] = memAddr[idx[i]];
752     }
753     return ret;
754 }
755 
756 // --------------------------------------------------------------------------
757 // _shuffleVector (128bit)
758 // --------------------------------------------------------------------------
759 
760 inline __m128i
seqan_mm_shuffle_epi16(const __m128i a,const __m128i b)761 seqan_mm_shuffle_epi16(const __m128i a, const __m128i b)
762 {
763     // multiply by 2
764     __m128i idx = _mm_slli_epi16(b, 1);
765     return _mm_shuffle_epi8(
766         a,
767         // interleave idx[7:0]   = 2*indices[7],   ..., 2*indices[0]
768         // with       idx[7:0]+1 = 2*indices[7]+1, ..., 2*indices[0]+1
769         // => 2*indices[7]+1, 2*indices[7], ..., 2*indices[0]+1, 2*indices[0]
770         _mm_unpacklo_epi8(
771             idx,
772             _mm_add_epi8(idx, _mm_set1_epi8(1))
773         )
774     );
775 }
776 
777 inline __m128i
seqan_mm_shuffle_epi32(const __m128i a,const __m128i b)778 seqan_mm_shuffle_epi32(const __m128i a, const __m128i b)
779 {
780     // multiply by 4
781     __m128i idx = _mm_slli_epi16(b, 2);
782     return _mm_shuffle_epi8(
783         a,
784         // interleave 4*indices[3]+1, 4*indices[3]+0; ..., 4*indices[0]+1, 4*indices[0]+0
785         // with       4*indices[3]+3, 4*indices[3]+2; ..., 4*indices[0]+3, 4*indices[0]+2
786         // => 4*indices[3]+3, 4*indices[3]+2; 4*indices[3]+1, 4*indices[3]+0;
787         //    ...
788         //    4*indices[0]+3, 4*indices[0]+2; 4*indices[0]+1, 4*indices[0]+0
789         _mm_unpacklo_epi16(
790             // interleave idx[3:0]+0 = 4*indices[3]+0; ...; 4*indices[0]+0
791             // with       idx[3:0]+1 = 4*indices[3]+1; ...; 4*indices[0]+1
792             // => 4*indices[3]+1; 4*indices[3]+0; ...; 4*indices[0]+1; 4*indices[0]+0
793             _mm_unpacklo_epi8(
794                 idx,
795                 _mm_add_epi8(idx, _mm_set1_epi8(1))
796             ),
797             // interleave idx[3:0]+2 = 4*indices[3]+2; ...; 4*indices[0]+2
798             // with       idx[3:0]+3 = 4*indices[3]+3; ...; 4*indices[0]+3
799             // => 4*indices[3]+3; 4*indices[3]+2; ...; 4*indices[0]+3; 4*indices[0]+2
800             _mm_unpacklo_epi8(
801                 _mm_add_epi8(idx, _mm_set1_epi8(2)),
802                 _mm_add_epi8(idx, _mm_set1_epi8(3))
803             )
804     ));
805 }
806 
807 inline __m128i
seqan_mm_shuffle_epi64(const __m128i a,const __m128i b)808 seqan_mm_shuffle_epi64(const __m128i a, const __m128i b)
809 {
810     // multiply by 8
811     __m128i idx = _mm_slli_epi16(b, 3);
812     return _mm_shuffle_epi8(
813         a,
814         _mm_unpacklo_epi32(
815             // interleave 8*indices[1]+1, 8*indices[1]+0; ..., 8*indices[0]+1, 8*indices[0]+0
816             // with       8*indices[1]+3, 8*indices[1]+2; ..., 8*indices[0]+3, 8*indices[0]+2
817             // => 8*indices[1]+3, 8*indices[1]+2; 8*indices[1]+1, 8*indices[1]+0;
818             //    ...
819             //    8*indices[0]+3, 8*indices[0]+2; 8*indices[0]+1, 8*indices[0]+0
820             _mm_unpacklo_epi16(
821                 // interleave idx[1:0]+0 = 8*indices[1]+0; ...; 8*indices[0]+0
822                 // with       idx[1:0]+1 = 8*indices[1]+1; ...; 8*indices[0]+1
823                 // => 8*indices[1]+1; 8*indices[1]+0; ...; 8*indices[0]+1; 8*indices[0]+0
824                 _mm_unpacklo_epi8(
825                     idx,
826                     _mm_add_epi8(idx, _mm_set1_epi8(1))
827                 ),
828                 // interleave idx[1:0]+2 = 8*indices[1]+2; ...; 8*indices[0]+2
829                 // with       idx[1:0]+3 = 8*indices[1]+3; ...; 8*indices[0]+3
830                 // => 8*indices[1]+3; 8*indices[1]+2; ...; 8*indices[0]+3; 8*indices[0]+2
831                 _mm_unpacklo_epi8(
832                     _mm_add_epi8(idx, _mm_set1_epi8(2)),
833                     _mm_add_epi8(idx, _mm_set1_epi8(3))
834                 )
835             ),
836             // interleave 8*indices[1]+5, 8*indices[1]+4; ..., 8*indices[0]+5, 8*indices[0]+4
837             // with       8*indices[1]+7, 8*indices[1]+6; ..., 8*indices[0]+7, 8*indices[0]+6
838             // => 8*indices[1]+7, 8*indices[1]+6; 8*indices[1]+5, 8*indices[1]+4;
839             //    ...
840             //    8*indices[0]+7, 8*indices[0]+6; 8*indices[0]+5, 8*indices[0]+4
841             _mm_unpacklo_epi16(
842                 // interleave idx[1:0]+4 = 8*indices[1]+4; ...; 8*indices[0]+4
843                 // with       idx[1:0]+5 = 8*indices[1]+5; ...; 8*indices[0]+5
844                 // => 8*indices[1]+5; 8*indices[1]+4; ...; 8*indices[0]+5; 8*indices[0]+4
845                 _mm_unpacklo_epi8(
846                     _mm_add_epi8(idx, _mm_set1_epi8(4)),
847                     _mm_add_epi8(idx, _mm_set1_epi8(5))
848                 ),
849                 // interleave idx[1:0]+6 = 8*indices[1]+6; ...; 8*indices[0]+6
850                 // with       idx[1:0]+7 = 8*indices[1]+7; ...; 8*indices[0]+7
851                 // => 8*indices[1]+7; 8*indices[1]+6; ...; 8*indices[0]+7; 8*indices[0]+6
852                 _mm_unpacklo_epi8(
853                     _mm_add_epi8(idx, _mm_set1_epi8(6)),
854                     _mm_add_epi8(idx, _mm_set1_epi8(7))
855                 )
856             )
857         )
858     );
859 }
860 
861 template <typename TSimdVector1, typename TSimdVector2>
862 [[deprecated("Here be dragons")]]
863 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,8>,SimdParams_<8,8>)864 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 8>, SimdParams_<8, 8>)
865 {
866 #if SEQAN_IS_32_BIT
867     __m128i idx = _mm_slli_epi16(
868         _mm_unpacklo_epi32(
869             _mm_cvtsi32_si128(reinterpret_cast<const uint32_t &>(indices)),
870             _mm_cvtsi32_si128(reinterpret_cast<const uint64_t &>(indices) >> 32)
871         ),
872         1
873     );
874 #else
875     __m128i idx = _mm_slli_epi16(_mm_cvtsi64_si128(reinterpret_cast<const uint64_t &>(indices)), 1);
876 #endif  // SEQAN_IS_32_BIT
877     return SEQAN_VECTOR_CAST_(TSimdVector1,
878         _mm_shuffle_epi8(
879             SEQAN_VECTOR_CAST_(const __m128i &, vector),
880             _mm_unpacklo_epi8(idx, _mm_add_epi8(idx, _mm_set1_epi8(1)))
881         ));
882 }
883 
884 template <typename TSimdVector1, typename TSimdVector2>
885 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,16>,SimdParams_<16,16>)886 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 16>, SimdParams_<16, 16>)
887 {
888     return SEQAN_VECTOR_CAST_(
889         TSimdVector1,
890         _mm_shuffle_epi8(
891             SEQAN_VECTOR_CAST_(const __m128i &, vector),
892             SEQAN_VECTOR_CAST_(const __m128i &, indices)
893     ));
894 }
895 
896 template <typename TSimdVector1, typename TSimdVector2>
897 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,8>,SimdParams_<16,16>)898 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 8>, SimdParams_<16, 16>)
899 {
900     return SEQAN_VECTOR_CAST_(
901         TSimdVector1,
902         seqan_mm_shuffle_epi16(
903             SEQAN_VECTOR_CAST_(const __m128i &, vector),
904             SEQAN_VECTOR_CAST_(const __m128i &, indices)
905     ));
906 }
907 
908 template <typename TSimdVector1, typename TSimdVector2>
909 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,4>,SimdParams_<16,16>)910 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 4>, SimdParams_<16, 16>)
911 {
912     return SEQAN_VECTOR_CAST_(
913         TSimdVector1,
914         seqan_mm_shuffle_epi32(
915             SEQAN_VECTOR_CAST_(const __m128i &, vector),
916             SEQAN_VECTOR_CAST_(const __m128i &, indices)
917     ));
918 }
919 
920 template <typename TSimdVector1, typename TSimdVector2>
921 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,2>,SimdParams_<16,16>)922 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 2>, SimdParams_<16, 16>)
923 {
924     return SEQAN_VECTOR_CAST_(
925         TSimdVector1,
926         seqan_mm_shuffle_epi64(
927             SEQAN_VECTOR_CAST_(const __m128i &, vector),
928             SEQAN_VECTOR_CAST_(const __m128i &, indices)
929     ));
930 }
931 
932 // --------------------------------------------------------------------------
933 // _transposeMatrix (128bit)
934 // --------------------------------------------------------------------------
935 
936 template <typename TSimdVector>
937 inline void
_transposeMatrix(TSimdVector matrix[],SimdMatrixParams_<8,8,8>)938 _transposeMatrix(TSimdVector matrix[], SimdMatrixParams_<8, 8, 8>)
939 {
940     // we need a look-up table to reverse the lowest 4 bits
941     // in order to place the permute the transposed rows
942     static const unsigned char bitRev[] = {0,4,2,6,1,5,3,7};
943 
944     // transpose a 8x8 byte matrix
945     __m64 tmp1[8];
946     for (int i = 0; i < 4; ++i)
947     {
948         tmp1[i]   = _mm_unpacklo_pi8(SEQAN_VECTOR_CAST_(const __m64 &, matrix[2*i]), SEQAN_VECTOR_CAST_(const __m64 &, matrix[2*i+1]));
949         tmp1[i+4] = _mm_unpackhi_pi8(SEQAN_VECTOR_CAST_(const __m64 &, matrix[2*i]), SEQAN_VECTOR_CAST_(const __m64 &, matrix[2*i+1]));
950     }
951     __m64 tmp2[8];
952     for (int i = 0; i < 4; ++i)
953     {
954         tmp2[i]   = _mm_unpacklo_pi16(tmp1[2*i], tmp1[2*i+1]);
955         tmp2[i+4] = _mm_unpackhi_pi16(tmp1[2*i], tmp1[2*i+1]);
956     }
957     for (int i = 0; i < 4; ++i)
958     {
959         matrix[bitRev[i]]   = SEQAN_VECTOR_CAST_(TSimdVector, _mm_unpacklo_pi32(tmp2[2*i], tmp2[2*i+1]));
960         matrix[bitRev[i+4]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm_unpackhi_pi32(tmp2[2*i], tmp2[2*i+1]));
961     }
962 }
963 
964 template <typename TSimdVector>
965 inline void
_transposeMatrix(TSimdVector matrix[],SimdMatrixParams_<16,16,8>)966 _transposeMatrix(TSimdVector matrix[], SimdMatrixParams_<16, 16, 8>)
967 {
968     // we need a look-up table to reverse the lowest 4 bits
969     // in order to place the permute the transposed rows
970     static const unsigned char bitRev[] = {0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15};
971 
972     // transpose a 16x16 byte matrix
973     //
974     // matrix =
975     // A0 A1 A2 ... Ae Af
976     // B0 B1 B2 ... Be Bf
977     // ...
978     // P0 P1 P2 ... Pe Pf
979     __m128i tmp1[16];
980     for (int i = 0; i < 8; ++i)
981     {
982         tmp1[i]   = _mm_unpacklo_epi8(SEQAN_VECTOR_CAST_(const __m128i &, matrix[2*i]), SEQAN_VECTOR_CAST_(const __m128i &, matrix[2*i+1]));
983         tmp1[i+8] = _mm_unpackhi_epi8(SEQAN_VECTOR_CAST_(const __m128i &, matrix[2*i]), SEQAN_VECTOR_CAST_(const __m128i &, matrix[2*i+1]));
984     }
985     // tmp1[0]  = A0 B0 A1 B1 ... A7 B7
986     // tmp1[1]  = C0 D0 C1 D1 ... C7 D7
987     // ...
988     // tmp1[7]  = O0 P0 O1 P1 ... O7 P7
989     // tmp1[8]  = A8 B8 A9 B9 ... Af Bf
990     // ...
991     // tmp1[15] = O8 P8 O9 P9 ... Of Pf
992     __m128i tmp2[16];
993     for (int i = 0; i < 8; ++i)
994     {
995         tmp2[i]   = _mm_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
996         tmp2[i+8] = _mm_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
997     }
998     // tmp2[0]  = A0 B0 C0 D0 ... A3 B3 C3 D3
999     // tmp2[1]  = E0 F0 G0 H0 ... E3 F3 G3 H3
1000     // ...
1001     // tmp2[3]  = M0 N0 O0 P0 ... M3 N3 O3 P3
1002     // tmp2[4]  = A8 B8 C8 D8 ... Ab Bb Cb Db
1003     // ...
1004     // tmp2[7]  = M8 N8 O8 P8 ... Mb Nb Ob Pb
1005     // tmp2[8]  = A4 B4 C4 D4 ... A7 B7 C7 D7
1006     // ..
1007     // tmp2[12] = Ac Bc Cc Dc ... Af Bf Cf Df
1008     // ...
1009     // tmp2[15] = Mc Nc Oc Pc ... Mf Nf Of Pf
1010     for (int i = 0; i < 8; ++i)
1011     {
1012         tmp1[i]   = _mm_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
1013         tmp1[i+8] = _mm_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
1014     }
1015     // tmp1[0]  = A0 B0 .... H0 A1 B1 .... H1
1016     // tmp1[1]  = I0 J0 .... P0 I1 J1 .... P1
1017     // ...
1018     // tmp1[4]  = A0 B0 .... H0 A1 B1 .... H1
1019     // tmp1[1]  = I0 J0 .... P0 I1 J1 .... P1
1020     for (int i = 0; i < 8; ++i)
1021     {
1022         matrix[bitRev[i]]   = SEQAN_VECTOR_CAST_(TSimdVector, _mm_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]));
1023         matrix[bitRev[i+8]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]));
1024     }
1025 }
1026 
1027 // --------------------------------------------------------------------------
1028 // Function _testAllZeros (128bit)
1029 // --------------------------------------------------------------------------
1030 
1031 template <typename TSimdVector>
SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector>>,int)1032 SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector> >, int)
1033 inline _testAllZeros(TSimdVector const & vector, TSimdVector const & mask, SimdParams_<16>)
1034 {
1035     return _mm_testz_si128(SEQAN_VECTOR_CAST_(const __m128i &, vector),
1036                            SEQAN_VECTOR_CAST_(const __m128i &, mask));
1037 }
1038 
1039 // --------------------------------------------------------------------------
1040 // Function _testAllOnes (128bit)
1041 // --------------------------------------------------------------------------
1042 
1043 template <typename TSimdVector>
1044 inline
SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector>>,int)1045 SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector> >, int)
1046 _testAllOnes(TSimdVector const & vector, SimdParams_<16>)
1047 {
1048     return _mm_test_all_ones(SEQAN_VECTOR_CAST_(const __m128i &, vector));
1049 }
1050 
1051 } // namespace seqan
1052 
1053 #endif // SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_SSE4_2_H_
1054