1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: David Weese <david.weese@fu-berlin.de>
33 // René Rahn <rene.rahn@fu-berlin.de>
34 // Stefan Budach <stefan.budach@fu-berlin.de>
35 // ==========================================================================
36 // generic SIMD interface for SSE3 / AVX2
37 // ==========================================================================
38
39 #ifndef SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_SSE4_2_H_
40 #define SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_SSE4_2_H_
41
42 namespace seqan {
43
44 // SimdParams_<8, 8>: 64bit = 8 elements * 8bit
45 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8Char, char, 8)
46 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8SChar, signed char, 8)
47 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8UChar, unsigned char, 8)
48
49 // SimdParams_<8, 4>: 64bit = 4 elements * 2 * 8bit
50 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4Short, short, 8)
51 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4UShort, unsigned short, 8)
52
53 // SimdParams_<8, 2>: 64bit = 2 elements * 4 * 8bit
54 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector2Int, int, 8)
55 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector2UInt, unsigned int, 8)
56
57 // SimdParams_<16, 16>: 128bit = 16 elements * 8bit
58 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16Char, char, 16)
59 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16SChar, signed char, 16)
60 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector16UChar, unsigned char, 16)
61
62 // SimdParams_<16, 8>: 128bit = 8 elements * 2 * 8bit
63 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8Short, short, 16)
64 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector8UShort, unsigned short, 16)
65
66 // SimdParams_<16, 4>: 128bit = 4 elements * 4 * 8bit
67 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4Int, int, 16)
68 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector4UInt, unsigned int, 16)
69
70 // SimdParams_<16, 2>: 128bit = 2 elements * 8 * 8bit
71 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector2Int64, int64_t, 16)
72 SEQAN_DEFINE_SIMD_VECTOR_(SimdVector2UInt64, uint64_t, 16)
73
74 // ============================================================================
75 // Functions
76 // ============================================================================
77
78 // --------------------------------------------------------------------------
79 // _fillVector (128bit)
80 // --------------------------------------------------------------------------
81
82 template <typename TSimdVector, typename... TValue>
83 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<16,16> const &)84 _fillVector(TSimdVector & vector,
85 std::tuple<TValue...> const & x,
86 std::index_sequence<0> const &,
87 SimdParams_<16, 16> const &)
88 {
89 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi8(std::get<0>(x)));
90 }
91
92 template <typename TSimdVector, typename... TValue>
93 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<16,8> const &)94 _fillVector(TSimdVector & vector,
95 std::tuple<TValue...> const & x,
96 std::index_sequence<0> const &,
97 SimdParams_<16, 8> const &)
98 {
99 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi16(std::get<0>(x)));
100 }
101
102 template <typename TSimdVector, typename... TValue>
103 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<16,4> const &)104 _fillVector(TSimdVector & vector,
105 std::tuple<TValue...> const & x,
106 std::index_sequence<0> const &,
107 SimdParams_<16, 4> const &)
108 {
109 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi32(std::get<0>(x)));
110 }
111
112 template <typename TSimdVector, typename... TValue>
113 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & x,std::index_sequence<0> const &,SimdParams_<16,2> const &)114 _fillVector(TSimdVector & vector,
115 std::tuple<TValue...> const & x,
116 std::index_sequence<0> const &,
117 SimdParams_<16, 2> const &)
118 {
119 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi64x(std::get<0>(x)));
120 }
121
122 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
123 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<16,16> const &)124 _fillVector(TSimdVector & vector,
125 std::tuple<TValue...> const & args,
126 std::index_sequence<INDICES...> const &,
127 SimdParams_<16, 16> const &)
128 {
129 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_setr_epi8(std::get<INDICES>(args)...));
130 }
131
132 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
133 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<16,8> const &)134 _fillVector(TSimdVector & vector,
135 std::tuple<TValue...> const & args,
136 std::index_sequence<INDICES...> const &,
137 SimdParams_<16, 8> const &)
138 {
139 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_setr_epi16(std::get<INDICES>(args)...));
140 }
141
142 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
143 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<16,4> const &)144 _fillVector(TSimdVector & vector,
145 std::tuple<TValue...> const & args,
146 std::index_sequence<INDICES...> const &,
147 SimdParams_<16, 4> const &)
148 {
149 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_setr_epi32(std::get<INDICES>(args)...));
150 }
151
152 template <typename TSimdVector, typename ...TValue, size_t ...INDICES>
153 inline void
_fillVector(TSimdVector & vector,std::tuple<TValue...> const & args,std::index_sequence<INDICES...> const &,SimdParams_<16,2> const &)154 _fillVector(TSimdVector & vector,
155 std::tuple<TValue...> const & args,
156 std::index_sequence<INDICES...> const &,
157 SimdParams_<16, 2> const &)
158 {
159 // reverse argument list 0, 1 -> 1, 0
160 // NOTE(marehr): Intel linux fails to reverse argument list and only
161 // _mm_set_epi64x has no reverse equivalent
162 // NOTE(rrahn): For g++-4.9 the set_epi function is a macro, which does not work with parameter pack expansion.
163 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_set_epi64x(std::get<sizeof...(INDICES) - 1 - INDICES>(args)...));
164 }
165
166 // --------------------------------------------------------------------------
167 // _clearVector (128bit)
168 // --------------------------------------------------------------------------
169
170 template <typename TSimdVector, int L>
_clearVector(TSimdVector & vector,SimdParams_<16,L>)171 inline void _clearVector(TSimdVector & vector, SimdParams_<16, L>)
172 {
173 vector = SEQAN_VECTOR_CAST_(TSimdVector, _mm_setzero_si128());
174 }
175
176 // --------------------------------------------------------------------------
177 // _createVector (128bit)
178 // --------------------------------------------------------------------------
179
180 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<16,16>)181 inline TSimdVector _createVector(TValue const x, SimdParams_<16, 16>)
182 {
183 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi8(x));
184 }
185
186 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<16,8>)187 inline TSimdVector _createVector(TValue const x, SimdParams_<16, 8>)
188 {
189 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi16(x));
190 }
191
192 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<16,4>)193 inline TSimdVector _createVector(TValue const x, SimdParams_<16, 4>)
194 {
195 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi32(x));
196 }
197
198 template <typename TSimdVector, typename TValue>
_createVector(TValue const x,SimdParams_<16,2>)199 inline TSimdVector _createVector(TValue const x, SimdParams_<16, 2>)
200 {
201 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_set1_epi64x(x));
202 }
203
204 // --------------------------------------------------------------------------
205 // cmpEq (128bit)
206 // --------------------------------------------------------------------------
207
208 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16>)209 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16>)
210 {
211 return SEQAN_VECTOR_CAST_(TSimdVector,
212 _mm_cmpeq_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
213 SEQAN_VECTOR_CAST_(const __m128i&, b)));
214 }
215
216 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8>)217 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8>)
218 {
219 return SEQAN_VECTOR_CAST_(TSimdVector,
220 _mm_cmpeq_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
221 SEQAN_VECTOR_CAST_(const __m128i&, b)));
222 }
223
224 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4>)225 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4>)
226 {
227 return SEQAN_VECTOR_CAST_(TSimdVector,
228 _mm_cmpeq_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
229 SEQAN_VECTOR_CAST_(const __m128i&, b)));
230 }
231
232 template <typename TSimdVector>
_cmpEq(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2>)233 inline TSimdVector _cmpEq(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2>)
234 {
235 return SEQAN_VECTOR_CAST_(TSimdVector,
236 _mm_cmpeq_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
237 SEQAN_VECTOR_CAST_(const __m128i&, b)));
238 }
239
240 // --------------------------------------------------------------------------
241 // _cmpGt (128bit)
242 // --------------------------------------------------------------------------
243
244 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,int8_t>)245 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, int8_t>)
246 {
247 return SEQAN_VECTOR_CAST_(TSimdVector,
248 _mm_cmpgt_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
249 SEQAN_VECTOR_CAST_(const __m128i&, b)));
250 }
251
252 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,uint8_t>)253 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, uint8_t>)
254 {
255 // There is no unsigned cmpgt, we reduce it to the signed case.
256 // Note that 0x80 = ~0x7F (prevent overflow messages).
257 return SEQAN_VECTOR_CAST_(TSimdVector,
258 _mm_cmpgt_epi8(
259 _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, a), _mm_set1_epi8(~0x7F)),
260 _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, b), _mm_set1_epi8(~0x7F))));
261 }
262
263 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,int16_t>)264 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, int16_t>)
265 {
266 return SEQAN_VECTOR_CAST_(TSimdVector,
267 _mm_cmpgt_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
268 SEQAN_VECTOR_CAST_(const __m128i&, b)));
269 }
270
271 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,uint16_t>)272 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, uint16_t>)
273 {
274 // There is no unsigned cmpgt, we reduce it to the signed case.
275 // Note that 0x8000 = ~0x7FFF (prevent overflow messages).
276 return SEQAN_VECTOR_CAST_(TSimdVector,
277 _mm_cmpgt_epi16(
278 _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, a), _mm_set1_epi16(~0x7FFF)),
279 _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, b), _mm_set1_epi16(~0x7FFF))));
280 }
281
282 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,int32_t>)283 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, int32_t>)
284 {
285 return SEQAN_VECTOR_CAST_(TSimdVector,
286 _mm_cmpgt_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
287 SEQAN_VECTOR_CAST_(const __m128i&, b)));
288 }
289
290 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,uint32_t>)291 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, uint32_t>)
292 {
293 // There is no unsigned cmpgt, we reduce it to the signed case.
294 // Note that 0x80000000 = ~0x7FFFFFFF (prevent overflow messages).
295 return SEQAN_VECTOR_CAST_(TSimdVector,
296 _mm_cmpgt_epi32(
297 _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, a), _mm_set1_epi32(~0x7FFFFFFF)),
298 _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, b), _mm_set1_epi32(~0x7FFFFFFF))));
299 }
300
301 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,int64_t>)302 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, int64_t>)
303 {
304 return SEQAN_VECTOR_CAST_(TSimdVector,
305 _mm_cmpgt_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
306 SEQAN_VECTOR_CAST_(const __m128i&, b)));
307 }
308
309 template <typename TSimdVector>
_cmpGt(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,uint64_t>)310 inline TSimdVector _cmpGt(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, uint64_t>)
311 {
312 // There is no unsigned cmpgt, we reduce it to the signed case.
313 // Note that 0x8000000000000000ul = ~0x7FFFFFFFFFFFFFFFul (prevent overflow messages).
314 return SEQAN_VECTOR_CAST_(TSimdVector,
315 _mm_cmpgt_epi64(
316 _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, a) ,_mm_set1_epi64x(~0x7FFFFFFFFFFFFFFFul)),
317 _mm_xor_si128(SEQAN_VECTOR_CAST_(const __m128i&, b), _mm_set1_epi64x(~0x7FFFFFFFFFFFFFFFul))));
318 }
319
320 // --------------------------------------------------------------------------
321 // _bitwiseOr (128bit)
322 // --------------------------------------------------------------------------
323
324 template <typename TSimdVector, int L>
_bitwiseOr(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,L>)325 inline TSimdVector _bitwiseOr(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, L>)
326 {
327 return SEQAN_VECTOR_CAST_(TSimdVector,
328 _mm_or_si128(SEQAN_VECTOR_CAST_(const __m128i&, a),
329 SEQAN_VECTOR_CAST_(const __m128i&, b)));
330 }
331
332 // --------------------------------------------------------------------------
333 // _bitwiseAnd (128bit)
334 // --------------------------------------------------------------------------
335
336 template <typename TSimdVector, int L>
_bitwiseAnd(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,L>)337 inline TSimdVector _bitwiseAnd(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, L>)
338 {
339 return SEQAN_VECTOR_CAST_(TSimdVector,
340 _mm_and_si128(SEQAN_VECTOR_CAST_(const __m128i&, a),
341 SEQAN_VECTOR_CAST_(const __m128i&, b)));
342 }
343
344 // --------------------------------------------------------------------------
345 // _bitwiseAndNot (128bit)
346 // --------------------------------------------------------------------------
347
348 template <typename TSimdVector, int L>
_bitwiseAndNot(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,L>)349 inline TSimdVector _bitwiseAndNot(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, L>)
350 {
351 return SEQAN_VECTOR_CAST_(TSimdVector,
352 _mm_andnot_si128(SEQAN_VECTOR_CAST_(const __m128i&, a),
353 SEQAN_VECTOR_CAST_(const __m128i&, b)));
354 }
355
356 // --------------------------------------------------------------------------
357 // _bitwiseNot (128bit)
358 // --------------------------------------------------------------------------
359
360 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<16,16>)361 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<16, 16>)
362 {
363 return SEQAN_VECTOR_CAST_(TSimdVector,
364 _mm_cmpeq_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
365 _mm_setzero_si128()));
366 }
367
368 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<16,8>)369 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<16, 8>)
370 {
371 return SEQAN_VECTOR_CAST_(TSimdVector,
372 _mm_cmpeq_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
373 _mm_setzero_si128()));
374 }
375
376 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<16,4>)377 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<16, 4>)
378 {
379 return SEQAN_VECTOR_CAST_(TSimdVector,
380 _mm_cmpeq_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
381 _mm_setzero_si128()));
382 }
383
384 template <typename TSimdVector>
_bitwiseNot(TSimdVector const & a,SimdParams_<16,2>)385 inline TSimdVector _bitwiseNot(TSimdVector const & a, SimdParams_<16, 2>)
386 {
387 return SEQAN_VECTOR_CAST_(TSimdVector,
388 _mm_cmpeq_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
389 _mm_setzero_si128()));
390 }
391
392 // --------------------------------------------------------------------------
393 // _divide (128bit)
394 // --------------------------------------------------------------------------
395
396 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<16,16>)397 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<16, 16>)
398 {
399 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_div_epi8(a, _mm_set1_epi8(b)));
400 }
401
402 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<16,8>)403 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<16, 8>)
404 {
405 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_div_epi16(a, _mm_set1_epi16(b)));
406 }
407
408 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<16,4>)409 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<16, 4>)
410 {
411 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_div_epi32(a, _mm_set1_epi32(b)));
412 }
413
414 template <typename TSimdVector>
_divide(TSimdVector const & a,int b,SimdParams_<16,2>)415 inline TSimdVector _divide(TSimdVector const & a, int b, SimdParams_<16, 2>)
416 {
417 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_div_epi64(a, _mm_set1_epi64x(b)));
418 }
419
420 // --------------------------------------------------------------------------
421 // _add (128bit)
422 // --------------------------------------------------------------------------
423
424 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16>)425 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16>)
426 {
427 return SEQAN_VECTOR_CAST_(TSimdVector,
428 _mm_add_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
429 SEQAN_VECTOR_CAST_(const __m128i&, b)));
430 }
431
432 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8>)433 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8>)
434 {
435 return SEQAN_VECTOR_CAST_(TSimdVector,
436 _mm_add_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
437 SEQAN_VECTOR_CAST_(const __m128i&, b)));
438 }
439
440 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4>)441 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4>)
442 {
443 return SEQAN_VECTOR_CAST_(TSimdVector,
444 _mm_add_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
445 SEQAN_VECTOR_CAST_(const __m128i&, b)));
446 }
447
448 template <typename TSimdVector>
_add(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2>)449 inline TSimdVector _add(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2>)
450 {
451 return SEQAN_VECTOR_CAST_(TSimdVector,
452 _mm_add_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
453 SEQAN_VECTOR_CAST_(const __m128i&, b)));
454 }
455
456 // --------------------------------------------------------------------------
457 // _sub (128bit)
458 // --------------------------------------------------------------------------
459
460 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16>)461 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16>)
462 {
463 return SEQAN_VECTOR_CAST_(TSimdVector,
464 _mm_sub_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
465 SEQAN_VECTOR_CAST_(const __m128i&, b)));
466 }
467
468 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8>)469 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8>)
470 {
471 return SEQAN_VECTOR_CAST_(TSimdVector,
472 _mm_sub_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
473 SEQAN_VECTOR_CAST_(const __m128i&, b)));
474 }
475
476 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4>)477 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4>)
478 {
479 return SEQAN_VECTOR_CAST_(TSimdVector,
480 _mm_sub_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
481 SEQAN_VECTOR_CAST_(const __m128i&, b)));
482 }
483
484 template <typename TSimdVector>
_sub(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2>)485 inline TSimdVector _sub(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2>)
486 {
487 return SEQAN_VECTOR_CAST_(TSimdVector,
488 _mm_sub_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
489 SEQAN_VECTOR_CAST_(const __m128i&, b)));
490 }
491
492 // --------------------------------------------------------------------------
493 // _mult (128bit)
494 // --------------------------------------------------------------------------
495
496 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const &,SimdParams_<16,16>)497 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const &/*b*/, SimdParams_<16, 16>)
498 {
499 SEQAN_ASSERT_FAIL("SSE intrinsics for multiplying 8 bit values not implemented!");
500 return a;
501 }
502
503 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8>)504 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8>)
505 {
506 return SEQAN_VECTOR_CAST_(TSimdVector,
507 _mm_mullo_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
508 SEQAN_VECTOR_CAST_(const __m128i&, b)));
509 }
510
511 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4>)512 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4>)
513 {
514 return SEQAN_VECTOR_CAST_(TSimdVector,
515 _mm_mullo_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
516 SEQAN_VECTOR_CAST_(const __m128i&, b)));
517 }
518
519 template <typename TSimdVector>
_mult(TSimdVector const & a,TSimdVector const &,SimdParams_<16,2>)520 inline TSimdVector _mult(TSimdVector const & a, TSimdVector const &/*b*/, SimdParams_<16, 2>)
521 {
522 SEQAN_ASSERT_FAIL("SSE intrinsics for multiplying 64 bit values not implemented!");
523 return a;
524 }
525
526 // --------------------------------------------------------------------------
527 // _max (128bit)
528 // --------------------------------------------------------------------------
529
530 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,int8_t>)531 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, int8_t>)
532 {
533 return SEQAN_VECTOR_CAST_(TSimdVector,
534 _mm_max_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
535 SEQAN_VECTOR_CAST_(const __m128i&, b)));
536 }
537
538 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,uint8_t>)539 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, uint8_t>)
540 {
541 return SEQAN_VECTOR_CAST_(TSimdVector,
542 _mm_max_epu8(SEQAN_VECTOR_CAST_(const __m128i&, a),
543 SEQAN_VECTOR_CAST_(const __m128i&, b)));
544 }
545
546 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,int16_t>)547 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, int16_t>)
548 {
549 return SEQAN_VECTOR_CAST_(TSimdVector,
550 _mm_max_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
551 SEQAN_VECTOR_CAST_(const __m128i&, b)));
552 }
553
554 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,uint16_t>)555 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, uint16_t>)
556 {
557 return SEQAN_VECTOR_CAST_(TSimdVector,
558 _mm_max_epu16(SEQAN_VECTOR_CAST_(const __m128i&, a),
559 SEQAN_VECTOR_CAST_(const __m128i&, b)));
560 }
561
562 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,int32_t>)563 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, int32_t>)
564 {
565 return SEQAN_VECTOR_CAST_(TSimdVector,
566 _mm_max_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
567 SEQAN_VECTOR_CAST_(const __m128i&, b)));
568 }
569
570 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,uint32_t>)571 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, uint32_t>)
572 {
573 return SEQAN_VECTOR_CAST_(TSimdVector,
574 _mm_max_epu32(SEQAN_VECTOR_CAST_(const __m128i&, a),
575 SEQAN_VECTOR_CAST_(const __m128i&, b)));
576 }
577
578 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,int64_t>)579 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, int64_t>)
580 {
581 #if defined(__AVX512VL__)
582 return SEQAN_VECTOR_CAST_(TSimdVector,
583 _mm_max_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
584 SEQAN_VECTOR_CAST_(const __m128i&, b)));
585 #else // defined(__AVX512VL__)
586 return blend(b, a, cmpGt(a, b));
587 #endif // defined(__AVX512VL__)
588 }
589
590 template <typename TSimdVector>
_max(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,uint64_t>)591 inline TSimdVector _max(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, uint64_t>)
592 {
593 #if defined(__AVX512VL__)
594 return SEQAN_VECTOR_CAST_(TSimdVector,
595 _mm_max_epu64(SEQAN_VECTOR_CAST_(const __m128i&, a),
596 SEQAN_VECTOR_CAST_(const __m128i&, b)));
597 #else // defined(__AVX512VL__)
598 return blend(b, a, cmpGt(a, b));
599 #endif // defined(__AVX512VL__)
600 }
601
602
603 // --------------------------------------------------------------------------
604 // _min (128bit)
605 // --------------------------------------------------------------------------
606
607 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,int8_t>)608 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, int8_t>)
609 {
610 return SEQAN_VECTOR_CAST_(TSimdVector,
611 _mm_min_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
612 SEQAN_VECTOR_CAST_(const __m128i&, b)));
613 }
614
615 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,16,uint8_t>)616 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 16, uint8_t>)
617 {
618 return SEQAN_VECTOR_CAST_(TSimdVector,
619 _mm_min_epu8(SEQAN_VECTOR_CAST_(const __m128i&, a),
620 SEQAN_VECTOR_CAST_(const __m128i&, b)));
621 }
622
623 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,int16_t>)624 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, int16_t>)
625 {
626 return SEQAN_VECTOR_CAST_(TSimdVector,
627 _mm_min_epi16(SEQAN_VECTOR_CAST_(const __m128i&, a),
628 SEQAN_VECTOR_CAST_(const __m128i&, b)));
629 }
630
631 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,8,uint16_t>)632 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 8, uint16_t>)
633 {
634 return SEQAN_VECTOR_CAST_(TSimdVector,
635 _mm_min_epu16(SEQAN_VECTOR_CAST_(const __m128i&, a),
636 SEQAN_VECTOR_CAST_(const __m128i&, b)));
637 }
638
639 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,int32_t>)640 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, int32_t>)
641 {
642 return SEQAN_VECTOR_CAST_(TSimdVector,
643 _mm_min_epi32(SEQAN_VECTOR_CAST_(const __m128i&, a),
644 SEQAN_VECTOR_CAST_(const __m128i&, b)));
645 }
646
647 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,4,uint32_t>)648 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 4, uint32_t>)
649 {
650 return SEQAN_VECTOR_CAST_(TSimdVector,
651 _mm_min_epu32(SEQAN_VECTOR_CAST_(const __m128i&, a),
652 SEQAN_VECTOR_CAST_(const __m128i&, b)));
653 }
654
655 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,int64_t>)656 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, int64_t>)
657 {
658 #if defined(__AVX512VL__)
659 return SEQAN_VECTOR_CAST_(TSimdVector,
660 _mm_min_epi64(SEQAN_VECTOR_CAST_(const __m128i&, a),
661 SEQAN_VECTOR_CAST_(const __m128i&, b)));
662 #else // defined(__AVX512VL__)
663 return blend(a, b, cmpGt(a, b));
664 #endif // defined(__AVX512VL__)
665 }
666
667 template <typename TSimdVector>
_min(TSimdVector const & a,TSimdVector const & b,SimdParams_<16,2,uint64_t>)668 inline TSimdVector _min(TSimdVector const & a, TSimdVector const & b, SimdParams_<16, 2, uint64_t>)
669 {
670 #if defined(__AVX512VL__)
671 return SEQAN_VECTOR_CAST_(TSimdVector,
672 _mm_min_epu64(SEQAN_VECTOR_CAST_(const __m128i&, a),
673 SEQAN_VECTOR_CAST_(const __m128i&, b)));
674 #else // defined(__AVX512VL__)
675 return blend(a, b, cmpGt(a, b));
676 #endif // defined(__AVX512VL__)
677 }
678
679 // --------------------------------------------------------------------------
680 // _blend (128bit)
681 // --------------------------------------------------------------------------
682
683 template <typename TSimdVector, typename TSimdVectorMask, int L>
_blend(TSimdVector const & a,TSimdVector const & b,TSimdVectorMask const & mask,SimdParams_<16,L>)684 inline TSimdVector _blend(TSimdVector const & a, TSimdVector const & b, TSimdVectorMask const & mask, SimdParams_<16, L>)
685 {
686 return SEQAN_VECTOR_CAST_(TSimdVector,
687 _mm_blendv_epi8(SEQAN_VECTOR_CAST_(const __m128i&, a),
688 SEQAN_VECTOR_CAST_(const __m128i&, b),
689 SEQAN_VECTOR_CAST_(const __m128i&, mask)));
690 }
691
692 // --------------------------------------------------------------------------
693 // _storeu (128bit)
694 // --------------------------------------------------------------------------
695
696 template <typename T, typename TSimdVector, int L>
_storeu(T * memAddr,TSimdVector const & vec,SimdParams_<16,L>)697 inline void _storeu(T * memAddr, TSimdVector const & vec, SimdParams_<16, L>)
698 {
699 _mm_storeu_si128((__m128i*)memAddr, reinterpret_cast<const __m128i &>(vec));
700 }
701
702 // ----------------------------------------------------------------------------
703 // Function _load() 128bit
704 // ----------------------------------------------------------------------------
705
706 template <typename TSimdVector, typename T, int L>
_load(T const * memAddr,SimdParams_<16,L>)707 inline TSimdVector _load(T const * memAddr, SimdParams_<16, L>)
708 {
709 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_load_si128((__m128i const *) memAddr));
710 }
711
712 // --------------------------------------------------------------------------
713 // _shiftRightLogical (128bit)
714 // --------------------------------------------------------------------------
715
716 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<16,16>)717 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<16, 16>)
718 {
719 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_srli_epi16(SEQAN_VECTOR_CAST_(const __m128i &, vector), imm) & _mm_set1_epi8(0xff >> imm));
720 }
721 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<16,8>)722 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<16, 8>)
723 {
724 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_srli_epi16(SEQAN_VECTOR_CAST_(const __m128i &, vector), imm));
725 }
726 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<16,4>)727 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<16, 4>)
728 {
729 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_srli_epi32(SEQAN_VECTOR_CAST_(const __m128i &, vector), imm));
730 }
731 template <typename TSimdVector>
_shiftRightLogical(TSimdVector const & vector,const int imm,SimdParams_<16,2>)732 inline TSimdVector _shiftRightLogical(TSimdVector const & vector, const int imm, SimdParams_<16, 2>)
733 {
734 return SEQAN_VECTOR_CAST_(TSimdVector, _mm_srli_epi64(SEQAN_VECTOR_CAST_(const __m128i &, vector), imm));
735 }
736
737 // --------------------------------------------------------------------------
738 // _gather (128bit)
739 // --------------------------------------------------------------------------
740
741 template <typename TValue, typename TSimdVector, typename TSize, TSize SCALE, typename TSimdParams>
742 inline TSimdVector
_gather(TValue const * memAddr,TSimdVector const & idx,std::integral_constant<TSize,SCALE> const &,TSimdParams)743 _gather(TValue const * memAddr,
744 TSimdVector const & idx,
745 std::integral_constant<TSize, SCALE> const & /*scale*/,
746 TSimdParams)
747 {
748 TSimdVector ret;
749 for (auto i = 0u; i < LENGTH<TSimdVector>::VALUE; ++i)
750 {
751 ret[i] = memAddr[idx[i]];
752 }
753 return ret;
754 }
755
756 // --------------------------------------------------------------------------
757 // _shuffleVector (128bit)
758 // --------------------------------------------------------------------------
759
760 inline __m128i
seqan_mm_shuffle_epi16(const __m128i a,const __m128i b)761 seqan_mm_shuffle_epi16(const __m128i a, const __m128i b)
762 {
763 // multiply by 2
764 __m128i idx = _mm_slli_epi16(b, 1);
765 return _mm_shuffle_epi8(
766 a,
767 // interleave idx[7:0] = 2*indices[7], ..., 2*indices[0]
768 // with idx[7:0]+1 = 2*indices[7]+1, ..., 2*indices[0]+1
769 // => 2*indices[7]+1, 2*indices[7], ..., 2*indices[0]+1, 2*indices[0]
770 _mm_unpacklo_epi8(
771 idx,
772 _mm_add_epi8(idx, _mm_set1_epi8(1))
773 )
774 );
775 }
776
777 inline __m128i
seqan_mm_shuffle_epi32(const __m128i a,const __m128i b)778 seqan_mm_shuffle_epi32(const __m128i a, const __m128i b)
779 {
780 // multiply by 4
781 __m128i idx = _mm_slli_epi16(b, 2);
782 return _mm_shuffle_epi8(
783 a,
784 // interleave 4*indices[3]+1, 4*indices[3]+0; ..., 4*indices[0]+1, 4*indices[0]+0
785 // with 4*indices[3]+3, 4*indices[3]+2; ..., 4*indices[0]+3, 4*indices[0]+2
786 // => 4*indices[3]+3, 4*indices[3]+2; 4*indices[3]+1, 4*indices[3]+0;
787 // ...
788 // 4*indices[0]+3, 4*indices[0]+2; 4*indices[0]+1, 4*indices[0]+0
789 _mm_unpacklo_epi16(
790 // interleave idx[3:0]+0 = 4*indices[3]+0; ...; 4*indices[0]+0
791 // with idx[3:0]+1 = 4*indices[3]+1; ...; 4*indices[0]+1
792 // => 4*indices[3]+1; 4*indices[3]+0; ...; 4*indices[0]+1; 4*indices[0]+0
793 _mm_unpacklo_epi8(
794 idx,
795 _mm_add_epi8(idx, _mm_set1_epi8(1))
796 ),
797 // interleave idx[3:0]+2 = 4*indices[3]+2; ...; 4*indices[0]+2
798 // with idx[3:0]+3 = 4*indices[3]+3; ...; 4*indices[0]+3
799 // => 4*indices[3]+3; 4*indices[3]+2; ...; 4*indices[0]+3; 4*indices[0]+2
800 _mm_unpacklo_epi8(
801 _mm_add_epi8(idx, _mm_set1_epi8(2)),
802 _mm_add_epi8(idx, _mm_set1_epi8(3))
803 )
804 ));
805 }
806
807 inline __m128i
seqan_mm_shuffle_epi64(const __m128i a,const __m128i b)808 seqan_mm_shuffle_epi64(const __m128i a, const __m128i b)
809 {
810 // multiply by 8
811 __m128i idx = _mm_slli_epi16(b, 3);
812 return _mm_shuffle_epi8(
813 a,
814 _mm_unpacklo_epi32(
815 // interleave 8*indices[1]+1, 8*indices[1]+0; ..., 8*indices[0]+1, 8*indices[0]+0
816 // with 8*indices[1]+3, 8*indices[1]+2; ..., 8*indices[0]+3, 8*indices[0]+2
817 // => 8*indices[1]+3, 8*indices[1]+2; 8*indices[1]+1, 8*indices[1]+0;
818 // ...
819 // 8*indices[0]+3, 8*indices[0]+2; 8*indices[0]+1, 8*indices[0]+0
820 _mm_unpacklo_epi16(
821 // interleave idx[1:0]+0 = 8*indices[1]+0; ...; 8*indices[0]+0
822 // with idx[1:0]+1 = 8*indices[1]+1; ...; 8*indices[0]+1
823 // => 8*indices[1]+1; 8*indices[1]+0; ...; 8*indices[0]+1; 8*indices[0]+0
824 _mm_unpacklo_epi8(
825 idx,
826 _mm_add_epi8(idx, _mm_set1_epi8(1))
827 ),
828 // interleave idx[1:0]+2 = 8*indices[1]+2; ...; 8*indices[0]+2
829 // with idx[1:0]+3 = 8*indices[1]+3; ...; 8*indices[0]+3
830 // => 8*indices[1]+3; 8*indices[1]+2; ...; 8*indices[0]+3; 8*indices[0]+2
831 _mm_unpacklo_epi8(
832 _mm_add_epi8(idx, _mm_set1_epi8(2)),
833 _mm_add_epi8(idx, _mm_set1_epi8(3))
834 )
835 ),
836 // interleave 8*indices[1]+5, 8*indices[1]+4; ..., 8*indices[0]+5, 8*indices[0]+4
837 // with 8*indices[1]+7, 8*indices[1]+6; ..., 8*indices[0]+7, 8*indices[0]+6
838 // => 8*indices[1]+7, 8*indices[1]+6; 8*indices[1]+5, 8*indices[1]+4;
839 // ...
840 // 8*indices[0]+7, 8*indices[0]+6; 8*indices[0]+5, 8*indices[0]+4
841 _mm_unpacklo_epi16(
842 // interleave idx[1:0]+4 = 8*indices[1]+4; ...; 8*indices[0]+4
843 // with idx[1:0]+5 = 8*indices[1]+5; ...; 8*indices[0]+5
844 // => 8*indices[1]+5; 8*indices[1]+4; ...; 8*indices[0]+5; 8*indices[0]+4
845 _mm_unpacklo_epi8(
846 _mm_add_epi8(idx, _mm_set1_epi8(4)),
847 _mm_add_epi8(idx, _mm_set1_epi8(5))
848 ),
849 // interleave idx[1:0]+6 = 8*indices[1]+6; ...; 8*indices[0]+6
850 // with idx[1:0]+7 = 8*indices[1]+7; ...; 8*indices[0]+7
851 // => 8*indices[1]+7; 8*indices[1]+6; ...; 8*indices[0]+7; 8*indices[0]+6
852 _mm_unpacklo_epi8(
853 _mm_add_epi8(idx, _mm_set1_epi8(6)),
854 _mm_add_epi8(idx, _mm_set1_epi8(7))
855 )
856 )
857 )
858 );
859 }
860
861 template <typename TSimdVector1, typename TSimdVector2>
862 [[deprecated("Here be dragons")]]
863 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,8>,SimdParams_<8,8>)864 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 8>, SimdParams_<8, 8>)
865 {
866 #if SEQAN_IS_32_BIT
867 __m128i idx = _mm_slli_epi16(
868 _mm_unpacklo_epi32(
869 _mm_cvtsi32_si128(reinterpret_cast<const uint32_t &>(indices)),
870 _mm_cvtsi32_si128(reinterpret_cast<const uint64_t &>(indices) >> 32)
871 ),
872 1
873 );
874 #else
875 __m128i idx = _mm_slli_epi16(_mm_cvtsi64_si128(reinterpret_cast<const uint64_t &>(indices)), 1);
876 #endif // SEQAN_IS_32_BIT
877 return SEQAN_VECTOR_CAST_(TSimdVector1,
878 _mm_shuffle_epi8(
879 SEQAN_VECTOR_CAST_(const __m128i &, vector),
880 _mm_unpacklo_epi8(idx, _mm_add_epi8(idx, _mm_set1_epi8(1)))
881 ));
882 }
883
884 template <typename TSimdVector1, typename TSimdVector2>
885 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,16>,SimdParams_<16,16>)886 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 16>, SimdParams_<16, 16>)
887 {
888 return SEQAN_VECTOR_CAST_(
889 TSimdVector1,
890 _mm_shuffle_epi8(
891 SEQAN_VECTOR_CAST_(const __m128i &, vector),
892 SEQAN_VECTOR_CAST_(const __m128i &, indices)
893 ));
894 }
895
896 template <typename TSimdVector1, typename TSimdVector2>
897 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,8>,SimdParams_<16,16>)898 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 8>, SimdParams_<16, 16>)
899 {
900 return SEQAN_VECTOR_CAST_(
901 TSimdVector1,
902 seqan_mm_shuffle_epi16(
903 SEQAN_VECTOR_CAST_(const __m128i &, vector),
904 SEQAN_VECTOR_CAST_(const __m128i &, indices)
905 ));
906 }
907
908 template <typename TSimdVector1, typename TSimdVector2>
909 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,4>,SimdParams_<16,16>)910 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 4>, SimdParams_<16, 16>)
911 {
912 return SEQAN_VECTOR_CAST_(
913 TSimdVector1,
914 seqan_mm_shuffle_epi32(
915 SEQAN_VECTOR_CAST_(const __m128i &, vector),
916 SEQAN_VECTOR_CAST_(const __m128i &, indices)
917 ));
918 }
919
920 template <typename TSimdVector1, typename TSimdVector2>
921 inline TSimdVector1
_shuffleVector(TSimdVector1 const & vector,TSimdVector2 const & indices,SimdParams_<16,2>,SimdParams_<16,16>)922 _shuffleVector(TSimdVector1 const & vector, TSimdVector2 const & indices, SimdParams_<16, 2>, SimdParams_<16, 16>)
923 {
924 return SEQAN_VECTOR_CAST_(
925 TSimdVector1,
926 seqan_mm_shuffle_epi64(
927 SEQAN_VECTOR_CAST_(const __m128i &, vector),
928 SEQAN_VECTOR_CAST_(const __m128i &, indices)
929 ));
930 }
931
932 // --------------------------------------------------------------------------
933 // _transposeMatrix (128bit)
934 // --------------------------------------------------------------------------
935
936 template <typename TSimdVector>
937 inline void
_transposeMatrix(TSimdVector matrix[],SimdMatrixParams_<8,8,8>)938 _transposeMatrix(TSimdVector matrix[], SimdMatrixParams_<8, 8, 8>)
939 {
940 // we need a look-up table to reverse the lowest 4 bits
941 // in order to place the permute the transposed rows
942 static const unsigned char bitRev[] = {0,4,2,6,1,5,3,7};
943
944 // transpose a 8x8 byte matrix
945 __m64 tmp1[8];
946 for (int i = 0; i < 4; ++i)
947 {
948 tmp1[i] = _mm_unpacklo_pi8(SEQAN_VECTOR_CAST_(const __m64 &, matrix[2*i]), SEQAN_VECTOR_CAST_(const __m64 &, matrix[2*i+1]));
949 tmp1[i+4] = _mm_unpackhi_pi8(SEQAN_VECTOR_CAST_(const __m64 &, matrix[2*i]), SEQAN_VECTOR_CAST_(const __m64 &, matrix[2*i+1]));
950 }
951 __m64 tmp2[8];
952 for (int i = 0; i < 4; ++i)
953 {
954 tmp2[i] = _mm_unpacklo_pi16(tmp1[2*i], tmp1[2*i+1]);
955 tmp2[i+4] = _mm_unpackhi_pi16(tmp1[2*i], tmp1[2*i+1]);
956 }
957 for (int i = 0; i < 4; ++i)
958 {
959 matrix[bitRev[i]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm_unpacklo_pi32(tmp2[2*i], tmp2[2*i+1]));
960 matrix[bitRev[i+4]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm_unpackhi_pi32(tmp2[2*i], tmp2[2*i+1]));
961 }
962 }
963
964 template <typename TSimdVector>
965 inline void
_transposeMatrix(TSimdVector matrix[],SimdMatrixParams_<16,16,8>)966 _transposeMatrix(TSimdVector matrix[], SimdMatrixParams_<16, 16, 8>)
967 {
968 // we need a look-up table to reverse the lowest 4 bits
969 // in order to place the permute the transposed rows
970 static const unsigned char bitRev[] = {0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15};
971
972 // transpose a 16x16 byte matrix
973 //
974 // matrix =
975 // A0 A1 A2 ... Ae Af
976 // B0 B1 B2 ... Be Bf
977 // ...
978 // P0 P1 P2 ... Pe Pf
979 __m128i tmp1[16];
980 for (int i = 0; i < 8; ++i)
981 {
982 tmp1[i] = _mm_unpacklo_epi8(SEQAN_VECTOR_CAST_(const __m128i &, matrix[2*i]), SEQAN_VECTOR_CAST_(const __m128i &, matrix[2*i+1]));
983 tmp1[i+8] = _mm_unpackhi_epi8(SEQAN_VECTOR_CAST_(const __m128i &, matrix[2*i]), SEQAN_VECTOR_CAST_(const __m128i &, matrix[2*i+1]));
984 }
985 // tmp1[0] = A0 B0 A1 B1 ... A7 B7
986 // tmp1[1] = C0 D0 C1 D1 ... C7 D7
987 // ...
988 // tmp1[7] = O0 P0 O1 P1 ... O7 P7
989 // tmp1[8] = A8 B8 A9 B9 ... Af Bf
990 // ...
991 // tmp1[15] = O8 P8 O9 P9 ... Of Pf
992 __m128i tmp2[16];
993 for (int i = 0; i < 8; ++i)
994 {
995 tmp2[i] = _mm_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
996 tmp2[i+8] = _mm_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
997 }
998 // tmp2[0] = A0 B0 C0 D0 ... A3 B3 C3 D3
999 // tmp2[1] = E0 F0 G0 H0 ... E3 F3 G3 H3
1000 // ...
1001 // tmp2[3] = M0 N0 O0 P0 ... M3 N3 O3 P3
1002 // tmp2[4] = A8 B8 C8 D8 ... Ab Bb Cb Db
1003 // ...
1004 // tmp2[7] = M8 N8 O8 P8 ... Mb Nb Ob Pb
1005 // tmp2[8] = A4 B4 C4 D4 ... A7 B7 C7 D7
1006 // ..
1007 // tmp2[12] = Ac Bc Cc Dc ... Af Bf Cf Df
1008 // ...
1009 // tmp2[15] = Mc Nc Oc Pc ... Mf Nf Of Pf
1010 for (int i = 0; i < 8; ++i)
1011 {
1012 tmp1[i] = _mm_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
1013 tmp1[i+8] = _mm_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
1014 }
1015 // tmp1[0] = A0 B0 .... H0 A1 B1 .... H1
1016 // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
1017 // ...
1018 // tmp1[4] = A0 B0 .... H0 A1 B1 .... H1
1019 // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
1020 for (int i = 0; i < 8; ++i)
1021 {
1022 matrix[bitRev[i]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]));
1023 matrix[bitRev[i+8]] = SEQAN_VECTOR_CAST_(TSimdVector, _mm_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]));
1024 }
1025 }
1026
1027 // --------------------------------------------------------------------------
1028 // Function _testAllZeros (128bit)
1029 // --------------------------------------------------------------------------
1030
1031 template <typename TSimdVector>
SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector>>,int)1032 SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector> >, int)
1033 inline _testAllZeros(TSimdVector const & vector, TSimdVector const & mask, SimdParams_<16>)
1034 {
1035 return _mm_testz_si128(SEQAN_VECTOR_CAST_(const __m128i &, vector),
1036 SEQAN_VECTOR_CAST_(const __m128i &, mask));
1037 }
1038
1039 // --------------------------------------------------------------------------
1040 // Function _testAllOnes (128bit)
1041 // --------------------------------------------------------------------------
1042
1043 template <typename TSimdVector>
1044 inline
SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector>>,int)1045 SEQAN_FUNC_ENABLE_IF(Is<SimdVectorConcept<TSimdVector> >, int)
1046 _testAllOnes(TSimdVector const & vector, SimdParams_<16>)
1047 {
1048 return _mm_test_all_ones(SEQAN_VECTOR_CAST_(const __m128i &, vector));
1049 }
1050
1051 } // namespace seqan
1052
1053 #endif // SEQAN_INCLUDE_SEQAN_SIMD_SIMD_BASE_SEQAN_IMPL_SSE4_2_H_
1054