1 /*  This file is part of the Vc library. {{{
2 Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #include "macros.h"
29 
30 namespace Vc_VERSIONED_NAMESPACE
31 {
32 namespace Detail
33 {
34 /*mask_count{{{*/
mask_count(__m128i k)35 template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k)
36 {
37     int mask = _mm_movemask_pd(_mm_castsi128_pd(k));
38     return (mask & 1) + (mask >> 1);
39 }
40 
mask_count(__m128i k)41 template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k)
42 {
43 #ifdef Vc_IMPL_POPCNT
44     return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k)));
45 #else
46     auto x = _mm_srli_epi32(k, 31);
47     x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
48     x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)));
49     return _mm_cvtsi128_si32(x);
50 #endif
51 }
52 
mask_count(__m128i k)53 template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k)
54 {
55 #ifdef Vc_IMPL_POPCNT
56     return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2;
57 #else
58     auto x = _mm_srli_epi16(k, 15);
59     x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
60     x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)));
61     x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)));
62     return _mm_extract_epi16(x, 0);
63 #endif
64 }
65 
mask_count(__m128i k)66 template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k)
67 {
68     return Detail::popcnt16(_mm_movemask_epi8(k));
69 }
70 /*}}}*/
71 // mask_to_int/*{{{*/
mask_to_int(__m128i k)72 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k)
73 {
74     return _mm_movemask_pd(_mm_castsi128_pd(k));
75 }
mask_to_int(__m128i k)76 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k)
77 {
78     return _mm_movemask_ps(_mm_castsi128_ps(k));
79 }
mask_to_int(__m128i k)80 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k)
81 {
82     return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128()));
83 }
mask_to_int(__m128i k)84 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k)
85 {
86     return _mm_movemask_epi8(k);
87 }
88 /*}}}*/
89 // mask_store/*{{{*/
90 template <size_t> Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem);
mask_store(__m128i k,bool * mem)91 template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem)
92 {
93     _mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1)));
94 }
mask_store(__m128i k,bool * mem)95 template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem)
96 {
97     k = _mm_srli_epi16(k, 15);
98     const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128());
99 #ifdef __x86_64__
100     *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k2);
101 #else
102     _mm_store_sd(aliasing_cast<double>(mem), _mm_castsi128_pd(k2));
103 #endif
104 }
mask_store(__m128i k,bool * mem)105 template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem)
106 {
107     *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(
108         _mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15),
109                         _mm_setzero_si128()));
110 }
mask_store(__m128i k,bool * mem)111 template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem)
112 {
113     mem[0] = -SseIntrinsics::extract_epi32<1>(k);
114     mem[1] = -SseIntrinsics::extract_epi32<3>(k);
115 }
116 /*}}}*/
117 // mask_load/*{{{*/
118 template<size_t> Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem);
mask_load(const bool * mem)119 template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem)
120 {
121     return sse_cast<__m128>(_mm_cmpgt_epi8(
122         _mm_load_si128(reinterpret_cast<const __m128i *>(mem)), _mm_setzero_si128()));
123 }
mask_load(const bool * mem)124 template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem)
125 {
126 #ifdef __x86_64__
127     __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const int64_t *>(mem));
128 #else
129     __m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(mem)));
130 #endif
131     return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
132 }
mask_load(const bool * mem)133 template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem)
134 {
135     __m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem));
136     k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
137     return sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
138 }
mask_load(const bool * mem)139 template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem)
140 {
141     return sse_cast<__m128>(
142         _mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0])));
143 }
144 /*}}}*/
145 // is_equal{{{
is_equal(__m128 k1,__m128 k2)146 template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2)
147 {
148     return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2));
149 }
is_not_equal(__m128 k1,__m128 k2)150 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2)
151 {
152     return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2));
153 }
154 
is_equal(__m128 k1,__m128 k2)155 template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2)
156 {
157     return _mm_movemask_ps(k1) == _mm_movemask_ps(k2);
158 }
is_not_equal(__m128 k1,__m128 k2)159 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2)
160 {
161     return _mm_movemask_ps(k1) != _mm_movemask_ps(k2);
162 }
163 
is_equal(__m128 k1,__m128 k2)164 template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2)
165 {
166     return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
167            _mm_movemask_epi8(_mm_castps_si128(k2));
168 }
is_not_equal(__m128 k1,__m128 k2)169 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2)
170 {
171     return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
172            _mm_movemask_epi8(_mm_castps_si128(k2));
173 }
174 
is_equal(__m128 k1,__m128 k2)175 template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2)
176 {
177     return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
178            _mm_movemask_epi8(_mm_castps_si128(k2));
179 }
is_not_equal(__m128 k1,__m128 k2)180 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2)
181 {
182     return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
183            _mm_movemask_epi8(_mm_castps_si128(k2));
184 }
185 
186 // }}}
187 }  // namespace Detail
188 
store(bool * mem) const189 template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const
190 {
191     *aliasing_cast<uint16_t>(mem) = _mm_movemask_epi8(dataI()) & 0x0101;
192 }
store(bool * mem) const193 template<typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::store(bool *mem) const
194 {
195     Detail::mask_store<Size>(dataI(), mem);
196 }
load(const bool * mem)197 template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem)
198 {
199     d.set(0, MaskBool(mem[0]));
200     d.set(1, MaskBool(mem[1]));
201 }
load(const bool * mem)202 template <typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::load(const bool *mem)
203 {
204     d.v() = sse_cast<VectorType>(Detail::mask_load<Size>(mem));
205 }
206 
207 // get / operator[] {{{1
208 template <>
get(const SSE::short_m & m,int index)209 Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept
210 {
211     return m.shiftMask() & (1 << 2 * index);
212 }
213 template <>
get(const SSE::ushort_m & m,int index)214 Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept
215 {
216     return m.shiftMask() & (1 << 2 * index);
217 }
218 
219 // firstOne {{{1
firstOne() const220 template<typename T> Vc_ALWAYS_INLINE Vc_PURE int Mask<T, VectorAbi::Sse>::firstOne() const
221 {
222     const int mask = toInt();
223 #ifdef _MSC_VER
224     unsigned long bit;
225     _BitScanForward(&bit, mask);
226 #else
227     int bit;
228     __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask));
229 #endif
230     return bit;
231 }
232 
233 // generate {{{1
234 template <typename M, typename G>
generate_impl(G && gen,std::integral_constant<int,2>)235 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 2>)
236 {
237     return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0,
238                           gen(0) ? 0xffffffffffffffffull : 0);
239 }
240 template <typename M, typename G>
generate_impl(G && gen,std::integral_constant<int,4>)241 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4>)
242 {
243     return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
244                           gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0);
245 }
246 template <typename M, typename G>
generate_impl(G && gen,std::integral_constant<int,8>)247 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8>)
248 {
249     return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0,
250                           gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0,
251                           gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0,
252                           gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0);
253 }
254 template <typename T>
255 template <typename G>
generate(G && gen)256 Vc_INTRINSIC Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::generate(G &&gen)
257 {
258     return generate_impl<Mask<T, VectorAbi::Sse>>(std::forward<G>(gen),
259                                   std::integral_constant<int, Size>());
260 }
261 // shifted {{{1
shifted(int amount) const262 template <typename T> Vc_INTRINSIC Vc_PURE Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::shifted(int amount) const
263 {
264     switch (amount * int(sizeof(VectorEntryType))) {
265     case   0: return *this;
266     case   1: return Detail::shifted<  1>(dataI());
267     case   2: return Detail::shifted<  2>(dataI());
268     case   3: return Detail::shifted<  3>(dataI());
269     case   4: return Detail::shifted<  4>(dataI());
270     case   5: return Detail::shifted<  5>(dataI());
271     case   6: return Detail::shifted<  6>(dataI());
272     case   7: return Detail::shifted<  7>(dataI());
273     case   8: return Detail::shifted<  8>(dataI());
274     case   9: return Detail::shifted<  9>(dataI());
275     case  10: return Detail::shifted< 10>(dataI());
276     case  11: return Detail::shifted< 11>(dataI());
277     case  12: return Detail::shifted< 12>(dataI());
278     case  13: return Detail::shifted< 13>(dataI());
279     case  14: return Detail::shifted< 14>(dataI());
280     case  15: return Detail::shifted< 15>(dataI());
281     case  16: return Detail::shifted< 16>(dataI());
282     case  -1: return Detail::shifted< -1>(dataI());
283     case  -2: return Detail::shifted< -2>(dataI());
284     case  -3: return Detail::shifted< -3>(dataI());
285     case  -4: return Detail::shifted< -4>(dataI());
286     case  -5: return Detail::shifted< -5>(dataI());
287     case  -6: return Detail::shifted< -6>(dataI());
288     case  -7: return Detail::shifted< -7>(dataI());
289     case  -8: return Detail::shifted< -8>(dataI());
290     case  -9: return Detail::shifted< -9>(dataI());
291     case -10: return Detail::shifted<-10>(dataI());
292     case -11: return Detail::shifted<-11>(dataI());
293     case -12: return Detail::shifted<-12>(dataI());
294     case -13: return Detail::shifted<-13>(dataI());
295     case -14: return Detail::shifted<-14>(dataI());
296     case -15: return Detail::shifted<-15>(dataI());
297     case -16: return Detail::shifted<-16>(dataI());
298     }
299     return Zero();
300 }
301 // }}}1
302 
303 }
304 
305 // vim: foldmethod=marker
306