1 /* This file is part of the Vc library. {{{
2 Copyright © 2013-2015 Matthias Kretz <kretz@kde.org>
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the names of contributing organizations nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 }}}*/
27
28 #include "macros.h"
29
30 namespace Vc_VERSIONED_NAMESPACE
31 {
32 namespace Detail
33 {
34 /*mask_count{{{*/
mask_count(__m128i k)35 template<> Vc_INTRINSIC Vc_CONST int mask_count<2>(__m128i k)
36 {
37 int mask = _mm_movemask_pd(_mm_castsi128_pd(k));
38 return (mask & 1) + (mask >> 1);
39 }
40
mask_count(__m128i k)41 template<> Vc_INTRINSIC Vc_CONST int mask_count<4>(__m128i k)
42 {
43 #ifdef Vc_IMPL_POPCNT
44 return _mm_popcnt_u32(_mm_movemask_ps(_mm_castsi128_ps(k)));
45 #else
46 auto x = _mm_srli_epi32(k, 31);
47 x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
48 x = _mm_add_epi32(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(1, 0, 3, 2)));
49 return _mm_cvtsi128_si32(x);
50 #endif
51 }
52
mask_count(__m128i k)53 template<> Vc_INTRINSIC Vc_CONST int mask_count<8>(__m128i k)
54 {
55 #ifdef Vc_IMPL_POPCNT
56 return _mm_popcnt_u32(_mm_movemask_epi8(k)) / 2;
57 #else
58 auto x = _mm_srli_epi16(k, 15);
59 x = _mm_add_epi16(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(0, 1, 2, 3)));
60 x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 1, 2, 3)));
61 x = _mm_add_epi16(x, _mm_shufflelo_epi16(x, _MM_SHUFFLE(2, 3, 0, 1)));
62 return _mm_extract_epi16(x, 0);
63 #endif
64 }
65
mask_count(__m128i k)66 template<> Vc_INTRINSIC Vc_CONST int mask_count<16>(__m128i k)
67 {
68 return Detail::popcnt16(_mm_movemask_epi8(k));
69 }
70 /*}}}*/
71 // mask_to_int/*{{{*/
mask_to_int(__m128i k)72 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<2>(__m128i k)
73 {
74 return _mm_movemask_pd(_mm_castsi128_pd(k));
75 }
mask_to_int(__m128i k)76 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m128i k)
77 {
78 return _mm_movemask_ps(_mm_castsi128_ps(k));
79 }
mask_to_int(__m128i k)80 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m128i k)
81 {
82 return _mm_movemask_epi8(_mm_packs_epi16(k, _mm_setzero_si128()));
83 }
mask_to_int(__m128i k)84 template<> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m128i k)
85 {
86 return _mm_movemask_epi8(k);
87 }
88 /*}}}*/
89 // mask_store/*{{{*/
90 template <size_t> Vc_ALWAYS_INLINE void mask_store(__m128i k, bool *mem);
mask_store(__m128i k,bool * mem)91 template <> Vc_ALWAYS_INLINE void mask_store<16>(__m128i k, bool *mem)
92 {
93 _mm_store_si128(reinterpret_cast<__m128i *>(mem), _mm_and_si128(k, _mm_set1_epi8(1)));
94 }
mask_store(__m128i k,bool * mem)95 template <> Vc_ALWAYS_INLINE void mask_store<8>(__m128i k, bool *mem)
96 {
97 k = _mm_srli_epi16(k, 15);
98 const auto k2 = _mm_packs_epi16(k, _mm_setzero_si128());
99 #ifdef __x86_64__
100 *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k2);
101 #else
102 _mm_store_sd(aliasing_cast<double>(mem), _mm_castsi128_pd(k2));
103 #endif
104 }
mask_store(__m128i k,bool * mem)105 template <> Vc_ALWAYS_INLINE void mask_store<4>(__m128i k, bool *mem)
106 {
107 *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(
108 _mm_packs_epi16(_mm_srli_epi16(_mm_packs_epi32(k, _mm_setzero_si128()), 15),
109 _mm_setzero_si128()));
110 }
mask_store(__m128i k,bool * mem)111 template <> Vc_ALWAYS_INLINE void mask_store<2>(__m128i k, bool *mem)
112 {
113 mem[0] = -SseIntrinsics::extract_epi32<1>(k);
114 mem[1] = -SseIntrinsics::extract_epi32<3>(k);
115 }
116 /*}}}*/
117 // mask_load/*{{{*/
118 template<size_t> Vc_ALWAYS_INLINE __m128 mask_load(const bool *mem);
mask_load(const bool * mem)119 template<> Vc_ALWAYS_INLINE __m128 mask_load<16>(const bool *mem)
120 {
121 return sse_cast<__m128>(_mm_cmpgt_epi8(
122 _mm_load_si128(reinterpret_cast<const __m128i *>(mem)), _mm_setzero_si128()));
123 }
mask_load(const bool * mem)124 template<> Vc_ALWAYS_INLINE __m128 mask_load<8>(const bool *mem)
125 {
126 #ifdef __x86_64__
127 __m128i k = _mm_cvtsi64_si128(*reinterpret_cast<const int64_t *>(mem));
128 #else
129 __m128i k = _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(mem)));
130 #endif
131 return sse_cast<__m128>(_mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
132 }
mask_load(const bool * mem)133 template<> Vc_ALWAYS_INLINE __m128 mask_load<4>(const bool *mem)
134 {
135 __m128i k = _mm_cvtsi32_si128(*reinterpret_cast<const int *>(mem));
136 k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
137 return sse_cast<__m128>(_mm_unpacklo_epi16(k, k));
138 }
mask_load(const bool * mem)139 template<> Vc_ALWAYS_INLINE __m128 mask_load<2>(const bool *mem)
140 {
141 return sse_cast<__m128>(
142 _mm_set_epi32(-int(mem[1]), -int(mem[1]), -int(mem[0]), -int(mem[0])));
143 }
144 /*}}}*/
145 // is_equal{{{
is_equal(__m128 k1,__m128 k2)146 template <> Vc_INTRINSIC Vc_CONST bool is_equal<2>(__m128 k1, __m128 k2)
147 {
148 return _mm_movemask_pd(_mm_castps_pd(k1)) == _mm_movemask_pd(_mm_castps_pd(k2));
149 }
is_not_equal(__m128 k1,__m128 k2)150 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<2>(__m128 k1, __m128 k2)
151 {
152 return _mm_movemask_pd(_mm_castps_pd(k1)) != _mm_movemask_pd(_mm_castps_pd(k2));
153 }
154
is_equal(__m128 k1,__m128 k2)155 template <> Vc_INTRINSIC Vc_CONST bool is_equal<4>(__m128 k1, __m128 k2)
156 {
157 return _mm_movemask_ps(k1) == _mm_movemask_ps(k2);
158 }
is_not_equal(__m128 k1,__m128 k2)159 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<4>(__m128 k1, __m128 k2)
160 {
161 return _mm_movemask_ps(k1) != _mm_movemask_ps(k2);
162 }
163
is_equal(__m128 k1,__m128 k2)164 template <> Vc_INTRINSIC Vc_CONST bool is_equal<8>(__m128 k1, __m128 k2)
165 {
166 return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
167 _mm_movemask_epi8(_mm_castps_si128(k2));
168 }
is_not_equal(__m128 k1,__m128 k2)169 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<8>(__m128 k1, __m128 k2)
170 {
171 return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
172 _mm_movemask_epi8(_mm_castps_si128(k2));
173 }
174
is_equal(__m128 k1,__m128 k2)175 template <> Vc_INTRINSIC Vc_CONST bool is_equal<16>(__m128 k1, __m128 k2)
176 {
177 return _mm_movemask_epi8(_mm_castps_si128(k1)) ==
178 _mm_movemask_epi8(_mm_castps_si128(k2));
179 }
is_not_equal(__m128 k1,__m128 k2)180 template <> Vc_INTRINSIC Vc_CONST bool is_not_equal<16>(__m128 k1, __m128 k2)
181 {
182 return _mm_movemask_epi8(_mm_castps_si128(k1)) !=
183 _mm_movemask_epi8(_mm_castps_si128(k2));
184 }
185
186 // }}}
187 } // namespace Detail
188
store(bool * mem) const189 template<> Vc_ALWAYS_INLINE void SSE::double_m::store(bool *mem) const
190 {
191 *aliasing_cast<uint16_t>(mem) = _mm_movemask_epi8(dataI()) & 0x0101;
192 }
store(bool * mem) const193 template<typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::store(bool *mem) const
194 {
195 Detail::mask_store<Size>(dataI(), mem);
196 }
load(const bool * mem)197 template<> Vc_ALWAYS_INLINE void SSE::double_m::load(const bool *mem)
198 {
199 d.set(0, MaskBool(mem[0]));
200 d.set(1, MaskBool(mem[1]));
201 }
load(const bool * mem)202 template <typename T> Vc_ALWAYS_INLINE void Mask<T, VectorAbi::Sse>::load(const bool *mem)
203 {
204 d.v() = sse_cast<VectorType>(Detail::mask_load<Size>(mem));
205 }
206
207 // get / operator[] {{{1
208 template <>
get(const SSE::short_m & m,int index)209 Vc_INTRINSIC Vc_PURE bool SSE::short_m::get(const SSE::short_m &m, int index) noexcept
210 {
211 return m.shiftMask() & (1 << 2 * index);
212 }
213 template <>
get(const SSE::ushort_m & m,int index)214 Vc_INTRINSIC Vc_PURE bool SSE::ushort_m::get(const SSE::ushort_m &m, int index) noexcept
215 {
216 return m.shiftMask() & (1 << 2 * index);
217 }
218
219 // firstOne {{{1
firstOne() const220 template<typename T> Vc_ALWAYS_INLINE Vc_PURE int Mask<T, VectorAbi::Sse>::firstOne() const
221 {
222 const int mask = toInt();
223 #ifdef _MSC_VER
224 unsigned long bit;
225 _BitScanForward(&bit, mask);
226 #else
227 int bit;
228 __asm__("bsf %1,%0" : "=&r"(bit) : "r"(mask));
229 #endif
230 return bit;
231 }
232
233 // generate {{{1
234 template <typename M, typename G>
generate_impl(G && gen,std::integral_constant<int,2>)235 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 2>)
236 {
237 return _mm_set_epi64x(gen(1) ? 0xffffffffffffffffull : 0,
238 gen(0) ? 0xffffffffffffffffull : 0);
239 }
240 template <typename M, typename G>
generate_impl(G && gen,std::integral_constant<int,4>)241 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 4>)
242 {
243 return _mm_setr_epi32(gen(0) ? 0xfffffffful : 0, gen(1) ? 0xfffffffful : 0,
244 gen(2) ? 0xfffffffful : 0, gen(3) ? 0xfffffffful : 0);
245 }
246 template <typename M, typename G>
generate_impl(G && gen,std::integral_constant<int,8>)247 Vc_INTRINSIC M generate_impl(G &&gen, std::integral_constant<int, 8>)
248 {
249 return _mm_setr_epi16(gen(0) ? 0xffffu : 0, gen(1) ? 0xffffu : 0,
250 gen(2) ? 0xffffu : 0, gen(3) ? 0xffffu : 0,
251 gen(4) ? 0xffffu : 0, gen(5) ? 0xffffu : 0,
252 gen(6) ? 0xffffu : 0, gen(7) ? 0xffffu : 0);
253 }
254 template <typename T>
255 template <typename G>
generate(G && gen)256 Vc_INTRINSIC Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::generate(G &&gen)
257 {
258 return generate_impl<Mask<T, VectorAbi::Sse>>(std::forward<G>(gen),
259 std::integral_constant<int, Size>());
260 }
261 // shifted {{{1
shifted(int amount) const262 template <typename T> Vc_INTRINSIC Vc_PURE Mask<T, VectorAbi::Sse> Mask<T, VectorAbi::Sse>::shifted(int amount) const
263 {
264 switch (amount * int(sizeof(VectorEntryType))) {
265 case 0: return *this;
266 case 1: return Detail::shifted< 1>(dataI());
267 case 2: return Detail::shifted< 2>(dataI());
268 case 3: return Detail::shifted< 3>(dataI());
269 case 4: return Detail::shifted< 4>(dataI());
270 case 5: return Detail::shifted< 5>(dataI());
271 case 6: return Detail::shifted< 6>(dataI());
272 case 7: return Detail::shifted< 7>(dataI());
273 case 8: return Detail::shifted< 8>(dataI());
274 case 9: return Detail::shifted< 9>(dataI());
275 case 10: return Detail::shifted< 10>(dataI());
276 case 11: return Detail::shifted< 11>(dataI());
277 case 12: return Detail::shifted< 12>(dataI());
278 case 13: return Detail::shifted< 13>(dataI());
279 case 14: return Detail::shifted< 14>(dataI());
280 case 15: return Detail::shifted< 15>(dataI());
281 case 16: return Detail::shifted< 16>(dataI());
282 case -1: return Detail::shifted< -1>(dataI());
283 case -2: return Detail::shifted< -2>(dataI());
284 case -3: return Detail::shifted< -3>(dataI());
285 case -4: return Detail::shifted< -4>(dataI());
286 case -5: return Detail::shifted< -5>(dataI());
287 case -6: return Detail::shifted< -6>(dataI());
288 case -7: return Detail::shifted< -7>(dataI());
289 case -8: return Detail::shifted< -8>(dataI());
290 case -9: return Detail::shifted< -9>(dataI());
291 case -10: return Detail::shifted<-10>(dataI());
292 case -11: return Detail::shifted<-11>(dataI());
293 case -12: return Detail::shifted<-12>(dataI());
294 case -13: return Detail::shifted<-13>(dataI());
295 case -14: return Detail::shifted<-14>(dataI());
296 case -15: return Detail::shifted<-15>(dataI());
297 case -16: return Detail::shifted<-16>(dataI());
298 }
299 return Zero();
300 }
301 // }}}1
302
303 }
304
305 // vim: foldmethod=marker
306