1 /*  This file is part of the Vc library. {{{
2 Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #include "../common/x86_prefetches.h"
29 #include "limits.h"
30 #include "../common/bitscanintrinsics.h"
31 #include "../common/set.h"
32 #include "../common/gatherimplementation.h"
33 #include "../common/scatterimplementation.h"
34 #include "../common/transpose.h"
35 #include "macros.h"
36 
37 namespace Vc_VERSIONED_NAMESPACE
38 {
39 namespace Detail
40 {
41 // compare operators {{{1
operator ==(SSE::double_v a,SSE::double_v b)42 Vc_INTRINSIC SSE::double_m operator==(SSE::double_v a, SSE::double_v b) { return _mm_cmpeq_pd(a.data(), b.data()); }
operator ==(SSE::float_v a,SSE::float_v b)43 Vc_INTRINSIC SSE:: float_m operator==(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpeq_ps(a.data(), b.data()); }
operator ==(SSE::int_v a,SSE::int_v b)44 Vc_INTRINSIC SSE::   int_m operator==(SSE::   int_v a, SSE::   int_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
operator ==(SSE::uint_v a,SSE::uint_v b)45 Vc_INTRINSIC SSE::  uint_m operator==(SSE::  uint_v a, SSE::  uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
operator ==(SSE::short_v a,SSE::short_v b)46 Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
operator ==(SSE::ushort_v a,SSE::ushort_v b)47 Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
48 
operator !=(SSE::double_v a,SSE::double_v b)49 Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); }
operator !=(SSE::float_v a,SSE::float_v b)50 Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); }
operator !=(SSE::int_v a,SSE::int_v b)51 Vc_INTRINSIC SSE::   int_m operator!=(SSE::   int_v a, SSE::   int_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
operator !=(SSE::uint_v a,SSE::uint_v b)52 Vc_INTRINSIC SSE::  uint_m operator!=(SSE::  uint_v a, SSE::  uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
operator !=(SSE::short_v a,SSE::short_v b)53 Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
operator !=(SSE::ushort_v a,SSE::ushort_v b)54 Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
55 
operator >(SSE::double_v a,SSE::double_v b)56 Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); }
operator >(SSE::float_v a,SSE::float_v b)57 Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); }
operator >(SSE::int_v a,SSE::int_v b)58 Vc_INTRINSIC SSE::   int_m operator> (SSE::   int_v a, SSE::   int_v b) { return _mm_cmpgt_epi32(a.data(), b.data()); }
operator >(SSE::uint_v a,SSE::uint_v b)59 Vc_INTRINSIC SSE::  uint_m operator> (SSE::  uint_v a, SSE::  uint_v b) {
60 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
61     return SSE::cmpgt_epu32(a.data(), b.data());
62 #else
63     return _mm_cmpgt_epi32(a.data(), b.data());
64 #endif
65 }
operator >(SSE::short_v a,SSE::short_v b)66 Vc_INTRINSIC SSE:: short_m operator> (SSE:: short_v a, SSE:: short_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); }
operator >(SSE::ushort_v a,SSE::ushort_v b)67 Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) {
68 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
69     return SSE::cmpgt_epu16(a.data(), b.data());
70 #else
71     return _mm_cmpgt_epi16(a.data(), b.data());
72 #endif
73 }
74 
operator <(SSE::double_v a,SSE::double_v b)75 Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); }
operator <(SSE::float_v a,SSE::float_v b)76 Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); }
operator <(SSE::int_v a,SSE::int_v b)77 Vc_INTRINSIC SSE::   int_m operator< (SSE::   int_v a, SSE::   int_v b) { return _mm_cmplt_epi32(a.data(), b.data()); }
operator <(SSE::uint_v a,SSE::uint_v b)78 Vc_INTRINSIC SSE::  uint_m operator< (SSE::  uint_v a, SSE::  uint_v b) {
79 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
80     return SSE::cmplt_epu32(a.data(), b.data());
81 #else
82     return _mm_cmplt_epi32(a.data(), b.data());
83 #endif
84 }
operator <(SSE::short_v a,SSE::short_v b)85 Vc_INTRINSIC SSE:: short_m operator< (SSE:: short_v a, SSE:: short_v b) { return _mm_cmplt_epi16(a.data(), b.data()); }
operator <(SSE::ushort_v a,SSE::ushort_v b)86 Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) {
87 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
88     return SSE::cmplt_epu16(a.data(), b.data());
89 #else
90     return _mm_cmplt_epi16(a.data(), b.data());
91 #endif
92 }
93 
operator >=(SSE::double_v a,SSE::double_v b)94 Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); }
operator >=(SSE::float_v a,SSE::float_v b)95 Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); }
operator >=(SSE::int_v a,SSE::int_v b)96 Vc_INTRINSIC SSE::   int_m operator>=(SSE::   int_v a, SSE::   int_v b) { return !(a < b); }
operator >=(SSE::uint_v a,SSE::uint_v b)97 Vc_INTRINSIC SSE::  uint_m operator>=(SSE::  uint_v a, SSE::  uint_v b) { return !(a < b); }
operator >=(SSE::short_v a,SSE::short_v b)98 Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); }
operator >=(SSE::ushort_v a,SSE::ushort_v b)99 Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); }
100 
operator <=(SSE::double_v a,SSE::double_v b)101 Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); }
operator <=(SSE::float_v a,SSE::float_v b)102 Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); }
operator <=(SSE::int_v a,SSE::int_v b)103 Vc_INTRINSIC SSE::   int_m operator<=(SSE::   int_v a, SSE::   int_v b) { return !(a > b); }
operator <=(SSE::uint_v a,SSE::uint_v b)104 Vc_INTRINSIC SSE::  uint_m operator<=(SSE::  uint_v a, SSE::  uint_v b) { return !(a > b); }
operator <=(SSE::short_v a,SSE::short_v b)105 Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); }
operator <=(SSE::ushort_v a,SSE::ushort_v b)106 Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); }
107 
108 // bitwise operators {{{1
109 template <typename T>
operator ^(SSE::Vector<T> a,SSE::Vector<T> b)110 Vc_INTRINSIC SSE::Vector<T> operator^(SSE::Vector<T> a, SSE::Vector<T> b)
111 {
112     return xor_(a.data(), b.data());
113 }
114 template <typename T>
operator &(SSE::Vector<T> a,SSE::Vector<T> b)115 Vc_INTRINSIC SSE::Vector<T> operator&(SSE::Vector<T> a, SSE::Vector<T> b)
116 {
117     return and_(a.data(), b.data());
118 }
119 template <typename T>
operator |(SSE::Vector<T> a,SSE::Vector<T> b)120 Vc_INTRINSIC SSE::Vector<T> operator|(SSE::Vector<T> a, SSE::Vector<T> b)
121 {
122     return or_(a.data(), b.data());
123 }
124 // arithmetic operators {{{1
125 template <typename T>
operator +(SSE::Vector<T> a,SSE::Vector<T> b)126 Vc_INTRINSIC SSE::Vector<T> operator+(SSE::Vector<T> a, SSE::Vector<T> b)
127 {
128     return add(a.data(), b.data(), T());
129 }
130 template <typename T>
operator -(SSE::Vector<T> a,SSE::Vector<T> b)131 Vc_INTRINSIC SSE::Vector<T> operator-(SSE::Vector<T> a, SSE::Vector<T> b)
132 {
133     return sub(a.data(), b.data(), T());
134 }
135 template <typename T>
operator *(SSE::Vector<T> a,SSE::Vector<T> b)136 Vc_INTRINSIC SSE::Vector<T> operator*(SSE::Vector<T> a, SSE::Vector<T> b)
137 {
138     return mul(a.data(), b.data(), T());
139 }
140 template <typename T>
operator /(SSE::Vector<T> a,SSE::Vector<T> b)141 Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, SSE::Vector<T>> operator/(
142     SSE::Vector<T> a, SSE::Vector<T> b)
143 {
144     return div(a.data(), b.data(), T());
145 }
146 template <typename T>
147 Vc_INTRINSIC
148     enable_if<std::is_same<int, T>::value || std::is_same<uint, T>::value, SSE::Vector<T>>
operator /(SSE::Vector<T> a,SSE::Vector<T> b)149     operator/(SSE::Vector<T> a, SSE::Vector<T> b)
150 {
151     return SSE::Vector<T>::generate([&](int i) { return a[i] / b[i]; });
152 }
153 template <typename T>
154 Vc_INTRINSIC enable_if<std::is_same<short, T>::value || std::is_same<ushort, T>::value,
155                        SSE::Vector<T>>
operator /(SSE::Vector<T> a,SSE::Vector<T> b)156 operator/(SSE::Vector<T> a, SSE::Vector<T> b)
157 {
158     using HT = SSE::VectorHelper<T>;
159     __m128 lo = _mm_cvtepi32_ps(HT::expand0(a.data()));
160     __m128 hi = _mm_cvtepi32_ps(HT::expand1(a.data()));
161     lo = _mm_div_ps(lo, _mm_cvtepi32_ps(HT::expand0(b.data())));
162     hi = _mm_div_ps(hi, _mm_cvtepi32_ps(HT::expand1(b.data())));
163     return HT::concat(_mm_cvttps_epi32(lo), _mm_cvttps_epi32(hi));
164 }
165 template <typename T>
operator %(SSE::Vector<T> a,SSE::Vector<T> b)166 Vc_INTRINSIC enable_if<std::is_integral<T>::value, SSE::Vector<T>> operator%(
167     SSE::Vector<T> a, SSE::Vector<T> b)
168 {
169     return a - a / b * b;
170 }
171 // }}}1
172 }  // namespace Detail
173 // constants {{{1
Vector(VectorSpecialInitializerZero)174 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerZero)
175     : d(HV::zero())
176 {
177 }
178 
Vector(VectorSpecialInitializerOne)179 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerOne)
180     : d(HT::one())
181 {
182 }
183 
184 template <typename T>
Vector(VectorSpecialInitializerIndexesFromZero)185 Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
186     : d(Detail::load16(Detail::IndexesFromZero<EntryType, Size>(), Aligned))
187 {
188 #if defined Vc_GCC && Vc_GCC < 0x40903 && defined Vc_IMPL_AVX2
189     // GCC 4.9.2 (at least) miscompiles SSE::short_v::IndexesFromZero() if used implicitly
190     // from SimdArray<short, 9> compiling for AVX2 to vpmovsxwd (sign extending load from
191     // a 8x 16-bit constant to 8x 32-bit register)
192     if (std::is_same<T, short>::value) {
193         asm("" ::"x"(d.v()));
194     }
195 #endif
196 }
197 
198 template <>
Vector(VectorSpecialInitializerIndexesFromZero)199 Vc_INTRINSIC Vector<float, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
200     : d(SSE::convert<int, float>(SSE::int_v::IndexesFromZero().data()))
201 {
202 }
203 
204 template <>
Vector(VectorSpecialInitializerIndexesFromZero)205 Vc_INTRINSIC Vector<double, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
206     : d(SSE::convert<int, double>(SSE::int_v::IndexesFromZero().data()))
207 {
208 }
209 
210 // load member functions {{{1
211 template <typename DstT>
212 template <typename SrcT, typename Flags>
213 Vc_INTRINSIC typename Vector<DstT, VectorAbi::Sse>::
214 #ifndef Vc_MSVC
215 template
216 #endif
load(const SrcT * mem,Flags flags)217 load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Sse>::load(const SrcT *mem, Flags flags)
218 {
219     Common::handleLoadPrefetches(mem, flags);
220     d.v() = Detail::load<VectorType, DstT>(mem, flags);
221 }
222 
223 // zeroing {{{1
setZero()224 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero()
225 {
226     data() = HV::zero();
227 }
228 
setZero(const Mask & k)229 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero(const Mask &k)
230 {
231     data() = Detail::andnot_(k.data(), data());
232 }
233 
setZeroInverted(const Mask & k)234 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZeroInverted(const Mask &k)
235 {
236     data() = Detail::and_(k.data(), data());
237 }
238 
setQnan()239 template<> Vc_INTRINSIC void SSE::double_v::setQnan()
240 {
241     data() = SSE::_mm_setallone_pd();
242 }
setQnan(const Mask & k)243 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Sse>::setQnan(const Mask &k)
244 {
245     data() = _mm_or_pd(data(), k.dataD());
246 }
setQnan()247 template<> Vc_INTRINSIC void SSE::float_v::setQnan()
248 {
249     data() = SSE::_mm_setallone_ps();
250 }
setQnan(const Mask & k)251 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Sse>::setQnan(const Mask &k)
252 {
253     data() = _mm_or_ps(data(), k.dataF());
254 }
255 
256 ///////////////////////////////////////////////////////////////////////////////////////////
257 // stores {{{1
258 template <typename T>
259 template <typename U, typename Flags, typename>
store(U * mem,Flags flags) const260 Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Flags flags) const
261 {
262     Common::handleStorePrefetches(mem, flags);
263     HV::template store<Flags>(mem, data());
264 }
265 
266 template <typename T>
267 template <typename U, typename Flags, typename>
store(U * mem,Mask mask,Flags flags) const268 Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Mask mask, Flags flags) const
269 {
270     Common::handleStorePrefetches(mem, flags);
271     HV::template store<Flags>(mem, data(), mask.data());
272 }
273 
274 ///////////////////////////////////////////////////////////////////////////////////////////
275 // operator- {{{1
operator -() const276 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator-() const
277 {
278     return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
279 }
280 ///////////////////////////////////////////////////////////////////////////////////////////
281 // integer ops {{{1
282 #ifdef Vc_IMPL_XOP
operator <<(const SSE::int_v shift) const283 template <> Vc_ALWAYS_INLINE    SSE::int_v    SSE::int_v::operator<<(const    SSE::int_v shift) const { return _mm_sha_epi32(d.v(), shift.d.v()); }
operator <<(const SSE::uint_v shift) const284 template <> Vc_ALWAYS_INLINE   SSE::uint_v   SSE::uint_v::operator<<(const   SSE::uint_v shift) const { return _mm_shl_epi32(d.v(), shift.d.v()); }
operator <<(const SSE::short_v shift) const285 template <> Vc_ALWAYS_INLINE  SSE::short_v  SSE::short_v::operator<<(const  SSE::short_v shift) const { return _mm_sha_epi16(d.v(), shift.d.v()); }
operator <<(const SSE::ushort_v shift) const286 template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator<<(const SSE::ushort_v shift) const { return _mm_shl_epi16(d.v(), shift.d.v()); }
operator >>(const SSE::int_v shift) const287 template <> Vc_ALWAYS_INLINE    SSE::int_v    SSE::int_v::operator>>(const    SSE::int_v shift) const { return operator<<(-shift); }
operator >>(const SSE::uint_v shift) const288 template <> Vc_ALWAYS_INLINE   SSE::uint_v   SSE::uint_v::operator>>(const   SSE::uint_v shift) const { return operator<<(-shift); }
operator >>(const SSE::short_v shift) const289 template <> Vc_ALWAYS_INLINE  SSE::short_v  SSE::short_v::operator>>(const  SSE::short_v shift) const { return operator<<(-shift); }
operator >>(const SSE::ushort_v shift) const290 template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator>>(const SSE::ushort_v shift) const { return operator<<(-shift); }
291 #elif defined Vc_IMPL_AVX2
operator <<(const SSE::Vector<int> x) const292 template <> Vc_ALWAYS_INLINE SSE::Vector<   int> Vector<   int, VectorAbi::Sse>::operator<<(const SSE::Vector<   int> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
operator <<(const SSE::Vector<uint> x) const293 template <> Vc_ALWAYS_INLINE SSE::Vector<  uint> Vector<  uint, VectorAbi::Sse>::operator<<(const SSE::Vector<  uint> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
operator >>(const SSE::Vector<int> x) const294 template <> Vc_ALWAYS_INLINE SSE::Vector<   int> Vector<   int, VectorAbi::Sse>::operator>>(const SSE::Vector<   int> x) const { return _mm_srav_epi32(d.v(), x.d.v()); }
operator >>(const SSE::Vector<uint> x) const295 template <> Vc_ALWAYS_INLINE SSE::Vector<  uint> Vector<  uint, VectorAbi::Sse>::operator>>(const SSE::Vector<  uint> x) const { return _mm_srlv_epi32(d.v(), x.d.v()); }
296 #endif
297 
operator >>=(int shift)298 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator>>=(int shift) {
299     d.v() = HT::shiftRight(d.v(), shift);
300     return *this;
301 }
operator >>(int shift) const302 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator>>(int shift) const {
303     return HT::shiftRight(d.v(), shift);
304 }
operator <<=(int shift)305 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator<<=(int shift) {
306     d.v() = HT::shiftLeft(d.v(), shift);
307     return *this;
308 }
operator <<(int shift) const309 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator<<(int shift) const {
310     return HT::shiftLeft(d.v(), shift);
311 }
312 
313 ///////////////////////////////////////////////////////////////////////////////////////////
314 // isnegative {{{1
isnegative(SSE::float_v x)315 Vc_INTRINSIC Vc_CONST SSE::float_m isnegative(SSE::float_v x)
316 {
317     return sse_cast<__m128>(_mm_srai_epi32(
318         sse_cast<__m128i>(_mm_and_ps(SSE::_mm_setsignmask_ps(), x.data())), 31));
319 }
isnegative(SSE::double_v x)320 Vc_INTRINSIC Vc_CONST SSE::double_m isnegative(SSE::double_v x)
321 {
322     return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(_mm_srai_epi32(
323         sse_cast<__m128i>(_mm_and_pd(SSE::_mm_setsignmask_pd(), x.data())), 31)));
324 }
325 
326 // gathers {{{1
327 #define Vc_GATHER_IMPL(V_)                                                               \
328     template <>                                                                          \
329     template <class MT, class IT, int Scale>                                             \
330     inline void SSE::V_::gatherImplementation(                                           \
331         const Common::GatherArguments<MT, IT, Scale> &args)
332 #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
Vc_GATHER_IMPL(double_v)333 Vc_GATHER_IMPL(double_v) { d.v() = _mm_setr_pd(Vc_M(0), Vc_M(1)); }
Vc_GATHER_IMPL(float_v)334 Vc_GATHER_IMPL(float_v)  { d.v() = _mm_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(int_v)335 Vc_GATHER_IMPL(int_v)    { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(uint_v)336 Vc_GATHER_IMPL(uint_v)   { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(short_v)337 Vc_GATHER_IMPL(short_v)
338 {
339     d.v() =
340         Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
341 }
Vc_GATHER_IMPL(ushort_v)342 Vc_GATHER_IMPL(ushort_v)
343 {
344     d.v() =
345         Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
346 }
347 #undef Vc_M
348 #undef Vc_GATHER_IMPL
349 
350 template <typename T>
351 template <class MT, class IT, int Scale>
gatherImplementation(const Common::GatherArguments<MT,IT,Scale> & args,MaskArgument mask)352 inline void Vector<T, VectorAbi::Sse>::gatherImplementation(
353     const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
354 {
355     const auto *mem = args.address;
356     const auto indexes = Scale * args.indexes;
357     using Selector = std::integral_constant < Common::GatherScatterImplementation,
358 #ifdef Vc_USE_SET_GATHERS
359           Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
360 #endif
361 #ifdef Vc_USE_BSF_GATHERS
362                                             Common::GatherScatterImplementation::BitScanLoop
363 #elif defined Vc_USE_POPCNT_BSF_GATHERS
364               Common::GatherScatterImplementation::PopcntSwitch
365 #else
366               Common::GatherScatterImplementation::SimpleLoop
367 #endif
368                                                 > ;
369     Common::executeGather(Selector(), *this, mem, indexes, mask);
370 }
371 
372 // scatters {{{1
373 template <typename T>
374 template <typename MT, typename IT>
scatterImplementation(MT * mem,IT && indexes) const375 inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes) const
376 {
377     Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
378 }
379 
380 template <typename T>
381 template <typename MT, typename IT>
scatterImplementation(MT * mem,IT && indexes,MaskArgument mask) const382 inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
383 {
384     using Selector = std::integral_constant < Common::GatherScatterImplementation,
385 #ifdef Vc_USE_SET_GATHERS
386           Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
387 #endif
388 #ifdef Vc_USE_BSF_GATHERS
389                                             Common::GatherScatterImplementation::BitScanLoop
390 #elif defined Vc_USE_POPCNT_BSF_GATHERS
391               Common::GatherScatterImplementation::PopcntSwitch
392 #else
393               Common::GatherScatterImplementation::SimpleLoop
394 #endif
395                                                 > ;
396     Common::executeScatter(Selector(), *this, mem, indexes, mask);
397 }
398 
399 ///////////////////////////////////////////////////////////////////////////////////////////
400 // horizontal ops {{{1
partialSum() const401 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::partialSum() const
402 {
403     //   a    b    c    d    e    f    g    h
404     // +      a    b    c    d    e    f    g    -> a ab bc  cd   de    ef     fg      gh
405     // +           a    ab   bc   cd   de   ef   -> a ab abc abcd bcde  cdef   defg    efgh
406     // +                     a    ab   abc  abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
407     Vector<T, VectorAbi::Sse> tmp = *this;
408     if (Size >  1) tmp += tmp.shifted(-1);
409     if (Size >  2) tmp += tmp.shifted(-2);
410     if (Size >  4) tmp += tmp.shifted(-4);
411     if (Size >  8) tmp += tmp.shifted(-8);
412     if (Size > 16) tmp += tmp.shifted(-16);
413     return tmp;
414 }
415 #ifndef Vc_IMPL_SSE4_1
416 // without SSE4.1 integer multiplication is slow and we rather multiply the scalars
product() const417 template<> Vc_INTRINSIC Vc_PURE int SSE::int_v::product() const
418 {
419     return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
420 }
product() const421 template<> Vc_INTRINSIC Vc_PURE unsigned int SSE::uint_v::product() const
422 {
423     return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
424 }
425 #endif
min(MaskArg m) const426 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::min(MaskArg m) const
427 {
428     Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::max();
429     tmp(m) = *this;
430     return tmp.min();
431 }
max(MaskArg m) const432 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::max(MaskArg m) const
433 {
434     Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::min();
435     tmp(m) = *this;
436     return tmp.max();
437 }
product(MaskArg m) const438 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::product(MaskArg m) const
439 {
440     Vector<T, VectorAbi::Sse> tmp(Vc::One);
441     tmp(m) = *this;
442     return tmp.product();
443 }
sum(MaskArg m) const444 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::sum(MaskArg m) const
445 {
446     Vector<T, VectorAbi::Sse> tmp(Vc::Zero);
447     tmp(m) = *this;
448     return tmp.sum();
449 }
450 
451 ///////////////////////////////////////////////////////////////////////////////////////////
452 // exponent {{{1
453 namespace Detail
454 {
exponent(__m128 v)455 Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v)
456 {
457     __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23);
458     tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
459     return _mm_cvtepi32_ps(tmp);
460 }
exponent(__m128d v)461 Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v)
462 {
463     __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52);
464     tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
465     return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
466 }
467 } // namespace Detail
468 
exponent(SSE::float_v x)469 Vc_INTRINSIC Vc_CONST SSE::float_v exponent(SSE::float_v x)
470 {
471     using Detail::operator>=;
472     Vc_ASSERT((x >= x.Zero()).isFull());
473     return Detail::exponent(x.data());
474 }
exponent(SSE::double_v x)475 Vc_INTRINSIC Vc_CONST SSE::double_v exponent(SSE::double_v x)
476 {
477     using Detail::operator>=;
478     Vc_ASSERT((x >= x.Zero()).isFull());
479     return Detail::exponent(x.data());
480 }
481 // }}}1
482 // Random {{{1
_doRandomStep(SSE::uint_v & state0,SSE::uint_v & state1)483 static void _doRandomStep(SSE::uint_v &state0,
484         SSE::uint_v &state1)
485 {
486     using SSE::uint_v;
487     using Detail::operator+;
488     using Detail::operator*;
489     state0.load(&Common::RandomState[0]);
490     state1.load(&Common::RandomState[uint_v::Size]);
491     (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
492     uint_v(_mm_xor_si128((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
493                          _mm_srli_epi32(state1.data(), 16)))
494         .store(&Common::RandomState[0]);
495 }
496 
Random()497 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::Random()
498 {
499     SSE::uint_v state0, state1;
500     _doRandomStep(state0, state1);
501     return state0.data();
502 }
503 
Random()504 template<> Vc_ALWAYS_INLINE SSE::float_v SSE::float_v::Random()
505 {
506     SSE::uint_v state0, state1;
507     _doRandomStep(state0, state1);
508     return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
509 }
510 
Random()511 template<> Vc_ALWAYS_INLINE SSE::double_v SSE::double_v::Random()
512 {
513     typedef unsigned long long uint64 Vc_MAY_ALIAS;
514     uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
515     uint64 state1 = *reinterpret_cast<const uint64 *>(&Common::RandomState[10]);
516     const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Common::RandomState[8]));
517     *reinterpret_cast<uint64 *>(&Common::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
518     *reinterpret_cast<uint64 *>(&Common::RandomState[10]) = (state1 * 0x5deece66dull + 11);
519     return _mm_sub_pd(_mm_or_pd(_mm_castsi128_pd(_mm_srli_epi64(state, 12)), HT::one()), HT::one());
520 }
521 // shifted / rotated {{{1
shifted(int amount) const522 template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount) const
523 {
524     enum {
525         EntryTypeSizeof = sizeof(EntryType)
526     };
527     switch (amount) {
528     case  0: return *this;
529     case  1: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
530     case  2: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
531     case  3: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
532     case  4: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
533     case  5: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
534     case  6: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
535     case  7: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
536     case  8: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
537     case -1: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
538     case -2: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
539     case -3: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
540     case -4: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
541     case -5: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
542     case -6: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
543     case -7: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
544     case -8: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
545     }
546     return Zero();
547 }
shifted(int amount,Vector shiftIn) const548 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount, Vector shiftIn) const
549 {
550     if (amount >= -int(size())) {
551         constexpr int VectorWidth = int(size());
552         constexpr int EntryTypeSizeof = sizeof(EntryType);
553         const __m128i v0 = sse_cast<__m128i>(d.v());
554         const __m128i v1 = sse_cast<__m128i>(shiftIn.d.v());
555         auto &&fixup = sse_cast<VectorType, __m128i>;
556         switch (amount) {
557         case  0: return *this;
558                  // alignr_epi8: [arg1 arg0] << n
559         case -1: return fixup(SSE::alignr_epi8<(VectorWidth - 1) * EntryTypeSizeof>(v0, v1));
560         case -2: return fixup(SSE::alignr_epi8<(VectorWidth - 2) * EntryTypeSizeof>(v0, v1));
561         case -3: return fixup(SSE::alignr_epi8<(VectorWidth - 3) * EntryTypeSizeof>(v0, v1));
562         case -4: return fixup(SSE::alignr_epi8<(VectorWidth - 4) * EntryTypeSizeof>(v0, v1));
563         case -5: return fixup(SSE::alignr_epi8<(VectorWidth - 5) * EntryTypeSizeof>(v0, v1));
564         case -6: return fixup(SSE::alignr_epi8<(VectorWidth - 6) * EntryTypeSizeof>(v0, v1));
565         case -7: return fixup(SSE::alignr_epi8<(VectorWidth - 7) * EntryTypeSizeof>(v0, v1));
566         case -8: return fixup(SSE::alignr_epi8<(VectorWidth - 8) * EntryTypeSizeof>(v0, v1));
567         case -9: return fixup(SSE::alignr_epi8<(VectorWidth - 9) * EntryTypeSizeof>(v0, v1));
568         case-10: return fixup(SSE::alignr_epi8<(VectorWidth -10) * EntryTypeSizeof>(v0, v1));
569         case-11: return fixup(SSE::alignr_epi8<(VectorWidth -11) * EntryTypeSizeof>(v0, v1));
570         case-12: return fixup(SSE::alignr_epi8<(VectorWidth -12) * EntryTypeSizeof>(v0, v1));
571         case-13: return fixup(SSE::alignr_epi8<(VectorWidth -13) * EntryTypeSizeof>(v0, v1));
572         case-14: return fixup(SSE::alignr_epi8<(VectorWidth -14) * EntryTypeSizeof>(v0, v1));
573         case-15: return fixup(SSE::alignr_epi8<(VectorWidth -15) * EntryTypeSizeof>(v0, v1));
574         case  1: return fixup(SSE::alignr_epi8< 1 * EntryTypeSizeof>(v1, v0));
575         case  2: return fixup(SSE::alignr_epi8< 2 * EntryTypeSizeof>(v1, v0));
576         case  3: return fixup(SSE::alignr_epi8< 3 * EntryTypeSizeof>(v1, v0));
577         case  4: return fixup(SSE::alignr_epi8< 4 * EntryTypeSizeof>(v1, v0));
578         case  5: return fixup(SSE::alignr_epi8< 5 * EntryTypeSizeof>(v1, v0));
579         case  6: return fixup(SSE::alignr_epi8< 6 * EntryTypeSizeof>(v1, v0));
580         case  7: return fixup(SSE::alignr_epi8< 7 * EntryTypeSizeof>(v1, v0));
581         case  8: return fixup(SSE::alignr_epi8< 8 * EntryTypeSizeof>(v1, v0));
582         case  9: return fixup(SSE::alignr_epi8< 9 * EntryTypeSizeof>(v1, v0));
583         case 10: return fixup(SSE::alignr_epi8<10 * EntryTypeSizeof>(v1, v0));
584         case 11: return fixup(SSE::alignr_epi8<11 * EntryTypeSizeof>(v1, v0));
585         case 12: return fixup(SSE::alignr_epi8<12 * EntryTypeSizeof>(v1, v0));
586         case 13: return fixup(SSE::alignr_epi8<13 * EntryTypeSizeof>(v1, v0));
587         case 14: return fixup(SSE::alignr_epi8<14 * EntryTypeSizeof>(v1, v0));
588         case 15: return fixup(SSE::alignr_epi8<15 * EntryTypeSizeof>(v1, v0));
589         }
590     }
591     return shiftIn.shifted(int(size()) + amount);
592 }
rotated(int amount) const593 template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::rotated(int amount) const
594 {
595     enum {
596         EntryTypeSizeof = sizeof(EntryType)
597     };
598     const __m128i v = SSE::sse_cast<__m128i>(d.v());
599     switch (static_cast<unsigned int>(amount) % Size) {
600     case  0: return *this;
601     case  1: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<1 * EntryTypeSizeof>(v, v));
602     case  2: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<2 * EntryTypeSizeof>(v, v));
603     case  3: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<3 * EntryTypeSizeof>(v, v));
604              // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake.
605              // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType))
606              // disables the following four calls unless sizeof(EntryType) == 2.
607     case  4: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<4 * EntryTypeSizeof>(v, v));
608     case  5: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<5 * EntryTypeSizeof>(v, v));
609     case  6: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<6 * EntryTypeSizeof>(v, v));
610     case  7: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<7 * EntryTypeSizeof>(v, v));
611     }
612     return Zero();
613 }
614 // sorted {{{1
615 namespace Detail
616 {
sorted(SSE::double_v x_)617 inline Vc_CONST SSE::double_v sorted(SSE::double_v x_)
618 {
619     const __m128d x = x_.data();
620     const __m128d y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1));
621     return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y));
622 }
623 }  // namespace Detail
624 template <typename T>
sorted() const625 Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::sorted()
626     const
627 {
628     return Detail::sorted(*this);
629 }
630 // interleaveLow/-High {{{1
interleaveLow(SSE::double_v x) const631 template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveLow (SSE::double_v x) const { return _mm_unpacklo_pd(data(), x.data()); }
interleaveHigh(SSE::double_v x) const632 template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveHigh(SSE::double_v x) const { return _mm_unpackhi_pd(data(), x.data()); }
interleaveLow(SSE::float_v x) const633 template <> Vc_INTRINSIC  SSE::float_v  SSE::float_v::interleaveLow ( SSE::float_v x) const { return _mm_unpacklo_ps(data(), x.data()); }
interleaveHigh(SSE::float_v x) const634 template <> Vc_INTRINSIC  SSE::float_v  SSE::float_v::interleaveHigh( SSE::float_v x) const { return _mm_unpackhi_ps(data(), x.data()); }
interleaveLow(SSE::int_v x) const635 template <> Vc_INTRINSIC    SSE::int_v    SSE::int_v::interleaveLow (   SSE::int_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
interleaveHigh(SSE::int_v x) const636 template <> Vc_INTRINSIC    SSE::int_v    SSE::int_v::interleaveHigh(   SSE::int_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
interleaveLow(SSE::uint_v x) const637 template <> Vc_INTRINSIC   SSE::uint_v   SSE::uint_v::interleaveLow (  SSE::uint_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
interleaveHigh(SSE::uint_v x) const638 template <> Vc_INTRINSIC   SSE::uint_v   SSE::uint_v::interleaveHigh(  SSE::uint_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
interleaveLow(SSE::short_v x) const639 template <> Vc_INTRINSIC  SSE::short_v  SSE::short_v::interleaveLow ( SSE::short_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
interleaveHigh(SSE::short_v x) const640 template <> Vc_INTRINSIC  SSE::short_v  SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
interleaveLow(SSE::ushort_v x) const641 template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
interleaveHigh(SSE::ushort_v x) const642 template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
643 // }}}1
644 // generate {{{1
generate(G gen)645 template <> template <typename G> Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen)
646 {
647     const auto tmp0 = gen(0);
648     const auto tmp1 = gen(1);
649     return _mm_setr_pd(tmp0, tmp1);
650 }
generate(G gen)651 template <> template <typename G> Vc_INTRINSIC SSE::float_v SSE::float_v::generate(G gen)
652 {
653     const auto tmp0 = gen(0);
654     const auto tmp1 = gen(1);
655     const auto tmp2 = gen(2);
656     const auto tmp3 = gen(3);
657     return _mm_setr_ps(tmp0, tmp1, tmp2, tmp3);
658 }
generate(G gen)659 template <> template <typename G> Vc_INTRINSIC SSE::int_v SSE::int_v::generate(G gen)
660 {
661     const auto tmp0 = gen(0);
662     const auto tmp1 = gen(1);
663     const auto tmp2 = gen(2);
664     const auto tmp3 = gen(3);
665     return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
666 }
generate(G gen)667 template <> template <typename G> Vc_INTRINSIC SSE::uint_v SSE::uint_v::generate(G gen)
668 {
669     const auto tmp0 = gen(0);
670     const auto tmp1 = gen(1);
671     const auto tmp2 = gen(2);
672     const auto tmp3 = gen(3);
673     return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
674 }
generate(G gen)675 template <> template <typename G> Vc_INTRINSIC SSE::short_v SSE::short_v::generate(G gen)
676 {
677     const auto tmp0 = gen(0);
678     const auto tmp1 = gen(1);
679     const auto tmp2 = gen(2);
680     const auto tmp3 = gen(3);
681     const auto tmp4 = gen(4);
682     const auto tmp5 = gen(5);
683     const auto tmp6 = gen(6);
684     const auto tmp7 = gen(7);
685     return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
686 }
generate(G gen)687 template <> template <typename G> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::generate(G gen)
688 {
689     const auto tmp0 = gen(0);
690     const auto tmp1 = gen(1);
691     const auto tmp2 = gen(2);
692     const auto tmp3 = gen(3);
693     const auto tmp4 = gen(4);
694     const auto tmp5 = gen(5);
695     const auto tmp6 = gen(6);
696     const auto tmp7 = gen(7);
697     return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
698 }
699 // }}}1
700 // reversed {{{1
reversed() const701 template <> Vc_INTRINSIC Vc_PURE SSE::double_v SSE::double_v::reversed() const
702 {
703     return Mem::permute<X1, X0>(d.v());
704 }
reversed() const705 template <> Vc_INTRINSIC Vc_PURE SSE::float_v SSE::float_v::reversed() const
706 {
707     return Mem::permute<X3, X2, X1, X0>(d.v());
708 }
reversed() const709 template <> Vc_INTRINSIC Vc_PURE SSE::int_v SSE::int_v::reversed() const
710 {
711     return Mem::permute<X3, X2, X1, X0>(d.v());
712 }
reversed() const713 template <> Vc_INTRINSIC Vc_PURE SSE::uint_v SSE::uint_v::reversed() const
714 {
715     return Mem::permute<X3, X2, X1, X0>(d.v());
716 }
reversed() const717 template <> Vc_INTRINSIC Vc_PURE SSE::short_v SSE::short_v::reversed() const
718 {
719     return sse_cast<__m128i>(
720         Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
721                              sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
722 }
reversed() const723 template <> Vc_INTRINSIC Vc_PURE SSE::ushort_v SSE::ushort_v::reversed() const
724 {
725     return sse_cast<__m128i>(
726         Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
727                              sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
728 }
729 // }}}1
730 // permutation via operator[] {{{1
731 template <>
operator [](const SSE::int_v & perm) const732 Vc_INTRINSIC SSE::float_v SSE::float_v::operator[](const SSE::int_v &
733 #ifdef Vc_IMPL_AVX
734                                              perm
735 #endif
736                                          ) const
737 {
738     /*
739     const int_m cross128 = concat(_mm_cmpgt_epi32(lo128(perm.data()), _mm_set1_epi32(3)),
740                                   _mm_cmplt_epi32(hi128(perm.data()), _mm_set1_epi32(4)));
741     if (cross128.isNotEmpty()) {
742     SSE::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
743         x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
744         return x;
745     } else {
746     */
747 #ifdef Vc_IMPL_AVX
748     return _mm_permutevar_ps(d.v(), perm.data());
749 #else
750     return *this;//TODO
751 #endif
752 }
753 // broadcast from constexpr index {{{1
broadcast() const754 template <> template <int Index> Vc_INTRINSIC SSE::float_v SSE::float_v::broadcast() const
755 {
756     constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
757     return Mem::permute<Inner, Inner, Inner, Inner>(d.v());
758 }
broadcast() const759 template <> template <int Index> Vc_INTRINSIC SSE::double_v SSE::double_v::broadcast() const
760 {
761     constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
762     return Mem::permute<Inner, Inner>(d.v());
763 }
764 // }}}1
765 
766 namespace Common
767 {
768 // transpose_impl {{{1
transpose_impl(TransposeTag<4,4>,SSE::float_v * Vc_RESTRICT r[],const TransposeProxy<SSE::float_v,SSE::float_v,SSE::float_v,SSE::float_v> & proxy)769 Vc_ALWAYS_INLINE void transpose_impl(
770     TransposeTag<4, 4>, SSE::float_v *Vc_RESTRICT r[],
771     const TransposeProxy<SSE::float_v, SSE::float_v, SSE::float_v, SSE::float_v> &proxy)
772 {
773     const auto in0 = std::get<0>(proxy.in).data();
774     const auto in1 = std::get<1>(proxy.in).data();
775     const auto in2 = std::get<2>(proxy.in).data();
776     const auto in3 = std::get<3>(proxy.in).data();
777     const auto tmp0 = _mm_unpacklo_ps(in0, in2);
778     const auto tmp1 = _mm_unpacklo_ps(in1, in3);
779     const auto tmp2 = _mm_unpackhi_ps(in0, in2);
780     const auto tmp3 = _mm_unpackhi_ps(in1, in3);
781     *r[0] = _mm_unpacklo_ps(tmp0, tmp1);
782     *r[1] = _mm_unpackhi_ps(tmp0, tmp1);
783     *r[2] = _mm_unpacklo_ps(tmp2, tmp3);
784     *r[3] = _mm_unpackhi_ps(tmp2, tmp3);
785 }
786 // }}}1
787 }  // namespace Common
788 }
789 
790 // vim: foldmethod=marker
791