1 /* This file is part of the Vc library. {{{
2 Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the names of contributing organizations nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 }}}*/
27
28 #include "../common/x86_prefetches.h"
29 #include "limits.h"
30 #include "../common/bitscanintrinsics.h"
31 #include "../common/set.h"
32 #include "../common/gatherimplementation.h"
33 #include "../common/scatterimplementation.h"
34 #include "../common/transpose.h"
35 #include "macros.h"
36
37 namespace Vc_VERSIONED_NAMESPACE
38 {
39 namespace Detail
40 {
41 // compare operators {{{1
operator ==(SSE::double_v a,SSE::double_v b)42 Vc_INTRINSIC SSE::double_m operator==(SSE::double_v a, SSE::double_v b) { return _mm_cmpeq_pd(a.data(), b.data()); }
operator ==(SSE::float_v a,SSE::float_v b)43 Vc_INTRINSIC SSE:: float_m operator==(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpeq_ps(a.data(), b.data()); }
operator ==(SSE::int_v a,SSE::int_v b)44 Vc_INTRINSIC SSE:: int_m operator==(SSE:: int_v a, SSE:: int_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
operator ==(SSE::uint_v a,SSE::uint_v b)45 Vc_INTRINSIC SSE:: uint_m operator==(SSE:: uint_v a, SSE:: uint_v b) { return _mm_cmpeq_epi32(a.data(), b.data()); }
operator ==(SSE::short_v a,SSE::short_v b)46 Vc_INTRINSIC SSE:: short_m operator==(SSE:: short_v a, SSE:: short_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
operator ==(SSE::ushort_v a,SSE::ushort_v b)47 Vc_INTRINSIC SSE::ushort_m operator==(SSE::ushort_v a, SSE::ushort_v b) { return _mm_cmpeq_epi16(a.data(), b.data()); }
48
operator !=(SSE::double_v a,SSE::double_v b)49 Vc_INTRINSIC SSE::double_m operator!=(SSE::double_v a, SSE::double_v b) { return _mm_cmpneq_pd(a.data(), b.data()); }
operator !=(SSE::float_v a,SSE::float_v b)50 Vc_INTRINSIC SSE:: float_m operator!=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpneq_ps(a.data(), b.data()); }
operator !=(SSE::int_v a,SSE::int_v b)51 Vc_INTRINSIC SSE:: int_m operator!=(SSE:: int_v a, SSE:: int_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
operator !=(SSE::uint_v a,SSE::uint_v b)52 Vc_INTRINSIC SSE:: uint_m operator!=(SSE:: uint_v a, SSE:: uint_v b) { return not_(_mm_cmpeq_epi32(a.data(), b.data())); }
operator !=(SSE::short_v a,SSE::short_v b)53 Vc_INTRINSIC SSE:: short_m operator!=(SSE:: short_v a, SSE:: short_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
operator !=(SSE::ushort_v a,SSE::ushort_v b)54 Vc_INTRINSIC SSE::ushort_m operator!=(SSE::ushort_v a, SSE::ushort_v b) { return not_(_mm_cmpeq_epi16(a.data(), b.data())); }
55
operator >(SSE::double_v a,SSE::double_v b)56 Vc_INTRINSIC SSE::double_m operator> (SSE::double_v a, SSE::double_v b) { return _mm_cmpgt_pd(a.data(), b.data()); }
operator >(SSE::float_v a,SSE::float_v b)57 Vc_INTRINSIC SSE:: float_m operator> (SSE:: float_v a, SSE:: float_v b) { return _mm_cmpgt_ps(a.data(), b.data()); }
operator >(SSE::int_v a,SSE::int_v b)58 Vc_INTRINSIC SSE:: int_m operator> (SSE:: int_v a, SSE:: int_v b) { return _mm_cmpgt_epi32(a.data(), b.data()); }
operator >(SSE::uint_v a,SSE::uint_v b)59 Vc_INTRINSIC SSE:: uint_m operator> (SSE:: uint_v a, SSE:: uint_v b) {
60 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
61 return SSE::cmpgt_epu32(a.data(), b.data());
62 #else
63 return _mm_cmpgt_epi32(a.data(), b.data());
64 #endif
65 }
operator >(SSE::short_v a,SSE::short_v b)66 Vc_INTRINSIC SSE:: short_m operator> (SSE:: short_v a, SSE:: short_v b) { return _mm_cmpgt_epi16(a.data(), b.data()); }
operator >(SSE::ushort_v a,SSE::ushort_v b)67 Vc_INTRINSIC SSE::ushort_m operator> (SSE::ushort_v a, SSE::ushort_v b) {
68 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
69 return SSE::cmpgt_epu16(a.data(), b.data());
70 #else
71 return _mm_cmpgt_epi16(a.data(), b.data());
72 #endif
73 }
74
operator <(SSE::double_v a,SSE::double_v b)75 Vc_INTRINSIC SSE::double_m operator< (SSE::double_v a, SSE::double_v b) { return _mm_cmplt_pd(a.data(), b.data()); }
operator <(SSE::float_v a,SSE::float_v b)76 Vc_INTRINSIC SSE:: float_m operator< (SSE:: float_v a, SSE:: float_v b) { return _mm_cmplt_ps(a.data(), b.data()); }
operator <(SSE::int_v a,SSE::int_v b)77 Vc_INTRINSIC SSE:: int_m operator< (SSE:: int_v a, SSE:: int_v b) { return _mm_cmplt_epi32(a.data(), b.data()); }
operator <(SSE::uint_v a,SSE::uint_v b)78 Vc_INTRINSIC SSE:: uint_m operator< (SSE:: uint_v a, SSE:: uint_v b) {
79 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
80 return SSE::cmplt_epu32(a.data(), b.data());
81 #else
82 return _mm_cmplt_epi32(a.data(), b.data());
83 #endif
84 }
operator <(SSE::short_v a,SSE::short_v b)85 Vc_INTRINSIC SSE:: short_m operator< (SSE:: short_v a, SSE:: short_v b) { return _mm_cmplt_epi16(a.data(), b.data()); }
operator <(SSE::ushort_v a,SSE::ushort_v b)86 Vc_INTRINSIC SSE::ushort_m operator< (SSE::ushort_v a, SSE::ushort_v b) {
87 #ifndef USE_INCORRECT_UNSIGNED_COMPARE
88 return SSE::cmplt_epu16(a.data(), b.data());
89 #else
90 return _mm_cmplt_epi16(a.data(), b.data());
91 #endif
92 }
93
operator >=(SSE::double_v a,SSE::double_v b)94 Vc_INTRINSIC SSE::double_m operator>=(SSE::double_v a, SSE::double_v b) { return _mm_cmpnlt_pd(a.data(), b.data()); }
operator >=(SSE::float_v a,SSE::float_v b)95 Vc_INTRINSIC SSE:: float_m operator>=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmpnlt_ps(a.data(), b.data()); }
operator >=(SSE::int_v a,SSE::int_v b)96 Vc_INTRINSIC SSE:: int_m operator>=(SSE:: int_v a, SSE:: int_v b) { return !(a < b); }
operator >=(SSE::uint_v a,SSE::uint_v b)97 Vc_INTRINSIC SSE:: uint_m operator>=(SSE:: uint_v a, SSE:: uint_v b) { return !(a < b); }
operator >=(SSE::short_v a,SSE::short_v b)98 Vc_INTRINSIC SSE:: short_m operator>=(SSE:: short_v a, SSE:: short_v b) { return !(a < b); }
operator >=(SSE::ushort_v a,SSE::ushort_v b)99 Vc_INTRINSIC SSE::ushort_m operator>=(SSE::ushort_v a, SSE::ushort_v b) { return !(a < b); }
100
operator <=(SSE::double_v a,SSE::double_v b)101 Vc_INTRINSIC SSE::double_m operator<=(SSE::double_v a, SSE::double_v b) { return _mm_cmple_pd(a.data(), b.data()); }
operator <=(SSE::float_v a,SSE::float_v b)102 Vc_INTRINSIC SSE:: float_m operator<=(SSE:: float_v a, SSE:: float_v b) { return _mm_cmple_ps(a.data(), b.data()); }
operator <=(SSE::int_v a,SSE::int_v b)103 Vc_INTRINSIC SSE:: int_m operator<=(SSE:: int_v a, SSE:: int_v b) { return !(a > b); }
operator <=(SSE::uint_v a,SSE::uint_v b)104 Vc_INTRINSIC SSE:: uint_m operator<=(SSE:: uint_v a, SSE:: uint_v b) { return !(a > b); }
operator <=(SSE::short_v a,SSE::short_v b)105 Vc_INTRINSIC SSE:: short_m operator<=(SSE:: short_v a, SSE:: short_v b) { return !(a > b); }
operator <=(SSE::ushort_v a,SSE::ushort_v b)106 Vc_INTRINSIC SSE::ushort_m operator<=(SSE::ushort_v a, SSE::ushort_v b) { return !(a > b); }
107
108 // bitwise operators {{{1
109 template <typename T>
operator ^(SSE::Vector<T> a,SSE::Vector<T> b)110 Vc_INTRINSIC SSE::Vector<T> operator^(SSE::Vector<T> a, SSE::Vector<T> b)
111 {
112 return xor_(a.data(), b.data());
113 }
114 template <typename T>
operator &(SSE::Vector<T> a,SSE::Vector<T> b)115 Vc_INTRINSIC SSE::Vector<T> operator&(SSE::Vector<T> a, SSE::Vector<T> b)
116 {
117 return and_(a.data(), b.data());
118 }
119 template <typename T>
operator |(SSE::Vector<T> a,SSE::Vector<T> b)120 Vc_INTRINSIC SSE::Vector<T> operator|(SSE::Vector<T> a, SSE::Vector<T> b)
121 {
122 return or_(a.data(), b.data());
123 }
124 // arithmetic operators {{{1
125 template <typename T>
operator +(SSE::Vector<T> a,SSE::Vector<T> b)126 Vc_INTRINSIC SSE::Vector<T> operator+(SSE::Vector<T> a, SSE::Vector<T> b)
127 {
128 return add(a.data(), b.data(), T());
129 }
130 template <typename T>
operator -(SSE::Vector<T> a,SSE::Vector<T> b)131 Vc_INTRINSIC SSE::Vector<T> operator-(SSE::Vector<T> a, SSE::Vector<T> b)
132 {
133 return sub(a.data(), b.data(), T());
134 }
135 template <typename T>
operator *(SSE::Vector<T> a,SSE::Vector<T> b)136 Vc_INTRINSIC SSE::Vector<T> operator*(SSE::Vector<T> a, SSE::Vector<T> b)
137 {
138 return mul(a.data(), b.data(), T());
139 }
140 template <typename T>
operator /(SSE::Vector<T> a,SSE::Vector<T> b)141 Vc_INTRINSIC enable_if<std::is_floating_point<T>::value, SSE::Vector<T>> operator/(
142 SSE::Vector<T> a, SSE::Vector<T> b)
143 {
144 return div(a.data(), b.data(), T());
145 }
146 template <typename T>
147 Vc_INTRINSIC
148 enable_if<std::is_same<int, T>::value || std::is_same<uint, T>::value, SSE::Vector<T>>
operator /(SSE::Vector<T> a,SSE::Vector<T> b)149 operator/(SSE::Vector<T> a, SSE::Vector<T> b)
150 {
151 return SSE::Vector<T>::generate([&](int i) { return a[i] / b[i]; });
152 }
153 template <typename T>
154 Vc_INTRINSIC enable_if<std::is_same<short, T>::value || std::is_same<ushort, T>::value,
155 SSE::Vector<T>>
operator /(SSE::Vector<T> a,SSE::Vector<T> b)156 operator/(SSE::Vector<T> a, SSE::Vector<T> b)
157 {
158 using HT = SSE::VectorHelper<T>;
159 __m128 lo = _mm_cvtepi32_ps(HT::expand0(a.data()));
160 __m128 hi = _mm_cvtepi32_ps(HT::expand1(a.data()));
161 lo = _mm_div_ps(lo, _mm_cvtepi32_ps(HT::expand0(b.data())));
162 hi = _mm_div_ps(hi, _mm_cvtepi32_ps(HT::expand1(b.data())));
163 return HT::concat(_mm_cvttps_epi32(lo), _mm_cvttps_epi32(hi));
164 }
165 template <typename T>
operator %(SSE::Vector<T> a,SSE::Vector<T> b)166 Vc_INTRINSIC enable_if<std::is_integral<T>::value, SSE::Vector<T>> operator%(
167 SSE::Vector<T> a, SSE::Vector<T> b)
168 {
169 return a - a / b * b;
170 }
171 // }}}1
172 } // namespace Detail
173 // constants {{{1
Vector(VectorSpecialInitializerZero)174 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerZero)
175 : d(HV::zero())
176 {
177 }
178
Vector(VectorSpecialInitializerOne)179 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerOne)
180 : d(HT::one())
181 {
182 }
183
184 template <typename T>
Vector(VectorSpecialInitializerIndexesFromZero)185 Vc_INTRINSIC Vector<T, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
186 : d(Detail::load16(Detail::IndexesFromZero<EntryType, Size>(), Aligned))
187 {
188 #if defined Vc_GCC && Vc_GCC < 0x40903 && defined Vc_IMPL_AVX2
189 // GCC 4.9.2 (at least) miscompiles SSE::short_v::IndexesFromZero() if used implicitly
190 // from SimdArray<short, 9> compiling for AVX2 to vpmovsxwd (sign extending load from
191 // a 8x 16-bit constant to 8x 32-bit register)
192 if (std::is_same<T, short>::value) {
193 asm("" ::"x"(d.v()));
194 }
195 #endif
196 }
197
198 template <>
Vector(VectorSpecialInitializerIndexesFromZero)199 Vc_INTRINSIC Vector<float, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
200 : d(SSE::convert<int, float>(SSE::int_v::IndexesFromZero().data()))
201 {
202 }
203
204 template <>
Vector(VectorSpecialInitializerIndexesFromZero)205 Vc_INTRINSIC Vector<double, VectorAbi::Sse>::Vector(VectorSpecialInitializerIndexesFromZero)
206 : d(SSE::convert<int, double>(SSE::int_v::IndexesFromZero().data()))
207 {
208 }
209
210 // load member functions {{{1
211 template <typename DstT>
212 template <typename SrcT, typename Flags>
213 Vc_INTRINSIC typename Vector<DstT, VectorAbi::Sse>::
214 #ifndef Vc_MSVC
215 template
216 #endif
load(const SrcT * mem,Flags flags)217 load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Sse>::load(const SrcT *mem, Flags flags)
218 {
219 Common::handleLoadPrefetches(mem, flags);
220 d.v() = Detail::load<VectorType, DstT>(mem, flags);
221 }
222
223 // zeroing {{{1
setZero()224 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero()
225 {
226 data() = HV::zero();
227 }
228
setZero(const Mask & k)229 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZero(const Mask &k)
230 {
231 data() = Detail::andnot_(k.data(), data());
232 }
233
setZeroInverted(const Mask & k)234 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::setZeroInverted(const Mask &k)
235 {
236 data() = Detail::and_(k.data(), data());
237 }
238
setQnan()239 template<> Vc_INTRINSIC void SSE::double_v::setQnan()
240 {
241 data() = SSE::_mm_setallone_pd();
242 }
setQnan(const Mask & k)243 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Sse>::setQnan(const Mask &k)
244 {
245 data() = _mm_or_pd(data(), k.dataD());
246 }
setQnan()247 template<> Vc_INTRINSIC void SSE::float_v::setQnan()
248 {
249 data() = SSE::_mm_setallone_ps();
250 }
setQnan(const Mask & k)251 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Sse>::setQnan(const Mask &k)
252 {
253 data() = _mm_or_ps(data(), k.dataF());
254 }
255
256 ///////////////////////////////////////////////////////////////////////////////////////////
257 // stores {{{1
258 template <typename T>
259 template <typename U, typename Flags, typename>
store(U * mem,Flags flags) const260 Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Flags flags) const
261 {
262 Common::handleStorePrefetches(mem, flags);
263 HV::template store<Flags>(mem, data());
264 }
265
266 template <typename T>
267 template <typename U, typename Flags, typename>
store(U * mem,Mask mask,Flags flags) const268 Vc_INTRINSIC void Vector<T, VectorAbi::Sse>::store(U *mem, Mask mask, Flags flags) const
269 {
270 Common::handleStorePrefetches(mem, flags);
271 HV::template store<Flags>(mem, data(), mask.data());
272 }
273
274 ///////////////////////////////////////////////////////////////////////////////////////////
275 // operator- {{{1
operator -() const276 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator-() const
277 {
278 return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
279 }
280 ///////////////////////////////////////////////////////////////////////////////////////////
281 // integer ops {{{1
282 #ifdef Vc_IMPL_XOP
operator <<(const SSE::int_v shift) const283 template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator<<(const SSE::int_v shift) const { return _mm_sha_epi32(d.v(), shift.d.v()); }
operator <<(const SSE::uint_v shift) const284 template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator<<(const SSE::uint_v shift) const { return _mm_shl_epi32(d.v(), shift.d.v()); }
operator <<(const SSE::short_v shift) const285 template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator<<(const SSE::short_v shift) const { return _mm_sha_epi16(d.v(), shift.d.v()); }
operator <<(const SSE::ushort_v shift) const286 template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator<<(const SSE::ushort_v shift) const { return _mm_shl_epi16(d.v(), shift.d.v()); }
operator >>(const SSE::int_v shift) const287 template <> Vc_ALWAYS_INLINE SSE::int_v SSE::int_v::operator>>(const SSE::int_v shift) const { return operator<<(-shift); }
operator >>(const SSE::uint_v shift) const288 template <> Vc_ALWAYS_INLINE SSE::uint_v SSE::uint_v::operator>>(const SSE::uint_v shift) const { return operator<<(-shift); }
operator >>(const SSE::short_v shift) const289 template <> Vc_ALWAYS_INLINE SSE::short_v SSE::short_v::operator>>(const SSE::short_v shift) const { return operator<<(-shift); }
operator >>(const SSE::ushort_v shift) const290 template <> Vc_ALWAYS_INLINE SSE::ushort_v SSE::ushort_v::operator>>(const SSE::ushort_v shift) const { return operator<<(-shift); }
291 #elif defined Vc_IMPL_AVX2
operator <<(const SSE::Vector<int> x) const292 template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator<<(const SSE::Vector< int> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
operator <<(const SSE::Vector<uint> x) const293 template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator<<(const SSE::Vector< uint> x) const { return _mm_sllv_epi32(d.v(), x.d.v()); }
operator >>(const SSE::Vector<int> x) const294 template <> Vc_ALWAYS_INLINE SSE::Vector< int> Vector< int, VectorAbi::Sse>::operator>>(const SSE::Vector< int> x) const { return _mm_srav_epi32(d.v(), x.d.v()); }
operator >>(const SSE::Vector<uint> x) const295 template <> Vc_ALWAYS_INLINE SSE::Vector< uint> Vector< uint, VectorAbi::Sse>::operator>>(const SSE::Vector< uint> x) const { return _mm_srlv_epi32(d.v(), x.d.v()); }
296 #endif
297
operator >>=(int shift)298 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator>>=(int shift) {
299 d.v() = HT::shiftRight(d.v(), shift);
300 return *this;
301 }
operator >>(int shift) const302 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator>>(int shift) const {
303 return HT::shiftRight(d.v(), shift);
304 }
operator <<=(int shift)305 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> &Vector<T, VectorAbi::Sse>::operator<<=(int shift) {
306 d.v() = HT::shiftLeft(d.v(), shift);
307 return *this;
308 }
operator <<(int shift) const309 template<typename T> Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::operator<<(int shift) const {
310 return HT::shiftLeft(d.v(), shift);
311 }
312
313 ///////////////////////////////////////////////////////////////////////////////////////////
314 // isnegative {{{1
isnegative(SSE::float_v x)315 Vc_INTRINSIC Vc_CONST SSE::float_m isnegative(SSE::float_v x)
316 {
317 return sse_cast<__m128>(_mm_srai_epi32(
318 sse_cast<__m128i>(_mm_and_ps(SSE::_mm_setsignmask_ps(), x.data())), 31));
319 }
isnegative(SSE::double_v x)320 Vc_INTRINSIC Vc_CONST SSE::double_m isnegative(SSE::double_v x)
321 {
322 return Mem::permute<X1, X1, X3, X3>(sse_cast<__m128>(_mm_srai_epi32(
323 sse_cast<__m128i>(_mm_and_pd(SSE::_mm_setsignmask_pd(), x.data())), 31)));
324 }
325
326 // gathers {{{1
327 #define Vc_GATHER_IMPL(V_) \
328 template <> \
329 template <class MT, class IT, int Scale> \
330 inline void SSE::V_::gatherImplementation( \
331 const Common::GatherArguments<MT, IT, Scale> &args)
332 #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
Vc_GATHER_IMPL(double_v)333 Vc_GATHER_IMPL(double_v) { d.v() = _mm_setr_pd(Vc_M(0), Vc_M(1)); }
Vc_GATHER_IMPL(float_v)334 Vc_GATHER_IMPL(float_v) { d.v() = _mm_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(int_v)335 Vc_GATHER_IMPL(int_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(uint_v)336 Vc_GATHER_IMPL(uint_v) { d.v() = _mm_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
Vc_GATHER_IMPL(short_v)337 Vc_GATHER_IMPL(short_v)
338 {
339 d.v() =
340 Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
341 }
Vc_GATHER_IMPL(ushort_v)342 Vc_GATHER_IMPL(ushort_v)
343 {
344 d.v() =
345 Vc::set(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6), Vc_M(7));
346 }
347 #undef Vc_M
348 #undef Vc_GATHER_IMPL
349
350 template <typename T>
351 template <class MT, class IT, int Scale>
gatherImplementation(const Common::GatherArguments<MT,IT,Scale> & args,MaskArgument mask)352 inline void Vector<T, VectorAbi::Sse>::gatherImplementation(
353 const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
354 {
355 const auto *mem = args.address;
356 const auto indexes = Scale * args.indexes;
357 using Selector = std::integral_constant < Common::GatherScatterImplementation,
358 #ifdef Vc_USE_SET_GATHERS
359 Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
360 #endif
361 #ifdef Vc_USE_BSF_GATHERS
362 Common::GatherScatterImplementation::BitScanLoop
363 #elif defined Vc_USE_POPCNT_BSF_GATHERS
364 Common::GatherScatterImplementation::PopcntSwitch
365 #else
366 Common::GatherScatterImplementation::SimpleLoop
367 #endif
368 > ;
369 Common::executeGather(Selector(), *this, mem, indexes, mask);
370 }
371
372 // scatters {{{1
373 template <typename T>
374 template <typename MT, typename IT>
scatterImplementation(MT * mem,IT && indexes) const375 inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes) const
376 {
377 Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
378 }
379
380 template <typename T>
381 template <typename MT, typename IT>
scatterImplementation(MT * mem,IT && indexes,MaskArgument mask) const382 inline void Vector<T, VectorAbi::Sse>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
383 {
384 using Selector = std::integral_constant < Common::GatherScatterImplementation,
385 #ifdef Vc_USE_SET_GATHERS
386 Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
387 #endif
388 #ifdef Vc_USE_BSF_GATHERS
389 Common::GatherScatterImplementation::BitScanLoop
390 #elif defined Vc_USE_POPCNT_BSF_GATHERS
391 Common::GatherScatterImplementation::PopcntSwitch
392 #else
393 Common::GatherScatterImplementation::SimpleLoop
394 #endif
395 > ;
396 Common::executeScatter(Selector(), *this, mem, indexes, mask);
397 }
398
399 ///////////////////////////////////////////////////////////////////////////////////////////
400 // horizontal ops {{{1
partialSum() const401 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::partialSum() const
402 {
403 // a b c d e f g h
404 // + a b c d e f g -> a ab bc cd de ef fg gh
405 // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh
406 // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
407 Vector<T, VectorAbi::Sse> tmp = *this;
408 if (Size > 1) tmp += tmp.shifted(-1);
409 if (Size > 2) tmp += tmp.shifted(-2);
410 if (Size > 4) tmp += tmp.shifted(-4);
411 if (Size > 8) tmp += tmp.shifted(-8);
412 if (Size > 16) tmp += tmp.shifted(-16);
413 return tmp;
414 }
415 #ifndef Vc_IMPL_SSE4_1
416 // without SSE4.1 integer multiplication is slow and we rather multiply the scalars
product() const417 template<> Vc_INTRINSIC Vc_PURE int SSE::int_v::product() const
418 {
419 return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
420 }
product() const421 template<> Vc_INTRINSIC Vc_PURE unsigned int SSE::uint_v::product() const
422 {
423 return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3));
424 }
425 #endif
min(MaskArg m) const426 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::min(MaskArg m) const
427 {
428 Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::max();
429 tmp(m) = *this;
430 return tmp.min();
431 }
max(MaskArg m) const432 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::max(MaskArg m) const
433 {
434 Vector<T, VectorAbi::Sse> tmp = std::numeric_limits<Vector<T, VectorAbi::Sse> >::min();
435 tmp(m) = *this;
436 return tmp.max();
437 }
product(MaskArg m) const438 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::product(MaskArg m) const
439 {
440 Vector<T, VectorAbi::Sse> tmp(Vc::One);
441 tmp(m) = *this;
442 return tmp.product();
443 }
sum(MaskArg m) const444 template<typename T> Vc_ALWAYS_INLINE Vc_PURE typename Vector<T, VectorAbi::Sse>::EntryType Vector<T, VectorAbi::Sse>::sum(MaskArg m) const
445 {
446 Vector<T, VectorAbi::Sse> tmp(Vc::Zero);
447 tmp(m) = *this;
448 return tmp.sum();
449 }
450
451 ///////////////////////////////////////////////////////////////////////////////////////////
452 // exponent {{{1
453 namespace Detail
454 {
exponent(__m128 v)455 Vc_INTRINSIC Vc_CONST __m128 exponent(__m128 v)
456 {
457 __m128i tmp = _mm_srli_epi32(_mm_castps_si128(v), 23);
458 tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x7f));
459 return _mm_cvtepi32_ps(tmp);
460 }
exponent(__m128d v)461 Vc_INTRINSIC Vc_CONST __m128d exponent(__m128d v)
462 {
463 __m128i tmp = _mm_srli_epi64(_mm_castpd_si128(v), 52);
464 tmp = _mm_sub_epi32(tmp, _mm_set1_epi32(0x3ff));
465 return _mm_cvtepi32_pd(_mm_shuffle_epi32(tmp, 0x08));
466 }
467 } // namespace Detail
468
exponent(SSE::float_v x)469 Vc_INTRINSIC Vc_CONST SSE::float_v exponent(SSE::float_v x)
470 {
471 using Detail::operator>=;
472 Vc_ASSERT((x >= x.Zero()).isFull());
473 return Detail::exponent(x.data());
474 }
exponent(SSE::double_v x)475 Vc_INTRINSIC Vc_CONST SSE::double_v exponent(SSE::double_v x)
476 {
477 using Detail::operator>=;
478 Vc_ASSERT((x >= x.Zero()).isFull());
479 return Detail::exponent(x.data());
480 }
481 // }}}1
482 // Random {{{1
_doRandomStep(SSE::uint_v & state0,SSE::uint_v & state1)483 static void _doRandomStep(SSE::uint_v &state0,
484 SSE::uint_v &state1)
485 {
486 using SSE::uint_v;
487 using Detail::operator+;
488 using Detail::operator*;
489 state0.load(&Common::RandomState[0]);
490 state1.load(&Common::RandomState[uint_v::Size]);
491 (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
492 uint_v(_mm_xor_si128((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
493 _mm_srli_epi32(state1.data(), 16)))
494 .store(&Common::RandomState[0]);
495 }
496
Random()497 template<typename T> Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::Random()
498 {
499 SSE::uint_v state0, state1;
500 _doRandomStep(state0, state1);
501 return state0.data();
502 }
503
Random()504 template<> Vc_ALWAYS_INLINE SSE::float_v SSE::float_v::Random()
505 {
506 SSE::uint_v state0, state1;
507 _doRandomStep(state0, state1);
508 return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one());
509 }
510
Random()511 template<> Vc_ALWAYS_INLINE SSE::double_v SSE::double_v::Random()
512 {
513 typedef unsigned long long uint64 Vc_MAY_ALIAS;
514 uint64 state0 = *reinterpret_cast<const uint64 *>(&Common::RandomState[8]);
515 uint64 state1 = *reinterpret_cast<const uint64 *>(&Common::RandomState[10]);
516 const __m128i state = _mm_load_si128(reinterpret_cast<const __m128i *>(&Common::RandomState[8]));
517 *reinterpret_cast<uint64 *>(&Common::RandomState[ 8]) = (state0 * 0x5deece66dull + 11);
518 *reinterpret_cast<uint64 *>(&Common::RandomState[10]) = (state1 * 0x5deece66dull + 11);
519 return _mm_sub_pd(_mm_or_pd(_mm_castsi128_pd(_mm_srli_epi64(state, 12)), HT::one()), HT::one());
520 }
521 // shifted / rotated {{{1
shifted(int amount) const522 template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount) const
523 {
524 enum {
525 EntryTypeSizeof = sizeof(EntryType)
526 };
527 switch (amount) {
528 case 0: return *this;
529 case 1: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
530 case 2: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
531 case 3: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
532 case 4: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
533 case 5: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
534 case 6: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
535 case 7: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
536 case 8: return SSE::sse_cast<VectorType>(_mm_srli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
537 case -1: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 1 * EntryTypeSizeof));
538 case -2: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 2 * EntryTypeSizeof));
539 case -3: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 3 * EntryTypeSizeof));
540 case -4: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 4 * EntryTypeSizeof));
541 case -5: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 5 * EntryTypeSizeof));
542 case -6: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 6 * EntryTypeSizeof));
543 case -7: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 7 * EntryTypeSizeof));
544 case -8: return SSE::sse_cast<VectorType>(_mm_slli_si128(SSE::sse_cast<__m128i>(d.v()), 8 * EntryTypeSizeof));
545 }
546 return Zero();
547 }
shifted(int amount,Vector shiftIn) const548 template<typename T> Vc_INTRINSIC Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::shifted(int amount, Vector shiftIn) const
549 {
550 if (amount >= -int(size())) {
551 constexpr int VectorWidth = int(size());
552 constexpr int EntryTypeSizeof = sizeof(EntryType);
553 const __m128i v0 = sse_cast<__m128i>(d.v());
554 const __m128i v1 = sse_cast<__m128i>(shiftIn.d.v());
555 auto &&fixup = sse_cast<VectorType, __m128i>;
556 switch (amount) {
557 case 0: return *this;
558 // alignr_epi8: [arg1 arg0] << n
559 case -1: return fixup(SSE::alignr_epi8<(VectorWidth - 1) * EntryTypeSizeof>(v0, v1));
560 case -2: return fixup(SSE::alignr_epi8<(VectorWidth - 2) * EntryTypeSizeof>(v0, v1));
561 case -3: return fixup(SSE::alignr_epi8<(VectorWidth - 3) * EntryTypeSizeof>(v0, v1));
562 case -4: return fixup(SSE::alignr_epi8<(VectorWidth - 4) * EntryTypeSizeof>(v0, v1));
563 case -5: return fixup(SSE::alignr_epi8<(VectorWidth - 5) * EntryTypeSizeof>(v0, v1));
564 case -6: return fixup(SSE::alignr_epi8<(VectorWidth - 6) * EntryTypeSizeof>(v0, v1));
565 case -7: return fixup(SSE::alignr_epi8<(VectorWidth - 7) * EntryTypeSizeof>(v0, v1));
566 case -8: return fixup(SSE::alignr_epi8<(VectorWidth - 8) * EntryTypeSizeof>(v0, v1));
567 case -9: return fixup(SSE::alignr_epi8<(VectorWidth - 9) * EntryTypeSizeof>(v0, v1));
568 case-10: return fixup(SSE::alignr_epi8<(VectorWidth -10) * EntryTypeSizeof>(v0, v1));
569 case-11: return fixup(SSE::alignr_epi8<(VectorWidth -11) * EntryTypeSizeof>(v0, v1));
570 case-12: return fixup(SSE::alignr_epi8<(VectorWidth -12) * EntryTypeSizeof>(v0, v1));
571 case-13: return fixup(SSE::alignr_epi8<(VectorWidth -13) * EntryTypeSizeof>(v0, v1));
572 case-14: return fixup(SSE::alignr_epi8<(VectorWidth -14) * EntryTypeSizeof>(v0, v1));
573 case-15: return fixup(SSE::alignr_epi8<(VectorWidth -15) * EntryTypeSizeof>(v0, v1));
574 case 1: return fixup(SSE::alignr_epi8< 1 * EntryTypeSizeof>(v1, v0));
575 case 2: return fixup(SSE::alignr_epi8< 2 * EntryTypeSizeof>(v1, v0));
576 case 3: return fixup(SSE::alignr_epi8< 3 * EntryTypeSizeof>(v1, v0));
577 case 4: return fixup(SSE::alignr_epi8< 4 * EntryTypeSizeof>(v1, v0));
578 case 5: return fixup(SSE::alignr_epi8< 5 * EntryTypeSizeof>(v1, v0));
579 case 6: return fixup(SSE::alignr_epi8< 6 * EntryTypeSizeof>(v1, v0));
580 case 7: return fixup(SSE::alignr_epi8< 7 * EntryTypeSizeof>(v1, v0));
581 case 8: return fixup(SSE::alignr_epi8< 8 * EntryTypeSizeof>(v1, v0));
582 case 9: return fixup(SSE::alignr_epi8< 9 * EntryTypeSizeof>(v1, v0));
583 case 10: return fixup(SSE::alignr_epi8<10 * EntryTypeSizeof>(v1, v0));
584 case 11: return fixup(SSE::alignr_epi8<11 * EntryTypeSizeof>(v1, v0));
585 case 12: return fixup(SSE::alignr_epi8<12 * EntryTypeSizeof>(v1, v0));
586 case 13: return fixup(SSE::alignr_epi8<13 * EntryTypeSizeof>(v1, v0));
587 case 14: return fixup(SSE::alignr_epi8<14 * EntryTypeSizeof>(v1, v0));
588 case 15: return fixup(SSE::alignr_epi8<15 * EntryTypeSizeof>(v1, v0));
589 }
590 }
591 return shiftIn.shifted(int(size()) + amount);
592 }
rotated(int amount) const593 template<typename T> Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::rotated(int amount) const
594 {
595 enum {
596 EntryTypeSizeof = sizeof(EntryType)
597 };
598 const __m128i v = SSE::sse_cast<__m128i>(d.v());
599 switch (static_cast<unsigned int>(amount) % Size) {
600 case 0: return *this;
601 case 1: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<1 * EntryTypeSizeof>(v, v));
602 case 2: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<2 * EntryTypeSizeof>(v, v));
603 case 3: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<3 * EntryTypeSizeof>(v, v));
604 // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake.
605 // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType))
606 // disables the following four calls unless sizeof(EntryType) == 2.
607 case 4: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<4 * EntryTypeSizeof>(v, v));
608 case 5: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<5 * EntryTypeSizeof>(v, v));
609 case 6: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<6 * EntryTypeSizeof>(v, v));
610 case 7: return SSE::sse_cast<VectorType>(SSE::alignr_epi8<7 * EntryTypeSizeof>(v, v));
611 }
612 return Zero();
613 }
614 // sorted {{{1
615 namespace Detail
616 {
sorted(SSE::double_v x_)617 inline Vc_CONST SSE::double_v sorted(SSE::double_v x_)
618 {
619 const __m128d x = x_.data();
620 const __m128d y = _mm_shuffle_pd(x, x, _MM_SHUFFLE2(0, 1));
621 return _mm_unpacklo_pd(_mm_min_sd(x, y), _mm_max_sd(x, y));
622 }
623 } // namespace Detail
624 template <typename T>
sorted() const625 Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Sse> Vector<T, VectorAbi::Sse>::sorted()
626 const
627 {
628 return Detail::sorted(*this);
629 }
630 // interleaveLow/-High {{{1
interleaveLow(SSE::double_v x) const631 template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveLow (SSE::double_v x) const { return _mm_unpacklo_pd(data(), x.data()); }
interleaveHigh(SSE::double_v x) const632 template <> Vc_INTRINSIC SSE::double_v SSE::double_v::interleaveHigh(SSE::double_v x) const { return _mm_unpackhi_pd(data(), x.data()); }
interleaveLow(SSE::float_v x) const633 template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveLow ( SSE::float_v x) const { return _mm_unpacklo_ps(data(), x.data()); }
interleaveHigh(SSE::float_v x) const634 template <> Vc_INTRINSIC SSE::float_v SSE::float_v::interleaveHigh( SSE::float_v x) const { return _mm_unpackhi_ps(data(), x.data()); }
interleaveLow(SSE::int_v x) const635 template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveLow ( SSE::int_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
interleaveHigh(SSE::int_v x) const636 template <> Vc_INTRINSIC SSE::int_v SSE::int_v::interleaveHigh( SSE::int_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
interleaveLow(SSE::uint_v x) const637 template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveLow ( SSE::uint_v x) const { return _mm_unpacklo_epi32(data(), x.data()); }
interleaveHigh(SSE::uint_v x) const638 template <> Vc_INTRINSIC SSE::uint_v SSE::uint_v::interleaveHigh( SSE::uint_v x) const { return _mm_unpackhi_epi32(data(), x.data()); }
interleaveLow(SSE::short_v x) const639 template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveLow ( SSE::short_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
interleaveHigh(SSE::short_v x) const640 template <> Vc_INTRINSIC SSE::short_v SSE::short_v::interleaveHigh( SSE::short_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
interleaveLow(SSE::ushort_v x) const641 template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveLow (SSE::ushort_v x) const { return _mm_unpacklo_epi16(data(), x.data()); }
interleaveHigh(SSE::ushort_v x) const642 template <> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::interleaveHigh(SSE::ushort_v x) const { return _mm_unpackhi_epi16(data(), x.data()); }
643 // }}}1
644 // generate {{{1
generate(G gen)645 template <> template <typename G> Vc_INTRINSIC SSE::double_v SSE::double_v::generate(G gen)
646 {
647 const auto tmp0 = gen(0);
648 const auto tmp1 = gen(1);
649 return _mm_setr_pd(tmp0, tmp1);
650 }
generate(G gen)651 template <> template <typename G> Vc_INTRINSIC SSE::float_v SSE::float_v::generate(G gen)
652 {
653 const auto tmp0 = gen(0);
654 const auto tmp1 = gen(1);
655 const auto tmp2 = gen(2);
656 const auto tmp3 = gen(3);
657 return _mm_setr_ps(tmp0, tmp1, tmp2, tmp3);
658 }
generate(G gen)659 template <> template <typename G> Vc_INTRINSIC SSE::int_v SSE::int_v::generate(G gen)
660 {
661 const auto tmp0 = gen(0);
662 const auto tmp1 = gen(1);
663 const auto tmp2 = gen(2);
664 const auto tmp3 = gen(3);
665 return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
666 }
generate(G gen)667 template <> template <typename G> Vc_INTRINSIC SSE::uint_v SSE::uint_v::generate(G gen)
668 {
669 const auto tmp0 = gen(0);
670 const auto tmp1 = gen(1);
671 const auto tmp2 = gen(2);
672 const auto tmp3 = gen(3);
673 return _mm_setr_epi32(tmp0, tmp1, tmp2, tmp3);
674 }
generate(G gen)675 template <> template <typename G> Vc_INTRINSIC SSE::short_v SSE::short_v::generate(G gen)
676 {
677 const auto tmp0 = gen(0);
678 const auto tmp1 = gen(1);
679 const auto tmp2 = gen(2);
680 const auto tmp3 = gen(3);
681 const auto tmp4 = gen(4);
682 const auto tmp5 = gen(5);
683 const auto tmp6 = gen(6);
684 const auto tmp7 = gen(7);
685 return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
686 }
generate(G gen)687 template <> template <typename G> Vc_INTRINSIC SSE::ushort_v SSE::ushort_v::generate(G gen)
688 {
689 const auto tmp0 = gen(0);
690 const auto tmp1 = gen(1);
691 const auto tmp2 = gen(2);
692 const auto tmp3 = gen(3);
693 const auto tmp4 = gen(4);
694 const auto tmp5 = gen(5);
695 const auto tmp6 = gen(6);
696 const auto tmp7 = gen(7);
697 return _mm_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
698 }
699 // }}}1
700 // reversed {{{1
reversed() const701 template <> Vc_INTRINSIC Vc_PURE SSE::double_v SSE::double_v::reversed() const
702 {
703 return Mem::permute<X1, X0>(d.v());
704 }
reversed() const705 template <> Vc_INTRINSIC Vc_PURE SSE::float_v SSE::float_v::reversed() const
706 {
707 return Mem::permute<X3, X2, X1, X0>(d.v());
708 }
reversed() const709 template <> Vc_INTRINSIC Vc_PURE SSE::int_v SSE::int_v::reversed() const
710 {
711 return Mem::permute<X3, X2, X1, X0>(d.v());
712 }
reversed() const713 template <> Vc_INTRINSIC Vc_PURE SSE::uint_v SSE::uint_v::reversed() const
714 {
715 return Mem::permute<X3, X2, X1, X0>(d.v());
716 }
reversed() const717 template <> Vc_INTRINSIC Vc_PURE SSE::short_v SSE::short_v::reversed() const
718 {
719 return sse_cast<__m128i>(
720 Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
721 sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
722 }
reversed() const723 template <> Vc_INTRINSIC Vc_PURE SSE::ushort_v SSE::ushort_v::reversed() const
724 {
725 return sse_cast<__m128i>(
726 Mem::shuffle<X1, Y0>(sse_cast<__m128d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
727 sse_cast<__m128d>(Mem::permuteLo<X3, X2, X1, X0>(d.v()))));
728 }
729 // }}}1
730 // permutation via operator[] {{{1
731 template <>
operator [](const SSE::int_v & perm) const732 Vc_INTRINSIC SSE::float_v SSE::float_v::operator[](const SSE::int_v &
733 #ifdef Vc_IMPL_AVX
734 perm
735 #endif
736 ) const
737 {
738 /*
739 const int_m cross128 = concat(_mm_cmpgt_epi32(lo128(perm.data()), _mm_set1_epi32(3)),
740 _mm_cmplt_epi32(hi128(perm.data()), _mm_set1_epi32(4)));
741 if (cross128.isNotEmpty()) {
742 SSE::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
743 x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
744 return x;
745 } else {
746 */
747 #ifdef Vc_IMPL_AVX
748 return _mm_permutevar_ps(d.v(), perm.data());
749 #else
750 return *this;//TODO
751 #endif
752 }
753 // broadcast from constexpr index {{{1
broadcast() const754 template <> template <int Index> Vc_INTRINSIC SSE::float_v SSE::float_v::broadcast() const
755 {
756 constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
757 return Mem::permute<Inner, Inner, Inner, Inner>(d.v());
758 }
broadcast() const759 template <> template <int Index> Vc_INTRINSIC SSE::double_v SSE::double_v::broadcast() const
760 {
761 constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
762 return Mem::permute<Inner, Inner>(d.v());
763 }
764 // }}}1
765
766 namespace Common
767 {
768 // transpose_impl {{{1
transpose_impl(TransposeTag<4,4>,SSE::float_v * Vc_RESTRICT r[],const TransposeProxy<SSE::float_v,SSE::float_v,SSE::float_v,SSE::float_v> & proxy)769 Vc_ALWAYS_INLINE void transpose_impl(
770 TransposeTag<4, 4>, SSE::float_v *Vc_RESTRICT r[],
771 const TransposeProxy<SSE::float_v, SSE::float_v, SSE::float_v, SSE::float_v> &proxy)
772 {
773 const auto in0 = std::get<0>(proxy.in).data();
774 const auto in1 = std::get<1>(proxy.in).data();
775 const auto in2 = std::get<2>(proxy.in).data();
776 const auto in3 = std::get<3>(proxy.in).data();
777 const auto tmp0 = _mm_unpacklo_ps(in0, in2);
778 const auto tmp1 = _mm_unpacklo_ps(in1, in3);
779 const auto tmp2 = _mm_unpackhi_ps(in0, in2);
780 const auto tmp3 = _mm_unpackhi_ps(in1, in3);
781 *r[0] = _mm_unpacklo_ps(tmp0, tmp1);
782 *r[1] = _mm_unpackhi_ps(tmp0, tmp1);
783 *r[2] = _mm_unpacklo_ps(tmp2, tmp3);
784 *r[3] = _mm_unpackhi_ps(tmp2, tmp3);
785 }
786 // }}}1
787 } // namespace Common
788 }
789
790 // vim: foldmethod=marker
791