1 /*  This file is part of the Vc library. {{{
2 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
3 
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6     * Redistributions of source code must retain the above copyright
7       notice, this list of conditions and the following disclaimer.
8     * Redistributions in binary form must reproduce the above copyright
9       notice, this list of conditions and the following disclaimer in the
10       documentation and/or other materials provided with the distribution.
11     * Neither the names of contributing organizations nor the
12       names of its contributors may be used to endorse or promote products
13       derived from this software without specific prior written permission.
14 
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 
26 }}}*/
27 
28 #include "../common/x86_prefetches.h"
29 #include "../common/gatherimplementation.h"
30 #include "../common/scatterimplementation.h"
31 #include "limits.h"
32 #include "const.h"
33 #include "../common/set.h"
34 #include "macros.h"
35 
36 namespace Vc_VERSIONED_NAMESPACE
37 {
38 namespace Detail
39 {
40 // compare operators {{{1
operator ==(AVX2::double_v a,AVX2::double_v b)41 Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
operator ==(AVX2::float_v a,AVX2::float_v b)42 Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
operator !=(AVX2::double_v a,AVX2::double_v b)43 Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
operator !=(AVX2::float_v a,AVX2::float_v b)44 Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
operator >=(AVX2::double_v a,AVX2::double_v b)45 Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
operator >=(AVX2::float_v a,AVX2::float_v b)46 Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
operator <=(AVX2::double_v a,AVX2::double_v b)47 Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
operator <=(AVX2::float_v a,AVX2::float_v b)48 Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
operator >(AVX2::double_v a,AVX2::double_v b)49 Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
operator >(AVX2::float_v a,AVX2::float_v b)50 Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
operator <(AVX2::double_v a,AVX2::double_v b)51 Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
operator <(AVX2::float_v a,AVX2::float_v b)52 Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
53 
54 #ifdef Vc_IMPL_AVX2
operator ==(AVX2::int_v a,AVX2::int_v b)55 Vc_INTRINSIC AVX2::   int_m operator==(AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
operator ==(AVX2::uint_v a,AVX2::uint_v b)56 Vc_INTRINSIC AVX2::  uint_m operator==(AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
operator ==(AVX2::short_v a,AVX2::short_v b)57 Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
operator ==(AVX2::ushort_v a,AVX2::ushort_v b)58 Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
operator !=(AVX2::int_v a,AVX2::int_v b)59 Vc_INTRINSIC AVX2::   int_m operator!=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
operator !=(AVX2::uint_v a,AVX2::uint_v b)60 Vc_INTRINSIC AVX2::  uint_m operator!=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
operator !=(AVX2::short_v a,AVX2::short_v b)61 Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
operator !=(AVX2::ushort_v a,AVX2::ushort_v b)62 Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
operator >=(AVX2::int_v a,AVX2::int_v b)63 Vc_INTRINSIC AVX2::   int_m operator>=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
operator >=(AVX2::uint_v a,AVX2::uint_v b)64 Vc_INTRINSIC AVX2::  uint_m operator>=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
operator >=(AVX2::short_v a,AVX2::short_v b)65 Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
operator >=(AVX2::ushort_v a,AVX2::ushort_v b)66 Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }
operator <=(AVX2::int_v a,AVX2::int_v b)67 Vc_INTRINSIC AVX2::   int_m operator<=(AVX2::   int_v a, AVX2::   int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
operator <=(AVX2::uint_v a,AVX2::uint_v b)68 Vc_INTRINSIC AVX2::  uint_m operator<=(AVX2::  uint_v a, AVX2::  uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
operator <=(AVX2::short_v a,AVX2::short_v b)69 Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
operator <=(AVX2::ushort_v a,AVX2::ushort_v b)70 Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }
operator >(AVX2::int_v a,AVX2::int_v b)71 Vc_INTRINSIC AVX2::   int_m operator> (AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
operator >(AVX2::uint_v a,AVX2::uint_v b)72 Vc_INTRINSIC AVX2::  uint_m operator> (AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
operator >(AVX2::short_v a,AVX2::short_v b)73 Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
operator >(AVX2::ushort_v a,AVX2::ushort_v b)74 Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }
operator <(AVX2::int_v a,AVX2::int_v b)75 Vc_INTRINSIC AVX2::   int_m operator< (AVX2::   int_v a, AVX2::   int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
operator <(AVX2::uint_v a,AVX2::uint_v b)76 Vc_INTRINSIC AVX2::  uint_m operator< (AVX2::  uint_v a, AVX2::  uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
operator <(AVX2::short_v a,AVX2::short_v b)77 Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
operator <(AVX2::ushort_v a,AVX2::ushort_v b)78 Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }
79 #endif  // Vc_IMPL_AVX2
80 
81 // bitwise operators {{{1
82 template <typename T>
operator ^(AVX2::Vector<T> a,AVX2::Vector<T> b)83 Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
84 {
85     return xor_(a.data(), b.data());
86 }
87 template <typename T>
operator &(AVX2::Vector<T> a,AVX2::Vector<T> b)88 Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
89 {
90     return and_(a.data(), b.data());
91 }
92 template <typename T>
operator |(AVX2::Vector<T> a,AVX2::Vector<T> b)93 Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
94 {
95     return or_(a.data(), b.data());
96 }
97 // }}}1
98 // arithmetic operators {{{1
99 template <typename T>
operator +(AVX2::Vector<T> a,AVX2::Vector<T> b)100 Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
101 {
102     return add(a.data(), b.data(), T());
103 }
104 template <typename T>
operator -(AVX2::Vector<T> a,AVX2::Vector<T> b)105 Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
106 {
107     return sub(a.data(), b.data(), T());
108 }
109 template <typename T>
operator *(AVX2::Vector<T> a,AVX2::Vector<T> b)110 Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
111 {
112     return mul(a.data(), b.data(), T());
113 }
114 template <typename T>
operator /(AVX2::Vector<T> a,AVX2::Vector<T> b)115 Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
116 {
117     return div(a.data(), b.data(), T());
118 }
operator /(AVX2::Vector<ushort> a,AVX2::Vector<ushort> b)119 Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
120                                             AVX2::Vector<ushort> b)
121 {
122     using namespace AVX;
123     const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
124                                     convert<ushort, float>(lo128(b.data())));
125     const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
126                                     convert<ushort, float>(hi128(b.data())));
127     const float_v threshold = 32767.f;
128     using Detail::operator>;
129     const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
130                                 ? convert<float, ushort>(lo)
131                                 : convert<float, short>(lo);
132     const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
133                                 ? convert<float, ushort>(hi)
134                                 : convert<float, short>(hi);
135     return concat(loShort, hiShort);
136 }
137 template <typename T>
operator %(AVX2::Vector<T> a,AVX2::Vector<T> b)138 Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
139     AVX2::Vector<T> a, AVX2::Vector<T> b)
140 {
141     return a - a / b * b;
142 }
143 // }}}1
144 }  // namespace Detail
145 ///////////////////////////////////////////////////////////////////////////////////////////
146 // generate {{{1
generate(G gen)147 template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
148 {
149     const auto tmp0 = gen(0);
150     const auto tmp1 = gen(1);
151     const auto tmp2 = gen(2);
152     const auto tmp3 = gen(3);
153     return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
154 }
generate(G gen)155 template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
156 {
157     const auto tmp0 = gen(0);
158     const auto tmp1 = gen(1);
159     const auto tmp2 = gen(2);
160     const auto tmp3 = gen(3);
161     const auto tmp4 = gen(4);
162     const auto tmp5 = gen(5);
163     const auto tmp6 = gen(6);
164     const auto tmp7 = gen(7);
165     return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
166 }
167 #ifdef Vc_IMPL_AVX2
generate(G gen)168 template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
169 {
170     const auto tmp0 = gen(0);
171     const auto tmp1 = gen(1);
172     const auto tmp2 = gen(2);
173     const auto tmp3 = gen(3);
174     const auto tmp4 = gen(4);
175     const auto tmp5 = gen(5);
176     const auto tmp6 = gen(6);
177     const auto tmp7 = gen(7);
178     return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
179 }
generate(G gen)180 template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
181 {
182     const auto tmp0 = gen(0);
183     const auto tmp1 = gen(1);
184     const auto tmp2 = gen(2);
185     const auto tmp3 = gen(3);
186     const auto tmp4 = gen(4);
187     const auto tmp5 = gen(5);
188     const auto tmp6 = gen(6);
189     const auto tmp7 = gen(7);
190     return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
191 }
generate(G gen)192 template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
193 {
194     const auto tmp0 = gen(0);
195     const auto tmp1 = gen(1);
196     const auto tmp2 = gen(2);
197     const auto tmp3 = gen(3);
198     const auto tmp4 = gen(4);
199     const auto tmp5 = gen(5);
200     const auto tmp6 = gen(6);
201     const auto tmp7 = gen(7);
202     const auto tmp8 = gen(8);
203     const auto tmp9 = gen(9);
204     const auto tmp10 = gen(10);
205     const auto tmp11 = gen(11);
206     const auto tmp12 = gen(12);
207     const auto tmp13 = gen(13);
208     const auto tmp14 = gen(14);
209     const auto tmp15 = gen(15);
210     return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
211 }
generate(G gen)212 template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
213 {
214     const auto tmp0 = gen(0);
215     const auto tmp1 = gen(1);
216     const auto tmp2 = gen(2);
217     const auto tmp3 = gen(3);
218     const auto tmp4 = gen(4);
219     const auto tmp5 = gen(5);
220     const auto tmp6 = gen(6);
221     const auto tmp7 = gen(7);
222     const auto tmp8 = gen(8);
223     const auto tmp9 = gen(9);
224     const auto tmp10 = gen(10);
225     const auto tmp11 = gen(11);
226     const auto tmp12 = gen(12);
227     const auto tmp13 = gen(13);
228     const auto tmp14 = gen(14);
229     const auto tmp15 = gen(15);
230     return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
231 }
232 #endif
233 
234 // constants {{{1
Vector(VectorSpecialInitializerZero)235 template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}
236 
Vector(VectorSpecialInitializerOne)237 template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
Vector(VectorSpecialInitializerOne)238 template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
239 #ifdef Vc_IMPL_AVX2
Vector(VectorSpecialInitializerOne)240 template <> Vc_INTRINSIC Vector<   int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
Vector(VectorSpecialInitializerOne)241 template <> Vc_INTRINSIC Vector<  uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
Vector(VectorSpecialInitializerOne)242 template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
Vector(VectorSpecialInitializerOne)243 template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
Vector(VectorSpecialInitializerOne)244 template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
Vector(VectorSpecialInitializerOne)245 template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
246 #endif
247 
248 template <typename T>
Vector(VectorSpecialInitializerIndexesFromZero)249 Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
250     VectorSpecialInitializerIndexesFromZero)
251     : Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
252 {
253 }
254 
255 template <>
Vector(VectorSpecialInitializerIndexesFromZero)256 Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
257     : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
258 {
259 }
260 template <>
Vector(VectorSpecialInitializerIndexesFromZero)261 Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
262     : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
263 {
264 }
265 
266 ///////////////////////////////////////////////////////////////////////////////////////////
267 // load member functions {{{1
268 // general load, implemented via LoadHelper {{{2
269 template <typename DstT>
270 template <typename SrcT, typename Flags>
271 Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
272 #ifndef Vc_MSVC
273 template
274 #endif
load(const SrcT * mem,Flags flags)275 load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
276 {
277     Common::handleLoadPrefetches(mem, flags);
278     d.v() = Detail::load<VectorType, DstT>(mem, flags);
279 }
280 
281 ///////////////////////////////////////////////////////////////////////////////////////////
282 // zeroing {{{1
setZero()283 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
284 {
285     data() = Detail::zero<VectorType>();
286 }
setZero(const Mask & k)287 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
288 {
289     data() = Detail::andnot_(k.data(), data());
290 }
setZeroInverted(const Mask & k)291 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
292 {
293     data() = Detail::and_(k.data(), data());
294 }
295 
setQnan()296 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
297 {
298     data() = Detail::allone<VectorType>();
299 }
setQnan(MaskArgument k)300 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
301 {
302     data() = _mm256_or_pd(data(), k.dataD());
303 }
setQnan()304 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
305 {
306     data() = Detail::allone<VectorType>();
307 }
setQnan(MaskArgument k)308 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
309 {
310     data() = _mm256_or_ps(data(), k.dataF());
311 }
312 
313 ///////////////////////////////////////////////////////////////////////////////////////////
314 // stores {{{1
315 template <typename T>
316 template <typename U,
317           typename Flags,
318           typename>
store(U * mem,Flags flags) const319 Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
320 {
321     Common::handleStorePrefetches(mem, flags);
322     HV::template store<Flags>(mem, data());
323 }
324 
325 template <typename T>
326 template <typename U,
327           typename Flags,
328           typename>
store(U * mem,Mask mask,Flags flags) const329 Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
330 {
331     Common::handleStorePrefetches(mem, flags);
332     HV::template store<Flags>(mem, data(), mask.data());
333 }
334 
335 ///////////////////////////////////////////////////////////////////////////////////////////
336 // integer ops {{{1
337 #ifdef Vc_IMPL_AVX2
operator <<(AsArg x) const338 template <> Vc_ALWAYS_INLINE AVX2::Vector<   int> Vector<   int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
operator <<(AsArg x) const339 template <> Vc_ALWAYS_INLINE AVX2::Vector<  uint> Vector<  uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
operator >>(AsArg x) const340 template <> Vc_ALWAYS_INLINE AVX2::Vector<   int> Vector<   int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
operator >>(AsArg x) const341 template <> Vc_ALWAYS_INLINE AVX2::Vector<  uint> Vector<  uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
operator <<(AsArg x) const342 template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
operator <<(AsArg x) const343 template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
operator >>(AsArg x) const344 template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
operator >>(AsArg x) const345 template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
346 template <typename T>
operator <<=(AsArg x)347 Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
348 {
349     static_assert(std::is_integral<T>::value,
350                   "bitwise-operators can only be used with Vectors of integral type");
351     return *this = *this << x;
352 }
353 template <typename T>
operator >>=(AsArg x)354 Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
355 {
356     static_assert(std::is_integral<T>::value,
357                   "bitwise-operators can only be used with Vectors of integral type");
358     return *this = *this >> x;
359 }
360 #endif
361 
operator >>=(int shift)362 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
363     d.v() = Detail::shiftRight(d.v(), shift, T());
364     return *static_cast<AVX2::Vector<T> *>(this);
365 }
operator >>(int shift) const366 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
367     return Detail::shiftRight(d.v(), shift, T());
368 }
operator <<=(int shift)369 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
370     d.v() = Detail::shiftLeft(d.v(), shift, T());
371     return *static_cast<AVX2::Vector<T> *>(this);
372 }
operator <<(int shift) const373 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
374     return Detail::shiftLeft(d.v(), shift, T());
375 }
376 
377 // isnegative {{{1
isnegative(AVX2::float_v x)378 Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
379 {
380     return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
381         AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
382 }
isnegative(AVX2::double_v x)383 Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
384 {
385     return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
386         AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
387 }
388 // gathers {{{1
389 #define Vc_GATHER_IMPL(V_)                                                               \
390     template <>                                                                          \
391     template <class MT, class IT, int Scale>                                             \
392     inline void AVX2::V_::gatherImplementation(                                          \
393         const Common::GatherArguments<MT, IT, Scale> &args)
394 #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
Vc_GATHER_IMPL(double_v)395 Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
396 
Vc_GATHER_IMPL(float_v)397 Vc_GATHER_IMPL(float_v)
398 {
399     d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
400                            Vc_M(7));
401 }
402 
403 #ifdef Vc_IMPL_AVX2
Vc_GATHER_IMPL(int_v)404 Vc_GATHER_IMPL(int_v)
405 {
406     d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
407                               Vc_M(6), Vc_M(7));
408 }
409 
Vc_GATHER_IMPL(uint_v)410 Vc_GATHER_IMPL(uint_v)
411 {
412     d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
413                               Vc_M(6), Vc_M(7));
414 }
415 
Vc_GATHER_IMPL(short_v)416 Vc_GATHER_IMPL(short_v)
417 {
418     d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
419                               Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
420                               Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
421 }
422 
Vc_GATHER_IMPL(ushort_v)423 Vc_GATHER_IMPL(ushort_v)
424 {
425     d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
426                               Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
427                               Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
428 }
429 #endif
430 #undef Vc_M
431 #undef Vc_GATHER_IMPL
432 
433 template <class T>
434 template <class MT, class IT, int Scale>
gatherImplementation(const Common::GatherArguments<MT,IT,Scale> & args,MaskArgument mask)435 inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
436     const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
437 {
438     const auto *mem = args.address;
439     const auto indexes = Scale * args.indexes;
440     using Selector = std::integral_constant < Common::GatherScatterImplementation,
441 #ifdef Vc_USE_SET_GATHERS
442           Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
443 #endif
444 #ifdef Vc_USE_BSF_GATHERS
445                                             Common::GatherScatterImplementation::BitScanLoop
446 #elif defined Vc_USE_POPCNT_BSF_GATHERS
447               Common::GatherScatterImplementation::PopcntSwitch
448 #else
449               Common::GatherScatterImplementation::SimpleLoop
450 #endif
451                                                 > ;
452     Common::executeGather(Selector(), *this, mem, indexes, mask);
453 }
454 
455 template <typename T>
456 template <typename MT, typename IT>
scatterImplementation(MT * mem,IT && indexes) const457 inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
458 {
459     Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
460 }
461 
462 template <typename T>
463 template <typename MT, typename IT>
scatterImplementation(MT * mem,IT && indexes,MaskArgument mask) const464 inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
465 {
466     using Selector = std::integral_constant < Common::GatherScatterImplementation,
467 #ifdef Vc_USE_SET_GATHERS
468           Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
469 #endif
470 #ifdef Vc_USE_BSF_GATHERS
471                                             Common::GatherScatterImplementation::BitScanLoop
472 #elif defined Vc_USE_POPCNT_BSF_GATHERS
473               Common::GatherScatterImplementation::PopcntSwitch
474 #else
475               Common::GatherScatterImplementation::SimpleLoop
476 #endif
477                                                 > ;
478     Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
479 }
480 
481 ///////////////////////////////////////////////////////////////////////////////////////////
482 // operator- {{{1
483 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
operator -() const484 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
485 {
486     return VectorType(-d.builtin());
487 }
488 #else
operator -() const489 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
490 {
491     return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
492 }
493 #endif
494 
495 ///////////////////////////////////////////////////////////////////////////////////////////
496 // horizontal ops {{{1
497 template <typename T>
498 Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
minIndex() const499 Vector<T, VectorAbi::Avx>::minIndex() const
500 {
501     AVX2::Vector<T> x = min();
502     return std::make_pair(x, (*this == x).firstOne());
503 }
504 template <typename T>
505 Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
maxIndex() const506 Vector<T, VectorAbi::Avx>::maxIndex() const
507 {
508     AVX2::Vector<T> x = max();
509     return std::make_pair(x, (*this == x).firstOne());
510 }
minIndex() const511 template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
512 {
513     /*
514     // 28 cycles latency:
515     __m256 x = _mm256_min_ps(Mem::permute128<X1, X0>(d.v()), d.v());
516     x = _mm256_min_ps(x, Reg::permute<X2, X3, X0, X1>(x));
517     AVX2::float_v xx = _mm256_min_ps(x, Reg::permute<X1, X0, X3, X2>(x));
518     AVX2::uint_v idx = AVX2::uint_v::IndexesFromZero();
519     idx = _mm256_castps_si256(
520         _mm256_or_ps((*this != xx).data(), _mm256_castsi256_ps(idx.data())));
521     return std::make_pair(xx, (*this == xx).firstOne());
522 
523     __m128 loData = AVX::lo128(d.v());
524     __m128 hiData = AVX::hi128(d.v());
525     const __m128 less2 = _mm_cmplt_ps(hiData, loData);
526     loData = _mm_min_ps(loData, hiData);
527     hiData = Mem::permute<X2, X3, X0, X1>(loData);
528     const __m128 less1 = _mm_cmplt_ps(hiData, loData);
529     loData = _mm_min_ps(loData, hiData);
530     hiData = Mem::permute<X1, X0, X3, X2>(loData);
531     const __m128 less0 = _mm_cmplt_ps(hiData, loData);
532     unsigned bits = _mm_movemask_ps(less0) & 0x1;
533     bits |= ((_mm_movemask_ps(less1) << 1) - bits) & 0x2;
534     bits |= ((_mm_movemask_ps(less2) << 3) - bits) & 0x4;
535     loData = _mm_min_ps(loData, hiData);
536     return std::make_pair(AVX::concat(loData, loData), bits);
537     */
538 
539     // 28 cycles Latency:
540     __m256 x = d.v();
541     __m256 idx = Vector<float>::IndexesFromZero().data();
542     __m256 y = Mem::permute128<X1, X0>(x);
543     __m256 idy = Mem::permute128<X1, X0>(idx);
544     __m256 less = AVX::cmplt_ps(x, y);
545 
546     x = _mm256_blendv_ps(y, x, less);
547     idx = _mm256_blendv_ps(idy, idx, less);
548     y = Reg::permute<X2, X3, X0, X1>(x);
549     idy = Reg::permute<X2, X3, X0, X1>(idx);
550     less = AVX::cmplt_ps(x, y);
551 
552     x = _mm256_blendv_ps(y, x, less);
553     idx = _mm256_blendv_ps(idy, idx, less);
554     y = Reg::permute<X1, X0, X3, X2>(x);
555     idy = Reg::permute<X1, X0, X3, X2>(idx);
556     less = AVX::cmplt_ps(x, y);
557 
558     idx = _mm256_blendv_ps(idy, idx, less);
559 
560     const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
561 #ifdef Vc_GNU_ASM
562     __asm__ __volatile__(""); // help GCC to order the instructions better
563 #endif
564     x = _mm256_blendv_ps(y, x, less);
565     return std::make_pair(x, index);
566 }
partialSum() const567 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
568 {
569     //   a    b    c    d    e    f    g    h
570     // +      a    b    c    d    e    f    g    -> a ab bc  cd   de    ef     fg      gh
571     // +           a    ab   bc   cd   de   ef   -> a ab abc abcd bcde  cdef   defg    efgh
572     // +                     a    ab   abc  abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
573     AVX2::Vector<T> tmp = *this;
574     if (Size >  1) tmp += tmp.shifted(-1);
575     if (Size >  2) tmp += tmp.shifted(-2);
576     if (Size >  4) tmp += tmp.shifted(-4);
577     if (Size >  8) tmp += tmp.shifted(-8);
578     if (Size > 16) tmp += tmp.shifted(-16);
579     return tmp;
580 }
581 
582 /* This function requires correct masking because the neutral element of \p op is not necessarily 0
583  *
584 template<typename T> template<typename BinaryOperation> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum(BinaryOperation op) const
585 {
586     //   a    b    c    d    e    f    g    h
587     // +      a    b    c    d    e    f    g    -> a ab bc  cd   de    ef     fg      gh
588     // +           a    ab   bc   cd   de   ef   -> a ab abc abcd bcde  cdef   defg    efgh
589     // +                     a    ab   abc  abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
590     AVX2::Vector<T> tmp = *this;
591     Mask mask(true);
592     if (Size >  1) tmp(mask) = op(tmp, tmp.shifted(-1));
593     if (Size >  2) tmp(mask) = op(tmp, tmp.shifted(-2));
594     if (Size >  4) tmp(mask) = op(tmp, tmp.shifted(-4));
595     if (Size >  8) tmp(mask) = op(tmp, tmp.shifted(-8));
596     if (Size > 16) tmp(mask) = op(tmp, tmp.shifted(-16));
597     return tmp;
598 }
599 */
600 
min(MaskArgument m) const601 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
602 {
603     AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
604     tmp(m) = *this;
605     return tmp.min();
606 }
max(MaskArgument m) const607 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
608 {
609     AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
610     tmp(m) = *this;
611     return tmp.max();
612 }
product(MaskArgument m) const613 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
614 {
615     AVX2::Vector<T> tmp(Vc::One);
616     tmp(m) = *this;
617     return tmp.product();
618 }
sum(MaskArgument m) const619 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
620 {
621     AVX2::Vector<T> tmp(Vc::Zero);
622     tmp(m) = *this;
623     return tmp.sum();
624 }//}}}
625 // exponent {{{1
626 namespace Detail
627 {
exponent(__m256 v)628 Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
629 {
630     using namespace AVX;
631     __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
632     __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
633     tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
634     tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
635     return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
636 }
exponent(__m256d v)637 Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
638 {
639     using namespace AVX;
640     __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
641     __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
642     tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
643     tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
644     return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
645 }
646 } // namespace Detail
647 
exponent(AVX2::float_v x)648 Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
649 {
650     using Detail::operator>=;
651     Vc_ASSERT((x >= x.Zero()).isFull());
652     return Detail::exponent(x.data());
653 }
exponent(AVX2::double_v x)654 Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
655 {
656     using Detail::operator>=;
657     Vc_ASSERT((x >= x.Zero()).isFull());
658     return Detail::exponent(x.data());
659 }
660 // }}}1
661 // Random {{{1
_doRandomStep()662 static Vc_ALWAYS_INLINE __m256i _doRandomStep()
663 {
664     using Detail::operator*;
665     using Detail::operator+;
666 #ifdef Vc_IMPL_AVX2
667     using AVX2::uint_v;
668     uint_v state0(&Common::RandomState[0]);
669     uint_v state1(&Common::RandomState[uint_v::Size]);
670     (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
671     uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
672                         _mm256_srli_epi32(state1.data(), 16)))
673         .store(&Common::RandomState[0]);
674     return state0.data();
675 #else
676     using SSE::uint_v;
677     uint_v state0(&Common::RandomState[0]);
678     uint_v state1(&Common::RandomState[uint_v::Size]);
679     uint_v state2(&Common::RandomState[2 * uint_v::Size]);
680     uint_v state3(&Common::RandomState[3 * uint_v::Size]);
681     (state2 * uint_v(0xdeece66du) + uint_v(11))
682         .store(&Common::RandomState[2 * uint_v::Size]);
683     (state3 * uint_v(0xdeece66du) + uint_v(11))
684         .store(&Common::RandomState[3 * uint_v::Size]);
685     uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
686                         _mm_srli_epi32(state2.data(), 16)))
687         .store(&Common::RandomState[0]);
688     uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
689                         _mm_srli_epi32(state3.data(), 16)))
690         .store(&Common::RandomState[uint_v::Size]);
691     return AVX::concat(state0.data(), state1.data());
692 #endif
693 }
694 
695 #ifdef Vc_IMPL_AVX2
Random()696 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
697 {
698     return {_doRandomStep()};
699 }
700 #endif
701 
Random()702 template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
703 {
704     return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
705                    HT::one());
706 }
707 
Random()708 template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
709 {
710     const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
711                                        Detail::LoadTag<__m256i, int>());
712     for (size_t k = 0; k < 8; k += 2) {
713         typedef unsigned long long uint64 Vc_MAY_ALIAS;
714         const uint64 stateX = *aliasing_cast<uint64>(&Common::RandomState[k]);
715         *aliasing_cast<uint64>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
716     }
717     return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
718 }
719 // }}}1
720 // shifted / rotated {{{1
shifted(int amount) const721 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
722 {
723     return Detail::shifted<EntryType>(d.v(), amount);
724 }
725 
726 template <typename VectorType>
shifted_shortcut(VectorType left,VectorType right,Common::WidthT<__m128>)727 Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
728 {
729     return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
730 }
731 template <typename VectorType>
shifted_shortcut(VectorType left,VectorType right,Common::WidthT<__m256>)732 Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
733 {
734     return Mem::shuffle128<X1, Y0>(left, right);
735 }
736 
shifted(int amount,Vector shiftIn) const737 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
738 {
739 #ifdef __GNUC__
740     if (__builtin_constant_p(amount)) {
741         const __m256i a = AVX::avx_cast<__m256i>(d.v());
742         const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
743         if (amount * 2 == int(Size)) {
744             return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
745         }
746         if (amount * 2 == -int(Size)) {
747             return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
748         }
749         switch (amount) {
750         case 1:
751             return AVX::avx_cast<VectorType>(
752 #ifdef Vc_IMPL_AVX2
753                 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
754                                    sizeof(EntryType))
755 #else  // Vc_IMPL_AVX2
756                 AVX::concat(
757                     _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
758                     _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
759 #endif  // Vc_IMPL_AVX2
760                     );
761         case 2:
762             return AVX::avx_cast<VectorType>(
763 #ifdef Vc_IMPL_AVX2
764                 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
765                                    2 * sizeof(EntryType))
766 #else  // Vc_IMPL_AVX2
767                 AVX::concat(
768                     _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
769                     _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
770 #endif  // Vc_IMPL_AVX2
771                     );
772         case 3:
773             if (6u < Size) {
774                 return AVX::avx_cast<VectorType>(
775 #ifdef Vc_IMPL_AVX2
776                     _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
777                                        3 * sizeof(EntryType))
778 #else   // Vc_IMPL_AVX2
779                     AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
780                                                 3 * sizeof(EntryType)),
781                                 _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
782                                                 3 * sizeof(EntryType)))
783 #endif  // Vc_IMPL_AVX2
784                         );
785             // TODO: } else {
786             }
787         }
788     }
789 #endif
790     using Detail::operator|;
791     return shifted(amount) | (amount > 0 ?
792                               shiftIn.shifted(amount - Size) :
793                               shiftIn.shifted(Size + amount));
794 }
rotated(int amount) const795 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
796 {
797     return Detail::rotated<EntryType, size()>(d.v(), amount);
798 }
799 // sorted {{{1
800 template <typename T>
sorted() const801 Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
802     const
803 {
804     return Detail::sorted(*this);
805 }
806 // interleaveLow/-High {{{1
interleaveLow(AVX2::double_v x) const807 template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
808 {
809     return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
810                                    _mm256_unpackhi_pd(data(), x.data()));
811 }
interleaveHigh(AVX2::double_v x) const812 template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
813 {
814     return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
815                                    _mm256_unpackhi_pd(data(), x.data()));
816 }
interleaveLow(AVX2::float_v x) const817 template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
818 {
819     return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
820                                    _mm256_unpackhi_ps(data(), x.data()));
821 }
interleaveHigh(AVX2::float_v x) const822 template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
823 {
824     return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
825                                    _mm256_unpackhi_ps(data(), x.data()));
826 }
827 #ifdef Vc_IMPL_AVX2
interleaveLow(AVX2::int_v x) const828 template <> Vc_INTRINSIC    AVX2::int_v    AVX2::int_v::interleaveLow (   AVX2::int_v x) const {
829     return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
830                                    _mm256_unpackhi_epi32(data(), x.data()));
831 }
interleaveHigh(AVX2::int_v x) const832 template <> Vc_INTRINSIC    AVX2::int_v    AVX2::int_v::interleaveHigh(   AVX2::int_v x) const {
833     return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
834                                    _mm256_unpackhi_epi32(data(), x.data()));
835 }
interleaveLow(AVX2::uint_v x) const836 template <> Vc_INTRINSIC   AVX2::uint_v   AVX2::uint_v::interleaveLow (  AVX2::uint_v x) const {
837     return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
838                                    _mm256_unpackhi_epi32(data(), x.data()));
839 }
interleaveHigh(AVX2::uint_v x) const840 template <> Vc_INTRINSIC   AVX2::uint_v   AVX2::uint_v::interleaveHigh(  AVX2::uint_v x) const {
841     return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
842                                    _mm256_unpackhi_epi32(data(), x.data()));
843 }
interleaveLow(AVX2::short_v x) const844 template <> Vc_INTRINSIC  AVX2::short_v  AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
845     return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
846                                    _mm256_unpackhi_epi16(data(), x.data()));
847 }
interleaveHigh(AVX2::short_v x) const848 template <> Vc_INTRINSIC  AVX2::short_v  AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
849     return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
850                                    _mm256_unpackhi_epi16(data(), x.data()));
851 }
interleaveLow(AVX2::ushort_v x) const852 template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
853     return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
854                                    _mm256_unpackhi_epi16(data(), x.data()));
855 }
interleaveHigh(AVX2::ushort_v x) const856 template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
857     return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
858                                    _mm256_unpackhi_epi16(data(), x.data()));
859 }
860 #endif
861 // permutation via operator[] {{{1
operator [](Permutation::ReversedTag) const862 template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
863 {
864     return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
865 }
operator [](Permutation::ReversedTag) const866 template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
867 {
868     return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
869 }
870 #ifdef Vc_IMPL_AVX2
871 template <>
operator [](Permutation::ReversedTag) const872 Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
873 {
874     return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
875 }
876 template <>
operator [](Permutation::ReversedTag) const877 Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
878 {
879     return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
880 }
881 template <>
operator [](Permutation::ReversedTag) const882 Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
883     Permutation::ReversedTag) const
884 {
885     return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
886         AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
887         AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
888 }
889 template <>
operator [](Permutation::ReversedTag) const890 Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
891     Permutation::ReversedTag) const
892 {
893     return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
894         AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
895         AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
896 }
897 #endif
operator [](const IndexType &) const898 template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType &/*perm*/) const
899 {
900     // TODO
901     return *this;
902 #ifdef Vc_IMPL_AVX2
903 #else
904     /*
905     const int_m cross128 = AVX::concat(_mm_cmpgt_epi32(AVX::lo128(perm.data()), _mm_set1_epi32(3)),
906                                   _mm_cmplt_epi32(AVX::hi128(perm.data()), _mm_set1_epi32(4)));
907     if (cross128.isNotEmpty()) {
908     AVX2::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
909         x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
910         return x;
911     } else {
912     */
913 #endif
914 }
915 
916 // reversed {{{1
917 template <typename T>
reversed() const918 Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
919 {
920     return (*this)[Permutation::Reversed];
921 }
922 
923 // broadcast from constexpr index {{{1
broadcast() const924 template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
925 {
926     constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
927     constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
928     return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
929 }
broadcast() const930 template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
931 {
932     constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
933     constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
934     return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
935 }
936 // }}}1
937 }  // namespace Vc
938 
939 // vim: foldmethod=marker
940