1 /* This file is part of the Vc library. {{{
2 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the names of contributing organizations nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26 }}}*/
27
28 #include "../common/x86_prefetches.h"
29 #include "../common/gatherimplementation.h"
30 #include "../common/scatterimplementation.h"
31 #include "limits.h"
32 #include "const.h"
33 #include "../common/set.h"
34 #include "macros.h"
35
36 namespace Vc_VERSIONED_NAMESPACE
37 {
38 namespace Detail
39 {
40 // compare operators {{{1
operator ==(AVX2::double_v a,AVX2::double_v b)41 Vc_INTRINSIC AVX2::double_m operator==(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpeq_pd(a.data(), b.data()); }
operator ==(AVX2::float_v a,AVX2::float_v b)42 Vc_INTRINSIC AVX2:: float_m operator==(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpeq_ps(a.data(), b.data()); }
operator !=(AVX2::double_v a,AVX2::double_v b)43 Vc_INTRINSIC AVX2::double_m operator!=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpneq_pd(a.data(), b.data()); }
operator !=(AVX2::float_v a,AVX2::float_v b)44 Vc_INTRINSIC AVX2:: float_m operator!=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpneq_ps(a.data(), b.data()); }
operator >=(AVX2::double_v a,AVX2::double_v b)45 Vc_INTRINSIC AVX2::double_m operator>=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmpnlt_pd(a.data(), b.data()); }
operator >=(AVX2::float_v a,AVX2::float_v b)46 Vc_INTRINSIC AVX2:: float_m operator>=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpnlt_ps(a.data(), b.data()); }
operator <=(AVX2::double_v a,AVX2::double_v b)47 Vc_INTRINSIC AVX2::double_m operator<=(AVX2::double_v a, AVX2::double_v b) { return AVX::cmple_pd(a.data(), b.data()); }
operator <=(AVX2::float_v a,AVX2::float_v b)48 Vc_INTRINSIC AVX2:: float_m operator<=(AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmple_ps(a.data(), b.data()); }
operator >(AVX2::double_v a,AVX2::double_v b)49 Vc_INTRINSIC AVX2::double_m operator> (AVX2::double_v a, AVX2::double_v b) { return AVX::cmpgt_pd(a.data(), b.data()); }
operator >(AVX2::float_v a,AVX2::float_v b)50 Vc_INTRINSIC AVX2:: float_m operator> (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmpgt_ps(a.data(), b.data()); }
operator <(AVX2::double_v a,AVX2::double_v b)51 Vc_INTRINSIC AVX2::double_m operator< (AVX2::double_v a, AVX2::double_v b) { return AVX::cmplt_pd(a.data(), b.data()); }
operator <(AVX2::float_v a,AVX2::float_v b)52 Vc_INTRINSIC AVX2:: float_m operator< (AVX2:: float_v a, AVX2:: float_v b) { return AVX::cmplt_ps(a.data(), b.data()); }
53
54 #ifdef Vc_IMPL_AVX2
operator ==(AVX2::int_v a,AVX2::int_v b)55 Vc_INTRINSIC AVX2:: int_m operator==(AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
operator ==(AVX2::uint_v a,AVX2::uint_v b)56 Vc_INTRINSIC AVX2:: uint_m operator==(AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpeq_epi32(a.data(), b.data()); }
operator ==(AVX2::short_v a,AVX2::short_v b)57 Vc_INTRINSIC AVX2:: short_m operator==(AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
operator ==(AVX2::ushort_v a,AVX2::ushort_v b)58 Vc_INTRINSIC AVX2::ushort_m operator==(AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpeq_epi16(a.data(), b.data()); }
operator !=(AVX2::int_v a,AVX2::int_v b)59 Vc_INTRINSIC AVX2:: int_m operator!=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
operator !=(AVX2::uint_v a,AVX2::uint_v b)60 Vc_INTRINSIC AVX2:: uint_m operator!=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpeq_epi32(a.data(), b.data())); }
operator !=(AVX2::short_v a,AVX2::short_v b)61 Vc_INTRINSIC AVX2:: short_m operator!=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
operator !=(AVX2::ushort_v a,AVX2::ushort_v b)62 Vc_INTRINSIC AVX2::ushort_m operator!=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpeq_epi16(a.data(), b.data())); }
operator >=(AVX2::int_v a,AVX2::int_v b)63 Vc_INTRINSIC AVX2:: int_m operator>=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmplt_epi32(a.data(), b.data())); }
operator >=(AVX2::uint_v a,AVX2::uint_v b)64 Vc_INTRINSIC AVX2:: uint_m operator>=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmplt_epu32(a.data(), b.data())); }
operator >=(AVX2::short_v a,AVX2::short_v b)65 Vc_INTRINSIC AVX2:: short_m operator>=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmplt_epi16(a.data(), b.data())); }
operator >=(AVX2::ushort_v a,AVX2::ushort_v b)66 Vc_INTRINSIC AVX2::ushort_m operator>=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmplt_epu16(a.data(), b.data())); }
operator <=(AVX2::int_v a,AVX2::int_v b)67 Vc_INTRINSIC AVX2:: int_m operator<=(AVX2:: int_v a, AVX2:: int_v b) { return not_(AVX::cmpgt_epi32(a.data(), b.data())); }
operator <=(AVX2::uint_v a,AVX2::uint_v b)68 Vc_INTRINSIC AVX2:: uint_m operator<=(AVX2:: uint_v a, AVX2:: uint_v b) { return not_(AVX::cmpgt_epu32(a.data(), b.data())); }
operator <=(AVX2::short_v a,AVX2::short_v b)69 Vc_INTRINSIC AVX2:: short_m operator<=(AVX2:: short_v a, AVX2:: short_v b) { return not_(AVX::cmpgt_epi16(a.data(), b.data())); }
operator <=(AVX2::ushort_v a,AVX2::ushort_v b)70 Vc_INTRINSIC AVX2::ushort_m operator<=(AVX2::ushort_v a, AVX2::ushort_v b) { return not_(AVX::cmpgt_epu16(a.data(), b.data())); }
operator >(AVX2::int_v a,AVX2::int_v b)71 Vc_INTRINSIC AVX2:: int_m operator> (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmpgt_epi32(a.data(), b.data()); }
operator >(AVX2::uint_v a,AVX2::uint_v b)72 Vc_INTRINSIC AVX2:: uint_m operator> (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmpgt_epu32(a.data(), b.data()); }
operator >(AVX2::short_v a,AVX2::short_v b)73 Vc_INTRINSIC AVX2:: short_m operator> (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmpgt_epi16(a.data(), b.data()); }
operator >(AVX2::ushort_v a,AVX2::ushort_v b)74 Vc_INTRINSIC AVX2::ushort_m operator> (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmpgt_epu16(a.data(), b.data()); }
operator <(AVX2::int_v a,AVX2::int_v b)75 Vc_INTRINSIC AVX2:: int_m operator< (AVX2:: int_v a, AVX2:: int_v b) { return AVX::cmplt_epi32(a.data(), b.data()); }
operator <(AVX2::uint_v a,AVX2::uint_v b)76 Vc_INTRINSIC AVX2:: uint_m operator< (AVX2:: uint_v a, AVX2:: uint_v b) { return AVX::cmplt_epu32(a.data(), b.data()); }
operator <(AVX2::short_v a,AVX2::short_v b)77 Vc_INTRINSIC AVX2:: short_m operator< (AVX2:: short_v a, AVX2:: short_v b) { return AVX::cmplt_epi16(a.data(), b.data()); }
operator <(AVX2::ushort_v a,AVX2::ushort_v b)78 Vc_INTRINSIC AVX2::ushort_m operator< (AVX2::ushort_v a, AVX2::ushort_v b) { return AVX::cmplt_epu16(a.data(), b.data()); }
79 #endif // Vc_IMPL_AVX2
80
81 // bitwise operators {{{1
82 template <typename T>
operator ^(AVX2::Vector<T> a,AVX2::Vector<T> b)83 Vc_INTRINSIC AVX2::Vector<T> operator^(AVX2::Vector<T> a, AVX2::Vector<T> b)
84 {
85 return xor_(a.data(), b.data());
86 }
87 template <typename T>
operator &(AVX2::Vector<T> a,AVX2::Vector<T> b)88 Vc_INTRINSIC AVX2::Vector<T> operator&(AVX2::Vector<T> a, AVX2::Vector<T> b)
89 {
90 return and_(a.data(), b.data());
91 }
92 template <typename T>
operator |(AVX2::Vector<T> a,AVX2::Vector<T> b)93 Vc_INTRINSIC AVX2::Vector<T> operator|(AVX2::Vector<T> a, AVX2::Vector<T> b)
94 {
95 return or_(a.data(), b.data());
96 }
97 // }}}1
98 // arithmetic operators {{{1
99 template <typename T>
operator +(AVX2::Vector<T> a,AVX2::Vector<T> b)100 Vc_INTRINSIC AVX2::Vector<T> operator+(AVX2::Vector<T> a, AVX2::Vector<T> b)
101 {
102 return add(a.data(), b.data(), T());
103 }
104 template <typename T>
operator -(AVX2::Vector<T> a,AVX2::Vector<T> b)105 Vc_INTRINSIC AVX2::Vector<T> operator-(AVX2::Vector<T> a, AVX2::Vector<T> b)
106 {
107 return sub(a.data(), b.data(), T());
108 }
109 template <typename T>
operator *(AVX2::Vector<T> a,AVX2::Vector<T> b)110 Vc_INTRINSIC AVX2::Vector<T> operator*(AVX2::Vector<T> a, AVX2::Vector<T> b)
111 {
112 return mul(a.data(), b.data(), T());
113 }
114 template <typename T>
operator /(AVX2::Vector<T> a,AVX2::Vector<T> b)115 Vc_INTRINSIC AVX2::Vector<T> operator/(AVX2::Vector<T> a, AVX2::Vector<T> b)
116 {
117 return div(a.data(), b.data(), T());
118 }
operator /(AVX2::Vector<ushort> a,AVX2::Vector<ushort> b)119 Vc_INTRINSIC AVX2::Vector<ushort> operator/(AVX2::Vector<ushort> a,
120 AVX2::Vector<ushort> b)
121 {
122 using namespace AVX;
123 const __m256 lo = _mm256_div_ps(convert<ushort, float>(lo128(a.data())),
124 convert<ushort, float>(lo128(b.data())));
125 const __m256 hi = _mm256_div_ps(convert<ushort, float>(hi128(a.data())),
126 convert<ushort, float>(hi128(b.data())));
127 const float_v threshold = 32767.f;
128 using Detail::operator>;
129 const __m128i loShort = (Vc_IS_UNLIKELY((float_v(lo) > threshold).isNotEmpty()))
130 ? convert<float, ushort>(lo)
131 : convert<float, short>(lo);
132 const __m128i hiShort = (Vc_IS_UNLIKELY((float_v(hi) > threshold).isNotEmpty()))
133 ? convert<float, ushort>(hi)
134 : convert<float, short>(hi);
135 return concat(loShort, hiShort);
136 }
137 template <typename T>
operator %(AVX2::Vector<T> a,AVX2::Vector<T> b)138 Vc_INTRINSIC enable_if<std::is_integral<T>::value, AVX2::Vector<T>> operator%(
139 AVX2::Vector<T> a, AVX2::Vector<T> b)
140 {
141 return a - a / b * b;
142 }
143 // }}}1
144 } // namespace Detail
145 ///////////////////////////////////////////////////////////////////////////////////////////
146 // generate {{{1
generate(G gen)147 template <> template <typename G> Vc_INTRINSIC AVX2::double_v AVX2::double_v::generate(G gen)
148 {
149 const auto tmp0 = gen(0);
150 const auto tmp1 = gen(1);
151 const auto tmp2 = gen(2);
152 const auto tmp3 = gen(3);
153 return _mm256_setr_pd(tmp0, tmp1, tmp2, tmp3);
154 }
generate(G gen)155 template <> template <typename G> Vc_INTRINSIC AVX2::float_v AVX2::float_v::generate(G gen)
156 {
157 const auto tmp0 = gen(0);
158 const auto tmp1 = gen(1);
159 const auto tmp2 = gen(2);
160 const auto tmp3 = gen(3);
161 const auto tmp4 = gen(4);
162 const auto tmp5 = gen(5);
163 const auto tmp6 = gen(6);
164 const auto tmp7 = gen(7);
165 return _mm256_setr_ps(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
166 }
167 #ifdef Vc_IMPL_AVX2
generate(G gen)168 template <> template <typename G> Vc_INTRINSIC AVX2::int_v AVX2::int_v::generate(G gen)
169 {
170 const auto tmp0 = gen(0);
171 const auto tmp1 = gen(1);
172 const auto tmp2 = gen(2);
173 const auto tmp3 = gen(3);
174 const auto tmp4 = gen(4);
175 const auto tmp5 = gen(5);
176 const auto tmp6 = gen(6);
177 const auto tmp7 = gen(7);
178 return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
179 }
generate(G gen)180 template <> template <typename G> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::generate(G gen)
181 {
182 const auto tmp0 = gen(0);
183 const auto tmp1 = gen(1);
184 const auto tmp2 = gen(2);
185 const auto tmp3 = gen(3);
186 const auto tmp4 = gen(4);
187 const auto tmp5 = gen(5);
188 const auto tmp6 = gen(6);
189 const auto tmp7 = gen(7);
190 return _mm256_setr_epi32(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
191 }
generate(G gen)192 template <> template <typename G> Vc_INTRINSIC AVX2::short_v AVX2::short_v::generate(G gen)
193 {
194 const auto tmp0 = gen(0);
195 const auto tmp1 = gen(1);
196 const auto tmp2 = gen(2);
197 const auto tmp3 = gen(3);
198 const auto tmp4 = gen(4);
199 const auto tmp5 = gen(5);
200 const auto tmp6 = gen(6);
201 const auto tmp7 = gen(7);
202 const auto tmp8 = gen(8);
203 const auto tmp9 = gen(9);
204 const auto tmp10 = gen(10);
205 const auto tmp11 = gen(11);
206 const auto tmp12 = gen(12);
207 const auto tmp13 = gen(13);
208 const auto tmp14 = gen(14);
209 const auto tmp15 = gen(15);
210 return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
211 }
generate(G gen)212 template <> template <typename G> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::generate(G gen)
213 {
214 const auto tmp0 = gen(0);
215 const auto tmp1 = gen(1);
216 const auto tmp2 = gen(2);
217 const auto tmp3 = gen(3);
218 const auto tmp4 = gen(4);
219 const auto tmp5 = gen(5);
220 const auto tmp6 = gen(6);
221 const auto tmp7 = gen(7);
222 const auto tmp8 = gen(8);
223 const auto tmp9 = gen(9);
224 const auto tmp10 = gen(10);
225 const auto tmp11 = gen(11);
226 const auto tmp12 = gen(12);
227 const auto tmp13 = gen(13);
228 const auto tmp14 = gen(14);
229 const auto tmp15 = gen(15);
230 return _mm256_setr_epi16(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
231 }
232 #endif
233
234 // constants {{{1
Vector(VectorSpecialInitializerZero)235 template <typename T> Vc_INTRINSIC Vector<T, VectorAbi::Avx>::Vector(VectorSpecialInitializerZero) : d{} {}
236
Vector(VectorSpecialInitializerOne)237 template <> Vc_INTRINSIC Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_pd()) {}
Vector(VectorSpecialInitializerOne)238 template <> Vc_INTRINSIC Vector< float, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_ps()) {}
239 #ifdef Vc_IMPL_AVX2
Vector(VectorSpecialInitializerOne)240 template <> Vc_INTRINSIC Vector< int, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi32()) {}
Vector(VectorSpecialInitializerOne)241 template <> Vc_INTRINSIC Vector< uint, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu32()) {}
Vector(VectorSpecialInitializerOne)242 template <> Vc_INTRINSIC Vector< short, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi16()) {}
Vector(VectorSpecialInitializerOne)243 template <> Vc_INTRINSIC Vector<ushort, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu16()) {}
Vector(VectorSpecialInitializerOne)244 template <> Vc_INTRINSIC Vector< schar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epi8()) {}
Vector(VectorSpecialInitializerOne)245 template <> Vc_INTRINSIC Vector< uchar, VectorAbi::Avx>::Vector(VectorSpecialInitializerOne) : d(AVX::setone_epu8()) {}
246 #endif
247
248 template <typename T>
Vector(VectorSpecialInitializerIndexesFromZero)249 Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx>::Vector(
250 VectorSpecialInitializerIndexesFromZero)
251 : Vector(AVX::IndexesFromZeroData<T>::address(), Vc::Aligned)
252 {
253 }
254
255 template <>
Vector(VectorSpecialInitializerIndexesFromZero)256 Vc_ALWAYS_INLINE Vector<float, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
257 : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
258 {
259 }
260 template <>
Vector(VectorSpecialInitializerIndexesFromZero)261 Vc_ALWAYS_INLINE Vector<double, VectorAbi::Avx>::Vector(VectorSpecialInitializerIndexesFromZero)
262 : Vector(AVX::IndexesFromZeroData<int>::address(), Vc::Aligned)
263 {
264 }
265
266 ///////////////////////////////////////////////////////////////////////////////////////////
267 // load member functions {{{1
268 // general load, implemented via LoadHelper {{{2
269 template <typename DstT>
270 template <typename SrcT, typename Flags>
271 Vc_INTRINSIC typename Vector<DstT, VectorAbi::Avx>::
272 #ifndef Vc_MSVC
273 template
274 #endif
load(const SrcT * mem,Flags flags)275 load_concept<SrcT, Flags>::type Vector<DstT, VectorAbi::Avx>::load(const SrcT *mem, Flags flags)
276 {
277 Common::handleLoadPrefetches(mem, flags);
278 d.v() = Detail::load<VectorType, DstT>(mem, flags);
279 }
280
281 ///////////////////////////////////////////////////////////////////////////////////////////
282 // zeroing {{{1
setZero()283 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero()
284 {
285 data() = Detail::zero<VectorType>();
286 }
setZero(const Mask & k)287 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZero(const Mask &k)
288 {
289 data() = Detail::andnot_(k.data(), data());
290 }
setZeroInverted(const Mask & k)291 template<typename T> Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::setZeroInverted(const Mask &k)
292 {
293 data() = Detail::and_(k.data(), data());
294 }
295
setQnan()296 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan()
297 {
298 data() = Detail::allone<VectorType>();
299 }
setQnan(MaskArgument k)300 template<> Vc_INTRINSIC void Vector<double, VectorAbi::Avx>::setQnan(MaskArgument k)
301 {
302 data() = _mm256_or_pd(data(), k.dataD());
303 }
setQnan()304 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan()
305 {
306 data() = Detail::allone<VectorType>();
307 }
setQnan(MaskArgument k)308 template<> Vc_INTRINSIC void Vector<float, VectorAbi::Avx>::setQnan(MaskArgument k)
309 {
310 data() = _mm256_or_ps(data(), k.dataF());
311 }
312
313 ///////////////////////////////////////////////////////////////////////////////////////////
314 // stores {{{1
315 template <typename T>
316 template <typename U,
317 typename Flags,
318 typename>
store(U * mem,Flags flags) const319 Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Flags flags) const
320 {
321 Common::handleStorePrefetches(mem, flags);
322 HV::template store<Flags>(mem, data());
323 }
324
325 template <typename T>
326 template <typename U,
327 typename Flags,
328 typename>
store(U * mem,Mask mask,Flags flags) const329 Vc_INTRINSIC void Vector<T, VectorAbi::Avx>::store(U *mem, Mask mask, Flags flags) const
330 {
331 Common::handleStorePrefetches(mem, flags);
332 HV::template store<Flags>(mem, data(), mask.data());
333 }
334
335 ///////////////////////////////////////////////////////////////////////////////////////////
336 // integer ops {{{1
337 #ifdef Vc_IMPL_AVX2
operator <<(AsArg x) const338 template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
operator <<(AsArg x) const339 template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator<<(AsArg x) const { return _mm256_sllv_epi32(d.v(), x.d.v()); }
operator >>(AsArg x) const340 template <> Vc_ALWAYS_INLINE AVX2::Vector< int> Vector< int, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srav_epi32(d.v(), x.d.v()); }
operator >>(AsArg x) const341 template <> Vc_ALWAYS_INLINE AVX2::Vector< uint> Vector< uint, VectorAbi::Avx>::operator>>(AsArg x) const { return _mm256_srlv_epi32(d.v(), x.d.v()); }
operator <<(AsArg x) const342 template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
operator <<(AsArg x) const343 template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator<<(AsArg x) const { return generate([&](int i) { return get(*this, i) << get(x, i); }); }
operator >>(AsArg x) const344 template <> Vc_ALWAYS_INLINE AVX2::Vector< short> Vector< short, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
operator >>(AsArg x) const345 template <> Vc_ALWAYS_INLINE AVX2::Vector<ushort> Vector<ushort, VectorAbi::Avx>::operator>>(AsArg x) const { return generate([&](int i) { return get(*this, i) >> get(x, i); }); }
346 template <typename T>
operator <<=(AsArg x)347 Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(AsArg x)
348 {
349 static_assert(std::is_integral<T>::value,
350 "bitwise-operators can only be used with Vectors of integral type");
351 return *this = *this << x;
352 }
353 template <typename T>
operator >>=(AsArg x)354 Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(AsArg x)
355 {
356 static_assert(std::is_integral<T>::value,
357 "bitwise-operators can only be used with Vectors of integral type");
358 return *this = *this >> x;
359 }
360 #endif
361
operator >>=(int shift)362 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator>>=(int shift) {
363 d.v() = Detail::shiftRight(d.v(), shift, T());
364 return *static_cast<AVX2::Vector<T> *>(this);
365 }
operator >>(int shift) const366 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator>>(int shift) const {
367 return Detail::shiftRight(d.v(), shift, T());
368 }
operator <<=(int shift)369 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> &Vector<T, VectorAbi::Avx>::operator<<=(int shift) {
370 d.v() = Detail::shiftLeft(d.v(), shift, T());
371 return *static_cast<AVX2::Vector<T> *>(this);
372 }
operator <<(int shift) const373 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator<<(int shift) const {
374 return Detail::shiftLeft(d.v(), shift, T());
375 }
376
377 // isnegative {{{1
isnegative(AVX2::float_v x)378 Vc_INTRINSIC Vc_CONST AVX2::float_m isnegative(AVX2::float_v x)
379 {
380 return AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
381 AVX::avx_cast<__m256i>(_mm256_and_ps(AVX::setsignmask_ps(), x.data()))));
382 }
isnegative(AVX2::double_v x)383 Vc_INTRINSIC Vc_CONST AVX2::double_m isnegative(AVX2::double_v x)
384 {
385 return Mem::permute<X1, X1, X3, X3>(AVX::avx_cast<__m256>(AVX::srai_epi32<31>(
386 AVX::avx_cast<__m256i>(_mm256_and_pd(AVX::setsignmask_pd(), x.data())))));
387 }
388 // gathers {{{1
389 #define Vc_GATHER_IMPL(V_) \
390 template <> \
391 template <class MT, class IT, int Scale> \
392 inline void AVX2::V_::gatherImplementation( \
393 const Common::GatherArguments<MT, IT, Scale> &args)
394 #define Vc_M(i_) static_cast<value_type>(args.address[Scale * args.indexes[i_]])
Vc_GATHER_IMPL(double_v)395 Vc_GATHER_IMPL(double_v) { d.v() = _mm256_setr_pd(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3)); }
396
Vc_GATHER_IMPL(float_v)397 Vc_GATHER_IMPL(float_v)
398 {
399 d.v() = _mm256_setr_ps(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5), Vc_M(6),
400 Vc_M(7));
401 }
402
403 #ifdef Vc_IMPL_AVX2
Vc_GATHER_IMPL(int_v)404 Vc_GATHER_IMPL(int_v)
405 {
406 d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
407 Vc_M(6), Vc_M(7));
408 }
409
Vc_GATHER_IMPL(uint_v)410 Vc_GATHER_IMPL(uint_v)
411 {
412 d.v() = _mm256_setr_epi32(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
413 Vc_M(6), Vc_M(7));
414 }
415
Vc_GATHER_IMPL(short_v)416 Vc_GATHER_IMPL(short_v)
417 {
418 d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
419 Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
420 Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
421 }
422
Vc_GATHER_IMPL(ushort_v)423 Vc_GATHER_IMPL(ushort_v)
424 {
425 d.v() = _mm256_setr_epi16(Vc_M(0), Vc_M(1), Vc_M(2), Vc_M(3), Vc_M(4), Vc_M(5),
426 Vc_M(6), Vc_M(7), Vc_M(8), Vc_M(9), Vc_M(10), Vc_M(11),
427 Vc_M(12), Vc_M(13), Vc_M(14), Vc_M(15));
428 }
429 #endif
430 #undef Vc_M
431 #undef Vc_GATHER_IMPL
432
433 template <class T>
434 template <class MT, class IT, int Scale>
gatherImplementation(const Common::GatherArguments<MT,IT,Scale> & args,MaskArgument mask)435 inline void Vector<T, VectorAbi::Avx>::gatherImplementation(
436 const Common::GatherArguments<MT, IT, Scale> &args, MaskArgument mask)
437 {
438 const auto *mem = args.address;
439 const auto indexes = Scale * args.indexes;
440 using Selector = std::integral_constant < Common::GatherScatterImplementation,
441 #ifdef Vc_USE_SET_GATHERS
442 Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
443 #endif
444 #ifdef Vc_USE_BSF_GATHERS
445 Common::GatherScatterImplementation::BitScanLoop
446 #elif defined Vc_USE_POPCNT_BSF_GATHERS
447 Common::GatherScatterImplementation::PopcntSwitch
448 #else
449 Common::GatherScatterImplementation::SimpleLoop
450 #endif
451 > ;
452 Common::executeGather(Selector(), *this, mem, indexes, mask);
453 }
454
455 template <typename T>
456 template <typename MT, typename IT>
scatterImplementation(MT * mem,IT && indexes) const457 inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes) const
458 {
459 Common::unrolled_loop<std::size_t, 0, Size>([&](std::size_t i) { mem[indexes[i]] = d.m(i); });
460 }
461
462 template <typename T>
463 template <typename MT, typename IT>
scatterImplementation(MT * mem,IT && indexes,MaskArgument mask) const464 inline void Vector<T, VectorAbi::Avx>::scatterImplementation(MT *mem, IT &&indexes, MaskArgument mask) const
465 {
466 using Selector = std::integral_constant < Common::GatherScatterImplementation,
467 #ifdef Vc_USE_SET_GATHERS
468 Traits::is_simd_vector<IT>::value ? Common::GatherScatterImplementation::SetIndexZero :
469 #endif
470 #ifdef Vc_USE_BSF_GATHERS
471 Common::GatherScatterImplementation::BitScanLoop
472 #elif defined Vc_USE_POPCNT_BSF_GATHERS
473 Common::GatherScatterImplementation::PopcntSwitch
474 #else
475 Common::GatherScatterImplementation::SimpleLoop
476 #endif
477 > ;
478 Common::executeScatter(Selector(), *this, mem, std::forward<IT>(indexes), mask);
479 }
480
481 ///////////////////////////////////////////////////////////////////////////////////////////
482 // operator- {{{1
483 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
operator -() const484 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
485 {
486 return VectorType(-d.builtin());
487 }
488 #else
operator -() const489 template<typename T> Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::operator-() const
490 {
491 return Detail::negate(d.v(), std::integral_constant<std::size_t, sizeof(T)>());
492 }
493 #endif
494
495 ///////////////////////////////////////////////////////////////////////////////////////////
496 // horizontal ops {{{1
497 template <typename T>
498 Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
minIndex() const499 Vector<T, VectorAbi::Avx>::minIndex() const
500 {
501 AVX2::Vector<T> x = min();
502 return std::make_pair(x, (*this == x).firstOne());
503 }
504 template <typename T>
505 Vc_INTRINSIC std::pair<Vector<T, VectorAbi::Avx>, int>
maxIndex() const506 Vector<T, VectorAbi::Avx>::maxIndex() const
507 {
508 AVX2::Vector<T> x = max();
509 return std::make_pair(x, (*this == x).firstOne());
510 }
minIndex() const511 template <> Vc_INTRINSIC std::pair<AVX2::float_v, int> AVX2::float_v::minIndex() const
512 {
513 /*
514 // 28 cycles latency:
515 __m256 x = _mm256_min_ps(Mem::permute128<X1, X0>(d.v()), d.v());
516 x = _mm256_min_ps(x, Reg::permute<X2, X3, X0, X1>(x));
517 AVX2::float_v xx = _mm256_min_ps(x, Reg::permute<X1, X0, X3, X2>(x));
518 AVX2::uint_v idx = AVX2::uint_v::IndexesFromZero();
519 idx = _mm256_castps_si256(
520 _mm256_or_ps((*this != xx).data(), _mm256_castsi256_ps(idx.data())));
521 return std::make_pair(xx, (*this == xx).firstOne());
522
523 __m128 loData = AVX::lo128(d.v());
524 __m128 hiData = AVX::hi128(d.v());
525 const __m128 less2 = _mm_cmplt_ps(hiData, loData);
526 loData = _mm_min_ps(loData, hiData);
527 hiData = Mem::permute<X2, X3, X0, X1>(loData);
528 const __m128 less1 = _mm_cmplt_ps(hiData, loData);
529 loData = _mm_min_ps(loData, hiData);
530 hiData = Mem::permute<X1, X0, X3, X2>(loData);
531 const __m128 less0 = _mm_cmplt_ps(hiData, loData);
532 unsigned bits = _mm_movemask_ps(less0) & 0x1;
533 bits |= ((_mm_movemask_ps(less1) << 1) - bits) & 0x2;
534 bits |= ((_mm_movemask_ps(less2) << 3) - bits) & 0x4;
535 loData = _mm_min_ps(loData, hiData);
536 return std::make_pair(AVX::concat(loData, loData), bits);
537 */
538
539 // 28 cycles Latency:
540 __m256 x = d.v();
541 __m256 idx = Vector<float>::IndexesFromZero().data();
542 __m256 y = Mem::permute128<X1, X0>(x);
543 __m256 idy = Mem::permute128<X1, X0>(idx);
544 __m256 less = AVX::cmplt_ps(x, y);
545
546 x = _mm256_blendv_ps(y, x, less);
547 idx = _mm256_blendv_ps(idy, idx, less);
548 y = Reg::permute<X2, X3, X0, X1>(x);
549 idy = Reg::permute<X2, X3, X0, X1>(idx);
550 less = AVX::cmplt_ps(x, y);
551
552 x = _mm256_blendv_ps(y, x, less);
553 idx = _mm256_blendv_ps(idy, idx, less);
554 y = Reg::permute<X1, X0, X3, X2>(x);
555 idy = Reg::permute<X1, X0, X3, X2>(idx);
556 less = AVX::cmplt_ps(x, y);
557
558 idx = _mm256_blendv_ps(idy, idx, less);
559
560 const auto index = _mm_cvtsi128_si32(AVX::avx_cast<__m128i>(idx));
561 #ifdef Vc_GNU_ASM
562 __asm__ __volatile__(""); // help GCC to order the instructions better
563 #endif
564 x = _mm256_blendv_ps(y, x, less);
565 return std::make_pair(x, index);
566 }
partialSum() const567 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum() const
568 {
569 // a b c d e f g h
570 // + a b c d e f g -> a ab bc cd de ef fg gh
571 // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh
572 // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
573 AVX2::Vector<T> tmp = *this;
574 if (Size > 1) tmp += tmp.shifted(-1);
575 if (Size > 2) tmp += tmp.shifted(-2);
576 if (Size > 4) tmp += tmp.shifted(-4);
577 if (Size > 8) tmp += tmp.shifted(-8);
578 if (Size > 16) tmp += tmp.shifted(-16);
579 return tmp;
580 }
581
582 /* This function requires correct masking because the neutral element of \p op is not necessarily 0
583 *
584 template<typename T> template<typename BinaryOperation> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::partialSum(BinaryOperation op) const
585 {
586 // a b c d e f g h
587 // + a b c d e f g -> a ab bc cd de ef fg gh
588 // + a ab bc cd de ef -> a ab abc abcd bcde cdef defg efgh
589 // + a ab abc abcd -> a ab abc abcd abcde abcdef abcdefg abcdefgh
590 AVX2::Vector<T> tmp = *this;
591 Mask mask(true);
592 if (Size > 1) tmp(mask) = op(tmp, tmp.shifted(-1));
593 if (Size > 2) tmp(mask) = op(tmp, tmp.shifted(-2));
594 if (Size > 4) tmp(mask) = op(tmp, tmp.shifted(-4));
595 if (Size > 8) tmp(mask) = op(tmp, tmp.shifted(-8));
596 if (Size > 16) tmp(mask) = op(tmp, tmp.shifted(-16));
597 return tmp;
598 }
599 */
600
min(MaskArgument m) const601 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::min(MaskArgument m) const
602 {
603 AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::max();
604 tmp(m) = *this;
605 return tmp.min();
606 }
max(MaskArgument m) const607 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::max(MaskArgument m) const
608 {
609 AVX2::Vector<T> tmp = std::numeric_limits<AVX2::Vector<T> >::min();
610 tmp(m) = *this;
611 return tmp.max();
612 }
product(MaskArgument m) const613 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::product(MaskArgument m) const
614 {
615 AVX2::Vector<T> tmp(Vc::One);
616 tmp(m) = *this;
617 return tmp.product();
618 }
sum(MaskArgument m) const619 template<typename T> Vc_ALWAYS_INLINE typename Vector<T, VectorAbi::Avx>::EntryType Vector<T, VectorAbi::Avx>::sum(MaskArgument m) const
620 {
621 AVX2::Vector<T> tmp(Vc::Zero);
622 tmp(m) = *this;
623 return tmp.sum();
624 }//}}}
625 // exponent {{{1
626 namespace Detail
627 {
exponent(__m256 v)628 Vc_INTRINSIC Vc_CONST __m256 exponent(__m256 v)
629 {
630 using namespace AVX;
631 __m128i tmp0 = _mm_srli_epi32(avx_cast<__m128i>(v), 23);
632 __m128i tmp1 = _mm_srli_epi32(avx_cast<__m128i>(hi128(v)), 23);
633 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x7f));
634 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x7f));
635 return _mm256_cvtepi32_ps(concat(tmp0, tmp1));
636 }
exponent(__m256d v)637 Vc_INTRINSIC Vc_CONST __m256d exponent(__m256d v)
638 {
639 using namespace AVX;
640 __m128i tmp0 = _mm_srli_epi64(avx_cast<__m128i>(v), 52);
641 __m128i tmp1 = _mm_srli_epi64(avx_cast<__m128i>(hi128(v)), 52);
642 tmp0 = _mm_sub_epi32(tmp0, _mm_set1_epi32(0x3ff));
643 tmp1 = _mm_sub_epi32(tmp1, _mm_set1_epi32(0x3ff));
644 return _mm256_cvtepi32_pd(avx_cast<__m128i>(Mem::shuffle<X0, X2, Y0, Y2>(avx_cast<__m128>(tmp0), avx_cast<__m128>(tmp1))));
645 }
646 } // namespace Detail
647
exponent(AVX2::float_v x)648 Vc_INTRINSIC Vc_CONST AVX2::float_v exponent(AVX2::float_v x)
649 {
650 using Detail::operator>=;
651 Vc_ASSERT((x >= x.Zero()).isFull());
652 return Detail::exponent(x.data());
653 }
exponent(AVX2::double_v x)654 Vc_INTRINSIC Vc_CONST AVX2::double_v exponent(AVX2::double_v x)
655 {
656 using Detail::operator>=;
657 Vc_ASSERT((x >= x.Zero()).isFull());
658 return Detail::exponent(x.data());
659 }
660 // }}}1
661 // Random {{{1
_doRandomStep()662 static Vc_ALWAYS_INLINE __m256i _doRandomStep()
663 {
664 using Detail::operator*;
665 using Detail::operator+;
666 #ifdef Vc_IMPL_AVX2
667 using AVX2::uint_v;
668 uint_v state0(&Common::RandomState[0]);
669 uint_v state1(&Common::RandomState[uint_v::Size]);
670 (state1 * uint_v(0xdeece66du) + uint_v(11)).store(&Common::RandomState[uint_v::Size]);
671 uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
672 _mm256_srli_epi32(state1.data(), 16)))
673 .store(&Common::RandomState[0]);
674 return state0.data();
675 #else
676 using SSE::uint_v;
677 uint_v state0(&Common::RandomState[0]);
678 uint_v state1(&Common::RandomState[uint_v::Size]);
679 uint_v state2(&Common::RandomState[2 * uint_v::Size]);
680 uint_v state3(&Common::RandomState[3 * uint_v::Size]);
681 (state2 * uint_v(0xdeece66du) + uint_v(11))
682 .store(&Common::RandomState[2 * uint_v::Size]);
683 (state3 * uint_v(0xdeece66du) + uint_v(11))
684 .store(&Common::RandomState[3 * uint_v::Size]);
685 uint_v(Detail::xor_((state0 * uint_v(0xdeece66du) + uint_v(11)).data(),
686 _mm_srli_epi32(state2.data(), 16)))
687 .store(&Common::RandomState[0]);
688 uint_v(Detail::xor_((state1 * uint_v(0xdeece66du) + uint_v(11)).data(),
689 _mm_srli_epi32(state3.data(), 16)))
690 .store(&Common::RandomState[uint_v::Size]);
691 return AVX::concat(state0.data(), state1.data());
692 #endif
693 }
694
695 #ifdef Vc_IMPL_AVX2
Random()696 template<typename T> Vc_ALWAYS_INLINE AVX2::Vector<T> Vector<T, VectorAbi::Avx>::Random()
697 {
698 return {_doRandomStep()};
699 }
700 #endif
701
Random()702 template <> Vc_ALWAYS_INLINE AVX2::float_v AVX2::float_v::Random()
703 {
704 return HT::sub(Detail::or_(_cast(AVX::srli_epi32<2>(_doRandomStep())), HT::one()),
705 HT::one());
706 }
707
Random()708 template<> Vc_ALWAYS_INLINE AVX2::double_v AVX2::double_v::Random()
709 {
710 const __m256i state = Detail::load(&Common::RandomState[0], Vc::Aligned,
711 Detail::LoadTag<__m256i, int>());
712 for (size_t k = 0; k < 8; k += 2) {
713 typedef unsigned long long uint64 Vc_MAY_ALIAS;
714 const uint64 stateX = *aliasing_cast<uint64>(&Common::RandomState[k]);
715 *aliasing_cast<uint64>(&Common::RandomState[k]) = (stateX * 0x5deece66dull + 11);
716 }
717 return HT::sub(Detail::or_(_cast(AVX::srli_epi64<12>(state)), HT::one()), HT::one());
718 }
719 // }}}1
720 // shifted / rotated {{{1
shifted(int amount) const721 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount) const
722 {
723 return Detail::shifted<EntryType>(d.v(), amount);
724 }
725
726 template <typename VectorType>
shifted_shortcut(VectorType left,VectorType right,Common::WidthT<__m128>)727 Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m128>)
728 {
729 return Mem::shuffle<X2, X3, Y0, Y1>(left, right);
730 }
731 template <typename VectorType>
shifted_shortcut(VectorType left,VectorType right,Common::WidthT<__m256>)732 Vc_INTRINSIC Vc_CONST VectorType shifted_shortcut(VectorType left, VectorType right, Common::WidthT<__m256>)
733 {
734 return Mem::shuffle128<X1, Y0>(left, right);
735 }
736
shifted(int amount,Vector shiftIn) const737 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::shifted(int amount, Vector shiftIn) const
738 {
739 #ifdef __GNUC__
740 if (__builtin_constant_p(amount)) {
741 const __m256i a = AVX::avx_cast<__m256i>(d.v());
742 const __m256i b = AVX::avx_cast<__m256i>(shiftIn.d.v());
743 if (amount * 2 == int(Size)) {
744 return shifted_shortcut(d.v(), shiftIn.d.v(), WidthT());
745 }
746 if (amount * 2 == -int(Size)) {
747 return shifted_shortcut(shiftIn.d.v(), d.v(), WidthT());
748 }
749 switch (amount) {
750 case 1:
751 return AVX::avx_cast<VectorType>(
752 #ifdef Vc_IMPL_AVX2
753 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
754 sizeof(EntryType))
755 #else // Vc_IMPL_AVX2
756 AVX::concat(
757 _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), sizeof(EntryType)),
758 _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), sizeof(EntryType)))
759 #endif // Vc_IMPL_AVX2
760 );
761 case 2:
762 return AVX::avx_cast<VectorType>(
763 #ifdef Vc_IMPL_AVX2
764 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
765 2 * sizeof(EntryType))
766 #else // Vc_IMPL_AVX2
767 AVX::concat(
768 _mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a), 2 * sizeof(EntryType)),
769 _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a), 2 * sizeof(EntryType)))
770 #endif // Vc_IMPL_AVX2
771 );
772 case 3:
773 if (6u < Size) {
774 return AVX::avx_cast<VectorType>(
775 #ifdef Vc_IMPL_AVX2
776 _mm256_alignr_epi8(_mm256_permute2x128_si256(a, b, 0x21), a,
777 3 * sizeof(EntryType))
778 #else // Vc_IMPL_AVX2
779 AVX::concat(_mm_alignr_epi8(AVX::hi128(a), AVX::lo128(a),
780 3 * sizeof(EntryType)),
781 _mm_alignr_epi8(AVX::lo128(b), AVX::hi128(a),
782 3 * sizeof(EntryType)))
783 #endif // Vc_IMPL_AVX2
784 );
785 // TODO: } else {
786 }
787 }
788 }
789 #endif
790 using Detail::operator|;
791 return shifted(amount) | (amount > 0 ?
792 shiftIn.shifted(amount - Size) :
793 shiftIn.shifted(Size + amount));
794 }
rotated(int amount) const795 template<typename T> Vc_INTRINSIC AVX2::Vector<T> Vector<T, VectorAbi::Avx>::rotated(int amount) const
796 {
797 return Detail::rotated<EntryType, size()>(d.v(), amount);
798 }
799 // sorted {{{1
800 template <typename T>
sorted() const801 Vc_ALWAYS_INLINE Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::sorted()
802 const
803 {
804 return Detail::sorted(*this);
805 }
806 // interleaveLow/-High {{{1
interleaveLow(AVX2::double_v x) const807 template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveLow(AVX2::double_v x) const
808 {
809 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_pd(data(), x.data()),
810 _mm256_unpackhi_pd(data(), x.data()));
811 }
interleaveHigh(AVX2::double_v x) const812 template <> Vc_INTRINSIC AVX2::double_v AVX2::double_v::interleaveHigh(AVX2::double_v x) const
813 {
814 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_pd(data(), x.data()),
815 _mm256_unpackhi_pd(data(), x.data()));
816 }
interleaveLow(AVX2::float_v x) const817 template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveLow(AVX2::float_v x) const
818 {
819 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_ps(data(), x.data()),
820 _mm256_unpackhi_ps(data(), x.data()));
821 }
interleaveHigh(AVX2::float_v x) const822 template <> Vc_INTRINSIC AVX2::float_v AVX2::float_v::interleaveHigh(AVX2::float_v x) const
823 {
824 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_ps(data(), x.data()),
825 _mm256_unpackhi_ps(data(), x.data()));
826 }
827 #ifdef Vc_IMPL_AVX2
interleaveLow(AVX2::int_v x) const828 template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveLow ( AVX2::int_v x) const {
829 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
830 _mm256_unpackhi_epi32(data(), x.data()));
831 }
interleaveHigh(AVX2::int_v x) const832 template <> Vc_INTRINSIC AVX2::int_v AVX2::int_v::interleaveHigh( AVX2::int_v x) const {
833 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
834 _mm256_unpackhi_epi32(data(), x.data()));
835 }
interleaveLow(AVX2::uint_v x) const836 template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveLow ( AVX2::uint_v x) const {
837 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi32(data(), x.data()),
838 _mm256_unpackhi_epi32(data(), x.data()));
839 }
interleaveHigh(AVX2::uint_v x) const840 template <> Vc_INTRINSIC AVX2::uint_v AVX2::uint_v::interleaveHigh( AVX2::uint_v x) const {
841 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi32(data(), x.data()),
842 _mm256_unpackhi_epi32(data(), x.data()));
843 }
interleaveLow(AVX2::short_v x) const844 template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveLow ( AVX2::short_v x) const {
845 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
846 _mm256_unpackhi_epi16(data(), x.data()));
847 }
interleaveHigh(AVX2::short_v x) const848 template <> Vc_INTRINSIC AVX2::short_v AVX2::short_v::interleaveHigh( AVX2::short_v x) const {
849 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
850 _mm256_unpackhi_epi16(data(), x.data()));
851 }
interleaveLow(AVX2::ushort_v x) const852 template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveLow (AVX2::ushort_v x) const {
853 return Mem::shuffle128<X0, Y0>(_mm256_unpacklo_epi16(data(), x.data()),
854 _mm256_unpackhi_epi16(data(), x.data()));
855 }
interleaveHigh(AVX2::ushort_v x) const856 template <> Vc_INTRINSIC AVX2::ushort_v AVX2::ushort_v::interleaveHigh(AVX2::ushort_v x) const {
857 return Mem::shuffle128<X1, Y1>(_mm256_unpacklo_epi16(data(), x.data()),
858 _mm256_unpackhi_epi16(data(), x.data()));
859 }
860 #endif
861 // permutation via operator[] {{{1
operator [](Permutation::ReversedTag) const862 template <> Vc_INTRINSIC Vc_PURE AVX2::double_v AVX2::double_v::operator[](Permutation::ReversedTag) const
863 {
864 return Mem::permute128<X1, X0>(Mem::permute<X1, X0, X3, X2>(d.v()));
865 }
operator [](Permutation::ReversedTag) const866 template <> Vc_INTRINSIC Vc_PURE AVX2::float_v AVX2::float_v::operator[](Permutation::ReversedTag) const
867 {
868 return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
869 }
870 #ifdef Vc_IMPL_AVX2
871 template <>
operator [](Permutation::ReversedTag) const872 Vc_INTRINSIC Vc_PURE AVX2::int_v AVX2::int_v::operator[](Permutation::ReversedTag) const
873 {
874 return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
875 }
876 template <>
operator [](Permutation::ReversedTag) const877 Vc_INTRINSIC Vc_PURE AVX2::uint_v AVX2::uint_v::operator[](Permutation::ReversedTag) const
878 {
879 return Mem::permute128<X1, X0>(Mem::permute<X3, X2, X1, X0>(d.v()));
880 }
881 template <>
operator [](Permutation::ReversedTag) const882 Vc_INTRINSIC Vc_PURE AVX2::short_v AVX2::short_v::operator[](
883 Permutation::ReversedTag) const
884 {
885 return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
886 AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
887 AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
888 }
889 template <>
operator [](Permutation::ReversedTag) const890 Vc_INTRINSIC Vc_PURE AVX2::ushort_v AVX2::ushort_v::operator[](
891 Permutation::ReversedTag) const
892 {
893 return Mem::permute128<X1, X0>(AVX::avx_cast<__m256i>(Mem::shuffle<X1, Y0, X3, Y2>(
894 AVX::avx_cast<__m256d>(Mem::permuteHi<X7, X6, X5, X4>(d.v())),
895 AVX::avx_cast<__m256d>(Mem::permuteLo<X3, X2, X1, X0>(d.v())))));
896 }
897 #endif
operator [](const IndexType &) const898 template <> Vc_INTRINSIC AVX2::float_v Vector<float, VectorAbi::Avx>::operator[](const IndexType &/*perm*/) const
899 {
900 // TODO
901 return *this;
902 #ifdef Vc_IMPL_AVX2
903 #else
904 /*
905 const int_m cross128 = AVX::concat(_mm_cmpgt_epi32(AVX::lo128(perm.data()), _mm_set1_epi32(3)),
906 _mm_cmplt_epi32(AVX::hi128(perm.data()), _mm_set1_epi32(4)));
907 if (cross128.isNotEmpty()) {
908 AVX2::float_v x = _mm256_permutevar_ps(d.v(), perm.data());
909 x(cross128) = _mm256_permutevar_ps(Mem::permute128<X1, X0>(d.v()), perm.data());
910 return x;
911 } else {
912 */
913 #endif
914 }
915
916 // reversed {{{1
917 template <typename T>
reversed() const918 Vc_INTRINSIC Vc_PURE Vector<T, VectorAbi::Avx> Vector<T, VectorAbi::Avx>::reversed() const
919 {
920 return (*this)[Permutation::Reversed];
921 }
922
923 // broadcast from constexpr index {{{1
broadcast() const924 template <> template <int Index> Vc_INTRINSIC AVX2::float_v AVX2::float_v::broadcast() const
925 {
926 constexpr VecPos Inner = static_cast<VecPos>(Index & 0x3);
927 constexpr VecPos Outer = static_cast<VecPos>((Index & 0x4) / 4);
928 return Mem::permute<Inner, Inner, Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
929 }
broadcast() const930 template <> template <int Index> Vc_INTRINSIC AVX2::double_v AVX2::double_v::broadcast() const
931 {
932 constexpr VecPos Inner = static_cast<VecPos>(Index & 0x1);
933 constexpr VecPos Outer = static_cast<VecPos>((Index & 0x2) / 2);
934 return Mem::permute<Inner, Inner>(Mem::permute128<Outer, Outer>(d.v()));
935 }
936 // }}}1
937 } // namespace Vc
938
939 // vim: foldmethod=marker
940