1 /* This file is part of the Vc library. {{{ 2 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org> 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the names of contributing organizations nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY 19 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26 }}}*/ 27 28 #ifndef VC_SSE_VECTORHELPER_H_ 29 #define VC_SSE_VECTORHELPER_H_ 30 31 #include "types.h" 32 #include "../common/loadstoreflags.h" 33 #include <limits> 34 #include "const_data.h" 35 #include "macros.h" 36 37 namespace Vc_VERSIONED_NAMESPACE 38 { 39 namespace SSE 40 { 41 #define Vc_OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; } 42 #define Vc_OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; } 43 #define Vc_OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; } 44 #define Vc_OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; } 45 46 template<> struct VectorHelper<__m128> 47 { 48 typedef __m128 VectorType; 49 50 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_ps(x); } 51 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); } 52 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); } 53 54 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_ps(mem, x); } 55 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); } 56 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_ps(mem, x); } 57 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); } 58 59 // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread) 60 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); } 61 62 Vc_OP0(allone, _mm_setallone_ps()) 63 Vc_OP0(zero, _mm_setzero_ps()) 64 Vc_OP3(blend, blendv_ps(a, b, c)) 65 }; 66 67 68 template<> struct VectorHelper<__m128d> 69 { 70 typedef __m128d VectorType; 71 72 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_pd(x); } 73 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); } 74 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); } 75 76 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_pd(mem, x); } 77 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); } 78 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_pd(mem, x); } 79 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); } 80 81 // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread) 82 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); } 83 84 Vc_OP0(allone, _mm_setallone_pd()) 85 Vc_OP0(zero, _mm_setzero_pd()) 86 Vc_OP3(blend, blendv_pd(a, b, c)) 87 }; 88 89 template<> struct VectorHelper<__m128i> 90 { 91 typedef __m128i VectorType; 92 93 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); } 94 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); } 95 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); } 96 97 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); } 98 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); } 99 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); } 100 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); } 101 102 // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread) 103 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); } 104 105 Vc_OP0(allone, _mm_setallone_si128()) 106 Vc_OP0(zero, _mm_setzero_si128()) 107 Vc_OP3(blend, blendv_epi8(a, b, c)) 108 }; 109 110 #undef Vc_OP1 111 #undef Vc_OP2 112 #undef Vc_OP3 113 114 #define Vc_OP1(op) \ 115 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); } 116 #define Vc_OP(op) \ 117 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); } 118 #define Vc_OP_(op) \ 119 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op , Vc_SUFFIX)(a, b); } 120 #define Vc_OPx(op, op2) \ 121 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); } 122 #define Vc_OP_CAST_(op) \ 123 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \ 124 _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \ 125 Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \ 126 } 127 #define Vc_MINMAX \ 128 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \ 129 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); } 130 131 template<> struct VectorHelper<double> { 132 typedef __m128d VectorType; 133 typedef double EntryType; 134 #define Vc_SUFFIX pd 135 136 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_) 137 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); } 138 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } 139 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); } 140 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } 141 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }// set(1.); } 142 143 #ifdef Vc_IMPL_FMA4 144 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { 145 v1 = _mm_macc_pd(v1, v2, v3); 146 } 147 #else 148 static inline void fma(VectorType &v1, VectorType v2, VectorType v3) { 149 VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble))); 150 VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble))); 151 #if defined(Vc_GCC) && Vc_GCC < 0x40703 152 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot 153 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703 154 asm("":"+x"(h1), "+x"(h2)); 155 #endif 156 const VectorType l1 = _mm_sub_pd(v1, h1); 157 const VectorType l2 = _mm_sub_pd(v2, h2); 158 const VectorType ll = mul(l1, l2); 159 const VectorType lh = add(mul(l1, h2), mul(h1, l2)); 160 const VectorType hh = mul(h1, h2); 161 // ll < lh < hh for all entries is certain 162 const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3| 163 const VectorType b = blendv_pd(v3, lh, lh_lt_v3); 164 const VectorType c = blendv_pd(lh, v3, lh_lt_v3); 165 v1 = add(add(ll, b), add(c, hh)); 166 } 167 #endif 168 169 Vc_OP(add) Vc_OP(sub) Vc_OP(mul) 170 171 Vc_OP1(sqrt) 172 static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) { 173 return _mm_div_pd(one(), sqrt(x)); 174 } 175 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) { 176 return _mm_div_pd(one(), x); 177 } 178 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) { 179 return _mm_cmpunord_pd(x, x); 180 } 181 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) { 182 return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x)); 183 } 184 static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) { 185 return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1))))); 186 } 187 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { 188 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd()); 189 } 190 191 Vc_MINMAX 192 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { 193 a = _mm_min_sd(a, _mm_unpackhi_pd(a, a)); 194 return _mm_cvtsd_f64(a); 195 } 196 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { 197 a = _mm_max_sd(a, _mm_unpackhi_pd(a, a)); 198 return _mm_cvtsd_f64(a); 199 } 200 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { 201 a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1))); 202 return _mm_cvtsd_f64(a); 203 } 204 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { 205 a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1))); 206 return _mm_cvtsd_f64(a); 207 } 208 #undef Vc_SUFFIX 209 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { 210 #ifdef Vc_IMPL_SSE4_1 211 return _mm_round_pd(a, _MM_FROUND_NINT); 212 #else 213 //XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 214 return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a)); 215 #endif 216 } 217 }; 218 219 template<> struct VectorHelper<float> { 220 typedef float EntryType; 221 typedef __m128 VectorType; 222 #define Vc_SUFFIX ps 223 224 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_) 225 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); } 226 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } 227 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); } 228 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } 229 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }// set(1.f); } 230 static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); } 231 232 #ifdef Vc_IMPL_FMA4 233 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { 234 v1 = _mm_macc_ps(v1, v2, v3); 235 } 236 #else 237 static inline void fma(VectorType &v1, VectorType v2, VectorType v3) { 238 __m128d v1_0 = _mm_cvtps_pd(v1); 239 __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1)); 240 __m128d v2_0 = _mm_cvtps_pd(v2); 241 __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2)); 242 __m128d v3_0 = _mm_cvtps_pd(v3); 243 __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3)); 244 v1 = _mm_movelh_ps( 245 _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)), 246 _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1))); 247 } 248 #endif 249 250 Vc_OP(add) Vc_OP(sub) Vc_OP(mul) 251 252 Vc_OP1(sqrt) Vc_OP1(rsqrt) 253 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) { 254 return _mm_cmpunord_ps(x, x); 255 } 256 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) { 257 return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x)); 258 } 259 static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) { 260 return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1))))); 261 } 262 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) { 263 return _mm_rcp_ps(x); 264 } 265 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { 266 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps()); 267 } 268 269 Vc_MINMAX 270 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { 271 a = _mm_min_ps(a, _mm_movehl_ps(a, a)); // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3) 272 a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3 273 return _mm_cvtss_f32(a); 274 } 275 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { 276 a = _mm_max_ps(a, _mm_movehl_ps(a, a)); // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3) 277 a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3 278 return _mm_cvtss_f32(a); 279 } 280 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { 281 a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))); 282 a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1))); 283 return _mm_cvtss_f32(a); 284 } 285 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { 286 a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3))); 287 a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1))); 288 return _mm_cvtss_f32(a); 289 } 290 #undef Vc_SUFFIX 291 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { 292 #ifdef Vc_IMPL_SSE4_1 293 return _mm_round_ps(a, _MM_FROUND_NINT); 294 #else 295 //XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); 296 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); 297 #endif 298 } 299 }; 300 301 template<> struct VectorHelper<int> { 302 typedef int EntryType; 303 typedef __m128i VectorType; 304 #define Vc_SUFFIX si128 305 306 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_) 307 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } 308 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); } 309 #undef Vc_SUFFIX 310 #define Vc_SUFFIX epi32 311 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } 312 313 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } 314 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); } 315 316 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } 317 318 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { 319 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift); 320 } 321 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { 322 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift); 323 } 324 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); } 325 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); } 326 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); } 327 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { 328 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 329 // using lo_epi16 for speed here 330 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 331 return _mm_cvtsi128_si32(a); 332 } 333 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { 334 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 335 // using lo_epi16 for speed here 336 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 337 return _mm_cvtsi128_si32(a); 338 } 339 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { 340 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 341 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 342 return _mm_cvtsi128_si32(a); 343 } 344 #ifdef Vc_IMPL_SSE4_1 345 static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); } 346 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { 347 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 348 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 349 return _mm_cvtsi128_si32(a); 350 } 351 #else 352 static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) { 353 const VectorType aShift = _mm_srli_si128(a, 4); 354 const VectorType ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2] 355 const VectorType bShift = _mm_srli_si128(b, 4); 356 const VectorType ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3] 357 return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8)); 358 } 359 #endif 360 361 Vc_OP(add) Vc_OP(sub) 362 #undef Vc_SUFFIX 363 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } 364 }; 365 366 template<> struct VectorHelper<unsigned int> { 367 typedef unsigned int EntryType; 368 typedef __m128i VectorType; 369 #define Vc_SUFFIX si128 370 Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_) 371 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } 372 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); } 373 374 #undef Vc_SUFFIX 375 #define Vc_SUFFIX epu32 376 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } 377 378 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); } 379 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); } 380 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { 381 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 382 // using lo_epi16 for speed here 383 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 384 return _mm_cvtsi128_si32(a); 385 } 386 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { 387 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 388 // using lo_epi16 for speed here 389 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 390 return _mm_cvtsi128_si32(a); 391 } 392 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { 393 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 394 // using lo_epi16 for speed here 395 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 396 return _mm_cvtsi128_si32(a); 397 } 398 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { 399 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 400 // using lo_epi16 for speed here 401 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 402 return _mm_cvtsi128_si32(a); 403 } 404 405 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } 406 407 static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) { 408 return VectorHelper<int>::mul(a, b); 409 } 410 //X template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) { 411 //X switch (b) { 412 //X case 0: return zero(); 413 //X case 1: return a; 414 //X case 2: return _mm_slli_epi32(a, 1); 415 //X case 4: return _mm_slli_epi32(a, 2); 416 //X case 8: return _mm_slli_epi32(a, 3); 417 //X case 16: return _mm_slli_epi32(a, 4); 418 //X case 32: return _mm_slli_epi32(a, 5); 419 //X case 64: return _mm_slli_epi32(a, 6); 420 //X case 128: return _mm_slli_epi32(a, 7); 421 //X case 256: return _mm_slli_epi32(a, 8); 422 //X case 512: return _mm_slli_epi32(a, 9); 423 //X case 1024: return _mm_slli_epi32(a, 10); 424 //X case 2048: return _mm_slli_epi32(a, 11); 425 //X } 426 //X return mul(a, set(b)); 427 //X } 428 429 #undef Vc_SUFFIX 430 #define Vc_SUFFIX epi32 431 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { 432 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift); 433 } 434 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { 435 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift); 436 } 437 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } 438 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); } 439 440 Vc_OP(add) Vc_OP(sub) 441 #undef Vc_SUFFIX 442 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } 443 }; 444 445 template<> struct VectorHelper<signed short> { 446 typedef __m128i VectorType; 447 typedef signed short EntryType; 448 #define Vc_SUFFIX si128 449 450 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_) 451 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } 452 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); } 453 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); } 454 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); } 455 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); } 456 457 #undef Vc_SUFFIX 458 #define Vc_SUFFIX epi16 459 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } 460 461 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { 462 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift); 463 } 464 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { 465 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift); 466 } 467 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } 468 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d, 469 const EntryType e, const EntryType f, const EntryType g, const EntryType h) { 470 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); 471 } 472 473 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { 474 v1 = add(mul(v1, v2), v3); } 475 476 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); } 477 478 Vc_OPx(mul, mullo) 479 Vc_OP(min) Vc_OP(max) 480 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { 481 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" 482 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 483 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 484 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); 485 return _mm_cvtsi128_si32(a); // & 0xffff is implicit 486 } 487 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { 488 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" 489 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 490 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 491 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); 492 return _mm_cvtsi128_si32(a); // & 0xffff is implicit 493 } 494 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { 495 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 496 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 497 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); 498 return _mm_cvtsi128_si32(a); // & 0xffff is implicit 499 } 500 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { 501 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 502 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 503 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); 504 return _mm_cvtsi128_si32(a); // & 0xffff is implicit 505 } 506 507 Vc_OP(add) Vc_OP(sub) 508 #undef Vc_SUFFIX 509 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } 510 }; 511 512 template<> struct VectorHelper<unsigned short> { 513 typedef __m128i VectorType; 514 typedef unsigned short EntryType; 515 #define Vc_SUFFIX si128 516 Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_) 517 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); } 518 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); } 519 #ifdef Vc_IMPL_SSE4_1 520 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); } 521 #else 522 // FIXME too bad, but this is broken without SSE 4.1 523 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { 524 auto tmp0 = _mm_unpacklo_epi16(a, b); // 0 4 X X 1 5 X X 525 auto tmp1 = _mm_unpackhi_epi16(a, b); // 2 6 X X 3 7 X X 526 auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // 0 2 4 6 X X X X 527 auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // 1 3 5 7 X X X X 528 return _mm_unpacklo_epi16(tmp2, tmp3); // 0 1 2 3 4 5 6 7 529 } 530 #endif 531 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); } 532 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); } 533 534 #undef Vc_SUFFIX 535 #define Vc_SUFFIX epu16 536 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); } 537 538 //X template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) { 539 //X switch (b) { 540 //X case 0: return zero(); 541 //X case 1: return a; 542 //X case 2: return _mm_slli_epi16(a, 1); 543 //X case 4: return _mm_slli_epi16(a, 2); 544 //X case 8: return _mm_slli_epi16(a, 3); 545 //X case 16: return _mm_slli_epi16(a, 4); 546 //X case 32: return _mm_slli_epi16(a, 5); 547 //X case 64: return _mm_slli_epi16(a, 6); 548 //X case 128: return _mm_slli_epi16(a, 7); 549 //X case 256: return _mm_slli_epi16(a, 8); 550 //X case 512: return _mm_slli_epi16(a, 9); 551 //X case 1024: return _mm_slli_epi16(a, 10); 552 //X case 2048: return _mm_slli_epi16(a, 11); 553 //X } 554 //X return mul(a, set(b)); 555 //X } 556 #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1 557 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); } 558 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); } 559 #endif 560 #undef Vc_SUFFIX 561 #define Vc_SUFFIX epi16 562 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) { 563 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift); 564 } 565 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) { 566 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift); 567 } 568 569 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); } 570 571 Vc_OPx(mul, mullo) // should work correctly for all values 572 #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1) 573 Vc_OP(min) Vc_OP(max) // XXX breaks for values with MSB set 574 #endif 575 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) { 576 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" 577 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 578 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 579 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); 580 return _mm_cvtsi128_si32(a); // & 0xffff is implicit 581 } 582 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) { 583 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" 584 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 585 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 586 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); 587 return _mm_cvtsi128_si32(a); // & 0xffff is implicit 588 } 589 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) { 590 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" 591 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 592 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 593 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); 594 return _mm_cvtsi128_si32(a); // & 0xffff is implicit 595 } 596 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) { 597 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change" 598 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2))); 599 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2))); 600 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1))); 601 return _mm_cvtsi128_si32(a); // & 0xffff is implicit 602 } 603 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); } 604 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, 605 const EntryType d, const EntryType e, const EntryType f, 606 const EntryType g, const EntryType h) { 607 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); 608 } 609 610 Vc_OP(add) Vc_OP(sub) 611 #undef Vc_SUFFIX 612 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; } 613 }; 614 #undef Vc_OP1 615 #undef Vc_OP 616 #undef Vc_OP_ 617 #undef Vc_OPx 618 #undef Vc_OP_CAST_ 619 #undef Vc_MINMAX 620 621 } // namespace SSE 622 } // namespace Vc 623 624 #include "vectorhelper.tcc" 625 626 #endif // VC_SSE_VECTORHELPER_H_ 627