1 // sse2 vector class 2 // 3 // Copyright (C) 2010 Tim Blechmann 4 // 5 // This program is free software; you can redistribute it and/or modify 6 // it under the terms of the GNU General Public License as published by 7 // the Free Software Foundation; either version 2 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful, 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU General Public License for more details. 14 // 15 // You should have received a copy of the GNU General Public License 16 // along with this program; see the file COPYING. If not, write to 17 // the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, 18 // Boston, MA 02110-1301, USA. 19 20 #ifndef VEC_SSE2_HPP 21 #define VEC_SSE2_HPP 22 23 #include <algorithm> 24 25 #include <xmmintrin.h> 26 #include <emmintrin.h> 27 28 #ifdef __SSE4_1__ 29 #include <smmintrin.h> 30 #endif 31 32 #ifdef __FMA__ 33 #include <immintrin.h> 34 #endif 35 36 #include "../detail/vec_math.hpp" 37 #include "vec_base.hpp" 38 39 #if defined(__GNUC__) && defined(NDEBUG) 40 #define always_inline inline __attribute__((always_inline)) 41 #else 42 #define always_inline inline 43 #endif 44 45 46 namespace nova 47 { 48 49 template <> 50 struct vec<double>: 51 vec_base<double, __m128d, 2> 52 { 53 typedef vec_base<double, __m128d, 2> base; 54 55 typedef double float_type; 56 typedef __m128d internal_vector_type; 57 gen_sign_masknova::vec58 static inline __m128d gen_sign_mask(void) 59 { 60 __m128i x = _mm_setzero_si128(); 61 #ifdef __SSE4_1__ 62 __m128i ones = _mm_cmpeq_epi64(x, x); 63 #else 64 __m128i ones = _mm_cmpeq_epi32(x, x); 65 #endif 66 return (__m128d)_mm_slli_epi64 (_mm_srli_epi64(ones, 63), 63); 67 } 68 gen_abs_masknova::vec69 static inline __m128d gen_abs_mask(void) 70 { 71 __m128i x = _mm_setzero_si128(); 72 #ifdef __SSE4_1__ 73 __m128i ones = _mm_cmpeq_epi64(x, x); 74 #else 75 __m128i ones = _mm_cmpeq_epi32(x, x); 76 #endif 77 return (__m128d)_mm_srli_epi64 (_mm_slli_epi64(ones, 1), 1); 78 } 79 gen_onenova::vec80 static inline __m128d gen_one(void) 81 { 82 return _mm_set1_pd(1.f); 83 } 84 gen_05nova::vec85 static inline __m128d gen_05(void) 86 { 87 return _mm_set1_pd(0.5f); 88 } 89 gen_zeronova::vec90 static inline __m128d gen_zero(void) 91 { 92 return _mm_setzero_pd(); 93 } 94 gen_onesnova::vec95 static inline __m128d gen_ones(void) 96 { 97 __m128d x = gen_zero(); 98 __m128d ones = _mm_cmpeq_pd(x, x); 99 return ones; 100 } 101 102 vecnova::vec103 vec(__m128d const & arg): 104 base(arg) 105 {} 106 107 public: 108 static const int size = 2; 109 static const int objects_per_cacheline = 64/sizeof(double); 110 static const bool has_compare_bitmask = true; 111 is_alignednova::vec112 static bool is_aligned(double* ptr) 113 { 114 return ((intptr_t)(ptr) & (intptr_t)(size * sizeof(double) - 1)) == 0; 115 } 116 117 /* @{ */ 118 /** constructors */ vecnova::vec119 vec(void) 120 {} 121 vecnova::vec122 vec(double f) 123 { 124 set_vec(f); 125 } 126 vecnova::vec127 vec(float f) 128 { 129 set_vec((double)f); 130 } 131 vecnova::vec132 vec(vec const & rhs) 133 { 134 data_ = rhs.data_; 135 } 136 /* @} */ 137 138 /* @{ */ 139 /** io */ loadnova::vec140 void load(const double * data) 141 { 142 data_ = _mm_loadu_pd(data); 143 } 144 load_alignednova::vec145 void load_aligned(const double * data) 146 { 147 data_ = _mm_load_pd(data); 148 } 149 load_firstnova::vec150 void load_first(const double * data) 151 { 152 data_ = _mm_load_sd(data); 153 } 154 storenova::vec155 void store(double * dest) const 156 { 157 _mm_storeu_pd(dest, data_); 158 } 159 store_alignednova::vec160 void store_aligned(double * dest) const 161 { 162 _mm_store_pd(dest, data_); 163 } 164 store_aligned_streamnova::vec165 void store_aligned_stream(double * dest) const 166 { 167 _mm_stream_pd(dest, data_); 168 } 169 clearnova::vec170 void clear(void) 171 { 172 data_ = gen_zero(); 173 } 174 175 /* @} */ 176 177 /* @{ */ 178 /** element access */ 179 set_vecnova::vec180 void set_vec (double value) 181 { 182 data_ = _mm_set1_pd(value); 183 } 184 set_slopenova::vec185 double set_slope(double start, double slope) 186 { 187 double v1 = start + slope; 188 data_ = _mm_set_pd(v1, start); 189 return slope + slope; 190 } 191 set_expnova::vec192 double set_exp(double start, double curve) 193 { 194 double v1 = start * curve; 195 data_ = _mm_set_pd(v1, start); 196 return v1 * curve; 197 } 198 getnova::vec199 double get (std::size_t index) const 200 { 201 __m128d ret; 202 switch (index) 203 { 204 case 0: 205 ret = data_; 206 break; 207 208 case 1: 209 ret = _mm_shuffle_pd(data_, data_, _MM_SHUFFLE2(1, 1)); 210 break; 211 } 212 213 return _mm_cvtsd_f64(ret); 214 } 215 /* @} */ 216 217 /* @{ */ 218 /** arithmetic operators */ 219 #define OPERATOR_ASSIGNMENT(op, opcode) \ 220 vec & operator op(vec const & rhs) \ 221 { \ 222 data_ = opcode(data_, rhs.data_);\ 223 return *this;\ 224 } 225 226 OPERATOR_ASSIGNMENT(+=, _mm_add_pd) 227 OPERATOR_ASSIGNMENT(-=, _mm_sub_pd) 228 OPERATOR_ASSIGNMENT(*=, _mm_mul_pd) 229 OPERATOR_ASSIGNMENT(/=, _mm_div_pd) 230 231 #define ARITHMETIC_OPERATOR(op, opcode) \ 232 vec operator op(vec const & rhs) const \ 233 { \ 234 return opcode(data_, rhs.data_); \ 235 } 236 237 ARITHMETIC_OPERATOR(+, _mm_add_pd) 238 ARITHMETIC_OPERATOR(-, _mm_sub_pd) 239 ARITHMETIC_OPERATOR(*, _mm_mul_pd) 240 ARITHMETIC_OPERATOR(/, _mm_div_pd) 241 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(reciprocal)242 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(reciprocal) 243 244 #ifndef __FMA__ 245 NOVA_SIMD_DEFINE_MADD 246 #else 247 inline friend vec madd(vec const & arg1, vec const & arg2, vec const & arg3) 248 { 249 return _mm_fmadd_pd(arg1.data_, arg2.data_, arg3.data_); 250 } 251 #endif 252 253 #define RELATIONAL_OPERATOR(op, opcode) \ 254 vec operator op(vec const & rhs) const \ 255 { \ 256 const __m128d one = gen_one(); \ 257 return _mm_and_pd(opcode(data_, rhs.data_), one); \ 258 } 259 260 RELATIONAL_OPERATOR(<, _mm_cmplt_pd) 261 RELATIONAL_OPERATOR(<=, _mm_cmple_pd) 262 RELATIONAL_OPERATOR(>, _mm_cmpgt_pd) 263 RELATIONAL_OPERATOR(>=, _mm_cmpge_pd) 264 RELATIONAL_OPERATOR(==, _mm_cmpeq_pd) 265 RELATIONAL_OPERATOR(!=, _mm_cmpneq_pd) 266 267 /* @{ */ 268 #define BITWISE_OPERATOR(op, opcode) \ 269 vec operator op(vec const & rhs) const \ 270 { \ 271 return opcode(data_, rhs.data_); \ 272 } 273 274 BITWISE_OPERATOR(&, _mm_and_pd) 275 BITWISE_OPERATOR(|, _mm_or_pd) 276 BITWISE_OPERATOR(^, _mm_xor_pd) 277 278 friend vec andnot(vec const & lhs, vec const & rhs) 279 { 280 return _mm_andnot_pd(lhs.data_, rhs.data_); 281 } 282 283 #define RELATIONAL_MASK_OPERATOR(op, opcode) \ 284 friend vec mask_##op(vec const & lhs, vec const & rhs) \ 285 { \ 286 return opcode(lhs.data_, rhs.data_); \ 287 } 288 RELATIONAL_MASK_OPERATOR(lt,_mm_cmplt_pd)289 RELATIONAL_MASK_OPERATOR(lt, _mm_cmplt_pd) 290 RELATIONAL_MASK_OPERATOR(le, _mm_cmple_pd) 291 RELATIONAL_MASK_OPERATOR(gt, _mm_cmpgt_pd) 292 RELATIONAL_MASK_OPERATOR(ge, _mm_cmpge_pd) 293 RELATIONAL_MASK_OPERATOR(eq, _mm_cmpeq_pd) 294 RELATIONAL_MASK_OPERATOR(neq, _mm_cmpneq_pd) 295 296 #undef RELATIONAL_MASK_OPERATOR 297 298 friend inline vec select(vec lhs, vec rhs, vec bitmask) 299 { 300 /* if bitmask is set, return value in rhs, else value in lhs */ 301 #ifdef __SSE4_1__ 302 return _mm_blendv_pd(lhs.data_, rhs.data_, bitmask.data_); 303 #else 304 return detail::vec_select(lhs, rhs, bitmask); 305 #endif 306 } 307 308 /* @} */ 309 310 /* @{ */ 311 /** unary functions */ abs(vec const & arg)312 friend inline vec abs(vec const & arg) 313 { 314 return _mm_and_pd(gen_abs_mask(), arg.data_); 315 } 316 sign(vec const & arg)317 friend always_inline vec sign(vec const & arg) 318 { 319 return detail::vec_sign(arg); 320 } 321 square(vec const & arg)322 friend inline vec square(vec const & arg) 323 { 324 return _mm_mul_pd(arg.data_, arg.data_); 325 } 326 sqrt(vec const & arg)327 friend inline vec sqrt(vec const & arg) 328 { 329 return _mm_sqrt_pd(arg.data_); 330 } 331 cube(vec const & arg)332 friend inline vec cube(vec const & arg) 333 { 334 return _mm_mul_pd(arg.data_, _mm_mul_pd(arg.data_, arg.data_)); 335 } 336 /* @} */ 337 338 /* @{ */ 339 /** binary functions */ max_(vec const & lhs,vec const & rhs)340 friend inline vec max_(vec const & lhs, vec const & rhs) 341 { 342 return _mm_max_pd(lhs.data_, rhs.data_); 343 } 344 min_(vec const & lhs,vec const & rhs)345 friend inline vec min_(vec const & lhs, vec const & rhs) 346 { 347 return _mm_min_pd(lhs.data_, rhs.data_); 348 } 349 /* @} */ 350 351 /* @{ */ 352 /** rounding functions */ round(vec const & arg)353 friend inline vec round(vec const & arg) 354 { 355 #ifdef __SSE4_1__ 356 return _mm_round_pd(arg.data_, _MM_FROUND_TO_NEAREST_INT); 357 #else 358 return vec::round(arg); 359 #endif 360 } 361 frac(vec const & arg)362 friend inline vec frac(vec const & arg) 363 { 364 vec floor_result = floor(arg); 365 return arg - floor_result; 366 } 367 floor(vec const & arg)368 friend inline vec floor(vec const & arg) 369 { 370 #ifdef __SSE4_1__ 371 return _mm_round_pd(arg.data_, _MM_FROUND_TO_NEG_INF); 372 #else 373 return vec::floor(arg); 374 #endif 375 } 376 ceil(vec const & arg)377 friend inline vec ceil(vec const & arg) 378 { 379 #ifdef __SSE4_1__ 380 return _mm_round_pd(arg.data_, _MM_FROUND_TO_POS_INF); 381 #else 382 return vec::ceil(arg); 383 #endif 384 } 385 trunc(vec const & arg)386 friend inline vec trunc(vec const & arg) 387 { 388 #ifdef __SSE4_1__ 389 return _mm_round_pd(arg.data_, _MM_FROUND_TO_ZERO); 390 #else 391 return vec::trunc(arg); 392 #endif 393 } 394 /* @} */ 395 396 397 /* @{ */ 398 /** mathematical functions */ 399 NOVA_SIMD_DELEGATE_BINARY_TO_BASE(pow) NOVA_SIMD_DELEGATE_BINARY_TO_BASE(signed_pow)400 NOVA_SIMD_DELEGATE_BINARY_TO_BASE(signed_pow) 401 402 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(sin) 403 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(cos) 404 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(tan) 405 406 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(asin) 407 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(acos) 408 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(atan) 409 410 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(log) 411 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(log2) 412 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(log10) 413 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(exp) 414 415 NOVA_SIMD_DELEGATE_UNARY_TO_BASE(tanh) 416 417 friend inline vec signed_sqrt(vec const & arg) 418 { 419 return detail::vec_signed_sqrt(arg); 420 } 421 undenormalize(vec const & arg)422 friend inline vec undenormalize(vec const & arg) 423 { 424 return detail::vec_undenormalize(arg); 425 } 426 /* @} */ 427 428 /* @{ */ 429 /** horizontal functions */ 430 #define HORIZONTAL_OP(OP) \ 431 __m128d data = data_; /* [0, 1] */ \ 432 __m128d high = _mm_unpackhi_pd(data, data); /* [1, 1] */ \ 433 __m128d accum = OP(data, high); \ 434 return _mm_cvtsd_f64(accum); 435 436 horizontal_minnova::vec437 inline double horizontal_min(void) const 438 { 439 HORIZONTAL_OP(_mm_min_sd); 440 } 441 horizontal_maxnova::vec442 inline double horizontal_max(void) const 443 { 444 HORIZONTAL_OP(_mm_max_sd); 445 } 446 horizontal_sumnova::vec447 inline double horizontal_sum(void) const 448 { 449 HORIZONTAL_OP(_mm_add_sd); 450 } 451 /* @} */ 452 453 #undef HORIZONTAL_OP 454 455 }; 456 457 } /* namespace nova */ 458 459 460 #undef OPERATOR_ASSIGNMENT 461 #undef ARITHMETIC_OPERATOR 462 #undef RELATIONAL_OPERATOR 463 #undef always_inline 464 465 #endif /* VEC_SSE2_HPP */ 466