1 //  sse2 vector class
2 //
3 //  Copyright (C) 2010 Tim Blechmann
4 //
5 //  This program is free software; you can redistribute it and/or modify
6 //  it under the terms of the GNU General Public License as published by
7 //  the Free Software Foundation; either version 2 of the License, or
8 //  (at your option) any later version.
9 //
10 //  This program is distributed in the hope that it will be useful,
11 //  but WITHOUT ANY WARRANTY; without even the implied warranty of
12 //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 //  GNU General Public License for more details.
14 //
15 //  You should have received a copy of the GNU General Public License
16 //  along with this program; see the file COPYING.  If not, write to
17 //  the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
18 //  Boston, MA 02110-1301, USA.
19 
20 #ifndef VEC_SSE2_HPP
21 #define VEC_SSE2_HPP
22 
23 #include <algorithm>
24 
25 #include <xmmintrin.h>
26 #include <emmintrin.h>
27 
28 #ifdef __SSE4_1__
29 #include <smmintrin.h>
30 #endif
31 
32 #ifdef __FMA__
33 #include <immintrin.h>
34 #endif
35 
36 #include "../detail/vec_math.hpp"
37 #include "vec_base.hpp"
38 
39 #if defined(__GNUC__) && defined(NDEBUG)
40 #define always_inline inline  __attribute__((always_inline))
41 #else
42 #define always_inline inline
43 #endif
44 
45 
46 namespace nova
47 {
48 
49 template <>
50 struct vec<double>:
51     vec_base<double, __m128d, 2>
52 {
53     typedef vec_base<double, __m128d, 2> base;
54 
55     typedef double float_type;
56     typedef __m128d internal_vector_type;
57 
gen_sign_masknova::vec58     static inline __m128d gen_sign_mask(void)
59     {
60         __m128i x = _mm_setzero_si128();
61 #ifdef __SSE4_1__
62         __m128i ones = _mm_cmpeq_epi64(x, x);
63 #else
64         __m128i ones = _mm_cmpeq_epi32(x, x);
65 #endif
66         return (__m128d)_mm_slli_epi64 (_mm_srli_epi64(ones, 63), 63);
67     }
68 
gen_abs_masknova::vec69     static inline __m128d gen_abs_mask(void)
70     {
71         __m128i x = _mm_setzero_si128();
72 #ifdef __SSE4_1__
73         __m128i ones = _mm_cmpeq_epi64(x, x);
74 #else
75         __m128i ones = _mm_cmpeq_epi32(x, x);
76 #endif
77         return (__m128d)_mm_srli_epi64 (_mm_slli_epi64(ones, 1), 1);
78     }
79 
gen_onenova::vec80     static inline __m128d gen_one(void)
81     {
82         return _mm_set1_pd(1.f);
83     }
84 
gen_05nova::vec85     static inline __m128d gen_05(void)
86     {
87         return _mm_set1_pd(0.5f);
88     }
89 
gen_zeronova::vec90     static inline __m128d gen_zero(void)
91     {
92         return _mm_setzero_pd();
93     }
94 
gen_onesnova::vec95     static inline __m128d gen_ones(void)
96     {
97         __m128d x = gen_zero();
98         __m128d ones = _mm_cmpeq_pd(x, x);
99         return ones;
100     }
101 
102 
vecnova::vec103     vec(__m128d const & arg):
104         base(arg)
105     {}
106 
107 public:
108     static const int size = 2;
109     static const int objects_per_cacheline = 64/sizeof(double);
110     static const bool has_compare_bitmask = true;
111 
is_alignednova::vec112     static bool is_aligned(double* ptr)
113     {
114         return ((intptr_t)(ptr) & (intptr_t)(size * sizeof(double) - 1)) == 0;
115     }
116 
117     /* @{ */
118     /** constructors */
vecnova::vec119     vec(void)
120     {}
121 
vecnova::vec122     vec(double f)
123     {
124         set_vec(f);
125     }
126 
vecnova::vec127     vec(float f)
128     {
129         set_vec((double)f);
130     }
131 
vecnova::vec132     vec(vec const & rhs)
133     {
134         data_ = rhs.data_;
135     }
136     /* @} */
137 
138     /* @{ */
139     /** io */
loadnova::vec140     void load(const double * data)
141     {
142         data_ = _mm_loadu_pd(data);
143     }
144 
load_alignednova::vec145     void load_aligned(const double * data)
146     {
147         data_ = _mm_load_pd(data);
148     }
149 
load_firstnova::vec150     void load_first(const double * data)
151     {
152         data_ = _mm_load_sd(data);
153     }
154 
storenova::vec155     void store(double * dest) const
156     {
157         _mm_storeu_pd(dest, data_);
158     }
159 
store_alignednova::vec160     void store_aligned(double * dest) const
161     {
162         _mm_store_pd(dest, data_);
163     }
164 
store_aligned_streamnova::vec165     void store_aligned_stream(double * dest) const
166     {
167         _mm_stream_pd(dest, data_);
168     }
169 
clearnova::vec170     void clear(void)
171     {
172         data_ = gen_zero();
173     }
174 
175     /* @} */
176 
177     /* @{ */
178     /** element access */
179 
set_vecnova::vec180     void set_vec (double value)
181     {
182         data_ = _mm_set1_pd(value);
183     }
184 
set_slopenova::vec185     double set_slope(double start, double slope)
186     {
187         double v1 = start + slope;
188         data_ = _mm_set_pd(v1, start);
189         return slope + slope;
190     }
191 
set_expnova::vec192     double set_exp(double start, double curve)
193     {
194         double v1 = start * curve;
195         data_ = _mm_set_pd(v1, start);
196         return v1 * curve;
197     }
198 
getnova::vec199     double get (std::size_t index) const
200     {
201         __m128d ret;
202         switch (index)
203         {
204         case 0:
205             ret = data_;
206             break;
207 
208         case 1:
209             ret = _mm_shuffle_pd(data_, data_, _MM_SHUFFLE2(1, 1));
210             break;
211         }
212 
213         return _mm_cvtsd_f64(ret);
214     }
215     /* @} */
216 
217     /* @{ */
218     /** arithmetic operators */
219 #define OPERATOR_ASSIGNMENT(op, opcode) \
220     vec & operator op(vec const & rhs) \
221     { \
222         data_ = opcode(data_, rhs.data_);\
223         return *this;\
224     }
225 
226     OPERATOR_ASSIGNMENT(+=, _mm_add_pd)
227     OPERATOR_ASSIGNMENT(-=, _mm_sub_pd)
228     OPERATOR_ASSIGNMENT(*=, _mm_mul_pd)
229     OPERATOR_ASSIGNMENT(/=, _mm_div_pd)
230 
231 #define ARITHMETIC_OPERATOR(op, opcode) \
232     vec operator op(vec const & rhs) const \
233     { \
234         return opcode(data_, rhs.data_); \
235     }
236 
237     ARITHMETIC_OPERATOR(+, _mm_add_pd)
238     ARITHMETIC_OPERATOR(-, _mm_sub_pd)
239     ARITHMETIC_OPERATOR(*, _mm_mul_pd)
240     ARITHMETIC_OPERATOR(/, _mm_div_pd)
241 
NOVA_SIMD_DELEGATE_UNARY_TO_BASE(reciprocal)242     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(reciprocal)
243 
244 #ifndef __FMA__
245     NOVA_SIMD_DEFINE_MADD
246 #else
247     inline friend vec madd(vec const & arg1, vec const & arg2, vec const & arg3)
248     {
249         return _mm_fmadd_pd(arg1.data_, arg2.data_, arg3.data_);
250     }
251 #endif
252 
253 #define RELATIONAL_OPERATOR(op, opcode) \
254     vec operator op(vec const & rhs) const \
255     { \
256         const __m128d one = gen_one(); \
257         return _mm_and_pd(opcode(data_, rhs.data_), one); \
258     }
259 
260     RELATIONAL_OPERATOR(<, _mm_cmplt_pd)
261     RELATIONAL_OPERATOR(<=, _mm_cmple_pd)
262     RELATIONAL_OPERATOR(>, _mm_cmpgt_pd)
263     RELATIONAL_OPERATOR(>=, _mm_cmpge_pd)
264     RELATIONAL_OPERATOR(==, _mm_cmpeq_pd)
265     RELATIONAL_OPERATOR(!=, _mm_cmpneq_pd)
266 
267     /* @{ */
268 #define BITWISE_OPERATOR(op, opcode) \
269     vec operator op(vec const & rhs) const \
270     { \
271         return opcode(data_, rhs.data_); \
272     }
273 
274     BITWISE_OPERATOR(&, _mm_and_pd)
275     BITWISE_OPERATOR(|, _mm_or_pd)
276     BITWISE_OPERATOR(^, _mm_xor_pd)
277 
278     friend vec andnot(vec const & lhs, vec const & rhs)
279     {
280         return _mm_andnot_pd(lhs.data_, rhs.data_);
281     }
282 
283     #define RELATIONAL_MASK_OPERATOR(op, opcode) \
284     friend vec mask_##op(vec const & lhs, vec const & rhs) \
285     { \
286         return opcode(lhs.data_, rhs.data_); \
287     }
288 
RELATIONAL_MASK_OPERATOR(lt,_mm_cmplt_pd)289     RELATIONAL_MASK_OPERATOR(lt, _mm_cmplt_pd)
290     RELATIONAL_MASK_OPERATOR(le, _mm_cmple_pd)
291     RELATIONAL_MASK_OPERATOR(gt, _mm_cmpgt_pd)
292     RELATIONAL_MASK_OPERATOR(ge, _mm_cmpge_pd)
293     RELATIONAL_MASK_OPERATOR(eq, _mm_cmpeq_pd)
294     RELATIONAL_MASK_OPERATOR(neq, _mm_cmpneq_pd)
295 
296     #undef RELATIONAL_MASK_OPERATOR
297 
298     friend inline vec select(vec lhs, vec rhs, vec bitmask)
299     {
300         /* if bitmask is set, return value in rhs, else value in lhs */
301 #ifdef __SSE4_1__
302         return _mm_blendv_pd(lhs.data_, rhs.data_, bitmask.data_);
303 #else
304         return detail::vec_select(lhs, rhs, bitmask);
305 #endif
306     }
307 
308     /* @} */
309 
310     /* @{ */
311     /** unary functions */
abs(vec const & arg)312     friend inline vec abs(vec const & arg)
313     {
314         return _mm_and_pd(gen_abs_mask(), arg.data_);
315     }
316 
sign(vec const & arg)317     friend always_inline vec sign(vec const & arg)
318     {
319         return detail::vec_sign(arg);
320     }
321 
square(vec const & arg)322     friend inline vec square(vec const & arg)
323     {
324         return _mm_mul_pd(arg.data_, arg.data_);
325     }
326 
sqrt(vec const & arg)327     friend inline vec sqrt(vec const & arg)
328     {
329         return _mm_sqrt_pd(arg.data_);
330     }
331 
cube(vec const & arg)332     friend inline vec cube(vec const & arg)
333     {
334         return _mm_mul_pd(arg.data_, _mm_mul_pd(arg.data_, arg.data_));
335     }
336     /* @} */
337 
338     /* @{ */
339     /** binary functions */
max_(vec const & lhs,vec const & rhs)340     friend inline vec max_(vec const & lhs, vec const & rhs)
341     {
342         return _mm_max_pd(lhs.data_, rhs.data_);
343     }
344 
min_(vec const & lhs,vec const & rhs)345     friend inline vec min_(vec const & lhs, vec const & rhs)
346     {
347         return _mm_min_pd(lhs.data_, rhs.data_);
348     }
349     /* @} */
350 
351     /* @{ */
352     /** rounding functions */
round(vec const & arg)353     friend inline vec round(vec const & arg)
354     {
355 #ifdef __SSE4_1__
356         return _mm_round_pd(arg.data_, _MM_FROUND_TO_NEAREST_INT);
357 #else
358         return vec::round(arg);
359 #endif
360     }
361 
frac(vec const & arg)362     friend inline vec frac(vec const & arg)
363     {
364         vec floor_result = floor(arg);
365         return arg - floor_result;
366     }
367 
floor(vec const & arg)368     friend inline vec floor(vec const & arg)
369     {
370 #ifdef __SSE4_1__
371         return _mm_round_pd(arg.data_, _MM_FROUND_TO_NEG_INF);
372 #else
373         return vec::floor(arg);
374 #endif
375     }
376 
ceil(vec const & arg)377     friend inline vec ceil(vec const & arg)
378     {
379 #ifdef __SSE4_1__
380         return _mm_round_pd(arg.data_, _MM_FROUND_TO_POS_INF);
381 #else
382         return vec::ceil(arg);
383 #endif
384     }
385 
trunc(vec const & arg)386     friend inline vec trunc(vec const & arg)
387     {
388 #ifdef __SSE4_1__
389         return _mm_round_pd(arg.data_, _MM_FROUND_TO_ZERO);
390 #else
391         return vec::trunc(arg);
392 #endif
393     }
394     /* @} */
395 
396 
397     /* @{ */
398     /** mathematical functions */
399     NOVA_SIMD_DELEGATE_BINARY_TO_BASE(pow)
NOVA_SIMD_DELEGATE_BINARY_TO_BASE(signed_pow)400     NOVA_SIMD_DELEGATE_BINARY_TO_BASE(signed_pow)
401 
402     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(sin)
403     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(cos)
404     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(tan)
405 
406     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(asin)
407     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(acos)
408     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(atan)
409 
410     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(log)
411     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(log2)
412     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(log10)
413     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(exp)
414 
415     NOVA_SIMD_DELEGATE_UNARY_TO_BASE(tanh)
416 
417     friend inline vec signed_sqrt(vec const & arg)
418     {
419         return detail::vec_signed_sqrt(arg);
420     }
421 
undenormalize(vec const & arg)422     friend inline vec undenormalize(vec const & arg)
423     {
424         return detail::vec_undenormalize(arg);
425     }
426     /* @} */
427 
428     /* @{ */
429     /** horizontal functions */
430 #define HORIZONTAL_OP(OP)                                        \
431         __m128d data = data_;                       /* [0, 1] */ \
432         __m128d high = _mm_unpackhi_pd(data, data); /* [1, 1] */ \
433         __m128d accum = OP(data, high);                          \
434         return _mm_cvtsd_f64(accum);
435 
436 
horizontal_minnova::vec437     inline double horizontal_min(void) const
438     {
439         HORIZONTAL_OP(_mm_min_sd);
440     }
441 
horizontal_maxnova::vec442     inline double horizontal_max(void) const
443     {
444         HORIZONTAL_OP(_mm_max_sd);
445     }
446 
horizontal_sumnova::vec447     inline double horizontal_sum(void) const
448     {
449         HORIZONTAL_OP(_mm_add_sd);
450     }
451     /* @} */
452 
453 #undef HORIZONTAL_OP
454 
455 };
456 
457 } /* namespace nova */
458 
459 
460 #undef OPERATOR_ASSIGNMENT
461 #undef ARITHMETIC_OPERATOR
462 #undef RELATIONAL_OPERATOR
463 #undef always_inline
464 
465 #endif /* VEC_SSE2_HPP */
466