1 /*  Copyright (C) 2011-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_F_FLOOR_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_F_FLOOR_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <cmath>
16 #include <simdpp/types.h>
17 #include <simdpp/core/f_abs.h>
18 #include <simdpp/core/bit_or.h>
19 #include <simdpp/core/blend.h>
20 #include <simdpp/core/cmp_eq.h>
21 #include <simdpp/core/cmp_gt.h>
22 #include <simdpp/core/i_shift_r.h>
23 #include <simdpp/core/i_sub.h>
24 #include <simdpp/core/to_float32.h>
25 #include <simdpp/core/to_int32.h>
26 #include <simdpp/detail/vector_array_macros.h>
27 
28 namespace simdpp {
29 namespace SIMDPP_ARCH_NAMESPACE {
30 namespace detail {
31 namespace insn {
32 
33 
34 static SIMDPP_INL
i_floor(const float32x4 & a)35 float32x4 i_floor(const float32x4& a)
36 {
37 #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
38     float32x4 r;
39     for (unsigned i = 0; i < a.length; i++) {
40         r.el(i) = std::floor(a.el(i));
41     }
42     return r;
43 #elif SIMDPP_USE_SSE4_1
44     return _mm_floor_ps(a.native());
45 #elif SIMDPP_USE_NEON64
46     return vrndmq_f32(a.native());
47 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA
48     //check if the value is not too large, or is zero
49     float32x4 ba = abs(a);
50     mask_float32x4 mask_range = cmp_le(ba, 8388607.0f);
51     mask_float32x4 mask_nonzero = cmp_gt(ba, 0);
52     mask_float32x4 mask = bit_and(mask_range, mask_nonzero); // takes care of nans and zeros
53 
54     //calculate the i_floor using trunc
55     int32x4 s = shift_r((uint32x4)a, 31); //=1 if a<0
56     float32x4 at = (float32x4) sub((int32x4)a, s); //=nextafter towards +inf, if a<0
57     int32x4 ia = to_int32(at);
58             ia = sub(ia, s);
59     float32x4 fa = to_float32(ia);
60 
61     //combine the results
62     return blend(fa, a, mask);
63 #elif SIMDPP_USE_ALTIVEC
64     return vec_floor(a.native());
65 #endif
66 }
67 
68 #if SIMDPP_USE_AVX
69 static SIMDPP_INL
i_floor(const float32x8 & a)70 float32x8 i_floor(const float32x8& a)
71 {
72     return _mm256_floor_ps(a.native());
73 }
74 #endif
75 
76 #if SIMDPP_USE_AVX512F
77 static SIMDPP_INL
i_floor(const float32<16> & a)78 float32<16> i_floor(const float32<16>& a)
79 {
80     return _mm512_floor_ps(a.native());
81 }
82 #endif
83 
84 // -----------------------------------------------------------------------------
85 
86 static SIMDPP_INL
i_floor(const float64x2 & a)87 float64x2 i_floor(const float64x2& a)
88 {
89 #if SIMDPP_USE_SSE4_1
90     return _mm_floor_pd(a.native());
91 #elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA
92     float64x2 af = abs(a);
93     // check if the value is not too large or is a nan
94     mask_float64x2 mask_range = cmp_le(af, 4503599627370495.0);
95     // check if truncate to zero or minus one
96     mask_float64x2 mask_1to1 = cmp_lt(af, 1.0);
97 
98     /*  Emulate truncation for numbers not less than 1.0.
99         This is implemented by clearing the mantissa in the source number,
100         adding 1.0 and subtracting integer 1. The mantissa of the resulting
101         number will effectively contain a bit mask defining which bits need to
102         be cleared off the source number in order to truncate it.
103     */
104     float64x2 clearbits = bit_and(af, 0x7ff0000000000000); // clear the mantissa
105     clearbits = add(clearbits, 1.0);
106     clearbits = (float64x2) sub(uint64x2(clearbits), 1);
107     clearbits = bit_andnot(clearbits, 0xfff0000000000000); // leave only the mantissa
108 
109     float64x2 a2 = bit_andnot(a, clearbits); // truncate
110 
111     // check if we need to subtract one (truncated bits when negative)
112     mask_float64x2 mask_neg = cmp_lt(a, 0.0);
113     mask_float64x2 mask_sub1 = cmp_gt(bit_and(a, clearbits), 0.0);
114     mask_sub1 = bit_and(mask_sub1, mask_neg);
115 
116     // one special case is when 'a' is in the range of (-1.0, 0.0) in which
117     // a & clearbits may still yield to zero. Thus this additional check
118     mask_sub1 = bit_or(mask_sub1, bit_and(mask_1to1, mask_neg));
119     float64x2 sub1 = make_float(-1.0);
120     sub1 = bit_and(sub1, mask_sub1);
121 
122     a2 = bit_andnot(a, mask_1to1);
123     a2 = sub(a2, sub1);
124 
125     return blend(a2, a, mask_range);
126 #elif SIMDPP_USE_NEON64
127     return vrndnq_f64(a.native());
128 #elif SIMDPP_USE_VSX_206
129     return vec_floor(a.native());
130 #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
131     float64x2 r;
132     for (unsigned i = 0; i < r.length; ++i) {
133         r.el(i) = std::floor(a.el(i));
134     }
135     return r;
136 #endif
137 }
138 
139 #if SIMDPP_USE_AVX
140 static SIMDPP_INL
i_floor(const float64x4 & a)141 float64x4 i_floor(const float64x4& a)
142 {
143     return _mm256_floor_pd(a.native());
144 }
145 #endif
146 
147 #if SIMDPP_USE_AVX512F
148 static SIMDPP_INL
i_floor(const float64<8> & a)149 float64<8> i_floor(const float64<8>& a)
150 {
151     return _mm512_floor_pd(a.native());
152 }
153 #endif
154 
155 template<class V> SIMDPP_INL
i_floor(const V & a)156 V i_floor(const V& a)
157 {
158     SIMDPP_VEC_ARRAY_IMPL1(V, i_floor, a);
159 }
160 
161 } // namespace insn
162 } // namespace detail
163 } // namespace SIMDPP_ARCH_NAMESPACE
164 } // namespace simdpp
165 
166 #endif
167 
168