/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ROUNDING_HPP #define XSIMD_ROUNDING_HPP #include #include "xsimd_fp_sign.hpp" #include "xsimd_numerical_constant.hpp" namespace xsimd { /** * Computes the batch of smallest integer values not less than * scalars in \c x. * @param x batch of floating point values. * @return the batch of smallest integer values not less than \c x. */ template batch_type_t ceil(const simd_base& x); /** * Computes the batch of largest integer values not greater than * scalars in \c x. * @param x batch of floating point values. * @return the batch of largest integer values not greater than \c x. */ template batch_type_t floor(const simd_base& x); /** * Computes the batch of nearest integer values not greater in magnitude * than scalars in \c x. * @param x batch of floating point values. * @return the batch of nearest integer values not greater in magnitude than \c x. */ template batch_type_t trunc(const simd_base& x); /** * Computes the batch of nearest integer values to scalars in \c x (in * floating point format), rounding halfway cases away from zero, regardless * of the current rounding mode. * @param x batch of flaoting point values. * @return the batch of nearest integer values. */ template batch_type_t round(const simd_base& x); // Contrary to their std counterpart, these functions // are assume that the rounding mode is FE_TONEAREST /** * Rounds the scalars in \c x to integer values (in floating point format), using * the current rounding mode. * @param x batch of flaoting point values. * @return the batch of nearest integer values. */ template batch_type_t nearbyint(const simd_base& x); /** * Rounds the scalars in \c x to integer values (in floating point format), using * the current rounding mode. * @param x batch of flaoting point values. * @return the batch of rounded values. */ template batch_type_t rint(const simd_base& x); namespace impl { template struct rounding_kernel; template struct rounding_kernel_int { static inline B ceil(const B& x) { return x; } static inline B floor(const B& x) { return x; } static inline B trunc(const B& x) { return x; } static inline B nearbyint(const B& x) { return x; } }; #define DEFINE_ROUNDING_KERNEL_INT(T, N) \ template <> \ struct rounding_kernel> \ : rounding_kernel_int> \ { \ } /********************** * SSE implementation * **********************/ #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION DEFINE_ROUNDING_KERNEL_INT(uint8_t, 16); DEFINE_ROUNDING_KERNEL_INT(int8_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 8); DEFINE_ROUNDING_KERNEL_INT(int16_t, 8); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 4); DEFINE_ROUNDING_KERNEL_INT(int32_t, 4); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 2); DEFINE_ROUNDING_KERNEL_INT(int64_t, 2); template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return _mm_ceil_ps(x); } static inline batch_type floor(const batch_type& x) { return _mm_floor_ps(x); } static inline batch_type trunc(const batch_type& x) { return _mm_round_ps(x, _MM_FROUND_TO_ZERO); } static inline batch_type nearbyint(const batch_type& x) { return _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT); } }; template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return _mm_ceil_pd(x); } static inline batch_type floor(const batch_type& x) { return _mm_floor_pd(x); } static inline batch_type trunc(const batch_type& x) { return _mm_round_pd(x, _MM_FROUND_TO_ZERO); } static inline batch_type nearbyint(const batch_type& x) { return _mm_round_pd(x, _MM_FROUND_TO_NEAREST_INT); } }; #elif (XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION) || (XSIMD_ARM_INSTR_SET == XSIMD_ARM7_NEON_VERSION) DEFINE_ROUNDING_KERNEL_INT(uint8_t, 16); DEFINE_ROUNDING_KERNEL_INT(int8_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 8); DEFINE_ROUNDING_KERNEL_INT(int16_t, 8); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 4); DEFINE_ROUNDING_KERNEL_INT(int32_t, 4); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 2); DEFINE_ROUNDING_KERNEL_INT(int64_t, 2); template struct rounding_kernel_base { static inline B ceil(const B& x) { B tx = trunc(x); return select(tx < x, tx + B(1), tx); } static inline B floor(const B& x) { B tx = trunc(x); return select(tx > x, tx - B(1), tx); } static inline B nearbyint(const B& x) { B s = bitofsign(x); B v = x ^ s; B t2n = twotonmb(); B d0 = v + t2n; return s ^ select(v < t2n, d0 - t2n, v); } }; template <> struct rounding_kernel> : rounding_kernel_base> { using batch_type = batch; static inline batch_type trunc(const batch_type& x) { return select(abs(x) < maxflint(), to_float(to_int(x)), x); } }; #if (XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE2_VERSION) template <> struct rounding_kernel> : rounding_kernel_base> { using batch_type = batch; static inline batch_type trunc(const batch_type& x) { return batch(std::trunc(x[0]), std::trunc(x[1])); } }; #endif #endif /********************** * AVX implementation * **********************/ #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION DEFINE_ROUNDING_KERNEL_INT(uint8_t, 32); DEFINE_ROUNDING_KERNEL_INT(int8_t, 32); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 16); DEFINE_ROUNDING_KERNEL_INT(int16_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 8); DEFINE_ROUNDING_KERNEL_INT(int32_t, 8); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 4); DEFINE_ROUNDING_KERNEL_INT(int64_t, 4); template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return _mm256_round_ps(x, _MM_FROUND_CEIL); } static inline batch_type floor(const batch_type& x) { return _mm256_round_ps(x, _MM_FROUND_FLOOR); } static inline batch_type trunc(const batch_type& x) { return _mm256_round_ps(x, _MM_FROUND_TO_ZERO); } static inline batch_type nearbyint(const batch_type& x) { return _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT); } }; template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return _mm256_round_pd(x, _MM_FROUND_CEIL); } static inline batch_type floor(const batch_type& x) { return _mm256_round_pd(x, _MM_FROUND_FLOOR); } static inline batch_type trunc(const batch_type& x) { return _mm256_round_pd(x, _MM_FROUND_TO_ZERO); } static inline batch_type nearbyint(const batch_type& x) { return _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT); } }; #endif #if XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION DEFINE_ROUNDING_KERNEL_INT(uint8_t, 64); DEFINE_ROUNDING_KERNEL_INT(int8_t, 64); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 32); DEFINE_ROUNDING_KERNEL_INT(int16_t, 32); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 16); DEFINE_ROUNDING_KERNEL_INT(int32_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 8); DEFINE_ROUNDING_KERNEL_INT(int64_t, 8); template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF); return res; } static inline batch_type floor(const batch_type& x) { auto res = _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF); return res; } static inline batch_type trunc(const batch_type& x) { auto res = _mm512_roundscale_round_ps(x, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); return res; } static inline batch_type nearbyint(const batch_type& x) { auto res = _mm512_roundscale_round_ps(x, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); return res; } }; template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF); return res; } static inline batch_type floor(const batch_type& x) { auto res = _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF); return res; } static inline batch_type trunc(const batch_type& x) { auto res = _mm512_roundscale_round_pd(x, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); return res; } static inline batch_type nearbyint(const batch_type& x) { auto res = _mm512_roundscale_round_pd(x, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); return res; } }; #endif #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_32_NEON_VERSION DEFINE_ROUNDING_KERNEL_INT(uint8_t, 16); DEFINE_ROUNDING_KERNEL_INT(int8_t, 16); DEFINE_ROUNDING_KERNEL_INT(uint16_t, 8); DEFINE_ROUNDING_KERNEL_INT(int16_t, 8); DEFINE_ROUNDING_KERNEL_INT(uint32_t, 4); DEFINE_ROUNDING_KERNEL_INT(int32_t, 4); DEFINE_ROUNDING_KERNEL_INT(uint64_t, 2); DEFINE_ROUNDING_KERNEL_INT(int64_t, 2); template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return vrndpq_f32(x); } static inline batch_type floor(const batch_type& x) { return vrndmq_f32(x); } static inline batch_type trunc(const batch_type& x) { return vrndq_f32(x); } static inline batch_type nearbyint(const batch_type& x) { return vrndxq_f32(x); } }; #endif #if XSIMD_ARM_INSTR_SET >= XSIMD_ARM8_64_NEON_VERSION template <> struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { return vrndpq_f64(x); } static inline batch_type floor(const batch_type& x) { return vrndmq_f64(x); } static inline batch_type trunc(const batch_type& x) { return vrndq_f64(x); } static inline batch_type nearbyint(const batch_type& x) { return vrndxq_f64(x); } }; #endif /*************************** * Fallback implementation * ***************************/ #if defined(XSIMD_ENABLE_FALLBACK) template struct rounding_kernel> { using batch_type = batch; static inline batch_type ceil(const batch_type& x) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::ceil, x) } static inline batch_type floor(const batch_type& x) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::floor, x) } static inline batch_type trunc(const batch_type& x) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::trunc, x) } static inline batch_type nearbyint(const batch_type& x) { XSIMD_FALLBACK_BATCH_UNARY_FUNC(std::nearbyint, x) } }; #endif /************************** * Generic implementation * **************************/ template ::value> struct round_impl; template struct round_impl, false> { using batch_type = batch; static inline batch_type round(const batch_type& x) { batch_type v = abs(x); batch_type c = ceil(v); batch_type cp = select(c - batch_type(0.5) > v, c - batch_type(1), c); return select(v > maxflint(), x, copysign(cp, x)); } }; template struct round_impl, true> { using batch_type = batch; static inline batch_type round(const batch_type& rhs) { return rhs; } }; template inline batch rint(const batch& x) { return nearbyint(x); } } template inline batch_type_t ceil(const simd_base& x) { return impl::rounding_kernel::ceil(x()); } template inline batch_type_t floor(const simd_base& x) { return impl::rounding_kernel::floor(x()); } template inline batch_type_t trunc(const simd_base& x) { return impl::rounding_kernel::trunc(x()); } template inline batch_type_t round(const simd_base& x) { return impl::round_impl::round(x()); } // Contrary to their std counterpart, these functions // are assume that the rounding mode is FE_TONEAREST template inline batch_type_t nearbyint(const simd_base& x) { return impl::rounding_kernel::nearbyint(x()); } template inline batch_type_t rint(const simd_base& x) { return impl::rint(x()); } } #endif