//****************************************************************************// // Copyright (C) 2016-2018 Florent Hivert , // // // // Distributed under the terms of the GNU General Public License (GPL) // // // // This code is distributed in the hope that it will be useful, // // but WITHOUT ANY WARRANTY; without even the implied warranty of // // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // // General Public License for more details. // // // // The full text of the GPL is available at: // // // // http://www.gnu.org/licenses/ // //****************************************************************************// #ifndef HPCOMBI_EPU_HPP_INCLUDED #define HPCOMBI_EPU_HPP_INCLUDED #include #include #include #include // less<>, equal_to<> #include #include #include #ifdef HPCOMBI_HAVE_CONFIG #include "HPCombi-config.h" #endif #if __cplusplus <= 201103L #include "fallback/seq.hpp" #endif #include "vect_generic.hpp" #ifdef HPCOMBI_CONSTEXPR_FUN_ARGS #define HPCOMBI_CONSTEXPR constexpr #define HPCOMBI_CONSTEXPR_CONSTRUCTOR constexpr #else #pragma message "Using a constexpr broken compiler ! " \ "Performance may not be optimal" #define HPCOMBI_CONSTEXPR const #define HPCOMBI_CONSTEXPR_CONSTRUCTOR #endif namespace HPCombi { /// Unsigned 8 bits int constant. inline constexpr uint8_t operator"" _u8(unsigned long long arg) noexcept { return static_cast(arg); } /// SIMD vector of 16 unsigned bytes using epu8 = uint8_t __attribute__((vector_size(16))); static_assert(alignof(epu8) == 16, "epu8 type is not properly aligned by the compiler !"); /// SIMD vector of 32 unsigned bytes using xpu8 = uint8_t __attribute__((vector_size(32))); static_assert(alignof(xpu8) == 32, "xpu8 type is not properly aligned by the compiler !"); namespace { // Implementation detail code /// A handmade C++11 constexpr lambda template struct ConstFun { HPCOMBI_CONSTEXPR_CONSTRUCTOR ConstFun(T cc) : cst(cc) {} HPCOMBI_CONSTEXPR T operator()(T) const { return cst; } /// constant value for constexpr lambda T cst; }; /// Factory object for various SIMD constants in particular constexpr template struct TPUBuild { using type_elem = typename std::remove_reference::type; static constexpr size_t size_elem = sizeof(type_elem); static constexpr size_t size = sizeof(TPU) / size_elem; using array = std::array; template static HPCOMBI_CONSTEXPR TPU make_helper(Fun f, std::index_sequence) { return TPU{f(Is)...}; } inline TPU operator()(const std::initializer_list il, type_elem def) const { assert(il.size() <= size); array res; std::copy(il.begin(), il.end(), res.begin()); std::fill(res.begin() + il.size(), res.end(), def); return reinterpret_cast(res); } template inline HPCOMBI_CONSTEXPR TPU operator()(Fun f) const { return make_helper(f, std::make_index_sequence{}); } inline HPCOMBI_CONSTEXPR TPU operator()(type_elem c) const { return make_helper(ConstFun(c), std::make_index_sequence{}); } // explicit overloading for int constants inline HPCOMBI_CONSTEXPR TPU operator()(int c) const { return operator()(uint8_t(c)); } inline HPCOMBI_CONSTEXPR TPU operator()(size_t c) const { return operator()(uint8_t(c)); } }; // The following functions should be constexpr lambdas writen directly in // their corresponding methods. However until C++17, constexpr lambda are // forbidden. So we put them here. /// The image of i by the identity function HPCOMBI_CONSTEXPR uint8_t id_fun(uint8_t i) { return i; } /// The image of i by the left cycle function HPCOMBI_CONSTEXPR uint8_t left_cycle_fun(uint8_t i) { return (i + 15) % 16; } /// The image of i by the right cycle function HPCOMBI_CONSTEXPR uint8_t right_cycle_fun(uint8_t i) { return (i + 1) % 16; } /// The image of i by a left shift duplicating the hole HPCOMBI_CONSTEXPR uint8_t left_dup_fun(uint8_t i) { return i == 15 ? 15 : i + 1; } /// The image of i by a right shift duplicating the hole HPCOMBI_CONSTEXPR uint8_t right_dup_fun(uint8_t i) { return i == 0 ? 0 : i - 1; } /// The complement of i to 15 HPCOMBI_CONSTEXPR uint8_t complement_fun(uint8_t i) { return 15 - i; } HPCOMBI_CONSTEXPR uint8_t popcount4_fun(uint8_t i) { return ((i & 1) != 0 ? 1 : 0) + ((i & 2) != 0 ? 1 : 0) + ((i & 4) != 0 ? 1 : 0) + ((i & 8) != 0 ? 1 : 0); } } // Anonymous namespace /// Factory object for various SIMD constants in particular constexpr TPUBuild Epu8; /// The indentity #HPCombi::epu8 HPCOMBI_CONSTEXPR epu8 epu8id = Epu8(id_fun); /// The reverted identity #HPCombi::epu8 HPCOMBI_CONSTEXPR epu8 epu8rev = Epu8(complement_fun); /// Left cycle #HPCombi::epu8 permutation HPCOMBI_CONSTEXPR epu8 left_cycle = Epu8(left_cycle_fun); /// Right cycle #HPCombi::epu8 permutation HPCOMBI_CONSTEXPR epu8 right_cycle = Epu8(right_cycle_fun); /// Left shift #HPCombi::epu8, duplicating the rightmost entry HPCOMBI_CONSTEXPR epu8 left_dup = Epu8(left_dup_fun); /// Right shift #HPCombi::epu8, duplicating the leftmost entry HPCOMBI_CONSTEXPR epu8 right_dup = Epu8(right_dup_fun); /// Popcount #HPCombi::epu8: the ith entry contains the number of bits set in i HPCOMBI_CONSTEXPR epu8 popcount4 = Epu8(popcount4_fun); /** Cast a #HPCombi::epu8 to a c++ \c std::array * * This is usually faster for algorithm using a lot of indexed acces. */ inline TPUBuild::array &as_array(epu8 &v) { return reinterpret_cast::array &>(v); } /** Cast a constant #HPCombi::epu8 to a C++ \c std::array * * This is usually faster for algorithm using a lot of indexed acces. */ inline const TPUBuild::array &as_array(const epu8 &v) { return reinterpret_cast::array &>(v); } /** Cast a C++ \c std::array to a #HPCombi::epu8 */ // Passing the argument by reference triggers a segfault in gcc // Since vector types doesn't belongs to the standard, I didn't manage // to know if I'm using undefined behavior here. inline epu8 from_array(TPUBuild::array a) { return reinterpret_cast(a); } /** Cast a #HPCombi::epu8 to a c++ #HPCombi::VectGeneric * * This is usually faster for algorithm using a lot of indexed acces. */ inline VectGeneric<16> &as_VectGeneric(epu8 &v) { return reinterpret_cast &>(as_array(v)); } /** Cast a #HPCombi::epu8 to a c++ #HPCombi::VectGeneric * * This is usually faster for algorithm using a lot of indexed acces. */ inline const VectGeneric<16> &as_VectGeneric(const epu8 &v) { return reinterpret_cast &>(as_array(v)); } /** Test whether all the entries of a #HPCombi::epu8 are zero */ inline bool is_all_zero(epu8 a) { return _mm_testz_si128(a, a); } /** Test whether all the entries of a #HPCombi::epu8 are one */ inline bool is_all_one(epu8 a) { return _mm_testc_si128(a, Epu8(0xFF)); } /** Equality of #HPCombi::epu8 */ inline bool equal(epu8 a, epu8 b) { return is_all_zero(_mm_xor_si128(a, b)); } /** Non equality of #HPCombi::epu8 */ inline bool not_equal(epu8 a, epu8 b) { return not equal(a, b); } /** Permuting a #HPCombi::epu8 */ inline epu8 permuted(epu8 a, epu8 b) { return _mm_shuffle_epi8(a, b); } /** Left shifted of a #HPCombi::epu8 inserting a 0 * @warning we use the convention that the 0 entry is on the left ! */ inline epu8 shifted_right(epu8 a) { return _mm_bslli_si128(a, 1); } /** Right shifted of a #HPCombi::epu8 inserting a 0 * @warning we use the convention that the 0 entry is on the left ! */ inline epu8 shifted_left(epu8 a) { return _mm_bsrli_si128(a, 1); } /** Reverting a #HPCombi::epu8 */ inline epu8 reverted(epu8 a) { return permuted(a, epu8rev); } /** Vector min between two #HPCombi::epu8 0 */ inline epu8 min(epu8 a, epu8 b) { return _mm_min_epu8(a, b); } /** Vector max between two #HPCombi::epu8 0 */ inline epu8 max(epu8 a, epu8 b) { return _mm_max_epu8(a, b); } /** Testing if a #HPCombi::epu8 is sorted */ inline bool is_sorted(epu8 a); /** Return a sorted #HPCombi::epu8 * @details * @par Algorithm: * Uses the 9 stages sorting network #sorting_rounds */ inline epu8 sorted(epu8 a); /** Return a #HPCombi::epu8 with the two half sorted * @details * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8 */ inline epu8 sorted8(epu8 a); /** Return a reverse sorted #HPCombi::epu8 * @details * @par Algorithm: * Uses the 9 stages sorting network #sorting_rounds */ inline epu8 revsorted(epu8 a); /** Return a #HPCombi::epu8 with the two half reverse sorted * @details * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8 */ inline epu8 revsorted8(epu8 a); /** Sort \c this and return the sorting permutation * @details * @par Algorithm: Uses a 9 stages sorting network #sorting_rounds8 */ inline epu8 sort_perm(epu8 & a); /** Sort \c this and return the sorting permutation * @details * @par Algorithm: Uses a 9 stages sorting network #sorting_rounds8 */ inline epu8 sort8_perm(epu8 & a); /** Find if a vector is a permutation of one other * @details * @param a, b: two #HPCombi::epu8 * @returns a #HPCombi::epu8 * For each @f$0 \leq i < 16@f$, \c res[i] is the position in \c a of \c b[i] if \c b[i] appears exactly once in \c a, or undefined if not. */ inline epu8 permutation_of(epu8 a, epu8 b); /** A prime number good for hashing */ const uint64_t prime = 0x9e3779b97f4a7bb9; /** A random #HPCombi::epu8 * @details * @param bnd : the upper bound for the value of the entries * @returns a random #HPCombi::epu8 with value in the interval * @f$[0, 1, 2, ..., bnd-1]@f$. */ inline epu8 random_epu8(uint16_t bnd); /** Remove duplicates in a sorted #HPCombi::epu8 * @details * @param a: supposed to be sorted * @param repl: the value replacing the duplicate entries (default to 0) * @return a where repeated occurences of entries are replaced by \c repl */ inline epu8 remove_dups(epu8 a, uint8_t repl = 0); /** @class common_horiz_sum * @brief Horizontal sum of a #HPCombi::epu8 * @details * @returns the horizontal sum of the input * @par Example: * @code * horiz_sum(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15}); * @endcode * Returns `110` * @warning The result is supposed to fit in a \c uint8_t */ /** @copydoc common_horiz_sum * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline uint8_t horiz_sum_ref(epu8); /** @copydoc common_horiz_sum * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access * through #HPCombi::VectGeneric */ inline uint8_t horiz_sum_gen(epu8); /** @copydoc common_horiz_sum * @par Algorithm: * 4-stages paralell algorithm */ inline uint8_t horiz_sum4(epu8); /** @copydoc common_horiz_sum * @par Algorithm: * 3-stages paralell algorithm + indexed access */ inline uint8_t horiz_sum3(epu8); /** @copydoc common_horiz_sum */ inline uint8_t horiz_sum(epu8 v) { return horiz_sum3(v); } /** @class common_partial_sums * @brief Horizontal partial sum of a #HPCombi::epu8 * @details * @returns the partials sums of the input * @par Example: * @code * partial_sums(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15}); * @endcode * Returns `{ 5,10,12,17,18,24,36,40,40,43,45,56,68,81,95,110}` */ /** @copydoc common_partial_sums * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline epu8 partial_sums_ref(epu8); /** @copydoc common_partial_sums * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access * through #HPCombi::VectGeneric */ inline epu8 partial_sums_gen(epu8); /** @copydoc common_partial_sums * @par Algorithm: * 4-stages paralell algorithm */ inline epu8 partial_sums_round(epu8); /** @copydoc common_partial_sums */ inline epu8 partial_sums(epu8 v) { return partial_sums_round(v); } /** @class common_horiz_max * @brief Horizontal sum of a #HPCombi::epu8 * @details * @returns the horizontal sum of the input * @par Example: * @code * horiz_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2, 0,12, 0, 0, 0}); * @endcode * Returns `12` */ /** @copydoc common_horiz_max * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline uint8_t horiz_max_ref(epu8); /** @copydoc common_horiz_max * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access * through #HPCombi::VectGeneric */ inline uint8_t horiz_max_gen(epu8); /** @copydoc common_horiz_max * @par Algorithm: * 4-stages paralell algorithm */ inline uint8_t horiz_max4(epu8); /** @copydoc common_horiz_max * @par Algorithm: * 3-stages paralell algorithm + indexed access */ inline uint8_t horiz_max3(epu8); /** @copydoc common_horiz_max */ inline uint8_t horiz_max(epu8 v) { return horiz_max4(v); } /** @class common_partial_max * @brief Horizontal partial sum of a #HPCombi::epu8 * @details * @returns the partials max of the input * @par Example: * @code * partial_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15}); * @endcode * Returns `{ 5, 5, 5, 5, 5, 6,12,12,12,12,12,12,12,13,14,15}` */ /** @copydoc common_partial_max * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline epu8 partial_max_ref(epu8); /** @copydoc common_partial_max * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access * through #HPCombi::VectGeneric */ inline epu8 partial_max_gen(epu8); /** @copydoc common_partial_max * @par Algorithm: * 4-stages paralell algorithm */ inline epu8 partial_max_round(epu8); /** @copydoc common_partial_max */ inline epu8 partial_max(epu8 v) { return partial_max_round(v); } /** @class common_horiz_min * @brief Horizontal sum of a #HPCombi::epu8 * @details * @returns the horizontal sum of the input * @par Example: * @code * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4}); * @endcode * Returns `1` */ /** @copydoc common_horiz_min * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline uint8_t horiz_min_ref(epu8); /** @copydoc common_horiz_min * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access * through #HPCombi::VectGeneric */ inline uint8_t horiz_min_gen(epu8); /** @copydoc common_horiz_min * @par Algorithm: * 4-stages paralell algorithm */ inline uint8_t horiz_min4(epu8); /** @copydoc common_horiz_min * @par Algorithm: * 3-stages paralell algorithm + indexed access */ inline uint8_t horiz_min3(epu8); /** @copydoc common_horiz_min */ inline uint8_t horiz_min(epu8 v) { return horiz_min4(v); } /** @class common_partial_min * @brief Horizontal partial sum of a #HPCombi::epu8 * @details * @returns the partials min of the input * @par Example: * @code * partial_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15}); * @endcode * Returns `{ 5, 5, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}` */ /** @copydoc common_partial_min * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline epu8 partial_min_ref(epu8); /** @copydoc common_partial_min * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access * through #HPCombi::VectGeneric */ inline epu8 partial_min_gen(epu8); /** @copydoc common_partial_min * @par Algorithm: * 4-stages paralell algorithm */ inline epu8 partial_min_round(epu8); /** @copydoc common_partial_min */ inline epu8 partial_min(epu8 v) { return partial_min_round(v); } /** @class common_eval16 * @brief Evaluation of a #HPCombi::epu8 * @details * @param v : a #HPCombi::epu8 * @returns the evaluation, that is the #HPCombi::epu8 \c r such that * \c r[i] is the number of occurrence of \c i in the input \c v * @par Example: * @code * eval16(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15}); * @endcode * Returns `{ 1, 1, 2, 1, 1, 3, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1}` * @warning The entries larger than 15 are ignored */ /** @copydoc common_eval16 * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and indexed access */ inline epu8 eval16_ref(epu8 v); /** @copydoc common_eval16 * @par Algorithm: * Reference @f$O(n)@f$ algorithm using loop and cast to array */ inline epu8 eval16_arr(epu8 v); /** @copydoc common_eval16 * @par Algorithm: * Vector @f$O(n)@f$ using cyclic shifting */ inline epu8 eval16_cycle(epu8 v); /** @copydoc common_eval16 * @par Algorithm: * Vector @f$O(n)@f$ using popcount */ inline epu8 eval16_popcount(epu8 v); /** @copydoc common_eval16 */ inline epu8 eval16(epu8 v) { return eval16_cycle(v); }; /** @class common_first_diff * @brief The first difference between two #HPCombi::epu8 * @details * @param a, b : two #HPCombi::epu8 * @param bound : a \c size_t * @returns the smallest index @f$i * - std::not_equal_to * - std::hash * - std::less */ } #include "epu_impl.hpp" #endif // HPCOMBI_EPU_HPP_INCLUDED