1 //****************************************************************************//
2 //     Copyright (C) 2016-2018 Florent Hivert <Florent.Hivert@lri.fr>,        //
3 //                                                                            //
4 //  Distributed under the terms of the GNU General Public License (GPL)       //
5 //                                                                            //
6 //    This code is distributed in the hope that it will be useful,            //
7 //    but WITHOUT ANY WARRANTY; without even the implied warranty of          //
8 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU       //
9 //   General Public License for more details.                                 //
10 //                                                                            //
11 //  The full text of the GPL is available at:                                 //
12 //                                                                            //
13 //                  http://www.gnu.org/licenses/                              //
14 //****************************************************************************//
15 
16 #ifndef HPCOMBI_EPU_HPP_INCLUDED
17 #define HPCOMBI_EPU_HPP_INCLUDED
18 
19 #include <array>
20 #include <cassert>
21 #include <cstdint>
22 #include <functional>  // less<>, equal_to<>
23 #include <iomanip>
24 #include <ostream>
25 #include <x86intrin.h>
26 
27 #ifdef HPCOMBI_HAVE_CONFIG
28 #include "HPCombi-config.h"
29 #endif
30 
31 #if __cplusplus <= 201103L
32 #include "fallback/seq.hpp"
33 #endif
34 
35 #include "vect_generic.hpp"
36 
37 #ifdef HPCOMBI_CONSTEXPR_FUN_ARGS
38 #define HPCOMBI_CONSTEXPR constexpr
39 #define HPCOMBI_CONSTEXPR_CONSTRUCTOR constexpr
40 #else
41 #pragma message "Using a constexpr broken compiler ! "                         \
42                 "Performance may not be optimal"
43 #define HPCOMBI_CONSTEXPR const
44 #define HPCOMBI_CONSTEXPR_CONSTRUCTOR
45 #endif
46 
47 
48 namespace HPCombi {
49 
50 /// Unsigned 8 bits int constant.
operator ""_u8(unsigned long long arg)51 inline constexpr uint8_t operator"" _u8(unsigned long long arg) noexcept {
52     return static_cast<uint8_t>(arg);
53 }
54 
55 /// SIMD vector of 16 unsigned bytes
56 using epu8 = uint8_t __attribute__((vector_size(16)));
57 
58 static_assert(alignof(epu8) == 16,
59               "epu8 type is not properly aligned by the compiler !");
60 
61 /// SIMD vector of 32 unsigned bytes
62 using xpu8 = uint8_t __attribute__((vector_size(32)));
63 
64 static_assert(alignof(xpu8) == 32,
65               "xpu8 type is not properly aligned by the compiler !");
66 
67 
68 namespace {  // Implementation detail code
69 
70 /// A handmade C++11 constexpr lambda
71 template <typename T> struct ConstFun {
ConstFunHPCombi::__anon091622170111::ConstFun72     HPCOMBI_CONSTEXPR_CONSTRUCTOR ConstFun(T cc) : cst(cc) {}
operator ()HPCombi::__anon091622170111::ConstFun73     HPCOMBI_CONSTEXPR T operator()(T) const { return cst; }
74     /// constant value for constexpr lambda
75     T cst;
76 };
77 
78 /// Factory object for various SIMD constants in particular constexpr
79 template <class TPU> struct TPUBuild {
80 
81     using type_elem =
82         typename std::remove_reference<decltype((TPU{})[0])>::type;
83     static constexpr size_t size_elem = sizeof(type_elem);
84     static constexpr size_t size = sizeof(TPU) / size_elem;
85     using array = std::array<type_elem, size>;
86 
87     template <class Fun, std::size_t... Is>
make_helperHPCombi::__anon091622170111::TPUBuild88     static HPCOMBI_CONSTEXPR TPU make_helper(Fun f,
89                                              std::index_sequence<Is...>) {
90         return TPU{f(Is)...};
91     }
92 
operator ()HPCombi::__anon091622170111::TPUBuild93     inline TPU operator()(const std::initializer_list<type_elem> il,
94                           type_elem def) const {
95         assert(il.size() <= size);
96         array res;
97         std::copy(il.begin(), il.end(), res.begin());
98         std::fill(res.begin() + il.size(), res.end(), def);
99         return reinterpret_cast<const TPU &>(res);
100     }
101 
operator ()HPCombi::__anon091622170111::TPUBuild102     template <class Fun> inline HPCOMBI_CONSTEXPR TPU operator()(Fun f) const {
103         return make_helper(f, std::make_index_sequence<size>{});
104     }
105 
operator ()HPCombi::__anon091622170111::TPUBuild106     inline HPCOMBI_CONSTEXPR TPU operator()(type_elem c) const {
107         return make_helper(ConstFun<type_elem>(c),
108                            std::make_index_sequence<size>{});
109     }
110     // explicit overloading for int constants
operator ()HPCombi::__anon091622170111::TPUBuild111     inline HPCOMBI_CONSTEXPR TPU operator()(int c) const {
112         return operator()(uint8_t(c));
113     }
operator ()HPCombi::__anon091622170111::TPUBuild114     inline HPCOMBI_CONSTEXPR TPU operator()(size_t c) const {
115         return operator()(uint8_t(c));
116     }
117 };
118 
119 // The following functions should be constexpr lambdas writen directly in
120 // their corresponding methods. However until C++17, constexpr lambda are
121 // forbidden. So we put them here.
122 /// The image of i by the identity function
id_fun(uint8_t i)123 HPCOMBI_CONSTEXPR uint8_t id_fun(uint8_t i) { return i; }
124 /// The image of i by the left cycle function
left_cycle_fun(uint8_t i)125 HPCOMBI_CONSTEXPR uint8_t left_cycle_fun(uint8_t i) { return (i + 15) % 16; }
126 /// The image of i by the right cycle function
127 HPCOMBI_CONSTEXPR
right_cycle_fun(uint8_t i)128 uint8_t right_cycle_fun(uint8_t i) { return (i + 1) % 16; }
129 /// The image of i by a left shift duplicating the hole
130 HPCOMBI_CONSTEXPR
left_dup_fun(uint8_t i)131 uint8_t left_dup_fun(uint8_t i) { return i == 15 ? 15 : i + 1; }
132 /// The image of i by a right shift duplicating the hole
133 HPCOMBI_CONSTEXPR
right_dup_fun(uint8_t i)134 uint8_t right_dup_fun(uint8_t i) { return i == 0 ? 0 : i - 1; }
135 /// The complement of i to 15
136 HPCOMBI_CONSTEXPR
complement_fun(uint8_t i)137 uint8_t complement_fun(uint8_t i) { return 15 - i; }
popcount4_fun(uint8_t i)138 HPCOMBI_CONSTEXPR uint8_t popcount4_fun(uint8_t i) {
139     return ((i & 1) != 0 ? 1 : 0)
140         +  ((i & 2) != 0 ? 1 : 0)
141         +  ((i & 4) != 0 ? 1 : 0)
142         +  ((i & 8) != 0 ? 1 : 0);
143 }
144 
145 }  // Anonymous namespace
146 
147 
148 /// Factory object for various SIMD constants in particular constexpr
149 TPUBuild<epu8> Epu8;
150 
151 /// The indentity #HPCombi::epu8
152 HPCOMBI_CONSTEXPR epu8 epu8id = Epu8(id_fun);
153 /// The reverted identity #HPCombi::epu8
154 HPCOMBI_CONSTEXPR epu8 epu8rev = Epu8(complement_fun);
155 /// Left cycle #HPCombi::epu8 permutation
156 HPCOMBI_CONSTEXPR epu8 left_cycle = Epu8(left_cycle_fun);
157 /// Right cycle #HPCombi::epu8 permutation
158 HPCOMBI_CONSTEXPR epu8 right_cycle = Epu8(right_cycle_fun);
159 /// Left shift #HPCombi::epu8, duplicating the rightmost entry
160 HPCOMBI_CONSTEXPR epu8 left_dup = Epu8(left_dup_fun);
161 /// Right shift #HPCombi::epu8, duplicating the leftmost entry
162 HPCOMBI_CONSTEXPR epu8 right_dup = Epu8(right_dup_fun);
163 /// Popcount #HPCombi::epu8: the ith entry contains the number of bits set in i
164 HPCOMBI_CONSTEXPR epu8 popcount4 = Epu8(popcount4_fun);
165 
166 /** Cast a #HPCombi::epu8 to a c++ \c std::array
167  *
168  *  This is usually faster for algorithm using a lot of indexed acces.
169  */
as_array(epu8 & v)170 inline TPUBuild<epu8>::array &as_array(epu8 &v) {
171     return reinterpret_cast<typename TPUBuild<epu8>::array &>(v);
172 }
173 /** Cast a constant #HPCombi::epu8 to a C++ \c std::array
174  *
175  *  This is usually faster for algorithm using a lot of indexed acces.
176  */
as_array(const epu8 & v)177 inline const TPUBuild<epu8>::array &as_array(const epu8 &v) {
178     return reinterpret_cast<const typename TPUBuild<epu8>::array &>(v);
179 }
180 /** Cast a C++ \c std::array to a #HPCombi::epu8 */
181 // Passing the argument by reference triggers a segfault in gcc
182 // Since vector types doesn't belongs to the standard, I didn't manage
183 // to know if I'm using undefined behavior here.
from_array(TPUBuild<epu8>::array a)184 inline epu8 from_array(TPUBuild<epu8>::array a) {
185     return reinterpret_cast<const epu8 &>(a);
186 }
187 
188 /** Cast a #HPCombi::epu8 to a c++ #HPCombi::VectGeneric
189  *
190  *  This is usually faster for algorithm using a lot of indexed acces.
191  */
as_VectGeneric(epu8 & v)192 inline VectGeneric<16> &as_VectGeneric(epu8 &v) {
193     return reinterpret_cast<VectGeneric<16> &>(as_array(v));
194 }
195 
196 /** Cast a #HPCombi::epu8 to a c++ #HPCombi::VectGeneric
197  *
198  *  This is usually faster for algorithm using a lot of indexed acces.
199  */
as_VectGeneric(const epu8 & v)200 inline const VectGeneric<16> &as_VectGeneric(const epu8 &v) {
201     return reinterpret_cast<const VectGeneric<16> &>(as_array(v));
202 }
203 
204 /** Test whether all the entries of a #HPCombi::epu8 are zero */
is_all_zero(epu8 a)205 inline bool is_all_zero(epu8 a) { return _mm_testz_si128(a, a); }
206 /** Test whether all the entries of a #HPCombi::epu8 are one */
is_all_one(epu8 a)207 inline bool is_all_one(epu8 a) { return _mm_testc_si128(a, Epu8(0xFF)); }
208 
209 /** Equality of #HPCombi::epu8 */
equal(epu8 a,epu8 b)210 inline bool equal(epu8 a, epu8 b) { return is_all_zero(_mm_xor_si128(a, b)); }
211 /** Non equality of #HPCombi::epu8 */
not_equal(epu8 a,epu8 b)212 inline bool not_equal(epu8 a, epu8 b) { return not equal(a, b); }
213 
214 /** Permuting a #HPCombi::epu8 */
permuted(epu8 a,epu8 b)215 inline epu8 permuted(epu8 a, epu8 b) { return _mm_shuffle_epi8(a, b); }
216 /** Left shifted of a #HPCombi::epu8 inserting a 0
217  * @warning we use the convention that the 0 entry is on the left !
218  */
shifted_right(epu8 a)219 inline epu8 shifted_right(epu8 a) { return _mm_bslli_si128(a, 1); }
220 /** Right shifted of a #HPCombi::epu8 inserting a 0
221  * @warning we use the convention that the 0 entry is on the left !
222  */
shifted_left(epu8 a)223 inline epu8 shifted_left(epu8 a) { return _mm_bsrli_si128(a, 1); }
224 /** Reverting a #HPCombi::epu8 */
reverted(epu8 a)225 inline epu8 reverted(epu8 a) { return permuted(a, epu8rev); }
226 
227 /** Vector min between two #HPCombi::epu8 0 */
min(epu8 a,epu8 b)228 inline epu8 min(epu8 a, epu8 b) { return _mm_min_epu8(a, b); }
229 /** Vector max between two #HPCombi::epu8 0 */
max(epu8 a,epu8 b)230 inline epu8 max(epu8 a, epu8 b) { return _mm_max_epu8(a, b); }
231 
232 /** Testing if a #HPCombi::epu8 is sorted */
233 inline bool is_sorted(epu8 a);
234 /** Return a sorted #HPCombi::epu8
235  * @details
236  * @par Algorithm:
237  * Uses the 9 stages sorting network #sorting_rounds
238  */
239 inline epu8 sorted(epu8 a);
240 /** Return a #HPCombi::epu8 with the two half sorted
241  * @details
242  * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8
243  */
244 inline epu8 sorted8(epu8 a);
245 /** Return a reverse sorted #HPCombi::epu8
246  * @details
247  * @par Algorithm:
248  * Uses the 9 stages sorting network #sorting_rounds
249  */
250 inline epu8 revsorted(epu8 a);
251 /** Return a #HPCombi::epu8 with the two half reverse sorted
252  * @details
253  * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8
254  */
255 inline epu8 revsorted8(epu8 a);
256 
257 /** Sort \c this and return the sorting permutation
258  * @details
259  * @par Algorithm: Uses a 9 stages sorting network #sorting_rounds8
260  */
261 inline epu8 sort_perm(epu8 & a);
262 /** Sort \c this and return the sorting permutation
263  * @details
264  * @par Algorithm: Uses a 9 stages sorting network #sorting_rounds8
265  */
266 inline epu8 sort8_perm(epu8 & a);
267 
268 
269 /** Find if a vector is a permutation of one other
270  * @details
271  * @param a, b: two #HPCombi::epu8
272  * @returns a #HPCombi::epu8
273  * For each @f$0 \leq i < 16@f$, \c res[i] is the position in \c a of \c b[i]
274      if \c b[i] appears exactly once in \c a, or undefined if not.
275  */
276 inline epu8 permutation_of(epu8 a, epu8 b);
277 
278 /** A prime number good for hashing */
279 const uint64_t prime = 0x9e3779b97f4a7bb9;
280 
281 /** A random #HPCombi::epu8
282  * @details
283  * @param bnd : the upper bound for the value of the entries
284  * @returns a random #HPCombi::epu8 with value in the interval
285  *    @f$[0, 1, 2, ..., bnd-1]@f$.
286  */
287 inline epu8 random_epu8(uint16_t bnd);
288 
289 /** Remove duplicates in a sorted #HPCombi::epu8
290  * @details
291  * @param a: supposed to be sorted
292  * @param repl: the value replacing the duplicate entries (default to 0)
293  * @return a where repeated occurences of entries are replaced by \c repl
294  */
295 inline epu8 remove_dups(epu8 a, uint8_t repl = 0);
296 
297 /** @class common_horiz_sum
298  * @brief Horizontal sum of a  #HPCombi::epu8
299  * @details
300  * @returns the horizontal sum of the input
301  * @par Example:
302  * @code
303  * horiz_sum(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
304  * @endcode
305  * Returns `110`
306  * @warning The result is supposed to fit in a \c uint8_t
307  */
308 /** @copydoc common_horiz_sum
309  *  @par Algorithm:
310  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
311  */
312 inline uint8_t horiz_sum_ref(epu8);
313 /** @copydoc common_horiz_sum
314  *  @par Algorithm:
315  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
316  *  through #HPCombi::VectGeneric
317  */
318 inline uint8_t horiz_sum_gen(epu8);
319 /** @copydoc common_horiz_sum
320  *  @par Algorithm:
321  *  4-stages paralell algorithm
322  */
323 inline uint8_t horiz_sum4(epu8);
324 /** @copydoc common_horiz_sum
325  *  @par Algorithm:
326  *  3-stages paralell algorithm + indexed access
327  */
328 inline uint8_t horiz_sum3(epu8);
329 /** @copydoc common_horiz_sum */
horiz_sum(epu8 v)330 inline uint8_t horiz_sum(epu8 v) { return horiz_sum3(v); }
331 
332 /** @class common_partial_sums
333  * @brief Horizontal partial sum of a #HPCombi::epu8
334  * @details
335  * @returns the partials sums of the input
336  * @par Example:
337  * @code
338  * partial_sums(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
339  * @endcode
340  * Returns `{ 5,10,12,17,18,24,36,40,40,43,45,56,68,81,95,110}`
341  */
342 /** @copydoc common_partial_sums
343  *  @par Algorithm:
344  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
345  */
346 inline epu8 partial_sums_ref(epu8);
347 /** @copydoc common_partial_sums
348  *  @par Algorithm:
349  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
350  *  through #HPCombi::VectGeneric
351  */
352 inline epu8 partial_sums_gen(epu8);
353 /** @copydoc common_partial_sums
354  *  @par Algorithm:
355  *  4-stages paralell algorithm
356  */
357 inline epu8 partial_sums_round(epu8);
358 /** @copydoc common_partial_sums */
partial_sums(epu8 v)359 inline epu8 partial_sums(epu8 v) { return partial_sums_round(v); }
360 
361 
362 /** @class common_horiz_max
363  * @brief Horizontal sum of a  #HPCombi::epu8
364  * @details
365  * @returns the horizontal sum of the input
366  * @par Example:
367  * @code
368  * horiz_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2, 0,12, 0, 0, 0});
369  * @endcode
370  * Returns `12`
371  */
372 /** @copydoc common_horiz_max
373  *  @par Algorithm:
374  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
375  */
376 inline uint8_t horiz_max_ref(epu8);
377 /** @copydoc common_horiz_max
378  *  @par Algorithm:
379  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
380  *  through #HPCombi::VectGeneric
381  */
382 inline uint8_t horiz_max_gen(epu8);
383 /** @copydoc common_horiz_max
384  *  @par Algorithm:
385  *  4-stages paralell algorithm
386  */
387 inline uint8_t horiz_max4(epu8);
388 /** @copydoc common_horiz_max
389  *  @par Algorithm:
390  *  3-stages paralell algorithm + indexed access
391  */
392 inline uint8_t horiz_max3(epu8);
393 /** @copydoc common_horiz_max */
horiz_max(epu8 v)394 inline uint8_t horiz_max(epu8 v) { return horiz_max4(v); }
395 
396 /** @class common_partial_max
397  * @brief Horizontal partial sum of a #HPCombi::epu8
398  * @details
399  * @returns the partials max of the input
400  * @par Example:
401  * @code
402  * partial_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
403  * @endcode
404  * Returns `{ 5, 5, 5, 5, 5, 6,12,12,12,12,12,12,12,13,14,15}`
405  */
406 /** @copydoc common_partial_max
407  *  @par Algorithm:
408  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
409  */
410 inline epu8 partial_max_ref(epu8);
411 /** @copydoc common_partial_max
412  *  @par Algorithm:
413  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
414  *  through #HPCombi::VectGeneric
415  */
416 inline epu8 partial_max_gen(epu8);
417 /** @copydoc common_partial_max
418  *  @par Algorithm:
419  *  4-stages paralell algorithm
420  */
421 inline epu8 partial_max_round(epu8);
422 /** @copydoc common_partial_max */
partial_max(epu8 v)423 inline epu8 partial_max(epu8 v) { return partial_max_round(v); }
424 
425 
426 /** @class common_horiz_min
427  * @brief Horizontal sum of a  #HPCombi::epu8
428  * @details
429  * @returns the horizontal sum of the input
430  * @par Example:
431  * @code
432  * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4});
433  * @endcode
434  * Returns `1`
435  */
436 /** @copydoc common_horiz_min
437  *  @par Algorithm:
438  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
439  */
440 inline uint8_t horiz_min_ref(epu8);
441 /** @copydoc common_horiz_min
442  *  @par Algorithm:
443  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
444  *  through #HPCombi::VectGeneric
445  */
446 inline uint8_t horiz_min_gen(epu8);
447 /** @copydoc common_horiz_min
448  *  @par Algorithm:
449  *  4-stages paralell algorithm
450  */
451 inline uint8_t horiz_min4(epu8);
452 /** @copydoc common_horiz_min
453  *  @par Algorithm:
454  *  3-stages paralell algorithm + indexed access
455  */
456 inline uint8_t horiz_min3(epu8);
457 /** @copydoc common_horiz_min */
horiz_min(epu8 v)458 inline uint8_t horiz_min(epu8 v) { return horiz_min4(v); }
459 
460 /** @class common_partial_min
461  * @brief Horizontal partial sum of a #HPCombi::epu8
462  * @details
463  * @returns the partials min of the input
464  * @par Example:
465  * @code
466  * partial_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
467  * @endcode
468  * Returns `{ 5, 5, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}`
469  */
470 /** @copydoc common_partial_min
471  *  @par Algorithm:
472  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
473  */
474 inline epu8 partial_min_ref(epu8);
475 /** @copydoc common_partial_min
476  *  @par Algorithm:
477  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
478  *  through #HPCombi::VectGeneric
479  */
480 inline epu8 partial_min_gen(epu8);
481 /** @copydoc common_partial_min
482  *  @par Algorithm:
483  *  4-stages paralell algorithm
484  */
485 inline epu8 partial_min_round(epu8);
486 /** @copydoc common_partial_min */
partial_min(epu8 v)487 inline epu8 partial_min(epu8 v) { return partial_min_round(v); }
488 
489 
490 /** @class common_eval16
491  * @brief Evaluation of a #HPCombi::epu8
492  * @details
493  * @param v : a #HPCombi::epu8
494  * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
495  *     \c r[i] is the number of occurrence of \c i in the input \c v
496  * @par Example:
497  * @code
498  * eval16(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
499  * @endcode
500  * Returns `{ 1, 1, 2, 1, 1, 3, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1}`
501  * @warning The entries larger than 15 are ignored
502  */
503 /** @copydoc common_eval16
504  *  @par Algorithm:
505  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
506  */
507 inline epu8 eval16_ref(epu8 v);
508 /** @copydoc common_eval16
509  *  @par Algorithm:
510  *  Reference @f$O(n)@f$ algorithm using loop and cast to array
511  */
512 inline epu8 eval16_arr(epu8 v);
513 /** @copydoc common_eval16
514  *  @par Algorithm:
515  *  Vector @f$O(n)@f$ using cyclic shifting
516  */
517 inline epu8 eval16_cycle(epu8 v);
518 /** @copydoc common_eval16
519  *  @par Algorithm:
520  *  Vector @f$O(n)@f$ using popcount
521  */
522 inline epu8 eval16_popcount(epu8 v);
523 /** @copydoc common_eval16 */
eval16(epu8 v)524 inline epu8 eval16(epu8 v) { return eval16_cycle(v); };
525 
526 /** @class common_first_diff
527  * @brief The first difference between two #HPCombi::epu8
528  * @details
529  * @param a, b : two #HPCombi::epu8
530  * @param bound : a \c size_t
531  * @returns the smallest index @f$i<bound@f$ such that \c a[i] and \c b[i]
532  * differ, 16 if there is no differences before bound.
533  * @par Example:
534  * @code
535  * epu8 a { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15};
536  * epu8 b { 5, 5, 2, 9, 1, 6,12, 4, 0, 4, 4, 4,12,13,14,15};
537  * @endcode
538  * then `first_diff(a, b)` returns `3`,
539  * `first_diff(a, b, 3)` returns `16`,
540  * `first_diff(a, b, 4)` returns `3`,
541  * `first_diff(a, b, 7)` returns `3`.
542  * @warning `bound` is assumed to be smaller or equal than 16
543  */
544 /** @copydoc common_first_diff
545  *  @par Algorithm:
546  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
547  */
548 inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16);
549 /** @copydoc common_first_diff
550  *  @par Algorithm:
551  *  Using \c cmpestri instruction
552  */
553 inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16);
554 /** @copydoc common_first_diff
555  *  @par Algorithm:
556  *  Using vector comparison and mask
557  */
558 inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound = 16);
559 /** @copydoc common_first_diff */
first_diff(epu8 a,epu8 b,size_t bound=16)560 inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) {
561     return first_diff_mask(a, b, bound);
562 }
563 
564 /** @class common_last_diff
565  * @brief The last difference between two #HPCombi::epu8
566  * @details
567  * @param a, b : two #HPCombi::epu8
568  * @param bound : a \c size_t
569  * @returns the largest index @f$i<bound@f$ such that \c a[i] and \c b[i]
570  * differ, 16 if there is no differences before bound.
571  * @par Example:
572  * @code
573  * epu8 a { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15};
574  * epu8 b { 5, 5, 2, 9, 1, 6,12, 4, 0, 4, 4, 4,12,13,14,15};
575  * @endcode
576  * then `last_diff(a, b)` returns `11`,
577  * `last_diff(a, b, 3)` returns `16`,
578  * `last_diff(a, b, 4)` returns `3`,
579  * `last_diff(a, b, 7)` returns `3`.
580  * @warning `bound` is assumed to be smaller or equal than 16
581  */
582 /** @copydoc common_last_diff
583  *  @par Algorithm:
584  *  Reference @f$O(n)@f$ algorithm using loop and indexed access
585  */
586 inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16);
587 /** @copydoc common_last_diff
588  *  @par Algorithm:
589  *  Using \c cmpestri instruction
590  */
591 inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16);
592 /** @copydoc common_last_diff
593  *  @par Algorithm:
594  *  Using vector comparison and mask
595  */
596 inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound = 16);
597 /** @copydoc common_last_diff */
last_diff(epu8 a,epu8 b,size_t bound=16)598 inline uint64_t last_diff(epu8 a, epu8 b, size_t bound = 16) {
599     return last_diff_mask(a, b, bound);
600 }
601 
602 /** Lexicographic comparison between two #HPCombi::epu8 */
603 inline bool less(epu8 a, epu8 b);
604 /** Partial lexicographic comparison between two #HPCombi::epu8
605  * @param a, b : the vectors to compare
606  * @param k : the bound for the lexicographic comparison
607  * @return a positive, negative or zero char depending on the result
608  */
609 inline char less_partial(epu8 a, epu8 b, int k);
610 
611 /** return the index of the first zero entry or 16 if there are none
612  *  Only index smaller than bound are taken into account.
613  */
614 inline uint64_t first_zero(epu8 v, int bnd);
615 /** return the index of the last zero entry or 16 if there are none
616  *  Only index smaller than bound are taken into account.
617  */
618 inline uint64_t last_zero(epu8 v, int bnd);
619 /** return the index of the first non zero entry or 16 if there are none
620  *  Only index smaller than bound are taken into account.
621  */
622 inline uint64_t first_non_zero(epu8 v, int bnd);
623 /** return the index of the last non zero entry or 16 if there are none
624  *  Only index smaller than bound are taken into account.
625  */
626 inline uint64_t last_non_zero(epu8 v, int bnd);
627 
628 /** a vector popcount function
629  */
630 inline epu8 popcount16(epu8 v);
631 
632 /** Test for partial transformation
633  * @details
634  * @returns whether \c v is a partial transformation.
635  * @param v the vector to test
636  * @param k the size of \c *this (default 16)
637  *
638  * Points where the function is undefined are mapped to \c 0xff. If \c *this
639  * is a tranformation of @f$0\dots n-1@f$ for @f$n<16@f$, it should be completed
640  * to a transformation of @f$0\dots 15@f$ by adding fixed points. That is the
641  * values @f$i\geq n@f$ should be mapped to themself.
642  * @par Example:
643  * The partial tranformation
644  * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 . . 4 \end{matrix}@f$
645  * is encoded by the array {2,0,5,0xff,0xff,4,6,7,8,9,10,11,12,13,14,15}
646  */
647 inline bool is_partial_transformation(epu8 v, const size_t k = 16);
648 
649 /** Test for transformation
650  * @details
651  * @returns whether \c *this is a transformation.
652  * @param v the vector to test
653  * @param k the size of \c *this (default 16)
654  *
655  * If \c *this is a tranformation of @f$0\dots n-1@f$ for @f$n<16@f$,
656  * it should be completed to a transformation of @f$0\dots 15@f$
657  * by adding fixed points. That is the values @f$i\geq n@f$ should be
658  * mapped to themself.
659  * @par Example:
660  * The tranformation
661  * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 2 1 4 \end{matrix}@f$
662  * is encoded by the array {2,0,5,2,1,4,6,7,8,9,10,11,12,13,14,15}
663  */
664 inline bool is_transformation(epu8 v, const size_t k = 16);
665 
666 /** Test for partial permutations
667  * @details
668  * @returns whether \c *this is a partial permutation.
669  * @param v the vector to test
670  * @param k the size of \c *this (default 16)
671  *
672  * Points where the function is undefined are mapped to \c 0xff.
673  * If \c *this is a partial permutation of @f$0\dots n-1@f$ for @f$n<16@f$,
674  * it should be completed to a partial permutation of @f$0\dots 15@f$
675  * by adding fixed points. That is the values @f$i\geq n@f$ should be
676  * mapped to themself.
677  * @par Example:
678  * The permutation
679  * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 . . 4 \end{matrix}@f$
680  * is encoded by the array {2,0,5,0xFF,0xFF,4,6,7,8,9,10,11,12,13,14,15}
681  */
682 inline bool is_partial_permutation(epu8 v, const size_t k = 16);
683 
684 /** Test for permutations
685  * @details
686  * @returns whether \c *this is a permutation.
687  * @param v the vector to test
688  * @param k the size of \c *this (default 16)
689  *
690  * If \c *this is a permutation of @f$0\dots n-1@f$ for @f$n<16@f$,
691  * it should be completed to a permutation of @f$0\dots 15@f$
692  * by adding fixed points. That is the values @f$i\geq n@f$ should be
693  * mapped to themself.
694  * @par Example:
695  * The permutation
696  * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 3 1 4 \end{matrix}@f$
697  * is encoded by the array {2,0,5,3,1,4,6,7,8,9,10,11,12,13,14,15}
698  */
699 inline bool is_permutation(epu8 v, const size_t k = 16);
700 
701 }  // namespace HPCombi
702 
703 namespace std {
704 
705 inline std::ostream &operator<<(std::ostream &stream, HPCombi::epu8 const &a);
706 
707 /** We also specialize the struct
708  *  - std::equal_to<epu8>
709  *  - std::not_equal_to<epu8>
710  *  - std::hash<epu8>
711  *  - std::less<epu8>
712  */
713 }
714 
715 #include "epu_impl.hpp"
716 
717 #endif  // HPCOMBI_EPU_HPP_INCLUDED
718