1 //****************************************************************************//
2 // Copyright (C) 2016-2018 Florent Hivert <Florent.Hivert@lri.fr>, //
3 // //
4 // Distributed under the terms of the GNU General Public License (GPL) //
5 // //
6 // This code is distributed in the hope that it will be useful, //
7 // but WITHOUT ANY WARRANTY; without even the implied warranty of //
8 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU //
9 // General Public License for more details. //
10 // //
11 // The full text of the GPL is available at: //
12 // //
13 // http://www.gnu.org/licenses/ //
14 //****************************************************************************//
15
16 #ifndef HPCOMBI_EPU_HPP_INCLUDED
17 #define HPCOMBI_EPU_HPP_INCLUDED
18
19 #include <array>
20 #include <cassert>
21 #include <cstdint>
22 #include <functional> // less<>, equal_to<>
23 #include <iomanip>
24 #include <ostream>
25 #include <x86intrin.h>
26
27 #ifdef HPCOMBI_HAVE_CONFIG
28 #include "HPCombi-config.h"
29 #endif
30
31 #if __cplusplus <= 201103L
32 #include "fallback/seq.hpp"
33 #endif
34
35 #include "vect_generic.hpp"
36
37 #ifdef HPCOMBI_CONSTEXPR_FUN_ARGS
38 #define HPCOMBI_CONSTEXPR constexpr
39 #define HPCOMBI_CONSTEXPR_CONSTRUCTOR constexpr
40 #else
41 #pragma message "Using a constexpr broken compiler ! " \
42 "Performance may not be optimal"
43 #define HPCOMBI_CONSTEXPR const
44 #define HPCOMBI_CONSTEXPR_CONSTRUCTOR
45 #endif
46
47
48 namespace HPCombi {
49
50 /// Unsigned 8 bits int constant.
operator ""_u8(unsigned long long arg)51 inline constexpr uint8_t operator"" _u8(unsigned long long arg) noexcept {
52 return static_cast<uint8_t>(arg);
53 }
54
55 /// SIMD vector of 16 unsigned bytes
56 using epu8 = uint8_t __attribute__((vector_size(16)));
57
58 static_assert(alignof(epu8) == 16,
59 "epu8 type is not properly aligned by the compiler !");
60
61 /// SIMD vector of 32 unsigned bytes
62 using xpu8 = uint8_t __attribute__((vector_size(32)));
63
64 static_assert(alignof(xpu8) == 32,
65 "xpu8 type is not properly aligned by the compiler !");
66
67
68 namespace { // Implementation detail code
69
70 /// A handmade C++11 constexpr lambda
71 template <typename T> struct ConstFun {
ConstFunHPCombi::__anon091622170111::ConstFun72 HPCOMBI_CONSTEXPR_CONSTRUCTOR ConstFun(T cc) : cst(cc) {}
operator ()HPCombi::__anon091622170111::ConstFun73 HPCOMBI_CONSTEXPR T operator()(T) const { return cst; }
74 /// constant value for constexpr lambda
75 T cst;
76 };
77
78 /// Factory object for various SIMD constants in particular constexpr
79 template <class TPU> struct TPUBuild {
80
81 using type_elem =
82 typename std::remove_reference<decltype((TPU{})[0])>::type;
83 static constexpr size_t size_elem = sizeof(type_elem);
84 static constexpr size_t size = sizeof(TPU) / size_elem;
85 using array = std::array<type_elem, size>;
86
87 template <class Fun, std::size_t... Is>
make_helperHPCombi::__anon091622170111::TPUBuild88 static HPCOMBI_CONSTEXPR TPU make_helper(Fun f,
89 std::index_sequence<Is...>) {
90 return TPU{f(Is)...};
91 }
92
operator ()HPCombi::__anon091622170111::TPUBuild93 inline TPU operator()(const std::initializer_list<type_elem> il,
94 type_elem def) const {
95 assert(il.size() <= size);
96 array res;
97 std::copy(il.begin(), il.end(), res.begin());
98 std::fill(res.begin() + il.size(), res.end(), def);
99 return reinterpret_cast<const TPU &>(res);
100 }
101
operator ()HPCombi::__anon091622170111::TPUBuild102 template <class Fun> inline HPCOMBI_CONSTEXPR TPU operator()(Fun f) const {
103 return make_helper(f, std::make_index_sequence<size>{});
104 }
105
operator ()HPCombi::__anon091622170111::TPUBuild106 inline HPCOMBI_CONSTEXPR TPU operator()(type_elem c) const {
107 return make_helper(ConstFun<type_elem>(c),
108 std::make_index_sequence<size>{});
109 }
110 // explicit overloading for int constants
operator ()HPCombi::__anon091622170111::TPUBuild111 inline HPCOMBI_CONSTEXPR TPU operator()(int c) const {
112 return operator()(uint8_t(c));
113 }
operator ()HPCombi::__anon091622170111::TPUBuild114 inline HPCOMBI_CONSTEXPR TPU operator()(size_t c) const {
115 return operator()(uint8_t(c));
116 }
117 };
118
119 // The following functions should be constexpr lambdas writen directly in
120 // their corresponding methods. However until C++17, constexpr lambda are
121 // forbidden. So we put them here.
122 /// The image of i by the identity function
id_fun(uint8_t i)123 HPCOMBI_CONSTEXPR uint8_t id_fun(uint8_t i) { return i; }
124 /// The image of i by the left cycle function
left_cycle_fun(uint8_t i)125 HPCOMBI_CONSTEXPR uint8_t left_cycle_fun(uint8_t i) { return (i + 15) % 16; }
126 /// The image of i by the right cycle function
127 HPCOMBI_CONSTEXPR
right_cycle_fun(uint8_t i)128 uint8_t right_cycle_fun(uint8_t i) { return (i + 1) % 16; }
129 /// The image of i by a left shift duplicating the hole
130 HPCOMBI_CONSTEXPR
left_dup_fun(uint8_t i)131 uint8_t left_dup_fun(uint8_t i) { return i == 15 ? 15 : i + 1; }
132 /// The image of i by a right shift duplicating the hole
133 HPCOMBI_CONSTEXPR
right_dup_fun(uint8_t i)134 uint8_t right_dup_fun(uint8_t i) { return i == 0 ? 0 : i - 1; }
135 /// The complement of i to 15
136 HPCOMBI_CONSTEXPR
complement_fun(uint8_t i)137 uint8_t complement_fun(uint8_t i) { return 15 - i; }
popcount4_fun(uint8_t i)138 HPCOMBI_CONSTEXPR uint8_t popcount4_fun(uint8_t i) {
139 return ((i & 1) != 0 ? 1 : 0)
140 + ((i & 2) != 0 ? 1 : 0)
141 + ((i & 4) != 0 ? 1 : 0)
142 + ((i & 8) != 0 ? 1 : 0);
143 }
144
145 } // Anonymous namespace
146
147
148 /// Factory object for various SIMD constants in particular constexpr
149 TPUBuild<epu8> Epu8;
150
151 /// The indentity #HPCombi::epu8
152 HPCOMBI_CONSTEXPR epu8 epu8id = Epu8(id_fun);
153 /// The reverted identity #HPCombi::epu8
154 HPCOMBI_CONSTEXPR epu8 epu8rev = Epu8(complement_fun);
155 /// Left cycle #HPCombi::epu8 permutation
156 HPCOMBI_CONSTEXPR epu8 left_cycle = Epu8(left_cycle_fun);
157 /// Right cycle #HPCombi::epu8 permutation
158 HPCOMBI_CONSTEXPR epu8 right_cycle = Epu8(right_cycle_fun);
159 /// Left shift #HPCombi::epu8, duplicating the rightmost entry
160 HPCOMBI_CONSTEXPR epu8 left_dup = Epu8(left_dup_fun);
161 /// Right shift #HPCombi::epu8, duplicating the leftmost entry
162 HPCOMBI_CONSTEXPR epu8 right_dup = Epu8(right_dup_fun);
163 /// Popcount #HPCombi::epu8: the ith entry contains the number of bits set in i
164 HPCOMBI_CONSTEXPR epu8 popcount4 = Epu8(popcount4_fun);
165
166 /** Cast a #HPCombi::epu8 to a c++ \c std::array
167 *
168 * This is usually faster for algorithm using a lot of indexed acces.
169 */
as_array(epu8 & v)170 inline TPUBuild<epu8>::array &as_array(epu8 &v) {
171 return reinterpret_cast<typename TPUBuild<epu8>::array &>(v);
172 }
173 /** Cast a constant #HPCombi::epu8 to a C++ \c std::array
174 *
175 * This is usually faster for algorithm using a lot of indexed acces.
176 */
as_array(const epu8 & v)177 inline const TPUBuild<epu8>::array &as_array(const epu8 &v) {
178 return reinterpret_cast<const typename TPUBuild<epu8>::array &>(v);
179 }
180 /** Cast a C++ \c std::array to a #HPCombi::epu8 */
181 // Passing the argument by reference triggers a segfault in gcc
182 // Since vector types doesn't belongs to the standard, I didn't manage
183 // to know if I'm using undefined behavior here.
from_array(TPUBuild<epu8>::array a)184 inline epu8 from_array(TPUBuild<epu8>::array a) {
185 return reinterpret_cast<const epu8 &>(a);
186 }
187
188 /** Cast a #HPCombi::epu8 to a c++ #HPCombi::VectGeneric
189 *
190 * This is usually faster for algorithm using a lot of indexed acces.
191 */
as_VectGeneric(epu8 & v)192 inline VectGeneric<16> &as_VectGeneric(epu8 &v) {
193 return reinterpret_cast<VectGeneric<16> &>(as_array(v));
194 }
195
196 /** Cast a #HPCombi::epu8 to a c++ #HPCombi::VectGeneric
197 *
198 * This is usually faster for algorithm using a lot of indexed acces.
199 */
as_VectGeneric(const epu8 & v)200 inline const VectGeneric<16> &as_VectGeneric(const epu8 &v) {
201 return reinterpret_cast<const VectGeneric<16> &>(as_array(v));
202 }
203
204 /** Test whether all the entries of a #HPCombi::epu8 are zero */
is_all_zero(epu8 a)205 inline bool is_all_zero(epu8 a) { return _mm_testz_si128(a, a); }
206 /** Test whether all the entries of a #HPCombi::epu8 are one */
is_all_one(epu8 a)207 inline bool is_all_one(epu8 a) { return _mm_testc_si128(a, Epu8(0xFF)); }
208
209 /** Equality of #HPCombi::epu8 */
equal(epu8 a,epu8 b)210 inline bool equal(epu8 a, epu8 b) { return is_all_zero(_mm_xor_si128(a, b)); }
211 /** Non equality of #HPCombi::epu8 */
not_equal(epu8 a,epu8 b)212 inline bool not_equal(epu8 a, epu8 b) { return not equal(a, b); }
213
214 /** Permuting a #HPCombi::epu8 */
permuted(epu8 a,epu8 b)215 inline epu8 permuted(epu8 a, epu8 b) { return _mm_shuffle_epi8(a, b); }
216 /** Left shifted of a #HPCombi::epu8 inserting a 0
217 * @warning we use the convention that the 0 entry is on the left !
218 */
shifted_right(epu8 a)219 inline epu8 shifted_right(epu8 a) { return _mm_bslli_si128(a, 1); }
220 /** Right shifted of a #HPCombi::epu8 inserting a 0
221 * @warning we use the convention that the 0 entry is on the left !
222 */
shifted_left(epu8 a)223 inline epu8 shifted_left(epu8 a) { return _mm_bsrli_si128(a, 1); }
224 /** Reverting a #HPCombi::epu8 */
reverted(epu8 a)225 inline epu8 reverted(epu8 a) { return permuted(a, epu8rev); }
226
227 /** Vector min between two #HPCombi::epu8 0 */
min(epu8 a,epu8 b)228 inline epu8 min(epu8 a, epu8 b) { return _mm_min_epu8(a, b); }
229 /** Vector max between two #HPCombi::epu8 0 */
max(epu8 a,epu8 b)230 inline epu8 max(epu8 a, epu8 b) { return _mm_max_epu8(a, b); }
231
232 /** Testing if a #HPCombi::epu8 is sorted */
233 inline bool is_sorted(epu8 a);
234 /** Return a sorted #HPCombi::epu8
235 * @details
236 * @par Algorithm:
237 * Uses the 9 stages sorting network #sorting_rounds
238 */
239 inline epu8 sorted(epu8 a);
240 /** Return a #HPCombi::epu8 with the two half sorted
241 * @details
242 * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8
243 */
244 inline epu8 sorted8(epu8 a);
245 /** Return a reverse sorted #HPCombi::epu8
246 * @details
247 * @par Algorithm:
248 * Uses the 9 stages sorting network #sorting_rounds
249 */
250 inline epu8 revsorted(epu8 a);
251 /** Return a #HPCombi::epu8 with the two half reverse sorted
252 * @details
253 * @par Algorithm: Uses a 6 stages sorting network #sorting_rounds8
254 */
255 inline epu8 revsorted8(epu8 a);
256
257 /** Sort \c this and return the sorting permutation
258 * @details
259 * @par Algorithm: Uses a 9 stages sorting network #sorting_rounds8
260 */
261 inline epu8 sort_perm(epu8 & a);
262 /** Sort \c this and return the sorting permutation
263 * @details
264 * @par Algorithm: Uses a 9 stages sorting network #sorting_rounds8
265 */
266 inline epu8 sort8_perm(epu8 & a);
267
268
269 /** Find if a vector is a permutation of one other
270 * @details
271 * @param a, b: two #HPCombi::epu8
272 * @returns a #HPCombi::epu8
273 * For each @f$0 \leq i < 16@f$, \c res[i] is the position in \c a of \c b[i]
274 if \c b[i] appears exactly once in \c a, or undefined if not.
275 */
276 inline epu8 permutation_of(epu8 a, epu8 b);
277
278 /** A prime number good for hashing */
279 const uint64_t prime = 0x9e3779b97f4a7bb9;
280
281 /** A random #HPCombi::epu8
282 * @details
283 * @param bnd : the upper bound for the value of the entries
284 * @returns a random #HPCombi::epu8 with value in the interval
285 * @f$[0, 1, 2, ..., bnd-1]@f$.
286 */
287 inline epu8 random_epu8(uint16_t bnd);
288
289 /** Remove duplicates in a sorted #HPCombi::epu8
290 * @details
291 * @param a: supposed to be sorted
292 * @param repl: the value replacing the duplicate entries (default to 0)
293 * @return a where repeated occurences of entries are replaced by \c repl
294 */
295 inline epu8 remove_dups(epu8 a, uint8_t repl = 0);
296
297 /** @class common_horiz_sum
298 * @brief Horizontal sum of a #HPCombi::epu8
299 * @details
300 * @returns the horizontal sum of the input
301 * @par Example:
302 * @code
303 * horiz_sum(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
304 * @endcode
305 * Returns `110`
306 * @warning The result is supposed to fit in a \c uint8_t
307 */
308 /** @copydoc common_horiz_sum
309 * @par Algorithm:
310 * Reference @f$O(n)@f$ algorithm using loop and indexed access
311 */
312 inline uint8_t horiz_sum_ref(epu8);
313 /** @copydoc common_horiz_sum
314 * @par Algorithm:
315 * Reference @f$O(n)@f$ algorithm using loop and indexed access
316 * through #HPCombi::VectGeneric
317 */
318 inline uint8_t horiz_sum_gen(epu8);
319 /** @copydoc common_horiz_sum
320 * @par Algorithm:
321 * 4-stages paralell algorithm
322 */
323 inline uint8_t horiz_sum4(epu8);
324 /** @copydoc common_horiz_sum
325 * @par Algorithm:
326 * 3-stages paralell algorithm + indexed access
327 */
328 inline uint8_t horiz_sum3(epu8);
329 /** @copydoc common_horiz_sum */
horiz_sum(epu8 v)330 inline uint8_t horiz_sum(epu8 v) { return horiz_sum3(v); }
331
332 /** @class common_partial_sums
333 * @brief Horizontal partial sum of a #HPCombi::epu8
334 * @details
335 * @returns the partials sums of the input
336 * @par Example:
337 * @code
338 * partial_sums(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
339 * @endcode
340 * Returns `{ 5,10,12,17,18,24,36,40,40,43,45,56,68,81,95,110}`
341 */
342 /** @copydoc common_partial_sums
343 * @par Algorithm:
344 * Reference @f$O(n)@f$ algorithm using loop and indexed access
345 */
346 inline epu8 partial_sums_ref(epu8);
347 /** @copydoc common_partial_sums
348 * @par Algorithm:
349 * Reference @f$O(n)@f$ algorithm using loop and indexed access
350 * through #HPCombi::VectGeneric
351 */
352 inline epu8 partial_sums_gen(epu8);
353 /** @copydoc common_partial_sums
354 * @par Algorithm:
355 * 4-stages paralell algorithm
356 */
357 inline epu8 partial_sums_round(epu8);
358 /** @copydoc common_partial_sums */
partial_sums(epu8 v)359 inline epu8 partial_sums(epu8 v) { return partial_sums_round(v); }
360
361
362 /** @class common_horiz_max
363 * @brief Horizontal sum of a #HPCombi::epu8
364 * @details
365 * @returns the horizontal sum of the input
366 * @par Example:
367 * @code
368 * horiz_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2, 0,12, 0, 0, 0});
369 * @endcode
370 * Returns `12`
371 */
372 /** @copydoc common_horiz_max
373 * @par Algorithm:
374 * Reference @f$O(n)@f$ algorithm using loop and indexed access
375 */
376 inline uint8_t horiz_max_ref(epu8);
377 /** @copydoc common_horiz_max
378 * @par Algorithm:
379 * Reference @f$O(n)@f$ algorithm using loop and indexed access
380 * through #HPCombi::VectGeneric
381 */
382 inline uint8_t horiz_max_gen(epu8);
383 /** @copydoc common_horiz_max
384 * @par Algorithm:
385 * 4-stages paralell algorithm
386 */
387 inline uint8_t horiz_max4(epu8);
388 /** @copydoc common_horiz_max
389 * @par Algorithm:
390 * 3-stages paralell algorithm + indexed access
391 */
392 inline uint8_t horiz_max3(epu8);
393 /** @copydoc common_horiz_max */
horiz_max(epu8 v)394 inline uint8_t horiz_max(epu8 v) { return horiz_max4(v); }
395
396 /** @class common_partial_max
397 * @brief Horizontal partial sum of a #HPCombi::epu8
398 * @details
399 * @returns the partials max of the input
400 * @par Example:
401 * @code
402 * partial_max(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
403 * @endcode
404 * Returns `{ 5, 5, 5, 5, 5, 6,12,12,12,12,12,12,12,13,14,15}`
405 */
406 /** @copydoc common_partial_max
407 * @par Algorithm:
408 * Reference @f$O(n)@f$ algorithm using loop and indexed access
409 */
410 inline epu8 partial_max_ref(epu8);
411 /** @copydoc common_partial_max
412 * @par Algorithm:
413 * Reference @f$O(n)@f$ algorithm using loop and indexed access
414 * through #HPCombi::VectGeneric
415 */
416 inline epu8 partial_max_gen(epu8);
417 /** @copydoc common_partial_max
418 * @par Algorithm:
419 * 4-stages paralell algorithm
420 */
421 inline epu8 partial_max_round(epu8);
422 /** @copydoc common_partial_max */
partial_max(epu8 v)423 inline epu8 partial_max(epu8 v) { return partial_max_round(v); }
424
425
426 /** @class common_horiz_min
427 * @brief Horizontal sum of a #HPCombi::epu8
428 * @details
429 * @returns the horizontal sum of the input
430 * @par Example:
431 * @code
432 * horiz_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 1, 3, 2, 2,12, 3, 4, 4});
433 * @endcode
434 * Returns `1`
435 */
436 /** @copydoc common_horiz_min
437 * @par Algorithm:
438 * Reference @f$O(n)@f$ algorithm using loop and indexed access
439 */
440 inline uint8_t horiz_min_ref(epu8);
441 /** @copydoc common_horiz_min
442 * @par Algorithm:
443 * Reference @f$O(n)@f$ algorithm using loop and indexed access
444 * through #HPCombi::VectGeneric
445 */
446 inline uint8_t horiz_min_gen(epu8);
447 /** @copydoc common_horiz_min
448 * @par Algorithm:
449 * 4-stages paralell algorithm
450 */
451 inline uint8_t horiz_min4(epu8);
452 /** @copydoc common_horiz_min
453 * @par Algorithm:
454 * 3-stages paralell algorithm + indexed access
455 */
456 inline uint8_t horiz_min3(epu8);
457 /** @copydoc common_horiz_min */
horiz_min(epu8 v)458 inline uint8_t horiz_min(epu8 v) { return horiz_min4(v); }
459
460 /** @class common_partial_min
461 * @brief Horizontal partial sum of a #HPCombi::epu8
462 * @details
463 * @returns the partials min of the input
464 * @par Example:
465 * @code
466 * partial_min(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
467 * @endcode
468 * Returns `{ 5, 5, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}`
469 */
470 /** @copydoc common_partial_min
471 * @par Algorithm:
472 * Reference @f$O(n)@f$ algorithm using loop and indexed access
473 */
474 inline epu8 partial_min_ref(epu8);
475 /** @copydoc common_partial_min
476 * @par Algorithm:
477 * Reference @f$O(n)@f$ algorithm using loop and indexed access
478 * through #HPCombi::VectGeneric
479 */
480 inline epu8 partial_min_gen(epu8);
481 /** @copydoc common_partial_min
482 * @par Algorithm:
483 * 4-stages paralell algorithm
484 */
485 inline epu8 partial_min_round(epu8);
486 /** @copydoc common_partial_min */
partial_min(epu8 v)487 inline epu8 partial_min(epu8 v) { return partial_min_round(v); }
488
489
490 /** @class common_eval16
491 * @brief Evaluation of a #HPCombi::epu8
492 * @details
493 * @param v : a #HPCombi::epu8
494 * @returns the evaluation, that is the #HPCombi::epu8 \c r such that
495 * \c r[i] is the number of occurrence of \c i in the input \c v
496 * @par Example:
497 * @code
498 * eval16(epu8 { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15});
499 * @endcode
500 * Returns `{ 1, 1, 2, 1, 1, 3, 1, 0, 0, 0, 0, 1, 2, 1, 1, 1}`
501 * @warning The entries larger than 15 are ignored
502 */
503 /** @copydoc common_eval16
504 * @par Algorithm:
505 * Reference @f$O(n)@f$ algorithm using loop and indexed access
506 */
507 inline epu8 eval16_ref(epu8 v);
508 /** @copydoc common_eval16
509 * @par Algorithm:
510 * Reference @f$O(n)@f$ algorithm using loop and cast to array
511 */
512 inline epu8 eval16_arr(epu8 v);
513 /** @copydoc common_eval16
514 * @par Algorithm:
515 * Vector @f$O(n)@f$ using cyclic shifting
516 */
517 inline epu8 eval16_cycle(epu8 v);
518 /** @copydoc common_eval16
519 * @par Algorithm:
520 * Vector @f$O(n)@f$ using popcount
521 */
522 inline epu8 eval16_popcount(epu8 v);
523 /** @copydoc common_eval16 */
eval16(epu8 v)524 inline epu8 eval16(epu8 v) { return eval16_cycle(v); };
525
526 /** @class common_first_diff
527 * @brief The first difference between two #HPCombi::epu8
528 * @details
529 * @param a, b : two #HPCombi::epu8
530 * @param bound : a \c size_t
531 * @returns the smallest index @f$i<bound@f$ such that \c a[i] and \c b[i]
532 * differ, 16 if there is no differences before bound.
533 * @par Example:
534 * @code
535 * epu8 a { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15};
536 * epu8 b { 5, 5, 2, 9, 1, 6,12, 4, 0, 4, 4, 4,12,13,14,15};
537 * @endcode
538 * then `first_diff(a, b)` returns `3`,
539 * `first_diff(a, b, 3)` returns `16`,
540 * `first_diff(a, b, 4)` returns `3`,
541 * `first_diff(a, b, 7)` returns `3`.
542 * @warning `bound` is assumed to be smaller or equal than 16
543 */
544 /** @copydoc common_first_diff
545 * @par Algorithm:
546 * Reference @f$O(n)@f$ algorithm using loop and indexed access
547 */
548 inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound = 16);
549 /** @copydoc common_first_diff
550 * @par Algorithm:
551 * Using \c cmpestri instruction
552 */
553 inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16);
554 /** @copydoc common_first_diff
555 * @par Algorithm:
556 * Using vector comparison and mask
557 */
558 inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound = 16);
559 /** @copydoc common_first_diff */
first_diff(epu8 a,epu8 b,size_t bound=16)560 inline uint64_t first_diff(epu8 a, epu8 b, size_t bound = 16) {
561 return first_diff_mask(a, b, bound);
562 }
563
564 /** @class common_last_diff
565 * @brief The last difference between two #HPCombi::epu8
566 * @details
567 * @param a, b : two #HPCombi::epu8
568 * @param bound : a \c size_t
569 * @returns the largest index @f$i<bound@f$ such that \c a[i] and \c b[i]
570 * differ, 16 if there is no differences before bound.
571 * @par Example:
572 * @code
573 * epu8 a { 5, 5, 2, 5, 1, 6,12, 4, 0, 3, 2,11,12,13,14,15};
574 * epu8 b { 5, 5, 2, 9, 1, 6,12, 4, 0, 4, 4, 4,12,13,14,15};
575 * @endcode
576 * then `last_diff(a, b)` returns `11`,
577 * `last_diff(a, b, 3)` returns `16`,
578 * `last_diff(a, b, 4)` returns `3`,
579 * `last_diff(a, b, 7)` returns `3`.
580 * @warning `bound` is assumed to be smaller or equal than 16
581 */
582 /** @copydoc common_last_diff
583 * @par Algorithm:
584 * Reference @f$O(n)@f$ algorithm using loop and indexed access
585 */
586 inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound = 16);
587 /** @copydoc common_last_diff
588 * @par Algorithm:
589 * Using \c cmpestri instruction
590 */
591 inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound = 16);
592 /** @copydoc common_last_diff
593 * @par Algorithm:
594 * Using vector comparison and mask
595 */
596 inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound = 16);
597 /** @copydoc common_last_diff */
last_diff(epu8 a,epu8 b,size_t bound=16)598 inline uint64_t last_diff(epu8 a, epu8 b, size_t bound = 16) {
599 return last_diff_mask(a, b, bound);
600 }
601
602 /** Lexicographic comparison between two #HPCombi::epu8 */
603 inline bool less(epu8 a, epu8 b);
604 /** Partial lexicographic comparison between two #HPCombi::epu8
605 * @param a, b : the vectors to compare
606 * @param k : the bound for the lexicographic comparison
607 * @return a positive, negative or zero char depending on the result
608 */
609 inline char less_partial(epu8 a, epu8 b, int k);
610
611 /** return the index of the first zero entry or 16 if there are none
612 * Only index smaller than bound are taken into account.
613 */
614 inline uint64_t first_zero(epu8 v, int bnd);
615 /** return the index of the last zero entry or 16 if there are none
616 * Only index smaller than bound are taken into account.
617 */
618 inline uint64_t last_zero(epu8 v, int bnd);
619 /** return the index of the first non zero entry or 16 if there are none
620 * Only index smaller than bound are taken into account.
621 */
622 inline uint64_t first_non_zero(epu8 v, int bnd);
623 /** return the index of the last non zero entry or 16 if there are none
624 * Only index smaller than bound are taken into account.
625 */
626 inline uint64_t last_non_zero(epu8 v, int bnd);
627
628 /** a vector popcount function
629 */
630 inline epu8 popcount16(epu8 v);
631
632 /** Test for partial transformation
633 * @details
634 * @returns whether \c v is a partial transformation.
635 * @param v the vector to test
636 * @param k the size of \c *this (default 16)
637 *
638 * Points where the function is undefined are mapped to \c 0xff. If \c *this
639 * is a tranformation of @f$0\dots n-1@f$ for @f$n<16@f$, it should be completed
640 * to a transformation of @f$0\dots 15@f$ by adding fixed points. That is the
641 * values @f$i\geq n@f$ should be mapped to themself.
642 * @par Example:
643 * The partial tranformation
644 * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 . . 4 \end{matrix}@f$
645 * is encoded by the array {2,0,5,0xff,0xff,4,6,7,8,9,10,11,12,13,14,15}
646 */
647 inline bool is_partial_transformation(epu8 v, const size_t k = 16);
648
649 /** Test for transformation
650 * @details
651 * @returns whether \c *this is a transformation.
652 * @param v the vector to test
653 * @param k the size of \c *this (default 16)
654 *
655 * If \c *this is a tranformation of @f$0\dots n-1@f$ for @f$n<16@f$,
656 * it should be completed to a transformation of @f$0\dots 15@f$
657 * by adding fixed points. That is the values @f$i\geq n@f$ should be
658 * mapped to themself.
659 * @par Example:
660 * The tranformation
661 * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 2 1 4 \end{matrix}@f$
662 * is encoded by the array {2,0,5,2,1,4,6,7,8,9,10,11,12,13,14,15}
663 */
664 inline bool is_transformation(epu8 v, const size_t k = 16);
665
666 /** Test for partial permutations
667 * @details
668 * @returns whether \c *this is a partial permutation.
669 * @param v the vector to test
670 * @param k the size of \c *this (default 16)
671 *
672 * Points where the function is undefined are mapped to \c 0xff.
673 * If \c *this is a partial permutation of @f$0\dots n-1@f$ for @f$n<16@f$,
674 * it should be completed to a partial permutation of @f$0\dots 15@f$
675 * by adding fixed points. That is the values @f$i\geq n@f$ should be
676 * mapped to themself.
677 * @par Example:
678 * The permutation
679 * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 . . 4 \end{matrix}@f$
680 * is encoded by the array {2,0,5,0xFF,0xFF,4,6,7,8,9,10,11,12,13,14,15}
681 */
682 inline bool is_partial_permutation(epu8 v, const size_t k = 16);
683
684 /** Test for permutations
685 * @details
686 * @returns whether \c *this is a permutation.
687 * @param v the vector to test
688 * @param k the size of \c *this (default 16)
689 *
690 * If \c *this is a permutation of @f$0\dots n-1@f$ for @f$n<16@f$,
691 * it should be completed to a permutation of @f$0\dots 15@f$
692 * by adding fixed points. That is the values @f$i\geq n@f$ should be
693 * mapped to themself.
694 * @par Example:
695 * The permutation
696 * @f$\begin{matrix}0 1 2 3 4 5\\ 2 0 5 3 1 4 \end{matrix}@f$
697 * is encoded by the array {2,0,5,3,1,4,6,7,8,9,10,11,12,13,14,15}
698 */
699 inline bool is_permutation(epu8 v, const size_t k = 16);
700
701 } // namespace HPCombi
702
703 namespace std {
704
705 inline std::ostream &operator<<(std::ostream &stream, HPCombi::epu8 const &a);
706
707 /** We also specialize the struct
708 * - std::equal_to<epu8>
709 * - std::not_equal_to<epu8>
710 * - std::hash<epu8>
711 * - std::less<epu8>
712 */
713 }
714
715 #include "epu_impl.hpp"
716
717 #endif // HPCOMBI_EPU_HPP_INCLUDED
718