1 //****************************************************************************//
2 //       Copyright (C) 2016 Florent Hivert <Florent.Hivert@lri.fr>,           //
3 //                                                                            //
4 //  Distributed under the terms of the GNU General Public License (GPL)       //
5 //                                                                            //
6 //    This code is distributed in the hope that it will be useful,            //
7 //    but WITHOUT ANY WARRANTY; without even the implied warranty of          //
8 //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU       //
9 //   General Public License for more details.                                 //
10 //                                                                            //
11 //  The full text of the GPL is available at:                                 //
12 //                                                                            //
13 //                  http://www.gnu.org/licenses/                              //
14 //****************************************************************************//
15 
16 // This is the implementation par of epu.hpp this should be seen as
17 // implementation details and should not be included directly.
18 
19 #include <initializer_list>
20 #include <iostream>
21 #include <random>
22 
23 #include "vect_generic.hpp"
24 
25 // Comparison mode for _mm_cmpestri
26 #define FIRST_DIFF                                                             \
27     (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)
28 #define LAST_DIFF                                                              \
29     (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY |        \
30      _SIDD_MOST_SIGNIFICANT)
31 #define FIRST_ZERO (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY)
32 #define LAST_ZERO                                                              \
33     (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MOST_SIGNIFICANT)
34 #define FIRST_NON_ZERO                                                         \
35     (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY)
36 #define LAST_NON_ZERO                                                          \
37     (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY |  \
38      _SIDD_MOST_SIGNIFICANT)
39 
40 namespace HPCombi {
41 
42 /*****************************************************************************/
43 /** Implementation part for inline functions *********************************/
44 /*****************************************************************************/
45 
46 // Msk is supposed to be a boolean mask (i.e. each entry is either 0 or 255)
first_mask(epu8 msk,size_t bound)47 inline uint64_t first_mask(epu8 msk, size_t bound) {
48     uint64_t res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
49     return res == 0 ? 16 : _bit_scan_forward(res);
50 }
last_mask(epu8 msk,size_t bound)51 inline uint64_t last_mask(epu8 msk, size_t bound) {
52     auto res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
53     return res == 0 ? 16 : _bit_scan_reverse(res);
54 }
55 
first_diff_ref(epu8 a,epu8 b,size_t bound)56 inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound) {
57     for (size_t i = 0; i < bound; i++)
58         if (a[i] != b[i])
59             return i;
60     return 16;
61 }
first_diff_cmpstr(epu8 a,epu8 b,size_t bound)62 inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound) {
63     return unsigned(_mm_cmpestri(a, bound, b, bound, FIRST_DIFF));
64 }
first_diff_mask(epu8 a,epu8 b,size_t bound)65 inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound) {
66     return first_mask(a != b, bound);
67 }
68 
last_diff_ref(epu8 a,epu8 b,size_t bound)69 inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound) {
70     while (bound != 0) {
71         --bound;
72         if (a[bound] != b[bound])
73             return bound;
74     }
75     return 16;
76 }
last_diff_cmpstr(epu8 a,epu8 b,size_t bound)77 inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound) {
78     return unsigned(_mm_cmpestri(a, bound, b, bound, LAST_DIFF));
79 }
last_diff_mask(epu8 a,epu8 b,size_t bound)80 inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound) {
81     return last_mask(a != b, bound);
82 }
83 
less(epu8 a,epu8 b)84 inline bool less(epu8 a, epu8 b) {
85     uint64_t diff = first_diff(a, b);
86     return (diff < 16) && (a[diff] < b[diff]);
87 }
less_partial(epu8 a,epu8 b,int k)88 inline char less_partial(epu8 a, epu8 b, int k) {
89     uint64_t diff = first_diff(a, b, k);
90     return (diff == 16)
91                ? 0
92                : static_cast<char>(a[diff]) - static_cast<char>(b[diff]);
93 }
94 
95 
first_zero(epu8 v,int bnd)96 inline uint64_t first_zero(epu8 v, int bnd) {
97     return first_mask(v == epu8 {}, bnd);
98 }
last_zero(epu8 v,int bnd)99 inline uint64_t last_zero(epu8 v, int bnd) {
100     return last_mask(v == epu8 {}, bnd);
101 }
first_non_zero(epu8 v,int bnd)102 inline uint64_t first_non_zero(epu8 v, int bnd) {
103     return first_mask(v != epu8 {}, bnd);
104 }
last_non_zero(epu8 v,int bnd)105 inline uint64_t last_non_zero(epu8 v, int bnd) {
106     return last_mask(v != epu8 {}, bnd);
107 }
108 
109 /// Apply a sorting network
110 template <bool Increassing = true, size_t sz>
network_sort(epu8 res,std::array<epu8,sz> rounds)111 inline epu8 network_sort(epu8 res, std::array<epu8, sz> rounds) {
112     for (auto round : rounds) {
113         // This conditional should be optimized out by the compiler
114         epu8 mask = Increassing ? round < epu8id : epu8id < round;
115         epu8 b = permuted(res, round);
116         // res = mask ? min(res,b) : max(res,b); is not accepted by clang
117         res = _mm_blendv_epi8(min(res, b), max(res, b), mask);
118     }
119     return res;
120 }
121 
122 /// Apply a sorting network in place and return the permutation
123 template <bool Increassing = true, size_t sz>
network_sort_perm(epu8 & v,std::array<epu8,sz> rounds)124 inline epu8 network_sort_perm(epu8 &v, std::array<epu8, sz> rounds) {
125     epu8 res = epu8id;
126     for (auto round : rounds) {
127         // This conditional should be optimized out by the compiler
128         epu8 mask = Increassing ? round < epu8id : epu8id < round;
129         epu8 b = permuted(v, round);
130         epu8 cmp = _mm_blendv_epi8(b < v, v < b, mask);
131         v = _mm_blendv_epi8(v, b, cmp);
132         res = _mm_blendv_epi8(res, permuted(res, round), cmp);
133     }
134     return res;
135 }
136 
137 /** A 16-way sorting network
138  * @details Sorting network from Knuth [AoCP3] Fig. 51 p 229.
139  * used by the #sorted function
140  *
141  * [AoCP3]: "D. Knuth, The art of computer programming vol. 3"
142  */
143 constexpr std::array<epu8, 9> sorting_rounds
144     // clang-format off
145     //     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
146 {{
147     epu8 { 1,  0,  3,  2,  5,  4,  7,  6,  9,  8, 11, 10, 13, 12, 15, 14},
148     epu8 { 2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13},
149     epu8 { 4,  5,  6,  7,  0,  1,  2,  3, 12, 13, 14, 15,  8,  9, 10, 11},
150     epu8 { 8,  9, 10, 11, 12, 13, 14, 15,  0,  1,  2,  3,  4,  5,  6,  7},
151     epu8 { 0,  2,  1, 12,  8, 10,  9, 11,  4,  6,  5,  7,  3, 14, 13, 15},
152     epu8 { 0,  4,  8, 10,  1,  9, 12, 13,  2,  5,  3, 14,  6,  7, 11, 15},
153     epu8 { 0,  1,  4,  5,  2,  3,  8,  9,  6,  7, 12, 13, 10, 11, 14, 15},
154     epu8 { 0,  1,  2,  6,  4,  8,  3, 10,  5, 12,  7, 11,  9, 13, 14, 15},
155     epu8 { 0,  1,  2,  4,  3,  6,  5,  8,  7, 10,  9, 12, 11, 13, 14, 15}
156 }};
157 // clang-format on
158 
159 /** A duplicated 8-way sorting network
160  * @details [Batcher odd-Even mergesort] sorting network
161  * used by the #sorted function
162  *
163  * [Batcher odd-Even mergesort]:
164  * https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort "Batcher
165  * odd–even mergesort"
166  */
167 constexpr std::array<epu8, 6> sorting_rounds8
168     // clang-format off
169     //     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
170 {{
171     epu8 { 1,  0,  3,  2,  5,  4,  7,  6,  9,  8, 11, 10, 13, 12, 15, 14},
172     epu8 { 2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13},
173     epu8 { 0,  2,  1,  3,  4,  6,  5,  7,  8, 10,  9, 11, 12, 14, 13, 15},
174     epu8 { 4,  5,  6,  7,  0,  1,  2,  3, 12, 13, 14, 15,  8,  9, 10, 11},
175     epu8 { 0,  1,  4,  5,  2,  3,  6,  7,  8,  9, 12, 13, 10, 11, 14, 15},
176     epu8 { 0,  2,  1,  4,  3,  6,  5,  7,  8, 10,  9, 12, 11, 14, 13, 15}
177 }};
178 // clang-format on
179 
is_sorted(epu8 a)180 inline bool is_sorted(epu8 a) {
181     return _mm_movemask_epi8(shifted_right(a) > a) == 0;
182 }
sorted(epu8 a)183 inline epu8 sorted(epu8 a) {
184     return network_sort<true>(a, sorting_rounds);
185 }
sorted8(epu8 a)186 inline epu8 sorted8(epu8 a) {
187     return network_sort<true>(a, sorting_rounds8);
188 }
revsorted(epu8 a)189 inline epu8 revsorted(epu8 a) {
190     return network_sort<false>(a, sorting_rounds);
191 }
revsorted8(epu8 a)192 inline epu8 revsorted8(epu8 a) {
193     return network_sort<false>(a, sorting_rounds8);
194 }
195 
sort_perm(epu8 & a)196 inline epu8 sort_perm(epu8 &a) {
197     return network_sort_perm<true>(a, sorting_rounds);
198 }
sort8_perm(epu8 & a)199 inline epu8 sort8_perm(epu8 &a) {
200     return network_sort_perm<true>(a, sorting_rounds8);
201 }
202 
203 
random_epu8(uint16_t bnd)204 inline epu8 random_epu8(uint16_t bnd) {
205     epu8 res;
206     std::random_device rd;
207 
208     std::default_random_engine e1(rd());
209     std::uniform_int_distribution<int> uniform_dist(0, bnd - 1);
210     for (size_t i = 0; i < 16; i++)
211         res[i] = uniform_dist(e1);
212     return res;
213 }
214 
remove_dups(epu8 v,uint8_t repl)215 inline epu8 remove_dups(epu8 v, uint8_t repl) {
216     // Vector ternary operator is not supported by clang.
217     // return (v != shifted_right(v) ? v : Epu8(repl);
218     return _mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));
219 }
220 
221 // Gather at the front numbers with (3-i)-th bit not set.
222 constexpr std::array<epu8, 3> inverting_rounds {{
223     // clang-format off
224     //     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
225     epu8 { 0,  1,  2,  3,  8,  9, 10, 11,  4,  5,  6,  7, 12, 13, 14, 15},
226     epu8 { 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15},
227     epu8 { 0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15}
228     // clang-format on
229 }};
230 
231 #define FIND_IN_VECT                                                           \
232     (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK |                 \
233      _SIDD_NEGATIVE_POLARITY)
234 #define FIND_IN_VECT_COMPL                                                     \
235     (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK)
236 
permutation_of(epu8 a,epu8 b)237 inline epu8 permutation_of(epu8 a, epu8 b) {
238     epu8 res = -static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
239     for (epu8 round : inverting_rounds) {
240         a = permuted(a, round);
241         res <<= 1;
242         res -= static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
243     }
244     return res;
245 }
246 
247 
248 #if defined(FF)
249 #error FF is defined !
250 #endif /* FF */
251 #define FF 0xff
252 
253 /// Permutation Round for partial and horizontal sums
254 constexpr std::array<epu8, 4> summing_rounds {{
255     // clang-format off
256     //      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
257     epu8 { FF,  0, FF,  2, FF,  4, FF,  6, FF,  8, FF, 10, FF, 12, FF, 14},
258     epu8 { FF, FF,  1,  1, FF, FF,  5,  5, FF, FF,  9,  9, FF, FF, 13, 13},
259     epu8 { FF, FF, FF, FF,  3,  3,  3,  3, FF, FF, FF, FF, 11, 11, 11, 11},
260     epu8 { FF, FF, FF, FF, FF, FF, FF, FF,  7,  7,  7,  7,  7,  7,  7,  7}
261     // clang-format on
262 }};
263 
264 constexpr std::array<epu8, 4> mining_rounds {{
265     // clang-format off
266     //      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
267     epu8 {  0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10, 10, 12, 12, 14, 14},
268     epu8 {  0,  1,  1,  1,  4,  5,  5,  5,  8,  9,  9,  9, 12, 13, 13, 13},
269     epu8 {  0,  1,  2,  3,  3,  3,  3,  3,  8,  9, 10, 11, 11, 11, 11, 11},
270     epu8 {  0,  1,  2,  3,  4,  5,  6,  7,  7,  7,  7,  7,  7,  7,  7,  7}
271     // clang-format on
272 }};
273 
274 #undef FF
275 
horiz_sum_ref(epu8 v)276 inline uint8_t horiz_sum_ref(epu8 v) {
277     uint8_t res = 0;
278     for (size_t i = 0; i < 16; i++)
279         res += v[i];
280     return res;
281 }
horiz_sum_gen(epu8 v)282 inline uint8_t horiz_sum_gen(epu8 v) { return as_VectGeneric(v).horiz_sum(); }
horiz_sum4(epu8 v)283 inline uint8_t horiz_sum4(epu8 v) { return partial_sums_round(v)[15]; }
horiz_sum3(epu8 v)284 inline uint8_t horiz_sum3(epu8 v) {
285     auto sr = summing_rounds;
286     v += permuted(v, sr[0]);
287     v += permuted(v, sr[1]);
288     v += permuted(v, sr[2]);
289     return v[7] + v[15];
290 }
291 
partial_sums_ref(epu8 v)292 inline epu8 partial_sums_ref(epu8 v) {
293     epu8 res{};
294     res[0] = v[0];
295     for (size_t i = 1; i < 16; i++)
296         res[i] = res[i - 1] + v[i];
297     return res;
298 }
partial_sums_gen(epu8 v)299 inline epu8 partial_sums_gen(epu8 v) {
300     as_VectGeneric(v).partial_sums_inplace();
301     return v;
302 }
partial_sums_round(epu8 v)303 inline epu8 partial_sums_round(epu8 v) {
304     for (epu8 round : summing_rounds)
305         v += permuted(v, round);
306     return v;
307 }
308 
309 
horiz_max_ref(epu8 v)310 inline uint8_t horiz_max_ref(epu8 v) {
311     uint8_t res = 0;
312     for (size_t i = 0; i < 16; i++)
313         res = std::max(res, v[i]);
314     return res;
315 }
horiz_max_gen(epu8 v)316 inline uint8_t horiz_max_gen(epu8 v) { return as_VectGeneric(v).horiz_max(); }
horiz_max4(epu8 v)317 inline uint8_t horiz_max4(epu8 v) { return partial_max_round(v)[15]; }
horiz_max3(epu8 v)318 inline uint8_t horiz_max3(epu8 v) {
319     auto sr = summing_rounds;
320     v = max(v, permuted(v, sr[0]));
321     v = max(v, permuted(v, sr[1]));
322     v = max(v, permuted(v, sr[2]));
323     return std::max(v[7], v[15]);
324 }
325 
partial_max_ref(epu8 v)326 inline epu8 partial_max_ref(epu8 v) {
327     epu8 res;
328     res[0] = v[0];
329     for (size_t i = 1; i < 16; i++)
330         res[i] = std::max(res[i - 1], v[i]);
331     return res;
332 }
partial_max_gen(epu8 v)333 inline epu8 partial_max_gen(epu8 v) {
334     as_VectGeneric(v).partial_max_inplace();
335     return v;
336 }
partial_max_round(epu8 v)337 inline epu8 partial_max_round(epu8 v) {
338     for (epu8 round : summing_rounds)
339         v = max(v, permuted(v, round));
340     return v;
341 }
342 
343 
horiz_min_ref(epu8 v)344 inline uint8_t horiz_min_ref(epu8 v) {
345     uint8_t res = 255;
346     for (size_t i = 0; i < 16; i++)
347         res = std::min(res, v[i]);
348     return res;
349 }
horiz_min_gen(epu8 v)350 inline uint8_t horiz_min_gen(epu8 v) { return as_VectGeneric(v).horiz_min(); }
horiz_min4(epu8 v)351 inline uint8_t horiz_min4(epu8 v) { return partial_min_round(v)[15]; }
horiz_min3(epu8 v)352 inline uint8_t horiz_min3(epu8 v) {
353     auto sr = mining_rounds;
354     v = min(v, permuted(v, sr[0]));
355     v = min(v, permuted(v, sr[1]));
356     v = min(v, permuted(v, sr[2]));
357     return std::min(v[7], v[15]);
358 }
359 
partial_min_ref(epu8 v)360 inline epu8 partial_min_ref(epu8 v) {
361     epu8 res;
362     res[0] = v[0];
363     for (size_t i = 1; i < 16; i++)
364         res[i] = std::min(res[i - 1], v[i]) ;
365     return res;
366 }
partial_min_gen(epu8 v)367 inline epu8 partial_min_gen(epu8 v) {
368     as_VectGeneric(v).partial_min_inplace();
369     return v;
370 }
partial_min_round(epu8 v)371 inline epu8 partial_min_round(epu8 v) {
372     for (epu8 round : mining_rounds)
373         v = min(v, permuted(v, round));
374     return v;
375 }
376 
377 
eval16_ref(epu8 v)378 inline epu8 eval16_ref(epu8 v) {
379     epu8 res{};
380     for (size_t i = 0; i < 16; i++)
381         if (v[i] < 16)
382             res[v[i]]++;
383     return res;
384 }
eval16_arr(epu8 v8)385 inline epu8 eval16_arr(epu8 v8) {
386     TPUBuild<epu8>::array res{};
387     auto v = as_array(v8);
388     for (size_t i = 0; i < 16; i++)
389         if (v[i] < 16)
390             res[v[i]]++;
391     return from_array(res);
392 }
eval16_gen(epu8 v)393 inline epu8 eval16_gen(epu8 v) {
394     return from_array(as_VectGeneric(v).eval().v);
395 }
eval16_cycle(epu8 v)396 inline epu8 eval16_cycle(epu8 v) {
397     epu8 res = -(epu8id == v);
398     for (int i = 1; i < 16; i++) {
399         v = permuted(v, left_cycle);
400         res -= (epu8id == v);
401     }
402     return res;
403 }
eval16_popcount(epu8 v)404 inline epu8 eval16_popcount(epu8 v) {
405     epu8 res{};
406     for (size_t i = 0; i < 16; i++) {
407         res[i] = _mm_popcnt_u32(_mm_movemask_epi8(v == Epu8(uint8_t(i))));
408     }
409     return res;
410 }
411 
412 
popcount16(epu8 v)413 inline epu8 popcount16(epu8 v){
414     return permuted(popcount4, (v & Epu8(0x0f))) + permuted(popcount4, v >> 4);
415 }
416 
417 
is_partial_transformation(epu8 v,const size_t k)418 inline bool is_partial_transformation(epu8 v, const size_t k) {
419     uint64_t diff = last_diff(v, epu8id, 16);
420     // (forall x in v, x + 1 <= 16)  and
421     // (v = Perm16::one()   or  last diff index < 16)
422     return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
423         && (diff == 16 || diff < k);
424 }
425 
is_transformation(epu8 v,const size_t k)426 inline bool is_transformation(epu8 v, const size_t k) {
427     uint64_t diff = last_diff(v, epu8id, 16);
428     return (_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff)
429         && (diff == 16 || diff < k);
430 }
431 
is_partial_permutation(epu8 v,const size_t k)432 inline bool is_partial_permutation(epu8 v, const size_t k) {
433     uint64_t diff = last_diff(v, epu8id, 16);
434     // (forall x in v, x <= 15)  and
435     // (forall x < 15, multiplicity x v <= 1
436     // (v = Perm16::one()   or  last diff index < 16)
437     return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
438         && (_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff)
439         && (diff == 16 || diff < k);
440 }
441 
is_permutation(epu8 v,const size_t k)442 inline bool is_permutation(epu8 v, const size_t k) {
443     uint64_t diff = last_diff(v, epu8id, 16);
444     // (forall x in v, x in Perm16::one())  and
445     // (forall x in Perm16::one(), x in v)  and
446     // (v = Perm16::one()   or  last diff index < 16)
447     return _mm_cmpestri(epu8id, 16, v, 16, FIRST_NON_ZERO) == 16
448         && _mm_cmpestri(v, 16, epu8id, 16, FIRST_NON_ZERO) == 16
449         && (diff == 16 || diff < k);
450 }
451 
452 }  // namespace HPCombi
453 
454 namespace std {
455 
operator <<(std::ostream & stream,HPCombi::epu8 const & a)456 inline std::ostream &operator<<(std::ostream &stream, HPCombi::epu8 const &a) {
457     stream << "[" << std::setw(2) << unsigned(a[0]);
458     for (unsigned i = 1; i < 16; ++i)
459         stream << "," << std::setw(2) << unsigned(a[i]);
460     stream << "]";
461     return stream;
462 }
463 
464 template <> struct equal_to<HPCombi::epu8> {
operator ()std::equal_to465     bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const {
466         return HPCombi::equal(lhs, rhs);
467     }
468 };
469 
470 template <> struct not_equal_to<HPCombi::epu8> {
operator ()std::not_equal_to471     bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const {
472         return HPCombi::not_equal(lhs, rhs);
473     }
474 };
475 
476 template <> struct hash<HPCombi::epu8> {
operator ()std::hash477     inline size_t operator()(HPCombi::epu8 a) const {
478         __int128 v0 = _mm_extract_epi64(a, 0);
479         __int128 v1 = _mm_extract_epi64(a, 1);
480         return ((v1 * HPCombi::prime + v0) * HPCombi::prime) >> 64;
481 
482         /* The following is extremely slow on Renner benchmark
483            uint64_t v0 = _mm_extract_epi64(ar.v, 0);
484            uint64_t v1 = _mm_extract_epi64(ar.v, 1);
485            size_t seed = v0 + 0x9e3779b9;
486            seed ^= v1 + 0x9e3779b9 + (seed<<6) + (seed>>2);
487            return seed;
488         */
489     }
490 };
491 
492 template <> struct less<HPCombi::epu8> {
493     // WARNING: due to endianess this is not lexicographic comparison,
494     //          but we don't care when using in std::set.
495     // 10% faster than calling the lexicographic comparison operator !
operator ()std::less496     inline size_t operator()(const HPCombi::epu8 &v1,
497                              const HPCombi::epu8 &v2) const {
498         __m128 v1v = __m128(v1), v2v = __m128(v2);
499         return v1v[0] == v2v[0] ? v1v[1] < v2v[1] : v1v[0] < v2v[0];
500     }
501 };
502 
503 }  // namespace std
504