1 //****************************************************************************//
2 // Copyright (C) 2016 Florent Hivert <Florent.Hivert@lri.fr>, //
3 // //
4 // Distributed under the terms of the GNU General Public License (GPL) //
5 // //
6 // This code is distributed in the hope that it will be useful, //
7 // but WITHOUT ANY WARRANTY; without even the implied warranty of //
8 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU //
9 // General Public License for more details. //
10 // //
11 // The full text of the GPL is available at: //
12 // //
13 // http://www.gnu.org/licenses/ //
14 //****************************************************************************//
15
16 // This is the implementation par of epu.hpp this should be seen as
17 // implementation details and should not be included directly.
18
19 #include <initializer_list>
20 #include <iostream>
21 #include <random>
22
23 #include "vect_generic.hpp"
24
25 // Comparison mode for _mm_cmpestri
26 #define FIRST_DIFF \
27 (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY)
28 #define LAST_DIFF \
29 (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_NEGATIVE_POLARITY | \
30 _SIDD_MOST_SIGNIFICANT)
31 #define FIRST_ZERO (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY)
32 #define LAST_ZERO \
33 (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MOST_SIGNIFICANT)
34 #define FIRST_NON_ZERO \
35 (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY)
36 #define LAST_NON_ZERO \
37 (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_MASKED_NEGATIVE_POLARITY | \
38 _SIDD_MOST_SIGNIFICANT)
39
40 namespace HPCombi {
41
42 /*****************************************************************************/
43 /** Implementation part for inline functions *********************************/
44 /*****************************************************************************/
45
46 // Msk is supposed to be a boolean mask (i.e. each entry is either 0 or 255)
first_mask(epu8 msk,size_t bound)47 inline uint64_t first_mask(epu8 msk, size_t bound) {
48 uint64_t res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
49 return res == 0 ? 16 : _bit_scan_forward(res);
50 }
last_mask(epu8 msk,size_t bound)51 inline uint64_t last_mask(epu8 msk, size_t bound) {
52 auto res = _mm_movemask_epi8(msk & (epu8id < Epu8(bound)));
53 return res == 0 ? 16 : _bit_scan_reverse(res);
54 }
55
first_diff_ref(epu8 a,epu8 b,size_t bound)56 inline uint64_t first_diff_ref(epu8 a, epu8 b, size_t bound) {
57 for (size_t i = 0; i < bound; i++)
58 if (a[i] != b[i])
59 return i;
60 return 16;
61 }
first_diff_cmpstr(epu8 a,epu8 b,size_t bound)62 inline uint64_t first_diff_cmpstr(epu8 a, epu8 b, size_t bound) {
63 return unsigned(_mm_cmpestri(a, bound, b, bound, FIRST_DIFF));
64 }
first_diff_mask(epu8 a,epu8 b,size_t bound)65 inline uint64_t first_diff_mask(epu8 a, epu8 b, size_t bound) {
66 return first_mask(a != b, bound);
67 }
68
last_diff_ref(epu8 a,epu8 b,size_t bound)69 inline uint64_t last_diff_ref(epu8 a, epu8 b, size_t bound) {
70 while (bound != 0) {
71 --bound;
72 if (a[bound] != b[bound])
73 return bound;
74 }
75 return 16;
76 }
last_diff_cmpstr(epu8 a,epu8 b,size_t bound)77 inline uint64_t last_diff_cmpstr(epu8 a, epu8 b, size_t bound) {
78 return unsigned(_mm_cmpestri(a, bound, b, bound, LAST_DIFF));
79 }
last_diff_mask(epu8 a,epu8 b,size_t bound)80 inline uint64_t last_diff_mask(epu8 a, epu8 b, size_t bound) {
81 return last_mask(a != b, bound);
82 }
83
less(epu8 a,epu8 b)84 inline bool less(epu8 a, epu8 b) {
85 uint64_t diff = first_diff(a, b);
86 return (diff < 16) && (a[diff] < b[diff]);
87 }
less_partial(epu8 a,epu8 b,int k)88 inline char less_partial(epu8 a, epu8 b, int k) {
89 uint64_t diff = first_diff(a, b, k);
90 return (diff == 16)
91 ? 0
92 : static_cast<char>(a[diff]) - static_cast<char>(b[diff]);
93 }
94
95
first_zero(epu8 v,int bnd)96 inline uint64_t first_zero(epu8 v, int bnd) {
97 return first_mask(v == epu8 {}, bnd);
98 }
last_zero(epu8 v,int bnd)99 inline uint64_t last_zero(epu8 v, int bnd) {
100 return last_mask(v == epu8 {}, bnd);
101 }
first_non_zero(epu8 v,int bnd)102 inline uint64_t first_non_zero(epu8 v, int bnd) {
103 return first_mask(v != epu8 {}, bnd);
104 }
last_non_zero(epu8 v,int bnd)105 inline uint64_t last_non_zero(epu8 v, int bnd) {
106 return last_mask(v != epu8 {}, bnd);
107 }
108
109 /// Apply a sorting network
110 template <bool Increassing = true, size_t sz>
network_sort(epu8 res,std::array<epu8,sz> rounds)111 inline epu8 network_sort(epu8 res, std::array<epu8, sz> rounds) {
112 for (auto round : rounds) {
113 // This conditional should be optimized out by the compiler
114 epu8 mask = Increassing ? round < epu8id : epu8id < round;
115 epu8 b = permuted(res, round);
116 // res = mask ? min(res,b) : max(res,b); is not accepted by clang
117 res = _mm_blendv_epi8(min(res, b), max(res, b), mask);
118 }
119 return res;
120 }
121
122 /// Apply a sorting network in place and return the permutation
123 template <bool Increassing = true, size_t sz>
network_sort_perm(epu8 & v,std::array<epu8,sz> rounds)124 inline epu8 network_sort_perm(epu8 &v, std::array<epu8, sz> rounds) {
125 epu8 res = epu8id;
126 for (auto round : rounds) {
127 // This conditional should be optimized out by the compiler
128 epu8 mask = Increassing ? round < epu8id : epu8id < round;
129 epu8 b = permuted(v, round);
130 epu8 cmp = _mm_blendv_epi8(b < v, v < b, mask);
131 v = _mm_blendv_epi8(v, b, cmp);
132 res = _mm_blendv_epi8(res, permuted(res, round), cmp);
133 }
134 return res;
135 }
136
137 /** A 16-way sorting network
138 * @details Sorting network from Knuth [AoCP3] Fig. 51 p 229.
139 * used by the #sorted function
140 *
141 * [AoCP3]: "D. Knuth, The art of computer programming vol. 3"
142 */
143 constexpr std::array<epu8, 9> sorting_rounds
144 // clang-format off
145 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
146 {{
147 epu8 { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
148 epu8 { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},
149 epu8 { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
150 epu8 { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7},
151 epu8 { 0, 2, 1, 12, 8, 10, 9, 11, 4, 6, 5, 7, 3, 14, 13, 15},
152 epu8 { 0, 4, 8, 10, 1, 9, 12, 13, 2, 5, 3, 14, 6, 7, 11, 15},
153 epu8 { 0, 1, 4, 5, 2, 3, 8, 9, 6, 7, 12, 13, 10, 11, 14, 15},
154 epu8 { 0, 1, 2, 6, 4, 8, 3, 10, 5, 12, 7, 11, 9, 13, 14, 15},
155 epu8 { 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 13, 14, 15}
156 }};
157 // clang-format on
158
159 /** A duplicated 8-way sorting network
160 * @details [Batcher odd-Even mergesort] sorting network
161 * used by the #sorted function
162 *
163 * [Batcher odd-Even mergesort]:
164 * https://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort "Batcher
165 * odd–even mergesort"
166 */
167 constexpr std::array<epu8, 6> sorting_rounds8
168 // clang-format off
169 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
170 {{
171 epu8 { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
172 epu8 { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13},
173 epu8 { 0, 2, 1, 3, 4, 6, 5, 7, 8, 10, 9, 11, 12, 14, 13, 15},
174 epu8 { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11},
175 epu8 { 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15},
176 epu8 { 0, 2, 1, 4, 3, 6, 5, 7, 8, 10, 9, 12, 11, 14, 13, 15}
177 }};
178 // clang-format on
179
is_sorted(epu8 a)180 inline bool is_sorted(epu8 a) {
181 return _mm_movemask_epi8(shifted_right(a) > a) == 0;
182 }
sorted(epu8 a)183 inline epu8 sorted(epu8 a) {
184 return network_sort<true>(a, sorting_rounds);
185 }
sorted8(epu8 a)186 inline epu8 sorted8(epu8 a) {
187 return network_sort<true>(a, sorting_rounds8);
188 }
revsorted(epu8 a)189 inline epu8 revsorted(epu8 a) {
190 return network_sort<false>(a, sorting_rounds);
191 }
revsorted8(epu8 a)192 inline epu8 revsorted8(epu8 a) {
193 return network_sort<false>(a, sorting_rounds8);
194 }
195
sort_perm(epu8 & a)196 inline epu8 sort_perm(epu8 &a) {
197 return network_sort_perm<true>(a, sorting_rounds);
198 }
sort8_perm(epu8 & a)199 inline epu8 sort8_perm(epu8 &a) {
200 return network_sort_perm<true>(a, sorting_rounds8);
201 }
202
203
random_epu8(uint16_t bnd)204 inline epu8 random_epu8(uint16_t bnd) {
205 epu8 res;
206 std::random_device rd;
207
208 std::default_random_engine e1(rd());
209 std::uniform_int_distribution<int> uniform_dist(0, bnd - 1);
210 for (size_t i = 0; i < 16; i++)
211 res[i] = uniform_dist(e1);
212 return res;
213 }
214
remove_dups(epu8 v,uint8_t repl)215 inline epu8 remove_dups(epu8 v, uint8_t repl) {
216 // Vector ternary operator is not supported by clang.
217 // return (v != shifted_right(v) ? v : Epu8(repl);
218 return _mm_blendv_epi8(Epu8(repl), v, v != shifted_right(v));
219 }
220
221 // Gather at the front numbers with (3-i)-th bit not set.
222 constexpr std::array<epu8, 3> inverting_rounds {{
223 // clang-format off
224 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
225 epu8 { 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15},
226 epu8 { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15},
227 epu8 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}
228 // clang-format on
229 }};
230
231 #define FIND_IN_VECT \
232 (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK | \
233 _SIDD_NEGATIVE_POLARITY)
234 #define FIND_IN_VECT_COMPL \
235 (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_UNIT_MASK)
236
permutation_of(epu8 a,epu8 b)237 inline epu8 permutation_of(epu8 a, epu8 b) {
238 epu8 res = -static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
239 for (epu8 round : inverting_rounds) {
240 a = permuted(a, round);
241 res <<= 1;
242 res -= static_cast<epu8>(_mm_cmpestrm(a, 8, b, 16, FIND_IN_VECT));
243 }
244 return res;
245 }
246
247
248 #if defined(FF)
249 #error FF is defined !
250 #endif /* FF */
251 #define FF 0xff
252
253 /// Permutation Round for partial and horizontal sums
254 constexpr std::array<epu8, 4> summing_rounds {{
255 // clang-format off
256 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
257 epu8 { FF, 0, FF, 2, FF, 4, FF, 6, FF, 8, FF, 10, FF, 12, FF, 14},
258 epu8 { FF, FF, 1, 1, FF, FF, 5, 5, FF, FF, 9, 9, FF, FF, 13, 13},
259 epu8 { FF, FF, FF, FF, 3, 3, 3, 3, FF, FF, FF, FF, 11, 11, 11, 11},
260 epu8 { FF, FF, FF, FF, FF, FF, FF, FF, 7, 7, 7, 7, 7, 7, 7, 7}
261 // clang-format on
262 }};
263
264 constexpr std::array<epu8, 4> mining_rounds {{
265 // clang-format off
266 // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
267 epu8 { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14},
268 epu8 { 0, 1, 1, 1, 4, 5, 5, 5, 8, 9, 9, 9, 12, 13, 13, 13},
269 epu8 { 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11},
270 epu8 { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7}
271 // clang-format on
272 }};
273
274 #undef FF
275
horiz_sum_ref(epu8 v)276 inline uint8_t horiz_sum_ref(epu8 v) {
277 uint8_t res = 0;
278 for (size_t i = 0; i < 16; i++)
279 res += v[i];
280 return res;
281 }
horiz_sum_gen(epu8 v)282 inline uint8_t horiz_sum_gen(epu8 v) { return as_VectGeneric(v).horiz_sum(); }
horiz_sum4(epu8 v)283 inline uint8_t horiz_sum4(epu8 v) { return partial_sums_round(v)[15]; }
horiz_sum3(epu8 v)284 inline uint8_t horiz_sum3(epu8 v) {
285 auto sr = summing_rounds;
286 v += permuted(v, sr[0]);
287 v += permuted(v, sr[1]);
288 v += permuted(v, sr[2]);
289 return v[7] + v[15];
290 }
291
partial_sums_ref(epu8 v)292 inline epu8 partial_sums_ref(epu8 v) {
293 epu8 res{};
294 res[0] = v[0];
295 for (size_t i = 1; i < 16; i++)
296 res[i] = res[i - 1] + v[i];
297 return res;
298 }
partial_sums_gen(epu8 v)299 inline epu8 partial_sums_gen(epu8 v) {
300 as_VectGeneric(v).partial_sums_inplace();
301 return v;
302 }
partial_sums_round(epu8 v)303 inline epu8 partial_sums_round(epu8 v) {
304 for (epu8 round : summing_rounds)
305 v += permuted(v, round);
306 return v;
307 }
308
309
horiz_max_ref(epu8 v)310 inline uint8_t horiz_max_ref(epu8 v) {
311 uint8_t res = 0;
312 for (size_t i = 0; i < 16; i++)
313 res = std::max(res, v[i]);
314 return res;
315 }
horiz_max_gen(epu8 v)316 inline uint8_t horiz_max_gen(epu8 v) { return as_VectGeneric(v).horiz_max(); }
horiz_max4(epu8 v)317 inline uint8_t horiz_max4(epu8 v) { return partial_max_round(v)[15]; }
horiz_max3(epu8 v)318 inline uint8_t horiz_max3(epu8 v) {
319 auto sr = summing_rounds;
320 v = max(v, permuted(v, sr[0]));
321 v = max(v, permuted(v, sr[1]));
322 v = max(v, permuted(v, sr[2]));
323 return std::max(v[7], v[15]);
324 }
325
partial_max_ref(epu8 v)326 inline epu8 partial_max_ref(epu8 v) {
327 epu8 res;
328 res[0] = v[0];
329 for (size_t i = 1; i < 16; i++)
330 res[i] = std::max(res[i - 1], v[i]);
331 return res;
332 }
partial_max_gen(epu8 v)333 inline epu8 partial_max_gen(epu8 v) {
334 as_VectGeneric(v).partial_max_inplace();
335 return v;
336 }
partial_max_round(epu8 v)337 inline epu8 partial_max_round(epu8 v) {
338 for (epu8 round : summing_rounds)
339 v = max(v, permuted(v, round));
340 return v;
341 }
342
343
horiz_min_ref(epu8 v)344 inline uint8_t horiz_min_ref(epu8 v) {
345 uint8_t res = 255;
346 for (size_t i = 0; i < 16; i++)
347 res = std::min(res, v[i]);
348 return res;
349 }
horiz_min_gen(epu8 v)350 inline uint8_t horiz_min_gen(epu8 v) { return as_VectGeneric(v).horiz_min(); }
horiz_min4(epu8 v)351 inline uint8_t horiz_min4(epu8 v) { return partial_min_round(v)[15]; }
horiz_min3(epu8 v)352 inline uint8_t horiz_min3(epu8 v) {
353 auto sr = mining_rounds;
354 v = min(v, permuted(v, sr[0]));
355 v = min(v, permuted(v, sr[1]));
356 v = min(v, permuted(v, sr[2]));
357 return std::min(v[7], v[15]);
358 }
359
partial_min_ref(epu8 v)360 inline epu8 partial_min_ref(epu8 v) {
361 epu8 res;
362 res[0] = v[0];
363 for (size_t i = 1; i < 16; i++)
364 res[i] = std::min(res[i - 1], v[i]) ;
365 return res;
366 }
partial_min_gen(epu8 v)367 inline epu8 partial_min_gen(epu8 v) {
368 as_VectGeneric(v).partial_min_inplace();
369 return v;
370 }
partial_min_round(epu8 v)371 inline epu8 partial_min_round(epu8 v) {
372 for (epu8 round : mining_rounds)
373 v = min(v, permuted(v, round));
374 return v;
375 }
376
377
eval16_ref(epu8 v)378 inline epu8 eval16_ref(epu8 v) {
379 epu8 res{};
380 for (size_t i = 0; i < 16; i++)
381 if (v[i] < 16)
382 res[v[i]]++;
383 return res;
384 }
eval16_arr(epu8 v8)385 inline epu8 eval16_arr(epu8 v8) {
386 TPUBuild<epu8>::array res{};
387 auto v = as_array(v8);
388 for (size_t i = 0; i < 16; i++)
389 if (v[i] < 16)
390 res[v[i]]++;
391 return from_array(res);
392 }
eval16_gen(epu8 v)393 inline epu8 eval16_gen(epu8 v) {
394 return from_array(as_VectGeneric(v).eval().v);
395 }
eval16_cycle(epu8 v)396 inline epu8 eval16_cycle(epu8 v) {
397 epu8 res = -(epu8id == v);
398 for (int i = 1; i < 16; i++) {
399 v = permuted(v, left_cycle);
400 res -= (epu8id == v);
401 }
402 return res;
403 }
eval16_popcount(epu8 v)404 inline epu8 eval16_popcount(epu8 v) {
405 epu8 res{};
406 for (size_t i = 0; i < 16; i++) {
407 res[i] = _mm_popcnt_u32(_mm_movemask_epi8(v == Epu8(uint8_t(i))));
408 }
409 return res;
410 }
411
412
popcount16(epu8 v)413 inline epu8 popcount16(epu8 v){
414 return permuted(popcount4, (v & Epu8(0x0f))) + permuted(popcount4, v >> 4);
415 }
416
417
is_partial_transformation(epu8 v,const size_t k)418 inline bool is_partial_transformation(epu8 v, const size_t k) {
419 uint64_t diff = last_diff(v, epu8id, 16);
420 // (forall x in v, x + 1 <= 16) and
421 // (v = Perm16::one() or last diff index < 16)
422 return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
423 && (diff == 16 || diff < k);
424 }
425
is_transformation(epu8 v,const size_t k)426 inline bool is_transformation(epu8 v, const size_t k) {
427 uint64_t diff = last_diff(v, epu8id, 16);
428 return (_mm_movemask_epi8(v < Epu8(0x10)) == 0xffff)
429 && (diff == 16 || diff < k);
430 }
431
is_partial_permutation(epu8 v,const size_t k)432 inline bool is_partial_permutation(epu8 v, const size_t k) {
433 uint64_t diff = last_diff(v, epu8id, 16);
434 // (forall x in v, x <= 15) and
435 // (forall x < 15, multiplicity x v <= 1
436 // (v = Perm16::one() or last diff index < 16)
437 return (_mm_movemask_epi8(v + Epu8(1) <= Epu8(0x10)) == 0xffff)
438 && (_mm_movemask_epi8(eval16(v) <= Epu8(1)) == 0xffff)
439 && (diff == 16 || diff < k);
440 }
441
is_permutation(epu8 v,const size_t k)442 inline bool is_permutation(epu8 v, const size_t k) {
443 uint64_t diff = last_diff(v, epu8id, 16);
444 // (forall x in v, x in Perm16::one()) and
445 // (forall x in Perm16::one(), x in v) and
446 // (v = Perm16::one() or last diff index < 16)
447 return _mm_cmpestri(epu8id, 16, v, 16, FIRST_NON_ZERO) == 16
448 && _mm_cmpestri(v, 16, epu8id, 16, FIRST_NON_ZERO) == 16
449 && (diff == 16 || diff < k);
450 }
451
452 } // namespace HPCombi
453
454 namespace std {
455
operator <<(std::ostream & stream,HPCombi::epu8 const & a)456 inline std::ostream &operator<<(std::ostream &stream, HPCombi::epu8 const &a) {
457 stream << "[" << std::setw(2) << unsigned(a[0]);
458 for (unsigned i = 1; i < 16; ++i)
459 stream << "," << std::setw(2) << unsigned(a[i]);
460 stream << "]";
461 return stream;
462 }
463
464 template <> struct equal_to<HPCombi::epu8> {
operator ()std::equal_to465 bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const {
466 return HPCombi::equal(lhs, rhs);
467 }
468 };
469
470 template <> struct not_equal_to<HPCombi::epu8> {
operator ()std::not_equal_to471 bool operator()(const HPCombi::epu8 &lhs, const HPCombi::epu8 &rhs) const {
472 return HPCombi::not_equal(lhs, rhs);
473 }
474 };
475
476 template <> struct hash<HPCombi::epu8> {
operator ()std::hash477 inline size_t operator()(HPCombi::epu8 a) const {
478 __int128 v0 = _mm_extract_epi64(a, 0);
479 __int128 v1 = _mm_extract_epi64(a, 1);
480 return ((v1 * HPCombi::prime + v0) * HPCombi::prime) >> 64;
481
482 /* The following is extremely slow on Renner benchmark
483 uint64_t v0 = _mm_extract_epi64(ar.v, 0);
484 uint64_t v1 = _mm_extract_epi64(ar.v, 1);
485 size_t seed = v0 + 0x9e3779b9;
486 seed ^= v1 + 0x9e3779b9 + (seed<<6) + (seed>>2);
487 return seed;
488 */
489 }
490 };
491
492 template <> struct less<HPCombi::epu8> {
493 // WARNING: due to endianess this is not lexicographic comparison,
494 // but we don't care when using in std::set.
495 // 10% faster than calling the lexicographic comparison operator !
operator ()std::less496 inline size_t operator()(const HPCombi::epu8 &v1,
497 const HPCombi::epu8 &v2) const {
498 __m128 v1v = __m128(v1), v2v = __m128(v2);
499 return v1v[0] == v2v[0] ? v1v[1] < v2v[1] : v1v[0] < v2v[0];
500 }
501 };
502
503 } // namespace std
504