1 #include "defs.h"
2 
3 #include <smmintrin.h>
4 #include <stdio.h>
5 
6 /* Straight-line branchless SSE 4.2 code for searching an array of uint16_t
7    hash codes. */
8 
mask(int32_t a,int32_t b)9 static inline int32_t mask(int32_t a, int32_t b) { return -(a == b); }
10 
11 #if defined(__GNUC__)
first_bit_set(int32_t a)12 static inline int32_t first_bit_set(int32_t a) {
13     return __builtin_ffs(a) - 1;
14 }
15 #else
16 static uint8_t de_bruijn_table[] = {
17     0,   1, 28,  2, 29, 14, 24,  3, 30, 22, 20, 15, 25, 17,  4,  8,
18     31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10,  9
19 };
20 
first_bit_set(int32_t a)21 static inline int32_t first_bit_set(int32_t a) {
22     int32_t zero_case = mask(0, a);
23     uint32_t x = (uint32_t) (a & -a);
24     x *= 0x077CB531;
25     x >>= 27;
26     return zero_case | de_bruijn_table[x];
27 }
28 #endif
29 
fill(small_hash_t v)30 static inline __m128i fill(small_hash_t v) {
31     int32_t v1 = (((int)v) << 16) | v;
32     __m128i x = _mm_cvtsi32_si128(0);
33     x = _mm_insert_epi32(x, v1, 0);
34     return _mm_shuffle_epi32(x, _MM_SHUFFLE(0,0,0,0));
35 }
36 
37 #ifndef SIDD_UWORD_OPS
38 #define SIDD_UWORD_OPS _SIDD_UWORD_OPS
39 #endif
40 
41 #ifndef SIDD_CMP_EQUAL_EACH
42 #define SIDD_CMP_EQUAL_EACH _SIDD_CMP_EQUAL_EACH
43 #endif
44 
45 #ifndef SIDD_BIT_MASK
46 #define SIDD_BIT_MASK _SIDD_BIT_MASK
47 #endif
48 
49 #define _MODE (SIDD_UWORD_OPS | SIDD_CMP_EQUAL_EACH)
50 
cmp_mask(__m128i a,__m128i b)51 static inline __m128i cmp_mask(__m128i a, __m128i b) {
52     const int mode = SIDD_UWORD_OPS | SIDD_CMP_EQUAL_EACH | SIDD_BIT_MASK;
53     return _mm_cmpistrm(a, b, mode);
54 }
55 
line_result(uint32_t m,int start)56 static inline int32_t line_result(uint32_t m, int start) {
57     int32_t p  = first_bit_set((int32_t) m);
58     int32_t mm = mask(p, -1);
59     return mm | (start + p);
60 }
61 
62 #define DUMP(xval) do {                                       \
63     uint16_t xval##_x0 = _mm_extract_epi16(xval, 0);          \
64     uint16_t xval##_x1 = _mm_extract_epi16(xval, 1);          \
65     uint16_t xval##_x2 = _mm_extract_epi16(xval, 2);          \
66     uint16_t xval##_x3 = _mm_extract_epi16(xval, 3);          \
67     uint16_t xval##_x4 = _mm_extract_epi16(xval, 4);          \
68     uint16_t xval##_x5 = _mm_extract_epi16(xval, 5);          \
69     uint16_t xval##_x6 = _mm_extract_epi16(xval, 6);          \
70     uint16_t xval##_x7 = _mm_extract_epi16(xval, 7);          \
71     printf("  % 10s: %04x-%04x-%04x-%04x-%04x-%04x-%04x-%04x\n", \
72            #xval, xval##_x0, xval##_x1, xval##_x2, xval##_x3, \
73            xval##_x4, xval##_x5, xval##_x6, xval##_x7);       \
74   } while(0);
75 
76 
line_search(small_hash_t * array,int start0,small_hash_t v1)77 int line_search(small_hash_t* array, int start0, small_hash_t v1) {
78     int offset = start0 & 31;
79     int start  = start0 & ~31;
80     __m128i* p = (__m128i*) &array[start];
81     __m128i x1, val1, val2, val3, val4;
82     __m128i m1, m2, m3, m4, dmask;
83 
84     x1 = fill(v1);
85 
86     val1 = *p++;
87     m1 = cmp_mask(x1, val1);
88     val2 = *p++;
89     m2 = _mm_slli_si128(cmp_mask(x1, val2), 1);
90     val3 = *p++;
91     m3 = _mm_slli_si128(cmp_mask(x1, val3), 2);
92     val4 = *p;
93     m4 = _mm_slli_si128(cmp_mask(x1, val4), 3);
94 
95     dmask = _mm_or_si128(_mm_or_si128(m1, m2),
96                          _mm_or_si128(m3, m4));
97     uint32_t imask = _mm_extract_epi32(dmask, 0);
98 
99     const uint32_t p2 = 1 << offset;
100     const uint32_t dest_mask = imask & ~(p2 - 1);
101 
102     return line_result(dest_mask, start);
103 }
104 
line_search_2(small_hash_t * array,int start0,small_hash_t v1,small_hash_t v2)105 int line_search_2(small_hash_t* array, int start0, small_hash_t v1,
106                   small_hash_t v2) {
107     int offset = start0 & 31;
108     int start  = start0 & ~31;
109     __m128i* p = (__m128i*) &array[start];
110     __m128i x1, x2, val1, val2, val3, val4;
111     __m128i m1, m2, m3, m4, dmask;
112 
113     x1 = fill(v1);
114     x2 = fill(v2);
115 
116 #define M(v) _mm_or_si128(cmp_mask(x1,(v)), \
117                           cmp_mask(x2,(v)))
118     val1 = *p++;
119     m1 = M(val1);
120     val2 = *p++;
121     m2 = _mm_slli_si128(M(val2), 1);
122     val3 = *p++;
123     m3 = _mm_slli_si128(M(val3), 2);
124     val4 = *p;
125     m4 = _mm_slli_si128(M(val4), 3);
126 #undef M
127 
128     dmask = _mm_or_si128(_mm_or_si128(m1, m2),
129                          _mm_or_si128(m3, m4));
130     uint32_t imask = _mm_extract_epi32(dmask, 0);
131 
132     const uint32_t p2 = 1 << offset;
133     const uint32_t dest_mask = imask & ~(p2 - 1);
134 
135     return line_result(dest_mask, start);
136 }
137 
line_search_3(small_hash_t * array,int start0,small_hash_t v1,small_hash_t v2,small_hash_t v3)138 int line_search_3(small_hash_t* array, int start0, small_hash_t v1,
139                   small_hash_t v2, small_hash_t v3) {
140     int offset = start0 & 31;
141     int start  = start0 & ~31;
142     __m128i* p = (__m128i*) &array[start];
143     __m128i x1, x2, x3, val1, val2, val3, val4;
144     __m128i m1, m2, m3, m4, dmask;
145 
146     x1 = fill(v1);
147     x2 = fill(v2);
148     x3 = fill(v3);
149 
150 #define M(v) _mm_or_si128(                  \
151         cmp_mask(x1,(v)),                   \
152         _mm_or_si128(cmp_mask(x2,(v)),      \
153                      cmp_mask(x3,(v))))
154     val1 = *p++;
155     m1 = M(val1);
156     val2 = *p++;
157     m2 = _mm_slli_si128(M(val2), 1);
158     val3 = *p++;
159     m3 = _mm_slli_si128(M(val3), 2);
160     val4 = *p;
161     m4 = _mm_slli_si128(M(val4), 3);
162 #undef M
163 
164     dmask = _mm_or_si128(_mm_or_si128(m1, m2),
165                          _mm_or_si128(m3, m4));
166     uint32_t imask = _mm_extract_epi32(dmask, 0);
167 
168     const uint32_t p2 = 1 << offset;
169     const uint32_t dest_mask = imask & ~(p2 - 1);
170 
171     return line_result(dest_mask, start);
172 }
173