1 #include "defs.h"
2
3 #include <smmintrin.h>
4 #include <stdio.h>
5
6 /* Straight-line branchless SSE 4.2 code for searching an array of uint16_t
7 hash codes. */
8
mask(int32_t a,int32_t b)9 static inline int32_t mask(int32_t a, int32_t b) { return -(a == b); }
10
11 #if defined(__GNUC__)
first_bit_set(int32_t a)12 static inline int32_t first_bit_set(int32_t a) {
13 return __builtin_ffs(a) - 1;
14 }
15 #else
16 static uint8_t de_bruijn_table[] = {
17 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19 };
20
first_bit_set(int32_t a)21 static inline int32_t first_bit_set(int32_t a) {
22 int32_t zero_case = mask(0, a);
23 uint32_t x = (uint32_t) (a & -a);
24 x *= 0x077CB531;
25 x >>= 27;
26 return zero_case | de_bruijn_table[x];
27 }
28 #endif
29
fill(small_hash_t v)30 static inline __m128i fill(small_hash_t v) {
31 int32_t v1 = (((int)v) << 16) | v;
32 __m128i x = _mm_cvtsi32_si128(0);
33 x = _mm_insert_epi32(x, v1, 0);
34 return _mm_shuffle_epi32(x, _MM_SHUFFLE(0,0,0,0));
35 }
36
37 #ifndef SIDD_UWORD_OPS
38 #define SIDD_UWORD_OPS _SIDD_UWORD_OPS
39 #endif
40
41 #ifndef SIDD_CMP_EQUAL_EACH
42 #define SIDD_CMP_EQUAL_EACH _SIDD_CMP_EQUAL_EACH
43 #endif
44
45 #ifndef SIDD_BIT_MASK
46 #define SIDD_BIT_MASK _SIDD_BIT_MASK
47 #endif
48
49 #define _MODE (SIDD_UWORD_OPS | SIDD_CMP_EQUAL_EACH)
50
cmp_mask(__m128i a,__m128i b)51 static inline __m128i cmp_mask(__m128i a, __m128i b) {
52 const int mode = SIDD_UWORD_OPS | SIDD_CMP_EQUAL_EACH | SIDD_BIT_MASK;
53 return _mm_cmpistrm(a, b, mode);
54 }
55
line_result(uint32_t m,int start)56 static inline int32_t line_result(uint32_t m, int start) {
57 int32_t p = first_bit_set((int32_t) m);
58 int32_t mm = mask(p, -1);
59 return mm | (start + p);
60 }
61
62 #define DUMP(xval) do { \
63 uint16_t xval##_x0 = _mm_extract_epi16(xval, 0); \
64 uint16_t xval##_x1 = _mm_extract_epi16(xval, 1); \
65 uint16_t xval##_x2 = _mm_extract_epi16(xval, 2); \
66 uint16_t xval##_x3 = _mm_extract_epi16(xval, 3); \
67 uint16_t xval##_x4 = _mm_extract_epi16(xval, 4); \
68 uint16_t xval##_x5 = _mm_extract_epi16(xval, 5); \
69 uint16_t xval##_x6 = _mm_extract_epi16(xval, 6); \
70 uint16_t xval##_x7 = _mm_extract_epi16(xval, 7); \
71 printf(" % 10s: %04x-%04x-%04x-%04x-%04x-%04x-%04x-%04x\n", \
72 #xval, xval##_x0, xval##_x1, xval##_x2, xval##_x3, \
73 xval##_x4, xval##_x5, xval##_x6, xval##_x7); \
74 } while(0);
75
76
line_search(small_hash_t * array,int start0,small_hash_t v1)77 int line_search(small_hash_t* array, int start0, small_hash_t v1) {
78 int offset = start0 & 31;
79 int start = start0 & ~31;
80 __m128i* p = (__m128i*) &array[start];
81 __m128i x1, val1, val2, val3, val4;
82 __m128i m1, m2, m3, m4, dmask;
83
84 x1 = fill(v1);
85
86 val1 = *p++;
87 m1 = cmp_mask(x1, val1);
88 val2 = *p++;
89 m2 = _mm_slli_si128(cmp_mask(x1, val2), 1);
90 val3 = *p++;
91 m3 = _mm_slli_si128(cmp_mask(x1, val3), 2);
92 val4 = *p;
93 m4 = _mm_slli_si128(cmp_mask(x1, val4), 3);
94
95 dmask = _mm_or_si128(_mm_or_si128(m1, m2),
96 _mm_or_si128(m3, m4));
97 uint32_t imask = _mm_extract_epi32(dmask, 0);
98
99 const uint32_t p2 = 1 << offset;
100 const uint32_t dest_mask = imask & ~(p2 - 1);
101
102 return line_result(dest_mask, start);
103 }
104
line_search_2(small_hash_t * array,int start0,small_hash_t v1,small_hash_t v2)105 int line_search_2(small_hash_t* array, int start0, small_hash_t v1,
106 small_hash_t v2) {
107 int offset = start0 & 31;
108 int start = start0 & ~31;
109 __m128i* p = (__m128i*) &array[start];
110 __m128i x1, x2, val1, val2, val3, val4;
111 __m128i m1, m2, m3, m4, dmask;
112
113 x1 = fill(v1);
114 x2 = fill(v2);
115
116 #define M(v) _mm_or_si128(cmp_mask(x1,(v)), \
117 cmp_mask(x2,(v)))
118 val1 = *p++;
119 m1 = M(val1);
120 val2 = *p++;
121 m2 = _mm_slli_si128(M(val2), 1);
122 val3 = *p++;
123 m3 = _mm_slli_si128(M(val3), 2);
124 val4 = *p;
125 m4 = _mm_slli_si128(M(val4), 3);
126 #undef M
127
128 dmask = _mm_or_si128(_mm_or_si128(m1, m2),
129 _mm_or_si128(m3, m4));
130 uint32_t imask = _mm_extract_epi32(dmask, 0);
131
132 const uint32_t p2 = 1 << offset;
133 const uint32_t dest_mask = imask & ~(p2 - 1);
134
135 return line_result(dest_mask, start);
136 }
137
line_search_3(small_hash_t * array,int start0,small_hash_t v1,small_hash_t v2,small_hash_t v3)138 int line_search_3(small_hash_t* array, int start0, small_hash_t v1,
139 small_hash_t v2, small_hash_t v3) {
140 int offset = start0 & 31;
141 int start = start0 & ~31;
142 __m128i* p = (__m128i*) &array[start];
143 __m128i x1, x2, x3, val1, val2, val3, val4;
144 __m128i m1, m2, m3, m4, dmask;
145
146 x1 = fill(v1);
147 x2 = fill(v2);
148 x3 = fill(v3);
149
150 #define M(v) _mm_or_si128( \
151 cmp_mask(x1,(v)), \
152 _mm_or_si128(cmp_mask(x2,(v)), \
153 cmp_mask(x3,(v))))
154 val1 = *p++;
155 m1 = M(val1);
156 val2 = *p++;
157 m2 = _mm_slli_si128(M(val2), 1);
158 val3 = *p++;
159 m3 = _mm_slli_si128(M(val3), 2);
160 val4 = *p;
161 m4 = _mm_slli_si128(M(val4), 3);
162 #undef M
163
164 dmask = _mm_or_si128(_mm_or_si128(m1, m2),
165 _mm_or_si128(m3, m4));
166 uint32_t imask = _mm_extract_epi32(dmask, 0);
167
168 const uint32_t p2 = 1 << offset;
169 const uint32_t dest_mask = imask & ~(p2 - 1);
170
171 return line_result(dest_mask, start);
172 }
173