1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2020 Evan Nemerson <evan@nemerson.com>
25 */
26
27 #if !defined(SIMDE_X86_AVX512_LZCNT_H)
28 #define SIMDE_X86_AVX512_LZCNT_H
29
30 #include "types.h"
31 #include "mov.h"
32
33 HEDLEY_DIAGNOSTIC_PUSH
34 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35 SIMDE_BEGIN_DECLS_
36
37 #if \
38 ( HEDLEY_HAS_BUILTIN(__builtin_clz) || \
39 HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
40 HEDLEY_ARM_VERSION_CHECK(4,1,0) ) && \
41 defined(__INT_MAX__) && defined(__LONG_MAX__) && defined(__LONG_LONG_MAX__) && \
42 defined(__INT32_MAX__) && defined(__INT64_MAX__)
43 #if __INT_MAX__ == __INT32_MAX__
44 #define simde_x_clz32(v) __builtin_clz(HEDLEY_STATIC_CAST(unsigned int, (v)))
45 #elif __LONG_MAX__ == __INT32_MAX__
46 #define simde_x_clz32(v) __builtin_clzl(HEDLEY_STATIC_CAST(unsigned long, (v)))
47 #elif __LONG_LONG_MAX__ == __INT32_MAX__
48 #define simde_x_clz32(v) __builtin_clzll(HEDLEY_STATIC_CAST(unsigned long long, (v)))
49 #endif
50
51 #if __INT_MAX__ == __INT64_MAX__
52 #define simde_x_clz64(v) __builtin_clz(HEDLEY_STATIC_CAST(unsigned int, (v)))
53 #elif __LONG_MAX__ == __INT64_MAX__
54 #define simde_x_clz64(v) __builtin_clzl(HEDLEY_STATIC_CAST(unsigned long, (v)))
55 #elif __LONG_LONG_MAX__ == __INT64_MAX__
56 #define simde_x_clz64(v) __builtin_clzll(HEDLEY_STATIC_CAST(unsigned long long, (v)))
57 #endif
58 #elif HEDLEY_MSVC_VERSION_CHECK(14,0,0)
59 static int simde_x_clz32(uint32_t x) {
60 unsigned long r;
61 _BitScanReverse(&r, x);
62 return 31 - HEDLEY_STATIC_CAST(int, r);
63 }
64 #define simde_x_clz32 simde_x_clz32
65
66 static int simde_x_clz64(uint64_t x) {
67 unsigned long r;
68
69 #if defined(_M_AMD64) || defined(_M_ARM64)
70 _BitScanReverse64(&r, x);
71 return 63 - HEDLEY_STATIC_CAST(int, r);
72 #else
73 uint32_t high = HEDLEY_STATIC_CAST(uint32_t, x >> 32);
74 if (high != 0)
75 return _BitScanReverse(&r, HEDLEY_STATIC_CAST(unsigned long, high));
76 else
77 return _BitScanReverse(&r, HEDLEY_STATIC_CAST(unsigned long, x & ~UINT32_C(0))) + 32;
78 #endif
79 }
80 #define simde_x_clz64 simde_x_clz64
81 #endif
82
83 #if !defined(simde_x_clz32) || !defined(simde_x_clz64)
simde_x_avx512cd_lz_lookup(const uint8_t value)84 static uint8_t simde_x_avx512cd_lz_lookup(const uint8_t value) {
85 static const uint8_t lut[256] = {
86 7, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
87 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
88 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
102 };
103 return lut[value];
104 };
105
106 #if !defined(simde_x_clz32)
simde_x_clz32(uint32_t x)107 static int simde_x_clz32(uint32_t x) {
108 size_t s = sizeof(x) * 8;
109 uint32_t r;
110
111 while ((s -= 8) != 0) {
112 r = x >> s;
113 if (r != 0)
114 return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, r)) +
115 (((sizeof(x) - 1) * 8) - s);
116 }
117
118 if (x == 0)
119 return (int) ((sizeof(x) * 8) - 1);
120 else
121 return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, x)) +
122 ((sizeof(x) - 1) * 8);
123 }
124 #endif
125
126 #if !defined(simde_x_clz64)
simde_x_clz64(uint64_t x)127 static int simde_x_clz64(uint64_t x) {
128 size_t s = sizeof(x) * 8;
129 uint64_t r;
130
131 while ((s -= 8) != 0) {
132 r = x >> s;
133 if (r != 0)
134 return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, r)) +
135 (((sizeof(x) - 1) * 8) - s);
136 }
137
138 if (x == 0)
139 return (int) ((sizeof(x) * 8) - 1);
140 else
141 return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, x)) +
142 ((sizeof(x) - 1) * 8);
143 }
144 #endif
145 #endif
146
147 SIMDE_FUNCTION_ATTRIBUTES
148 simde__m128i
simde_mm_lzcnt_epi32(simde__m128i a)149 simde_mm_lzcnt_epi32(simde__m128i a) {
150 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE)
151 return _mm_lzcnt_epi32(a);
152 #elif defined(SIMDE_X86_SSE2_NATIVE)
153 /* https://stackoverflow.com/a/58827596/501126 */
154 a = _mm_andnot_si128(_mm_srli_epi32(a, 8), a);
155 a = _mm_castps_si128(_mm_cvtepi32_ps(a));
156 a = _mm_srli_epi32(a, 23);
157 a = _mm_subs_epu16(_mm_set1_epi32(158), a);
158 a = _mm_min_epi16(a, _mm_set1_epi32(32));
159 return a;
160 #else
161 simde__m128i_private
162 r_,
163 a_ = simde__m128i_to_private(a);
164
165 SIMDE_VECTORIZE
166 for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
167 r_.i32[i] = (HEDLEY_UNLIKELY(a_.i32[i] == 0) ? HEDLEY_STATIC_CAST(int32_t, sizeof(int32_t) * CHAR_BIT) : HEDLEY_STATIC_CAST(int32_t, simde_x_clz32(HEDLEY_STATIC_CAST(uint32_t, a_.i32[i]))));
168 }
169
170 return simde__m128i_from_private(r_);
171 #endif
172 }
173 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES)
174 #undef _mm_lzcnt_epi32
175 #define _mm_lzcnt_epi32(a) simde_mm_lzcnt_epi32(a)
176 #endif
177
178 SIMDE_FUNCTION_ATTRIBUTES
179 simde__m128i
simde_mm_mask_lzcnt_epi32(simde__m128i src,simde__mmask8 k,simde__m128i a)180 simde_mm_mask_lzcnt_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a) {
181 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE)
182 return _mm_mask_lzcnt_epi32(src, k, a);
183 #else
184 return simde_mm_mask_mov_epi32(src, k, simde_mm_lzcnt_epi32(a));
185 #endif
186 }
187 #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
188 #undef _mm_mask_lzcnt_epi32
189 #define _mm_mask_lzcnt_epi32(src, k, a) simde_mm_mask_lzcnt_epi32(src, k, a)
190 #endif
191
192 SIMDE_FUNCTION_ATTRIBUTES
193 simde__m128i
simde_mm_maskz_lzcnt_epi32(simde__mmask8 k,simde__m128i a)194 simde_mm_maskz_lzcnt_epi32(simde__mmask8 k, simde__m128i a) {
195 #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE)
196 return _mm_maskz_lzcnt_epi32(k, a);
197 #else
198 return simde_mm_maskz_mov_epi32(k, simde_mm_lzcnt_epi32(a));
199 #endif
200 }
201 #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
202 #undef _mm_maskz_lzcnt_epi32
203 #define _mm_maskz_lzcnt_epi32(k, a) simde_mm_maskz_lzcnt_epi32(k, a)
204 #endif
205
206 SIMDE_END_DECLS_
207 HEDLEY_DIAGNOSTIC_POP
208
209 #endif /* !defined(SIMDE_X86_AVX512_LZCNT_H) */
210