1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2020      Evan Nemerson <evan@nemerson.com>
25  */
26 
27 #if !defined(SIMDE_X86_AVX512_LZCNT_H)
28 #define SIMDE_X86_AVX512_LZCNT_H
29 
30 #include "types.h"
31 #include "mov.h"
32 
33 HEDLEY_DIAGNOSTIC_PUSH
34 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35 SIMDE_BEGIN_DECLS_
36 
37 #if \
38     ( HEDLEY_HAS_BUILTIN(__builtin_clz) || \
39       HEDLEY_GCC_VERSION_CHECK(3,4,0) || \
40       HEDLEY_ARM_VERSION_CHECK(4,1,0) ) && \
41     defined(__INT_MAX__) && defined(__LONG_MAX__) && defined(__LONG_LONG_MAX__) && \
42     defined(__INT32_MAX__) && defined(__INT64_MAX__)
43   #if __INT_MAX__ == __INT32_MAX__
44     #define simde_x_clz32(v) __builtin_clz(HEDLEY_STATIC_CAST(unsigned int, (v)))
45   #elif __LONG_MAX__ == __INT32_MAX__
46     #define simde_x_clz32(v) __builtin_clzl(HEDLEY_STATIC_CAST(unsigned long, (v)))
47   #elif __LONG_LONG_MAX__ == __INT32_MAX__
48     #define simde_x_clz32(v) __builtin_clzll(HEDLEY_STATIC_CAST(unsigned long long, (v)))
49   #endif
50 
51   #if __INT_MAX__ == __INT64_MAX__
52     #define simde_x_clz64(v) __builtin_clz(HEDLEY_STATIC_CAST(unsigned int, (v)))
53   #elif __LONG_MAX__ == __INT64_MAX__
54     #define simde_x_clz64(v) __builtin_clzl(HEDLEY_STATIC_CAST(unsigned long, (v)))
55   #elif __LONG_LONG_MAX__ == __INT64_MAX__
56     #define simde_x_clz64(v) __builtin_clzll(HEDLEY_STATIC_CAST(unsigned long long, (v)))
57   #endif
58 #elif HEDLEY_MSVC_VERSION_CHECK(14,0,0)
59   static int simde_x_clz32(uint32_t x) {
60     unsigned long r;
61     _BitScanReverse(&r, x);
62     return 31 - HEDLEY_STATIC_CAST(int, r);
63   }
64   #define simde_x_clz32 simde_x_clz32
65 
66   static int simde_x_clz64(uint64_t x) {
67     unsigned long r;
68 
69     #if defined(_M_AMD64) || defined(_M_ARM64)
70       _BitScanReverse64(&r, x);
71       return 63 - HEDLEY_STATIC_CAST(int, r);
72     #else
73       uint32_t high = HEDLEY_STATIC_CAST(uint32_t, x >> 32);
74       if (high != 0)
75         return _BitScanReverse(&r, HEDLEY_STATIC_CAST(unsigned long, high));
76       else
77         return _BitScanReverse(&r, HEDLEY_STATIC_CAST(unsigned long, x & ~UINT32_C(0))) + 32;
78     #endif
79   }
80   #define simde_x_clz64 simde_x_clz64
81 #endif
82 
83 #if !defined(simde_x_clz32) || !defined(simde_x_clz64)
simde_x_avx512cd_lz_lookup(const uint8_t value)84   static uint8_t simde_x_avx512cd_lz_lookup(const uint8_t value) {
85     static const uint8_t lut[256] = {
86       7, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
87       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
88       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
94       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
102     };
103     return lut[value];
104   };
105 
106   #if !defined(simde_x_clz32)
simde_x_clz32(uint32_t x)107     static int simde_x_clz32(uint32_t x) {
108       size_t s = sizeof(x) * 8;
109       uint32_t r;
110 
111       while ((s -= 8) != 0) {
112         r = x >> s;
113         if (r != 0)
114           return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, r)) +
115             (((sizeof(x) - 1) * 8) - s);
116       }
117 
118       if (x == 0)
119         return (int) ((sizeof(x) * 8) - 1);
120       else
121         return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, x)) +
122           ((sizeof(x) - 1) * 8);
123     }
124   #endif
125 
126   #if !defined(simde_x_clz64)
simde_x_clz64(uint64_t x)127     static int simde_x_clz64(uint64_t x) {
128       size_t s = sizeof(x) * 8;
129       uint64_t r;
130 
131       while ((s -= 8) != 0) {
132         r = x >> s;
133         if (r != 0)
134           return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, r)) +
135             (((sizeof(x) - 1) * 8) - s);
136       }
137 
138       if (x == 0)
139         return (int) ((sizeof(x) * 8) - 1);
140       else
141         return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, x)) +
142           ((sizeof(x) - 1) * 8);
143     }
144   #endif
145 #endif
146 
147 SIMDE_FUNCTION_ATTRIBUTES
148 simde__m128i
simde_mm_lzcnt_epi32(simde__m128i a)149 simde_mm_lzcnt_epi32(simde__m128i a) {
150   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE)
151     return _mm_lzcnt_epi32(a);
152   #elif defined(SIMDE_X86_SSE2_NATIVE)
153     /* https://stackoverflow.com/a/58827596/501126 */
154     a = _mm_andnot_si128(_mm_srli_epi32(a, 8), a);
155     a = _mm_castps_si128(_mm_cvtepi32_ps(a));
156     a = _mm_srli_epi32(a, 23);
157     a = _mm_subs_epu16(_mm_set1_epi32(158), a);
158     a = _mm_min_epi16(a, _mm_set1_epi32(32));
159     return a;
160   #else
161     simde__m128i_private
162       r_,
163       a_ = simde__m128i_to_private(a);
164 
165     SIMDE_VECTORIZE
166     for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
167       r_.i32[i] = (HEDLEY_UNLIKELY(a_.i32[i] == 0) ? HEDLEY_STATIC_CAST(int32_t, sizeof(int32_t) * CHAR_BIT) : HEDLEY_STATIC_CAST(int32_t, simde_x_clz32(HEDLEY_STATIC_CAST(uint32_t, a_.i32[i]))));
168     }
169 
170     return simde__m128i_from_private(r_);
171   #endif
172 }
173 #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES)
174   #undef _mm_lzcnt_epi32
175   #define _mm_lzcnt_epi32(a) simde_mm_lzcnt_epi32(a)
176 #endif
177 
178 SIMDE_FUNCTION_ATTRIBUTES
179 simde__m128i
simde_mm_mask_lzcnt_epi32(simde__m128i src,simde__mmask8 k,simde__m128i a)180 simde_mm_mask_lzcnt_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a) {
181   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE)
182     return _mm_mask_lzcnt_epi32(src, k, a);
183   #else
184     return simde_mm_mask_mov_epi32(src, k, simde_mm_lzcnt_epi32(a));
185   #endif
186 }
187 #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
188   #undef _mm_mask_lzcnt_epi32
189   #define _mm_mask_lzcnt_epi32(src, k, a) simde_mm_mask_lzcnt_epi32(src, k, a)
190 #endif
191 
192 SIMDE_FUNCTION_ATTRIBUTES
193 simde__m128i
simde_mm_maskz_lzcnt_epi32(simde__mmask8 k,simde__m128i a)194 simde_mm_maskz_lzcnt_epi32(simde__mmask8 k, simde__m128i a) {
195   #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE)
196     return _mm_maskz_lzcnt_epi32(k, a);
197   #else
198     return simde_mm_maskz_mov_epi32(k, simde_mm_lzcnt_epi32(a));
199   #endif
200 }
201 #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
202   #undef _mm_maskz_lzcnt_epi32
203   #define _mm_maskz_lzcnt_epi32(k, a) simde_mm_maskz_lzcnt_epi32(k, a)
204 #endif
205 
206 SIMDE_END_DECLS_
207 HEDLEY_DIAGNOSTIC_POP
208 
209 #endif /* !defined(SIMDE_X86_AVX512_LZCNT_H) */
210