1 /* SPDX-License-Identifier: MIT
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Copyright:
24 * 2020 Evan Nemerson <evan@nemerson.com>
25 * 2020 Christopher Moore <moore@free.fr>
26 */
27
28 /* The GFNI implementation is based on Wojciech Muła's work at
29 * http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html#bit-shuffling via
30 * https://github.com/InstLatx64/InstLatX64_Demo/blob/49c27effdfd5a45f27e0ccb6e2f3be5f27c3845d/GFNI_Demo.h#L173 */
31
32 #if !defined(SIMDE_ARM_NEON_RBIT_H)
33 #define SIMDE_ARM_NEON_RBIT_H
34
35 #include "reinterpret.h"
36 #include "types.h"
37
38 HEDLEY_DIAGNOSTIC_PUSH
39 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
40 SIMDE_BEGIN_DECLS_
41
42 SIMDE_FUNCTION_ATTRIBUTES
43 simde_uint8x8_t
simde_vrbit_u8(simde_uint8x8_t a)44 simde_vrbit_u8(simde_uint8x8_t a) {
45 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
46 return vrbit_u8(a);
47 #elif defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_X86_GFNI_NATIVE)
48 __m128i tmp = _mm_movpi64_epi64(a);
49 tmp = _mm_gf2p8affine_epi64_epi8(tmp, _mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, UINT64_C(0x8040201008040201))), 0);
50 return _mm_movepi64_pi64(tmp);
51 #elif defined(SIMDE_X86_MMX_NATIVE)
52 __m64 mask;
53 mask = _mm_set1_pi8(0x55);
54 a = _mm_or_si64(_mm_andnot_si64(mask, _mm_slli_pi16(a, 1)), _mm_and_si64(mask, _mm_srli_pi16(a, 1)));
55 mask = _mm_set1_pi8(0x33);
56 a = _mm_or_si64(_mm_andnot_si64(mask, _mm_slli_pi16(a, 2)), _mm_and_si64(mask, _mm_srli_pi16(a, 2)));
57 mask = _mm_set1_pi8(0x0F);
58 a = _mm_or_si64(_mm_andnot_si64(mask, _mm_slli_pi16(a, 4)), _mm_and_si64(mask, _mm_srli_pi16(a, 4)));
59 return a;
60 #else
61 simde_uint8x8_private
62 r_,
63 a_ = simde_uint8x8_to_private(a);
64
65 SIMDE_VECTORIZE
66 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
67 #if HEDLEY_HAS_BUILTIN(__builtin_bitreverse8) && !defined(HEDLEY_IBM_VERSION)
68 r_.values[i] = __builtin_bitreverse8(a_.values[i]);
69 #else
70 r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, (((a_.values[i] * UINT64_C(0x80200802)) & UINT64_C(0x0884422110)) * UINT64_C(0x0101010101)) >> 32);
71 #endif
72 }
73
74 return simde_uint8x8_from_private(r_);
75 #endif
76 }
77 #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
78 #undef vrbit_u8
79 #define vrbit_u8(a) simde_vrbit_u8(a)
80 #endif
81
82 SIMDE_FUNCTION_ATTRIBUTES
83 simde_int8x8_t
simde_vrbit_s8(simde_int8x8_t a)84 simde_vrbit_s8(simde_int8x8_t a) {
85 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
86 return vrbit_s8(a);
87 #else
88 return simde_vreinterpret_s8_u8(simde_vrbit_u8(simde_vreinterpret_u8_s8(a)));
89 #endif
90 }
91 #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
92 #undef vrbit_s8
93 #define vrbit_s8(a) simde_vrbit_s8(a)
94 #endif
95
96 SIMDE_FUNCTION_ATTRIBUTES
97 simde_uint8x16_t
simde_vrbitq_u8(simde_uint8x16_t a)98 simde_vrbitq_u8(simde_uint8x16_t a) {
99 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
100 return vrbitq_u8(a);
101 #elif defined(SIMDE_X86_GFNI_NATIVE)
102 return _mm_gf2p8affine_epi64_epi8(a, _mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, UINT64_C(0x8040201008040201))), 0);
103 #elif defined(SIMDE_X86_SSE2_NATIVE)
104 __m128i mask;
105 mask = _mm_set1_epi8(0x55);
106 a = _mm_or_si128(_mm_andnot_si128(mask, _mm_slli_epi16(a, 1)), _mm_and_si128(mask, _mm_srli_epi16(a, 1)));
107 mask = _mm_set1_epi8(0x33);
108 a = _mm_or_si128(_mm_andnot_si128(mask, _mm_slli_epi16(a, 2)), _mm_and_si128(mask, _mm_srli_epi16(a, 2)));
109 mask = _mm_set1_epi8(0x0F);
110 a = _mm_or_si128(_mm_andnot_si128(mask, _mm_slli_epi16(a, 4)), _mm_and_si128(mask, _mm_srli_epi16(a, 4)));
111 return a;
112 #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
113 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) shift;
114 shift = vec_splat_u8(1);
115 a = vec_sel(vec_sl(a, shift), vec_sr(a, shift), vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x55)));
116 shift = vec_splat_u8(2);
117 a = vec_sel(vec_sl(a, shift), vec_sr(a, shift), vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x33)));
118 shift = vec_splat_u8(4);
119 a = vec_or(vec_sl(a, shift), vec_sr(a, shift));
120 return a;
121 #elif defined(SIMDE_WASM_SIMD128_NATIVE)
122 a = wasm_v128_bitselect(wasm_u8x16_shr(a, 1), wasm_i8x16_shl(a, 1), wasm_i8x16_splat(0x55));
123 a = wasm_v128_bitselect(wasm_u8x16_shr(a, 2), wasm_i8x16_shl(a, 2), wasm_i8x16_splat(0x33));
124 a = wasm_v128_or(wasm_u8x16_shr(a, 4), wasm_i8x16_shl(a, 4));
125 return a;
126 #else
127 simde_uint8x16_private
128 r_,
129 a_ = simde_uint8x16_to_private(a);
130
131 SIMDE_VECTORIZE
132 for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
133 #if HEDLEY_HAS_BUILTIN(__builtin_bitreverse8) && !defined(HEDLEY_IBM_VERSION)
134 r_.values[i] = __builtin_bitreverse8(a_.values[i]);
135 #else
136 r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, (((a_.values[i] * UINT64_C(0x80200802)) & UINT64_C(0x0884422110)) * UINT64_C(0x0101010101)) >> 32);
137 #endif
138 }
139
140 return simde_uint8x16_from_private(r_);
141 #endif
142 }
143 #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
144 #undef vrbitq_u8
145 #define vrbitq_u8(a) simde_vrbitq_u8(a)
146 #endif
147
148 SIMDE_FUNCTION_ATTRIBUTES
149 simde_int8x16_t
simde_vrbitq_s8(simde_int8x16_t a)150 simde_vrbitq_s8(simde_int8x16_t a) {
151 #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
152 return vrbitq_s8(a);
153 #else
154 return simde_vreinterpretq_s8_u8(simde_vrbitq_u8(simde_vreinterpretq_u8_s8(a)));
155 #endif
156 }
157 #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
158 #undef vrbitq_s8
159 #define vrbitq_s8(a) simde_vrbitq_s8(a)
160 #endif
161
162 SIMDE_END_DECLS_
163 HEDLEY_DIAGNOSTIC_POP
164
165 #endif /* !defined(SIMDE_ARM_NEON_RBIT_H) */
166