1 //! LLVM bit manipulation intrinsics.
2 #[rustfmt::skip]
3 
4 use crate::*;
5 
6 #[allow(improper_ctypes, dead_code)]
7 extern "C" {
8     #[link_name = "llvm.ctlz.v2i8"]
ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x29     fn ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
10     #[link_name = "llvm.ctlz.v4i8"]
ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x411     fn ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;
12     #[link_name = "llvm.ctlz.v8i8"]
ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x813     fn ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;
14     #[link_name = "llvm.ctlz.v16i8"]
ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x1615     fn ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;
16     #[link_name = "llvm.ctlz.v32i8"]
ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x3217     fn ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;
18     #[link_name = "llvm.ctlz.v64i8"]
ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x6419     fn ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;
20 
21     #[link_name = "llvm.ctlz.v2i16"]
ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x222     fn ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;
23     #[link_name = "llvm.ctlz.v4i16"]
ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x424     fn ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;
25     #[link_name = "llvm.ctlz.v8i16"]
ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x826     fn ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;
27     #[link_name = "llvm.ctlz.v16i16"]
ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x1628     fn ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;
29     #[link_name = "llvm.ctlz.v32i16"]
ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x3230     fn ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;
31 
32     #[link_name = "llvm.ctlz.v2i32"]
ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x233     fn ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;
34     #[link_name = "llvm.ctlz.v4i32"]
ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x435     fn ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;
36     #[link_name = "llvm.ctlz.v8i32"]
ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x837     fn ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;
38     #[link_name = "llvm.ctlz.v16i32"]
ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x1639     fn ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;
40 
41     #[link_name = "llvm.ctlz.v2i64"]
ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x242     fn ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;
43     #[link_name = "llvm.ctlz.v4i64"]
ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x444     fn ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;
45     #[link_name = "llvm.ctlz.v8i64"]
ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x846     fn ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;
47 
48     #[link_name = "llvm.ctlz.v1i128"]
ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x149     fn ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;
50     #[link_name = "llvm.ctlz.v2i128"]
ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x251     fn ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;
52     #[link_name = "llvm.ctlz.v4i128"]
ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x453     fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;
54 
55     #[link_name = "llvm.cttz.v2i8"]
cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x256     fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
57     #[link_name = "llvm.cttz.v4i8"]
cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x458     fn cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4;
59     #[link_name = "llvm.cttz.v8i8"]
cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x860     fn cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8;
61     #[link_name = "llvm.cttz.v16i8"]
cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x1662     fn cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16;
63     #[link_name = "llvm.cttz.v32i8"]
cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x3264     fn cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32;
65     #[link_name = "llvm.cttz.v64i8"]
cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x6466     fn cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64;
67 
68     #[link_name = "llvm.cttz.v2i16"]
cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x269     fn cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2;
70     #[link_name = "llvm.cttz.v4i16"]
cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x471     fn cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4;
72     #[link_name = "llvm.cttz.v8i16"]
cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x873     fn cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8;
74     #[link_name = "llvm.cttz.v16i16"]
cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x1675     fn cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16;
76     #[link_name = "llvm.cttz.v32i16"]
cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x3277     fn cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32;
78 
79     #[link_name = "llvm.cttz.v2i32"]
cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x280     fn cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2;
81     #[link_name = "llvm.cttz.v4i32"]
cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x482     fn cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4;
83     #[link_name = "llvm.cttz.v8i32"]
cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x884     fn cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8;
85     #[link_name = "llvm.cttz.v16i32"]
cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x1686     fn cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16;
87 
88     #[link_name = "llvm.cttz.v2i64"]
cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x289     fn cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2;
90     #[link_name = "llvm.cttz.v4i64"]
cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x491     fn cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4;
92     #[link_name = "llvm.cttz.v8i64"]
cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x893     fn cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8;
94 
95     #[link_name = "llvm.cttz.v1i128"]
cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x196     fn cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1;
97     #[link_name = "llvm.cttz.v2i128"]
cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x298     fn cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2;
99     #[link_name = "llvm.cttz.v4i128"]
cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4100     fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;
101 
102     #[link_name = "llvm.ctpop.v2i8"]
ctpop_u8x2(x: u8x2) -> u8x2103     fn ctpop_u8x2(x: u8x2) -> u8x2;
104     #[link_name = "llvm.ctpop.v4i8"]
ctpop_u8x4(x: u8x4) -> u8x4105     fn ctpop_u8x4(x: u8x4) -> u8x4;
106     #[link_name = "llvm.ctpop.v8i8"]
ctpop_u8x8(x: u8x8) -> u8x8107     fn ctpop_u8x8(x: u8x8) -> u8x8;
108     #[link_name = "llvm.ctpop.v16i8"]
ctpop_u8x16(x: u8x16) -> u8x16109     fn ctpop_u8x16(x: u8x16) -> u8x16;
110     #[link_name = "llvm.ctpop.v32i8"]
ctpop_u8x32(x: u8x32) -> u8x32111     fn ctpop_u8x32(x: u8x32) -> u8x32;
112     #[link_name = "llvm.ctpop.v64i8"]
ctpop_u8x64(x: u8x64) -> u8x64113     fn ctpop_u8x64(x: u8x64) -> u8x64;
114 
115     #[link_name = "llvm.ctpop.v2i16"]
ctpop_u16x2(x: u16x2) -> u16x2116     fn ctpop_u16x2(x: u16x2) -> u16x2;
117     #[link_name = "llvm.ctpop.v4i16"]
ctpop_u16x4(x: u16x4) -> u16x4118     fn ctpop_u16x4(x: u16x4) -> u16x4;
119     #[link_name = "llvm.ctpop.v8i16"]
ctpop_u16x8(x: u16x8) -> u16x8120     fn ctpop_u16x8(x: u16x8) -> u16x8;
121     #[link_name = "llvm.ctpop.v16i16"]
ctpop_u16x16(x: u16x16) -> u16x16122     fn ctpop_u16x16(x: u16x16) -> u16x16;
123     #[link_name = "llvm.ctpop.v32i16"]
ctpop_u16x32(x: u16x32) -> u16x32124     fn ctpop_u16x32(x: u16x32) -> u16x32;
125 
126     #[link_name = "llvm.ctpop.v2i32"]
ctpop_u32x2(x: u32x2) -> u32x2127     fn ctpop_u32x2(x: u32x2) -> u32x2;
128     #[link_name = "llvm.ctpop.v4i32"]
ctpop_u32x4(x: u32x4) -> u32x4129     fn ctpop_u32x4(x: u32x4) -> u32x4;
130     #[link_name = "llvm.ctpop.v8i32"]
ctpop_u32x8(x: u32x8) -> u32x8131     fn ctpop_u32x8(x: u32x8) -> u32x8;
132     #[link_name = "llvm.ctpop.v16i32"]
ctpop_u32x16(x: u32x16) -> u32x16133     fn ctpop_u32x16(x: u32x16) -> u32x16;
134 
135     #[link_name = "llvm.ctpop.v2i64"]
ctpop_u64x2(x: u64x2) -> u64x2136     fn ctpop_u64x2(x: u64x2) -> u64x2;
137     #[link_name = "llvm.ctpop.v4i64"]
ctpop_u64x4(x: u64x4) -> u64x4138     fn ctpop_u64x4(x: u64x4) -> u64x4;
139     #[link_name = "llvm.ctpop.v8i64"]
ctpop_u64x8(x: u64x8) -> u64x8140     fn ctpop_u64x8(x: u64x8) -> u64x8;
141 
142     #[link_name = "llvm.ctpop.v1i128"]
ctpop_u128x1(x: u128x1) -> u128x1143     fn ctpop_u128x1(x: u128x1) -> u128x1;
144     #[link_name = "llvm.ctpop.v2i128"]
ctpop_u128x2(x: u128x2) -> u128x2145     fn ctpop_u128x2(x: u128x2) -> u128x2;
146     #[link_name = "llvm.ctpop.v4i128"]
ctpop_u128x4(x: u128x4) -> u128x4147     fn ctpop_u128x4(x: u128x4) -> u128x4;
148 }
149 
150 crate trait BitManip {
ctpop(self) -> Self151     fn ctpop(self) -> Self;
ctlz(self) -> Self152     fn ctlz(self) -> Self;
cttz(self) -> Self153     fn cttz(self) -> Self;
154 }
155 
156 macro_rules! impl_bit_manip {
157     (inner: $ty:ident, $scalar:ty, $uty:ident,
158      $ctpop:ident, $ctlz:ident, $cttz:ident) => {
159         // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192
160         #[cfg(target_arch = "s390x")]
161         impl_bit_manip! { scalar: $ty, $scalar }
162         #[cfg(not(target_arch = "s390x"))]
163         impl BitManip for $ty {
164             #[inline]
165             fn ctpop(self) -> Self {
166                 let y: $uty = self.cast();
167                 unsafe { $ctpop(y).cast() }
168             }
169 
170             #[inline]
171             fn ctlz(self) -> Self {
172                 let y: $uty = self.cast();
173                 // the ctxx intrinsics need compile-time constant
174                 // `is_zero_undef`
175                 unsafe { $ctlz(y, false).cast() }
176             }
177 
178             #[inline]
179             fn cttz(self) -> Self {
180                 let y: $uty = self.cast();
181                 unsafe { $cttz(y, false).cast() }
182             }
183         }
184     };
185     (sized_inner: $ty:ident, $scalar:ty, $uty:ident) => {
186         #[cfg(target_arch = "s390x")]
187         impl_bit_manip! { scalar: $ty, $scalar }
188         #[cfg(not(target_arch = "s390x"))]
189         impl BitManip for $ty {
190             #[inline]
191             fn ctpop(self) -> Self {
192                 let y: $uty = self.cast();
193                 $uty::ctpop(y).cast()
194             }
195 
196             #[inline]
197             fn ctlz(self) -> Self {
198                 let y: $uty = self.cast();
199                 $uty::ctlz(y).cast()
200             }
201 
202             #[inline]
203             fn cttz(self) -> Self {
204                 let y: $uty = self.cast();
205                 $uty::cttz(y).cast()
206             }
207         }
208     };
209     (scalar: $ty:ident, $scalar:ty) => {
210         impl BitManip for $ty {
211             #[inline]
212             fn ctpop(self) -> Self {
213                 let mut ones = self;
214                 for i in 0..Self::lanes() {
215                     ones = ones
216                         .replace(i, self.extract(i).count_ones() as $scalar);
217                 }
218                 ones
219             }
220 
221             #[inline]
222             fn ctlz(self) -> Self {
223                 let mut lz = self;
224                 for i in 0..Self::lanes() {
225                     lz = lz.replace(
226                         i,
227                         self.extract(i).leading_zeros() as $scalar,
228                     );
229                 }
230                 lz
231             }
232 
233             #[inline]
234             fn cttz(self) -> Self {
235                 let mut tz = self;
236                 for i in 0..Self::lanes() {
237                     tz = tz.replace(
238                         i,
239                         self.extract(i).trailing_zeros() as $scalar,
240                     );
241                 }
242                 tz
243             }
244         }
245     };
246     ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty,
247      $ctpop:ident, $ctlz:ident, $cttz:ident) => {
248         impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz }
249         impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz }
250     };
251     (sized: $usize:ident, $uscalar:ty, $isize:ident,
252      $iscalar:ty, $ty:ident) => {
253         impl_bit_manip! { sized_inner: $usize, $uscalar, $ty }
254         impl_bit_manip! { sized_inner: $isize, $iscalar, $ty }
255     };
256 }
257 
258 impl_bit_manip! { u8x2   ,   u8, i8x2, i8,   ctpop_u8x2,   ctlz_u8x2,   cttz_u8x2   }
259 impl_bit_manip! { u8x4   ,   u8, i8x4, i8,   ctpop_u8x4,   ctlz_u8x4,   cttz_u8x4   }
260 #[cfg(not(target_arch = "aarch64"))] // see below
261 impl_bit_manip! { u8x8   ,   u8, i8x8, i8,   ctpop_u8x8,   ctlz_u8x8,   cttz_u8x8   }
262 impl_bit_manip! { u8x16  ,  u8, i8x16, i8,  ctpop_u8x16,  ctlz_u8x16,  cttz_u8x16  }
263 impl_bit_manip! { u8x32  ,  u8, i8x32, i8,  ctpop_u8x32,  ctlz_u8x32,  cttz_u8x32  }
264 impl_bit_manip! { u8x64  ,  u8, i8x64, i8,  ctpop_u8x64,  ctlz_u8x64,  cttz_u8x64  }
265 impl_bit_manip! { u16x2  ,  u16, i16x2, i16,  ctpop_u16x2,  ctlz_u16x2,  cttz_u16x2  }
266 impl_bit_manip! { u16x4  ,  u16, i16x4, i16,  ctpop_u16x4,  ctlz_u16x4,  cttz_u16x4  }
267 impl_bit_manip! { u16x8  ,  u16, i16x8, i16,  ctpop_u16x8,  ctlz_u16x8,  cttz_u16x8  }
268 impl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 }
269 impl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 }
270 impl_bit_manip! { u32x2  ,  u32, i32x2, i32,  ctpop_u32x2,  ctlz_u32x2,  cttz_u32x2  }
271 impl_bit_manip! { u32x4  ,  u32, i32x4, i32,  ctpop_u32x4,  ctlz_u32x4,  cttz_u32x4  }
272 impl_bit_manip! { u32x8  ,  u32, i32x8, i32,  ctpop_u32x8,  ctlz_u32x8,  cttz_u32x8  }
273 impl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 }
274 impl_bit_manip! { u64x2  ,  u64, i64x2, i64,  ctpop_u64x2,  ctlz_u64x2,  cttz_u64x2  }
275 impl_bit_manip! { u64x4  ,  u64, i64x4, i64,  ctpop_u64x4,  ctlz_u64x4,  cttz_u64x4  }
276 impl_bit_manip! { u64x8  ,  u64, i64x8, i64,  ctpop_u64x8,  ctlz_u64x8,  cttz_u64x8  }
277 impl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 }
278 impl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 }
279 impl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 }
280 
281 #[cfg(target_arch = "aarch64")]
282 impl BitManip for u8x8 {
283     #[inline]
ctpop(self) -> Self284     fn ctpop(self) -> Self {
285         let y: u8x8 = self.cast();
286         unsafe { ctpop_u8x8(y).cast() }
287     }
288 
289     #[inline]
ctlz(self) -> Self290     fn ctlz(self) -> Self {
291         let y: u8x8 = self.cast();
292         unsafe { ctlz_u8x8(y, false).cast() }
293     }
294 
295     #[inline]
cttz(self) -> Self296     fn cttz(self) -> Self {
297         // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
298         // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
299         // intrinsics
300         let mut tz = self;
301         for i in 0..Self::lanes() {
302             tz = tz.replace(i, self.extract(i).trailing_zeros() as u8);
303         }
304         tz
305     }
306 }
307 #[cfg(target_arch = "aarch64")]
308 impl BitManip for i8x8 {
309     #[inline]
ctpop(self) -> Self310     fn ctpop(self) -> Self {
311         let y: u8x8 = self.cast();
312         unsafe { ctpop_u8x8(y).cast() }
313     }
314 
315     #[inline]
ctlz(self) -> Self316     fn ctlz(self) -> Self {
317         let y: u8x8 = self.cast();
318         unsafe { ctlz_u8x8(y, false).cast() }
319     }
320 
321     #[inline]
cttz(self) -> Self322     fn cttz(self) -> Self {
323         // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
324         // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
325         // intrinsics
326         let mut tz = self;
327         for i in 0..Self::lanes() {
328             tz = tz.replace(i, self.extract(i).trailing_zeros() as i8);
329         }
330         tz
331     }
332 }
333 
334 cfg_if! {
335     if #[cfg(target_pointer_width = "8")] {
336         impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u8x2 }
337         impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u8x4 }
338         impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u8x8 }
339     } else if #[cfg(target_pointer_width = "16")] {
340         impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u16x2 }
341         impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u16x4 }
342         impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u16x8 }
343     } else if #[cfg(target_pointer_width = "32")] {
344         impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u32x2 }
345         impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u32x4 }
346         impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u32x8 }
347     } else if #[cfg(target_pointer_width = "64")] {
348         impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u64x2 }
349         impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u64x4 }
350         impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u64x8 }
351     } else {
352         compile_error!("unsupported target_pointer_width");
353     }
354 }
355