1 //! LLVM bit manipulation intrinsics. 2 #[rustfmt::skip] 3 4 use crate::*; 5 6 #[allow(improper_ctypes, dead_code)] 7 extern "C" { 8 #[link_name = "llvm.ctlz.v2i8"] ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x29 fn ctlz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2; 10 #[link_name = "llvm.ctlz.v4i8"] ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x411 fn ctlz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4; 12 #[link_name = "llvm.ctlz.v8i8"] ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x813 fn ctlz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8; 14 #[link_name = "llvm.ctlz.v16i8"] ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x1615 fn ctlz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16; 16 #[link_name = "llvm.ctlz.v32i8"] ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x3217 fn ctlz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32; 18 #[link_name = "llvm.ctlz.v64i8"] ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x6419 fn ctlz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64; 20 21 #[link_name = "llvm.ctlz.v2i16"] ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x222 fn ctlz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2; 23 #[link_name = "llvm.ctlz.v4i16"] ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x424 fn ctlz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4; 25 #[link_name = "llvm.ctlz.v8i16"] ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x826 fn ctlz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8; 27 #[link_name = "llvm.ctlz.v16i16"] ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x1628 fn ctlz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16; 29 #[link_name = "llvm.ctlz.v32i16"] ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x3230 fn ctlz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32; 31 32 #[link_name = "llvm.ctlz.v2i32"] ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x233 fn ctlz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2; 34 #[link_name = "llvm.ctlz.v4i32"] ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x435 fn ctlz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4; 36 #[link_name = "llvm.ctlz.v8i32"] ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x837 fn ctlz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8; 38 #[link_name = "llvm.ctlz.v16i32"] ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x1639 fn ctlz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16; 40 41 #[link_name = "llvm.ctlz.v2i64"] ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x242 fn ctlz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2; 43 #[link_name = "llvm.ctlz.v4i64"] ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x444 fn ctlz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4; 45 #[link_name = "llvm.ctlz.v8i64"] ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x846 fn ctlz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8; 47 48 #[link_name = "llvm.ctlz.v1i128"] ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x149 fn ctlz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1; 50 #[link_name = "llvm.ctlz.v2i128"] ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x251 fn ctlz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2; 52 #[link_name = "llvm.ctlz.v4i128"] ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x453 fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4; 54 55 #[link_name = "llvm.cttz.v2i8"] cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x256 fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2; 57 #[link_name = "llvm.cttz.v4i8"] cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x458 fn cttz_u8x4(x: u8x4, is_zero_undef: bool) -> u8x4; 59 #[link_name = "llvm.cttz.v8i8"] cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x860 fn cttz_u8x8(x: u8x8, is_zero_undef: bool) -> u8x8; 61 #[link_name = "llvm.cttz.v16i8"] cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x1662 fn cttz_u8x16(x: u8x16, is_zero_undef: bool) -> u8x16; 63 #[link_name = "llvm.cttz.v32i8"] cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x3264 fn cttz_u8x32(x: u8x32, is_zero_undef: bool) -> u8x32; 65 #[link_name = "llvm.cttz.v64i8"] cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x6466 fn cttz_u8x64(x: u8x64, is_zero_undef: bool) -> u8x64; 67 68 #[link_name = "llvm.cttz.v2i16"] cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x269 fn cttz_u16x2(x: u16x2, is_zero_undef: bool) -> u16x2; 70 #[link_name = "llvm.cttz.v4i16"] cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x471 fn cttz_u16x4(x: u16x4, is_zero_undef: bool) -> u16x4; 72 #[link_name = "llvm.cttz.v8i16"] cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x873 fn cttz_u16x8(x: u16x8, is_zero_undef: bool) -> u16x8; 74 #[link_name = "llvm.cttz.v16i16"] cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x1675 fn cttz_u16x16(x: u16x16, is_zero_undef: bool) -> u16x16; 76 #[link_name = "llvm.cttz.v32i16"] cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x3277 fn cttz_u16x32(x: u16x32, is_zero_undef: bool) -> u16x32; 78 79 #[link_name = "llvm.cttz.v2i32"] cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x280 fn cttz_u32x2(x: u32x2, is_zero_undef: bool) -> u32x2; 81 #[link_name = "llvm.cttz.v4i32"] cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x482 fn cttz_u32x4(x: u32x4, is_zero_undef: bool) -> u32x4; 83 #[link_name = "llvm.cttz.v8i32"] cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x884 fn cttz_u32x8(x: u32x8, is_zero_undef: bool) -> u32x8; 85 #[link_name = "llvm.cttz.v16i32"] cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x1686 fn cttz_u32x16(x: u32x16, is_zero_undef: bool) -> u32x16; 87 88 #[link_name = "llvm.cttz.v2i64"] cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x289 fn cttz_u64x2(x: u64x2, is_zero_undef: bool) -> u64x2; 90 #[link_name = "llvm.cttz.v4i64"] cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x491 fn cttz_u64x4(x: u64x4, is_zero_undef: bool) -> u64x4; 92 #[link_name = "llvm.cttz.v8i64"] cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x893 fn cttz_u64x8(x: u64x8, is_zero_undef: bool) -> u64x8; 94 95 #[link_name = "llvm.cttz.v1i128"] cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x196 fn cttz_u128x1(x: u128x1, is_zero_undef: bool) -> u128x1; 97 #[link_name = "llvm.cttz.v2i128"] cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x298 fn cttz_u128x2(x: u128x2, is_zero_undef: bool) -> u128x2; 99 #[link_name = "llvm.cttz.v4i128"] cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4100 fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4; 101 102 #[link_name = "llvm.ctpop.v2i8"] ctpop_u8x2(x: u8x2) -> u8x2103 fn ctpop_u8x2(x: u8x2) -> u8x2; 104 #[link_name = "llvm.ctpop.v4i8"] ctpop_u8x4(x: u8x4) -> u8x4105 fn ctpop_u8x4(x: u8x4) -> u8x4; 106 #[link_name = "llvm.ctpop.v8i8"] ctpop_u8x8(x: u8x8) -> u8x8107 fn ctpop_u8x8(x: u8x8) -> u8x8; 108 #[link_name = "llvm.ctpop.v16i8"] ctpop_u8x16(x: u8x16) -> u8x16109 fn ctpop_u8x16(x: u8x16) -> u8x16; 110 #[link_name = "llvm.ctpop.v32i8"] ctpop_u8x32(x: u8x32) -> u8x32111 fn ctpop_u8x32(x: u8x32) -> u8x32; 112 #[link_name = "llvm.ctpop.v64i8"] ctpop_u8x64(x: u8x64) -> u8x64113 fn ctpop_u8x64(x: u8x64) -> u8x64; 114 115 #[link_name = "llvm.ctpop.v2i16"] ctpop_u16x2(x: u16x2) -> u16x2116 fn ctpop_u16x2(x: u16x2) -> u16x2; 117 #[link_name = "llvm.ctpop.v4i16"] ctpop_u16x4(x: u16x4) -> u16x4118 fn ctpop_u16x4(x: u16x4) -> u16x4; 119 #[link_name = "llvm.ctpop.v8i16"] ctpop_u16x8(x: u16x8) -> u16x8120 fn ctpop_u16x8(x: u16x8) -> u16x8; 121 #[link_name = "llvm.ctpop.v16i16"] ctpop_u16x16(x: u16x16) -> u16x16122 fn ctpop_u16x16(x: u16x16) -> u16x16; 123 #[link_name = "llvm.ctpop.v32i16"] ctpop_u16x32(x: u16x32) -> u16x32124 fn ctpop_u16x32(x: u16x32) -> u16x32; 125 126 #[link_name = "llvm.ctpop.v2i32"] ctpop_u32x2(x: u32x2) -> u32x2127 fn ctpop_u32x2(x: u32x2) -> u32x2; 128 #[link_name = "llvm.ctpop.v4i32"] ctpop_u32x4(x: u32x4) -> u32x4129 fn ctpop_u32x4(x: u32x4) -> u32x4; 130 #[link_name = "llvm.ctpop.v8i32"] ctpop_u32x8(x: u32x8) -> u32x8131 fn ctpop_u32x8(x: u32x8) -> u32x8; 132 #[link_name = "llvm.ctpop.v16i32"] ctpop_u32x16(x: u32x16) -> u32x16133 fn ctpop_u32x16(x: u32x16) -> u32x16; 134 135 #[link_name = "llvm.ctpop.v2i64"] ctpop_u64x2(x: u64x2) -> u64x2136 fn ctpop_u64x2(x: u64x2) -> u64x2; 137 #[link_name = "llvm.ctpop.v4i64"] ctpop_u64x4(x: u64x4) -> u64x4138 fn ctpop_u64x4(x: u64x4) -> u64x4; 139 #[link_name = "llvm.ctpop.v8i64"] ctpop_u64x8(x: u64x8) -> u64x8140 fn ctpop_u64x8(x: u64x8) -> u64x8; 141 142 #[link_name = "llvm.ctpop.v1i128"] ctpop_u128x1(x: u128x1) -> u128x1143 fn ctpop_u128x1(x: u128x1) -> u128x1; 144 #[link_name = "llvm.ctpop.v2i128"] ctpop_u128x2(x: u128x2) -> u128x2145 fn ctpop_u128x2(x: u128x2) -> u128x2; 146 #[link_name = "llvm.ctpop.v4i128"] ctpop_u128x4(x: u128x4) -> u128x4147 fn ctpop_u128x4(x: u128x4) -> u128x4; 148 } 149 150 crate trait BitManip { ctpop(self) -> Self151 fn ctpop(self) -> Self; ctlz(self) -> Self152 fn ctlz(self) -> Self; cttz(self) -> Self153 fn cttz(self) -> Self; 154 } 155 156 macro_rules! impl_bit_manip { 157 (inner: $ty:ident, $scalar:ty, $uty:ident, 158 $ctpop:ident, $ctlz:ident, $cttz:ident) => { 159 // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192 160 #[cfg(target_arch = "s390x")] 161 impl_bit_manip! { scalar: $ty, $scalar } 162 #[cfg(not(target_arch = "s390x"))] 163 impl BitManip for $ty { 164 #[inline] 165 fn ctpop(self) -> Self { 166 let y: $uty = self.cast(); 167 unsafe { $ctpop(y).cast() } 168 } 169 170 #[inline] 171 fn ctlz(self) -> Self { 172 let y: $uty = self.cast(); 173 // the ctxx intrinsics need compile-time constant 174 // `is_zero_undef` 175 unsafe { $ctlz(y, false).cast() } 176 } 177 178 #[inline] 179 fn cttz(self) -> Self { 180 let y: $uty = self.cast(); 181 unsafe { $cttz(y, false).cast() } 182 } 183 } 184 }; 185 (sized_inner: $ty:ident, $scalar:ty, $uty:ident) => { 186 #[cfg(target_arch = "s390x")] 187 impl_bit_manip! { scalar: $ty, $scalar } 188 #[cfg(not(target_arch = "s390x"))] 189 impl BitManip for $ty { 190 #[inline] 191 fn ctpop(self) -> Self { 192 let y: $uty = self.cast(); 193 $uty::ctpop(y).cast() 194 } 195 196 #[inline] 197 fn ctlz(self) -> Self { 198 let y: $uty = self.cast(); 199 $uty::ctlz(y).cast() 200 } 201 202 #[inline] 203 fn cttz(self) -> Self { 204 let y: $uty = self.cast(); 205 $uty::cttz(y).cast() 206 } 207 } 208 }; 209 (scalar: $ty:ident, $scalar:ty) => { 210 impl BitManip for $ty { 211 #[inline] 212 fn ctpop(self) -> Self { 213 let mut ones = self; 214 for i in 0..Self::lanes() { 215 ones = ones 216 .replace(i, self.extract(i).count_ones() as $scalar); 217 } 218 ones 219 } 220 221 #[inline] 222 fn ctlz(self) -> Self { 223 let mut lz = self; 224 for i in 0..Self::lanes() { 225 lz = lz.replace( 226 i, 227 self.extract(i).leading_zeros() as $scalar, 228 ); 229 } 230 lz 231 } 232 233 #[inline] 234 fn cttz(self) -> Self { 235 let mut tz = self; 236 for i in 0..Self::lanes() { 237 tz = tz.replace( 238 i, 239 self.extract(i).trailing_zeros() as $scalar, 240 ); 241 } 242 tz 243 } 244 } 245 }; 246 ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty, 247 $ctpop:ident, $ctlz:ident, $cttz:ident) => { 248 impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz } 249 impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz } 250 }; 251 (sized: $usize:ident, $uscalar:ty, $isize:ident, 252 $iscalar:ty, $ty:ident) => { 253 impl_bit_manip! { sized_inner: $usize, $uscalar, $ty } 254 impl_bit_manip! { sized_inner: $isize, $iscalar, $ty } 255 }; 256 } 257 258 impl_bit_manip! { u8x2 , u8, i8x2, i8, ctpop_u8x2, ctlz_u8x2, cttz_u8x2 } 259 impl_bit_manip! { u8x4 , u8, i8x4, i8, ctpop_u8x4, ctlz_u8x4, cttz_u8x4 } 260 #[cfg(not(target_arch = "aarch64"))] // see below 261 impl_bit_manip! { u8x8 , u8, i8x8, i8, ctpop_u8x8, ctlz_u8x8, cttz_u8x8 } 262 impl_bit_manip! { u8x16 , u8, i8x16, i8, ctpop_u8x16, ctlz_u8x16, cttz_u8x16 } 263 impl_bit_manip! { u8x32 , u8, i8x32, i8, ctpop_u8x32, ctlz_u8x32, cttz_u8x32 } 264 impl_bit_manip! { u8x64 , u8, i8x64, i8, ctpop_u8x64, ctlz_u8x64, cttz_u8x64 } 265 impl_bit_manip! { u16x2 , u16, i16x2, i16, ctpop_u16x2, ctlz_u16x2, cttz_u16x2 } 266 impl_bit_manip! { u16x4 , u16, i16x4, i16, ctpop_u16x4, ctlz_u16x4, cttz_u16x4 } 267 impl_bit_manip! { u16x8 , u16, i16x8, i16, ctpop_u16x8, ctlz_u16x8, cttz_u16x8 } 268 impl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 } 269 impl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 } 270 impl_bit_manip! { u32x2 , u32, i32x2, i32, ctpop_u32x2, ctlz_u32x2, cttz_u32x2 } 271 impl_bit_manip! { u32x4 , u32, i32x4, i32, ctpop_u32x4, ctlz_u32x4, cttz_u32x4 } 272 impl_bit_manip! { u32x8 , u32, i32x8, i32, ctpop_u32x8, ctlz_u32x8, cttz_u32x8 } 273 impl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 } 274 impl_bit_manip! { u64x2 , u64, i64x2, i64, ctpop_u64x2, ctlz_u64x2, cttz_u64x2 } 275 impl_bit_manip! { u64x4 , u64, i64x4, i64, ctpop_u64x4, ctlz_u64x4, cttz_u64x4 } 276 impl_bit_manip! { u64x8 , u64, i64x8, i64, ctpop_u64x8, ctlz_u64x8, cttz_u64x8 } 277 impl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 } 278 impl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 } 279 impl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 } 280 281 #[cfg(target_arch = "aarch64")] 282 impl BitManip for u8x8 { 283 #[inline] ctpop(self) -> Self284 fn ctpop(self) -> Self { 285 let y: u8x8 = self.cast(); 286 unsafe { ctpop_u8x8(y).cast() } 287 } 288 289 #[inline] ctlz(self) -> Self290 fn ctlz(self) -> Self { 291 let y: u8x8 = self.cast(); 292 unsafe { ctlz_u8x8(y, false).cast() } 293 } 294 295 #[inline] cttz(self) -> Self296 fn cttz(self) -> Self { 297 // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191 298 // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64 299 // intrinsics 300 let mut tz = self; 301 for i in 0..Self::lanes() { 302 tz = tz.replace(i, self.extract(i).trailing_zeros() as u8); 303 } 304 tz 305 } 306 } 307 #[cfg(target_arch = "aarch64")] 308 impl BitManip for i8x8 { 309 #[inline] ctpop(self) -> Self310 fn ctpop(self) -> Self { 311 let y: u8x8 = self.cast(); 312 unsafe { ctpop_u8x8(y).cast() } 313 } 314 315 #[inline] ctlz(self) -> Self316 fn ctlz(self) -> Self { 317 let y: u8x8 = self.cast(); 318 unsafe { ctlz_u8x8(y, false).cast() } 319 } 320 321 #[inline] cttz(self) -> Self322 fn cttz(self) -> Self { 323 // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191 324 // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64 325 // intrinsics 326 let mut tz = self; 327 for i in 0..Self::lanes() { 328 tz = tz.replace(i, self.extract(i).trailing_zeros() as i8); 329 } 330 tz 331 } 332 } 333 334 cfg_if! { 335 if #[cfg(target_pointer_width = "8")] { 336 impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u8x2 } 337 impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u8x4 } 338 impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u8x8 } 339 } else if #[cfg(target_pointer_width = "16")] { 340 impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u16x2 } 341 impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u16x4 } 342 impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u16x8 } 343 } else if #[cfg(target_pointer_width = "32")] { 344 impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u32x2 } 345 impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u32x4 } 346 impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u32x8 } 347 } else if #[cfg(target_pointer_width = "64")] { 348 impl_bit_manip! { sized: usizex2, usize, isizex2, isize, u64x2 } 349 impl_bit_manip! { sized: usizex4, usize, isizex4, isize, u64x4 } 350 impl_bit_manip! { sized: usizex8, usize, isizex8, isize, u64x8 } 351 } else { 352 compile_error!("unsupported target_pointer_width"); 353 } 354 } 355