1 extern crate packed_simd; 2 3 #[cfg(not(feature = "runtime-dispatch-simd"))] 4 use core::mem; 5 #[cfg(feature = "runtime-dispatch-simd")] 6 use std::mem; 7 8 use self::packed_simd::{u8x32, u8x64, FromCast}; 9 10 const MASK: [u8; 64] = [ 11 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 14 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 15 ]; 16 17 unsafe fn u8x64_from_offset(slice: &[u8], offset: usize) -> u8x64 { 18 u8x64::from_slice_unaligned_unchecked(slice.get_unchecked(offset..)) 19 } 20 unsafe fn u8x32_from_offset(slice: &[u8], offset: usize) -> u8x32 { 21 u8x32::from_slice_unaligned_unchecked(slice.get_unchecked(offset..)) 22 } 23 24 fn sum_x64(u8s: &u8x64) -> usize { 25 let mut store = [0; mem::size_of::<u8x64>()]; 26 u8s.write_to_slice_unaligned(&mut store); 27 store.iter().map(|&e| e as usize).sum() 28 } 29 fn sum_x32(u8s: &u8x32) -> usize { 30 let mut store = [0; mem::size_of::<u8x32>()]; 31 u8s.write_to_slice_unaligned(&mut store); 32 store.iter().map(|&e| e as usize).sum() 33 } 34 35 pub fn chunk_count(haystack: &[u8], needle: u8) -> usize { 36 assert!(haystack.len() >= 32); 37 38 unsafe { 39 let mut offset = 0; 40 let mut count = 0; 41 42 let needles_x64 = u8x64::splat(needle); 43 44 // 16320 45 while haystack.len() >= offset + 64 * 255 { 46 let mut counts = u8x64::splat(0); 47 for _ in 0..255 { 48 counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64)); 49 offset += 64; 50 } 51 count += sum_x64(&counts); 52 } 53 54 // 8192 55 if haystack.len() >= offset + 64 * 128 { 56 let mut counts = u8x64::splat(0); 57 for _ in 0..128 { 58 counts -= u8x64::from_cast(u8x64_from_offset(haystack, offset).eq(needles_x64)); 59 offset += 64; 60 } 61 count += sum_x64(&counts); count(haystack: &[u8], needle: u8) -> usize62 } 63 64 let needles_x32 = u8x32::splat(needle); 65 66 // 32 67 let mut counts = u8x32::splat(0); 68 for i in 0..(haystack.len() - offset) / 32 { 69 counts -= u8x32::from_cast(u8x32_from_offset(haystack, offset + i * 32).eq(needles_x32)); 70 } 71 count += sum_x32(&counts); 72 73 // Straggler; need to reset counts because prior loop can run 255 times 74 counts = u8x32::splat(0); 75 if haystack.len() % 32 != 0 { 76 counts -= u8x32::from_cast(u8x32_from_offset(haystack, haystack.len() - 32).eq(needles_x32)) & 77 u8x32_from_offset(&MASK, haystack.len() % 32); 78 } 79 count += sum_x32(&counts); 80 81 count 82 } 83 } 84 85 fn is_leading_utf8_byte_x64(u8s: u8x64) -> u8x64 { 86 u8x64::from_cast((u8s & u8x64::splat(0b1100_0000)).ne(u8x64::splat(0b1000_0000))) 87 } 88 89 fn is_leading_utf8_byte_x32(u8s: u8x32) -> u8x32 { 90 u8x32::from_cast((u8s & u8x32::splat(0b1100_0000)).ne(u8x32::splat(0b1000_0000))) 91 } 92 93 pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize { 94 assert!(utf8_chars.len() >= 32); 95 96 unsafe { 97 let mut offset = 0; 98 let mut count = 0; 99 100 // 16320 101 while utf8_chars.len() >= offset + 64 * 255 { 102 let mut counts = u8x64::splat(0); 103 for _ in 0..255 { 104 counts -= is_leading_utf8_byte_x64(u8x64_from_offset(utf8_chars, offset)); 105 offset += 64; 106 } num_chars(utf8_chars: &[u8]) -> usize107 count += sum_x64(&counts); 108 } 109 110 // 8192 111 if utf8_chars.len() >= offset + 64 * 128 { 112 let mut counts = u8x64::splat(0); 113 for _ in 0..128 { 114 counts -= is_leading_utf8_byte_x64(u8x64_from_offset(utf8_chars, offset)); 115 offset += 64; 116 } 117 count += sum_x64(&counts); 118 } 119 120 // 32 121 let mut counts = u8x32::splat(0); 122 for i in 0..(utf8_chars.len() - offset) / 32 { 123 counts -= is_leading_utf8_byte_x32(u8x32_from_offset(utf8_chars, offset + i * 32)); 124 } 125 count += sum_x32(&counts); 126 127 // Straggler; need to reset counts because prior loop can run 255 times 128 counts = u8x32::splat(0); 129 if utf8_chars.len() % 32 != 0 { 130 counts -= is_leading_utf8_byte_x32(u8x32_from_offset(utf8_chars, utf8_chars.len() - 32)) & 131 u8x32_from_offset(&MASK, utf8_chars.len() % 32); 132 } 133 count += sum_x32(&counts); 134 135 count 136 } 137 } 138