1 /// A few elementary UTF-8 encoding and decoding functions used by the matching
2 /// engines.
3 ///
4 /// In an ideal world, the matching engines operate on `&str` and we can just
5 /// lean on the standard library for all our UTF-8 needs. However, to support
6 /// byte based regexes (that can match on arbitrary bytes which may contain
7 /// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
8 /// The standard library doesn't really recognize this use case, so we have
9 /// to build it out ourselves.
10 ///
11 /// Should this be factored out into a separate crate? It seems independently
12 /// useful. There are other crates that already exist (e.g., `utf-8`) that have
13 /// overlapping use cases. Not sure what to do.
14 use std::char;
15 
16 const TAG_CONT: u8 = 0b1000_0000;
17 const TAG_TWO: u8 = 0b1100_0000;
18 const TAG_THREE: u8 = 0b1110_0000;
19 const TAG_FOUR: u8 = 0b1111_0000;
20 
21 /// Returns the smallest possible index of the next valid UTF-8 sequence
22 /// starting after `i`.
next_utf8(text: &[u8], i: usize) -> usize23 pub fn next_utf8(text: &[u8], i: usize) -> usize {
24     let b = match text.get(i) {
25         None => return i + 1,
26         Some(&b) => b,
27     };
28     let inc = if b <= 0x7F {
29         1
30     } else if b <= 0b110_11111 {
31         2
32     } else if b <= 0b1110_1111 {
33         3
34     } else {
35         4
36     };
37     i + inc
38 }
39 
40 /// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
41 ///
42 /// If no valid UTF-8 sequence could be found, then `None` is returned.
43 /// Otherwise, the decoded codepoint and the number of bytes read is returned.
44 /// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
45 /// 1, 2, 3 or 4.
46 ///
47 /// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
48 /// codepoint that is out of range (surrogate codepoints are out of range) or
49 /// is not the shortest possible UTF-8 sequence for that codepoint.
50 #[inline]
decode_utf8(src: &[u8]) -> Option<(char, usize)>51 pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
52     let b0 = match src.get(0) {
53         None => return None,
54         Some(&b) if b <= 0x7F => return Some((b as char, 1)),
55         Some(&b) => b,
56     };
57     match b0 {
58         0b110_00000..=0b110_11111 => {
59             if src.len() < 2 {
60                 return None;
61             }
62             let b1 = src[1];
63             if 0b11_000000 & b1 != TAG_CONT {
64                 return None;
65             }
66             let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
67             match cp {
68                 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
69                 _ => None,
70             }
71         }
72         0b1110_0000..=0b1110_1111 => {
73             if src.len() < 3 {
74                 return None;
75             }
76             let (b1, b2) = (src[1], src[2]);
77             if 0b11_000000 & b1 != TAG_CONT {
78                 return None;
79             }
80             if 0b11_000000 & b2 != TAG_CONT {
81                 return None;
82             }
83             let cp = ((b0 & !TAG_THREE) as u32) << 12
84                 | ((b1 & !TAG_CONT) as u32) << 6
85                 | ((b2 & !TAG_CONT) as u32);
86             match cp {
87                 // char::from_u32 will disallow surrogate codepoints.
88                 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
89                 _ => None,
90             }
91         }
92         0b11110_000..=0b11110_111 => {
93             if src.len() < 4 {
94                 return None;
95             }
96             let (b1, b2, b3) = (src[1], src[2], src[3]);
97             if 0b11_000000 & b1 != TAG_CONT {
98                 return None;
99             }
100             if 0b11_000000 & b2 != TAG_CONT {
101                 return None;
102             }
103             if 0b11_000000 & b3 != TAG_CONT {
104                 return None;
105             }
106             let cp = ((b0 & !TAG_FOUR) as u32) << 18
107                 | ((b1 & !TAG_CONT) as u32) << 12
108                 | ((b2 & !TAG_CONT) as u32) << 6
109                 | ((b3 & !TAG_CONT) as u32);
110             match cp {
111                 0x10000..=0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
112                 _ => None,
113             }
114         }
115         _ => None,
116     }
117 }
118 
119 /// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
120 /// of the first.
decode_last_utf8(src: &[u8]) -> Option<(char, usize)>121 pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
122     if src.is_empty() {
123         return None;
124     }
125     let mut start = src.len() - 1;
126     if src[start] <= 0x7F {
127         return Some((src[start] as char, 1));
128     }
129     while start > src.len().saturating_sub(4) {
130         start -= 1;
131         if is_start_byte(src[start]) {
132             break;
133         }
134     }
135     match decode_utf8(&src[start..]) {
136         None => None,
137         Some((_, n)) if n < src.len() - start => None,
138         Some((cp, n)) => Some((cp, n)),
139     }
140 }
141 
is_start_byte(b: u8) -> bool142 fn is_start_byte(b: u8) -> bool {
143     b & 0b11_000000 != 0b1_0000000
144 }
145 
146 #[cfg(test)]
147 mod tests {
148     use std::str;
149 
150     use quickcheck::quickcheck;
151 
152     use super::{
153         decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
154     };
155 
156     #[test]
prop_roundtrip()157     fn prop_roundtrip() {
158         fn p(given_cp: char) -> bool {
159             let mut tmp = [0; 4];
160             let encoded_len = given_cp.encode_utf8(&mut tmp).len();
161             let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
162             encoded_len == got_len && given_cp == got_cp
163         }
164         quickcheck(p as fn(char) -> bool)
165     }
166 
167     #[test]
prop_roundtrip_last()168     fn prop_roundtrip_last() {
169         fn p(given_cp: char) -> bool {
170             let mut tmp = [0; 4];
171             let encoded_len = given_cp.encode_utf8(&mut tmp).len();
172             let (got_cp, got_len) =
173                 decode_last_utf8(&tmp[..encoded_len]).unwrap();
174             encoded_len == got_len && given_cp == got_cp
175         }
176         quickcheck(p as fn(char) -> bool)
177     }
178 
179     #[test]
prop_encode_matches_std()180     fn prop_encode_matches_std() {
181         fn p(cp: char) -> bool {
182             let mut got = [0; 4];
183             let n = cp.encode_utf8(&mut got).len();
184             let expected = cp.to_string();
185             &got[..n] == expected.as_bytes()
186         }
187         quickcheck(p as fn(char) -> bool)
188     }
189 
190     #[test]
prop_decode_matches_std()191     fn prop_decode_matches_std() {
192         fn p(given_cp: char) -> bool {
193             let mut tmp = [0; 4];
194             let n = given_cp.encode_utf8(&mut tmp).len();
195             let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
196             let expected_cp =
197                 str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
198             got_cp == expected_cp
199         }
200         quickcheck(p as fn(char) -> bool)
201     }
202 
203     #[test]
prop_decode_last_matches_std()204     fn prop_decode_last_matches_std() {
205         fn p(given_cp: char) -> bool {
206             let mut tmp = [0; 4];
207             let n = given_cp.encode_utf8(&mut tmp).len();
208             let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
209             let expected_cp = str::from_utf8(&tmp[..n])
210                 .unwrap()
211                 .chars()
212                 .rev()
213                 .next()
214                 .unwrap();
215             got_cp == expected_cp
216         }
217         quickcheck(p as fn(char) -> bool)
218     }
219 
220     #[test]
reject_invalid()221     fn reject_invalid() {
222         // Invalid start byte
223         assert_eq!(decode_utf8(&[0xFF]), None);
224         // Surrogate pair
225         assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
226         // Invalid continuation byte.
227         assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
228         // Bad lengths
229         assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
230         assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
231         assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
232                                                             // Not a minimal UTF-8 sequence
233         assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
234         assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
235         assert_eq!(
236             decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
237             None
238         );
239     }
240 
241     #[test]
reject_invalid_last()242     fn reject_invalid_last() {
243         // Invalid start byte
244         assert_eq!(decode_last_utf8(&[0xFF]), None);
245         // Surrogate pair
246         assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
247         // Bad lengths
248         assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
249         assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
250         assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
251                                                                  // Not a minimal UTF-8 sequence
252         assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
253         assert_eq!(
254             decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
255             None
256         );
257         assert_eq!(
258             decode_last_utf8(
259                 &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
260             ),
261             None
262         );
263     }
264 }
265