1 /// A few elementary UTF-8 encoding and decoding functions used by the matching
2 /// engines.
3 ///
4 /// In an ideal world, the matching engines operate on `&str` and we can just
5 /// lean on the standard library for all our UTF-8 needs. However, to support
6 /// byte based regexes (that can match on arbitrary bytes which may contain
7 /// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
8 /// The standard library doesn't really recognize this use case, so we have
9 /// to build it out ourselves.
10 ///
11 /// Should this be factored out into a separate crate? It seems independently
12 /// useful. There are other crates that already exist (e.g., `utf-8`) that have
13 /// overlapping use cases. Not sure what to do.
14 
15 use std::char;
16 
17 const TAG_CONT: u8 = 0b1000_0000;
18 const TAG_TWO: u8 = 0b1100_0000;
19 const TAG_THREE: u8 = 0b1110_0000;
20 const TAG_FOUR: u8 = 0b1111_0000;
21 
22 /// Returns the smallest possible index of the next valid UTF-8 sequence
23 /// starting after `i`.
next_utf8(text: &[u8], i: usize) -> usize24 pub fn next_utf8(text: &[u8], i: usize) -> usize {
25     let b = match text.get(i) {
26         None => return i + 1,
27         Some(&b) => b,
28     };
29     let inc = if b <= 0x7F {
30         1
31     } else if b <= 0b110_11111 {
32         2
33     } else if b <= 0b1110_1111 {
34         3
35     } else {
36         4
37     };
38     i + inc
39 }
40 
41 /// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
42 ///
43 /// If `dst` is not long enough, then `None` is returned. Otherwise, the number
44 /// of bytes written is returned.
45 #[allow(dead_code)]
46 #[inline]
encode_utf8(character: char, dst: &mut [u8]) -> Option<usize>47 pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
48     let code = character as u32;
49     if code <= 0x7F && !dst.is_empty() {
50         dst[0] = code as u8;
51         Some(1)
52     } else if code <= 0x7FF && dst.len() >= 2 {
53         dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
54         dst[1] = (code & 0x3F) as u8 | TAG_CONT;
55         Some(2)
56     } else if code <= 0xFFFF && dst.len() >= 3  {
57         dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
58         dst[1] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
59         dst[2] = (code & 0x3F) as u8 | TAG_CONT;
60         Some(3)
61     } else if dst.len() >= 4 {
62         dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
63         dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
64         dst[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
65         dst[3] = (code & 0x3F) as u8 | TAG_CONT;
66         Some(4)
67     } else {
68         None
69     }
70 }
71 
72 /// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
73 ///
74 /// If no valid UTF-8 sequence could be found, then `None` is returned.
75 /// Otherwise, the decoded codepoint and the number of bytes read is returned.
76 /// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
77 /// 1, 2, 3 or 4.
78 ///
79 /// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
80 /// codepoint that is out of range (surrogate codepoints are out of range) or
81 /// is not the shortest possible UTF-8 sequence for that codepoint.
82 #[inline]
decode_utf8(src: &[u8]) -> Option<(char, usize)>83 pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
84     let b0 = match src.get(0) {
85         None => return None,
86         Some(&b) if b <= 0x7F => return Some((b as char, 1)),
87         Some(&b) => b,
88     };
89     match b0 {
90         0b110_00000 ... 0b110_11111 => {
91             if src.len() < 2 {
92                 return None;
93             }
94             let b1 = src[1];
95             let cp = ((b0 & !TAG_TWO) as u32) << 6
96                      | ((b1 & !TAG_CONT) as u32);
97             match cp {
98                 0x80 ... 0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
99                 _ => None,
100             }
101         }
102         0b1110_0000 ... 0b1110_1111 => {
103             if src.len() < 3 {
104                 return None;
105             }
106             let (b1, b2) = (src[1], src[2]);
107             let cp = ((b0 & !TAG_THREE) as u32) << 12
108                      | ((b1 & !TAG_CONT) as u32) << 6
109                      | ((b2 & !TAG_CONT) as u32);
110             match cp {
111                 // char::from_u32 will disallow surrogate codepoints.
112                 0x800 ... 0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
113                 _ => None,
114             }
115         }
116         0b11110_000 ... 0b11110_111 => {
117             if src.len() < 4 {
118                 return None;
119             }
120             let (b1, b2, b3) = (src[1], src[2], src[3]);
121             let cp = ((b0 & !TAG_FOUR) as u32) << 18
122                      | ((b1 & !TAG_CONT) as u32) << 12
123                      | ((b2 & !TAG_CONT) as u32) << 6
124                      | ((b3 & !TAG_CONT) as u32);
125             match cp {
126                 0x10000 ... 0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
127                 _ => None,
128             }
129         }
130         _ => None,
131     }
132 }
133 
134 /// Like decode_utf8, but decodes the last UTF-8 sequence in `src` instead of
135 /// the first.
decode_last_utf8(src: &[u8]) -> Option<(char, usize)>136 pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
137     if src.is_empty() {
138         return None;
139     }
140     let mut start = src.len() - 1;
141     if src[start] <= 0x7F {
142         return Some((src[start] as char, 1));
143     }
144     while start > src.len().saturating_sub(4) {
145         start -= 1;
146         if is_start_byte(src[start]) {
147             break;
148         }
149     }
150     match decode_utf8(&src[start..]) {
151         None => None,
152         Some((_, n)) if n < src.len() - start => None,
153         Some((cp, n)) => Some((cp, n)),
154     }
155 }
156 
is_start_byte(b: u8) -> bool157 fn is_start_byte(b: u8) -> bool {
158     b & 0b11_000000 != 0b1_0000000
159 }
160 
161 #[cfg(test)]
162 mod tests {
163     use std::str;
164 
165     use quickcheck::quickcheck;
166 
167     use super::{
168         TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
169         decode_utf8, decode_last_utf8, encode_utf8,
170     };
171 
172     #[test]
prop_roundtrip()173     fn prop_roundtrip() {
174         fn p(given_cp: char) -> bool {
175             let mut tmp = [0; 4];
176             let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
177             let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
178             encoded_len == got_len && given_cp == got_cp
179         }
180         quickcheck(p as fn(char) -> bool)
181     }
182 
183     #[test]
prop_roundtrip_last()184     fn prop_roundtrip_last() {
185         fn p(given_cp: char) -> bool {
186             let mut tmp = [0; 4];
187             let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
188             let (got_cp, got_len) =
189                 decode_last_utf8(&tmp[..encoded_len]).unwrap();
190             encoded_len == got_len && given_cp == got_cp
191         }
192         quickcheck(p as fn(char) -> bool)
193     }
194 
195     #[test]
prop_encode_matches_std()196     fn prop_encode_matches_std() {
197         fn p(cp: char) -> bool {
198             let mut got = [0; 4];
199             let n = encode_utf8(cp, &mut got).unwrap();
200             let expected = cp.to_string();
201             &got[..n] == expected.as_bytes()
202         }
203         quickcheck(p as fn(char) -> bool)
204     }
205 
206     #[test]
prop_decode_matches_std()207     fn prop_decode_matches_std() {
208         fn p(given_cp: char) -> bool {
209             let mut tmp = [0; 4];
210             let n = encode_utf8(given_cp, &mut tmp).unwrap();
211             let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
212             let expected_cp =
213                 str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
214             got_cp == expected_cp
215         }
216         quickcheck(p as fn(char) -> bool)
217     }
218 
219     #[test]
prop_decode_last_matches_std()220     fn prop_decode_last_matches_std() {
221         fn p(given_cp: char) -> bool {
222             let mut tmp = [0; 4];
223             let n = encode_utf8(given_cp, &mut tmp).unwrap();
224             let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
225             let expected_cp =
226                 str::from_utf8(&tmp[..n]).unwrap()
227                     .chars().rev().next().unwrap();
228             got_cp == expected_cp
229         }
230         quickcheck(p as fn(char) -> bool)
231     }
232 
233     #[test]
reject_invalid()234     fn reject_invalid() {
235         // Invalid start byte
236         assert_eq!(decode_utf8(&[0xFF]), None);
237         // Surrogate pair
238         assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
239         // Bad lengths
240         assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
241         assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
242         assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
243         // Not a minimal UTF-8 sequence
244         assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
245         assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
246         assert_eq!(decode_utf8(&[
247             TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
248         ]), None);
249     }
250 
251     #[test]
reject_invalid_last()252     fn reject_invalid_last() {
253         // Invalid start byte
254         assert_eq!(decode_last_utf8(&[0xFF]), None);
255         // Surrogate pair
256         assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
257         // Bad lengths
258         assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
259         assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
260         assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
261         // Not a minimal UTF-8 sequence
262         assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
263         assert_eq!(decode_last_utf8(&[
264             TAG_THREE, TAG_CONT, TAG_CONT | b'a',
265         ]), None);
266         assert_eq!(decode_last_utf8(&[
267             TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
268         ]), None);
269     }
270 }
271