1 /// A few elementary UTF-8 encoding and decoding functions used by the matching
2 /// engines.
3 ///
4 /// In an ideal world, the matching engines operate on `&str` and we can just
5 /// lean on the standard library for all our UTF-8 needs. However, to support
6 /// byte based regexes (that can match on arbitrary bytes which may contain
7 /// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
8 /// The standard library doesn't really recognize this use case, so we have
9 /// to build it out ourselves.
10 ///
11 /// Should this be factored out into a separate crate? It seems independently
12 /// useful. There are other crates that already exist (e.g., `utf-8`) that have
13 /// overlapping use cases. Not sure what to do.
14
15 use std::char;
16
17 const TAG_CONT: u8 = 0b1000_0000;
18 const TAG_TWO: u8 = 0b1100_0000;
19 const TAG_THREE: u8 = 0b1110_0000;
20 const TAG_FOUR: u8 = 0b1111_0000;
21
22 /// Returns the smallest possible index of the next valid UTF-8 sequence
23 /// starting after `i`.
next_utf8(text: &[u8], i: usize) -> usize24 pub fn next_utf8(text: &[u8], i: usize) -> usize {
25 let b = match text.get(i) {
26 None => return i + 1,
27 Some(&b) => b,
28 };
29 let inc = if b <= 0x7F {
30 1
31 } else if b <= 0b110_11111 {
32 2
33 } else if b <= 0b1110_1111 {
34 3
35 } else {
36 4
37 };
38 i + inc
39 }
40
41 /// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
42 ///
43 /// If `dst` is not long enough, then `None` is returned. Otherwise, the number
44 /// of bytes written is returned.
45 #[allow(dead_code)]
46 #[inline]
encode_utf8(character: char, dst: &mut [u8]) -> Option<usize>47 pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
48 let code = character as u32;
49 if code <= 0x7F && !dst.is_empty() {
50 dst[0] = code as u8;
51 Some(1)
52 } else if code <= 0x7FF && dst.len() >= 2 {
53 dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
54 dst[1] = (code & 0x3F) as u8 | TAG_CONT;
55 Some(2)
56 } else if code <= 0xFFFF && dst.len() >= 3 {
57 dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
58 dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
59 dst[2] = (code & 0x3F) as u8 | TAG_CONT;
60 Some(3)
61 } else if dst.len() >= 4 {
62 dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
63 dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
64 dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
65 dst[3] = (code & 0x3F) as u8 | TAG_CONT;
66 Some(4)
67 } else {
68 None
69 }
70 }
71
72 /// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
73 ///
74 /// If no valid UTF-8 sequence could be found, then `None` is returned.
75 /// Otherwise, the decoded codepoint and the number of bytes read is returned.
76 /// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
77 /// 1, 2, 3 or 4.
78 ///
79 /// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
80 /// codepoint that is out of range (surrogate codepoints are out of range) or
81 /// is not the shortest possible UTF-8 sequence for that codepoint.
82 #[inline]
decode_utf8(src: &[u8]) -> Option<(char, usize)>83 pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
84 let b0 = match src.get(0) {
85 None => return None,
86 Some(&b) if b <= 0x7F => return Some((b as char, 1)),
87 Some(&b) => b,
88 };
89 match b0 {
90 0b110_00000 ... 0b110_11111 => {
91 if src.len() < 2 {
92 return None;
93 }
94 let b1 = src[1];
95 let cp = ((b0 & !TAG_TWO) as u32) << 6
96 | ((b1 & !TAG_CONT) as u32);
97 match cp {
98 0x80 ... 0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
99 _ => None,
100 }
101 }
102 0b1110_0000 ... 0b1110_1111 => {
103 if src.len() < 3 {
104 return None;
105 }
106 let (b1, b2) = (src[1], src[2]);
107 let cp = ((b0 & !TAG_THREE) as u32) << 12
108 | ((b1 & !TAG_CONT) as u32) << 6
109 | ((b2 & !TAG_CONT) as u32);
110 match cp {
111 // char::from_u32 will disallow surrogate codepoints.
112 0x800 ... 0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
113 _ => None,
114 }
115 }
116 0b11110_000 ... 0b11110_111 => {
117 if src.len() < 4 {
118 return None;
119 }
120 let (b1, b2, b3) = (src[1], src[2], src[3]);
121 let cp = ((b0 & !TAG_FOUR) as u32) << 18
122 | ((b1 & !TAG_CONT) as u32) << 12
123 | ((b2 & !TAG_CONT) as u32) << 6
124 | ((b3 & !TAG_CONT) as u32);
125 match cp {
126 0x10000 ... 0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
127 _ => None,
128 }
129 }
130 _ => None,
131 }
132 }
133
134 /// Like decode_utf8, but decodes the last UTF-8 sequence in `src` instead of
135 /// the first.
decode_last_utf8(src: &[u8]) -> Option<(char, usize)>136 pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
137 if src.is_empty() {
138 return None;
139 }
140 let mut start = src.len() - 1;
141 if src[start] <= 0x7F {
142 return Some((src[start] as char, 1));
143 }
144 while start > src.len().saturating_sub(4) {
145 start -= 1;
146 if is_start_byte(src[start]) {
147 break;
148 }
149 }
150 match decode_utf8(&src[start..]) {
151 None => None,
152 Some((_, n)) if n < src.len() - start => None,
153 Some((cp, n)) => Some((cp, n)),
154 }
155 }
156
is_start_byte(b: u8) -> bool157 fn is_start_byte(b: u8) -> bool {
158 b & 0b11_000000 != 0b1_0000000
159 }
160
161 #[cfg(test)]
162 mod tests {
163 use std::str;
164
165 use quickcheck::quickcheck;
166
167 use super::{
168 TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
169 decode_utf8, decode_last_utf8, encode_utf8,
170 };
171
172 #[test]
prop_roundtrip()173 fn prop_roundtrip() {
174 fn p(given_cp: char) -> bool {
175 let mut tmp = [0; 4];
176 let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
177 let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
178 encoded_len == got_len && given_cp == got_cp
179 }
180 quickcheck(p as fn(char) -> bool)
181 }
182
183 #[test]
prop_roundtrip_last()184 fn prop_roundtrip_last() {
185 fn p(given_cp: char) -> bool {
186 let mut tmp = [0; 4];
187 let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
188 let (got_cp, got_len) =
189 decode_last_utf8(&tmp[..encoded_len]).unwrap();
190 encoded_len == got_len && given_cp == got_cp
191 }
192 quickcheck(p as fn(char) -> bool)
193 }
194
195 #[test]
prop_encode_matches_std()196 fn prop_encode_matches_std() {
197 fn p(cp: char) -> bool {
198 let mut got = [0; 4];
199 let n = encode_utf8(cp, &mut got).unwrap();
200 let expected = cp.to_string();
201 &got[..n] == expected.as_bytes()
202 }
203 quickcheck(p as fn(char) -> bool)
204 }
205
206 #[test]
prop_decode_matches_std()207 fn prop_decode_matches_std() {
208 fn p(given_cp: char) -> bool {
209 let mut tmp = [0; 4];
210 let n = encode_utf8(given_cp, &mut tmp).unwrap();
211 let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
212 let expected_cp =
213 str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
214 got_cp == expected_cp
215 }
216 quickcheck(p as fn(char) -> bool)
217 }
218
219 #[test]
prop_decode_last_matches_std()220 fn prop_decode_last_matches_std() {
221 fn p(given_cp: char) -> bool {
222 let mut tmp = [0; 4];
223 let n = encode_utf8(given_cp, &mut tmp).unwrap();
224 let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
225 let expected_cp =
226 str::from_utf8(&tmp[..n]).unwrap()
227 .chars().rev().next().unwrap();
228 got_cp == expected_cp
229 }
230 quickcheck(p as fn(char) -> bool)
231 }
232
233 #[test]
reject_invalid()234 fn reject_invalid() {
235 // Invalid start byte
236 assert_eq!(decode_utf8(&[0xFF]), None);
237 // Surrogate pair
238 assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
239 // Bad lengths
240 assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
241 assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
242 assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
243 // Not a minimal UTF-8 sequence
244 assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
245 assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
246 assert_eq!(decode_utf8(&[
247 TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
248 ]), None);
249 }
250
251 #[test]
reject_invalid_last()252 fn reject_invalid_last() {
253 // Invalid start byte
254 assert_eq!(decode_last_utf8(&[0xFF]), None);
255 // Surrogate pair
256 assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
257 // Bad lengths
258 assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
259 assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
260 assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
261 // Not a minimal UTF-8 sequence
262 assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
263 assert_eq!(decode_last_utf8(&[
264 TAG_THREE, TAG_CONT, TAG_CONT | b'a',
265 ]), None);
266 assert_eq!(decode_last_utf8(&[
267 TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
268 ]), None);
269 }
270 }
271