regex-0.1.80/src/utf8.rs

/// A few elementary UTF-8 encoding and decoding functions used by the matching
/// engines.
///
/// In an ideal world, the matching engines operate on `&str` and we can just
/// lean on the standard library for all our UTF-8 needs. However, to support
/// byte based regexes (that can match on arbitrary bytes which may contain
/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
/// The standard library doesn't really recognize this use case, so we have
/// to build it out ourselves.
///
/// Should this be factored out into a separate crate? It seems independently
/// useful. There are other crates that already exist (e.g., `utf-8`) that have
/// overlapping use cases. Not sure what to do.

use std::char;

const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO: u8 = 0b1100_0000;
const TAG_THREE: u8 = 0b1110_0000;
const TAG_FOUR: u8 = 0b1111_0000;

/// Returns the smallest possible index of the next valid UTF-8 sequence
/// starting after `i`.
pub fn next_utf8(text: &[u8], i: usize) -> usize {
    let b = match text.get(i) {
        None => return i + 1,
        Some(&b) => b,
    };
    let inc = if b <= 0x7F {
        1
    } else if b <= 0b110_11111 {
        2
    } else if b <= 0b1110_1111 {
        3
    } else {
        4
    };
    i + inc
}

/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
///
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
/// of bytes written is returned.
#[allow(dead_code)]
#[inline]
pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
    let code = character as u32;
    if code <= 0x7F && !dst.is_empty() {
        dst[0] = code as u8;
        Some(1)
    } else if code <= 0x7FF && dst.len() >= 2 {
        dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
        dst[1] = (code & 0x3F) as u8 | TAG_CONT;
        Some(2)
    } else if code <= 0xFFFF && dst.len() >= 3  {
        dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
        dst[1] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
        dst[2] = (code & 0x3F) as u8 | TAG_CONT;
        Some(3)
    } else if dst.len() >= 4 {
        dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
        dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
        dst[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
        dst[3] = (code & 0x3F) as u8 | TAG_CONT;
        Some(4)
    } else {
        None
    }
}

/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
///
/// If no valid UTF-8 sequence could be found, then `None` is returned.
/// Otherwise, the decoded codepoint and the number of bytes read is returned.
/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
/// 1, 2, 3 or 4.
///
/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
/// codepoint that is out of range (surrogate codepoints are out of range) or
/// is not the shortest possible UTF-8 sequence for that codepoint.
#[inline]
pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
    let b0 = match src.get(0) {
        None => return None,
        Some(&b) if b <= 0x7F => return Some((b as char, 1)),
        Some(&b) => b,
    };
    match b0 {
        0b110_00000 ... 0b110_11111 => {
            if src.len() < 2 {
                return None;
            }
            let b1 = src[1];
            let cp = ((b0 & !TAG_TWO) as u32) << 6
                     | ((b1 & !TAG_CONT) as u32);
            match cp {
                0x80 ... 0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
                _ => None,
            }
        }
        0b1110_0000 ... 0b1110_1111 => {
            if src.len() < 3 {
                return None;
            }
            let (b1, b2) = (src[1], src[2]);
            let cp = ((b0 & !TAG_THREE) as u32) << 12
                     | ((b1 & !TAG_CONT) as u32) << 6
                     | ((b2 & !TAG_CONT) as u32);
            match cp {
                // char::from_u32 will disallow surrogate codepoints.
                0x800 ... 0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
                _ => None,
            }
        }
        0b11110_000 ... 0b11110_111 => {
            if src.len() < 4 {
                return None;
            }
            let (b1, b2, b3) = (src[1], src[2], src[3]);
            let cp = ((b0 & !TAG_FOUR) as u32) << 18
                     | ((b1 & !TAG_CONT) as u32) << 12
                     | ((b2 & !TAG_CONT) as u32) << 6
                     | ((b3 & !TAG_CONT) as u32);
            match cp {
                0x10000 ... 0x10FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
                _ => None,
            }
        }
        _ => None,
    }
}

/// Like decode_utf8, but decodes the last UTF-8 sequence in `src` instead of
/// the first.
pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
    if src.is_empty() {
        return None;
    }
    let mut start = src.len() - 1;
    if src[start] <= 0x7F {
        return Some((src[start] as char, 1));
    }
    while start > src.len().saturating_sub(4) {
        start -= 1;
        if is_start_byte(src[start]) {
            break;
        }
    }
    match decode_utf8(&src[start..]) {
        None => None,
        Some((_, n)) if n < src.len() - start => None,
        Some((cp, n)) => Some((cp, n)),
    }
}

fn is_start_byte(b: u8) -> bool {
    b & 0b11_000000 != 0b1_0000000
}

#[cfg(test)]
mod tests {
    use std::str;

    use quickcheck::quickcheck;

    use super::{
        TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
        decode_utf8, decode_last_utf8, encode_utf8,
    };

    #[test]
    fn prop_roundtrip() {
        fn p(given_cp: char) -> bool {
            let mut tmp = [0; 4];
            let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
            let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
            encoded_len == got_len && given_cp == got_cp
        }
        quickcheck(p as fn(char) -> bool)
    }

    #[test]
    fn prop_roundtrip_last() {
        fn p(given_cp: char) -> bool {
            let mut tmp = [0; 4];
            let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
            let (got_cp, got_len) =
                decode_last_utf8(&tmp[..encoded_len]).unwrap();
            encoded_len == got_len && given_cp == got_cp
        }
        quickcheck(p as fn(char) -> bool)
    }

    #[test]
    fn prop_encode_matches_std() {
        fn p(cp: char) -> bool {
            let mut got = [0; 4];
            let n = encode_utf8(cp, &mut got).unwrap();
            let expected = cp.to_string();
            &got[..n] == expected.as_bytes()
        }
        quickcheck(p as fn(char) -> bool)
    }

    #[test]
    fn prop_decode_matches_std() {
        fn p(given_cp: char) -> bool {
            let mut tmp = [0; 4];
            let n = encode_utf8(given_cp, &mut tmp).unwrap();
            let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
            let expected_cp =
                str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
            got_cp == expected_cp
        }
        quickcheck(p as fn(char) -> bool)
    }

    #[test]
    fn prop_decode_last_matches_std() {
        fn p(given_cp: char) -> bool {
            let mut tmp = [0; 4];
            let n = encode_utf8(given_cp, &mut tmp).unwrap();
            let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
            let expected_cp =
                str::from_utf8(&tmp[..n]).unwrap()
                    .chars().rev().next().unwrap();
            got_cp == expected_cp
        }
        quickcheck(p as fn(char) -> bool)
    }

    #[test]
    fn reject_invalid() {
        // Invalid start byte
        assert_eq!(decode_utf8(&[0xFF]), None);
        // Surrogate pair
        assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
        // Bad lengths
        assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
        assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
        assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
        // Not a minimal UTF-8 sequence
        assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
        assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
        assert_eq!(decode_utf8(&[
            TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
        ]), None);
    }

    #[test]
    fn reject_invalid_last() {
        // Invalid start byte
        assert_eq!(decode_last_utf8(&[0xFF]), None);
        // Surrogate pair
        assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
        // Bad lengths
        assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
        assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
        assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
        // Not a minimal UTF-8 sequence
        assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
        assert_eq!(decode_last_utf8(&[
            TAG_THREE, TAG_CONT, TAG_CONT | b'a',
        ]), None);
        assert_eq!(decode_last_utf8(&[
            TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',
        ]), None);
    }
}