1 /// Normalize the given character name in place according to UAX44-LM2.
2 ///
3 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
character_name_normalize(string: &mut String)4 pub fn character_name_normalize(string: &mut String) {
5     let bytes = unsafe {
6         // SAFETY: `character_name_normalize_bytes` guarantees that
7         // `bytes[..len]` is valid UTF-8.
8         string.as_mut_vec()
9     };
10     let len = character_name_normalize_bytes(bytes).len();
11     bytes.truncate(len);
12 }
13 
14 /// Normalize the given character name in place according to UAX44-LM2.
15 ///
16 /// The slice returned is guaranteed to be valid UTF-8 for all possible values
17 /// of `slice`.
18 ///
19 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
character_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8]20 fn character_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
21     // According to Unicode 4.8, character names consist only of Latin
22     // capital letters A to Z, ASCII digits, ASCII space or ASCII hypen.
23     // Therefore, we can do very simplistic case folding and operate on the
24     // raw bytes, since everything is ASCII. Note that we don't actually know
25     // whether `slice` is all ASCII or not, so we drop all non-ASCII bytes.
26     let mut next_write = 0;
27     let mut prev_space = true;
28     for i in 0..slice.len() {
29         // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
30         // UTF-8, we ensure that the slice contains only ASCII bytes. In
31         // particular, we drop every non-ASCII byte from the normalized string.
32         let b = slice[i];
33         if b == b' ' {
34             prev_space = true;
35             continue;
36         } else if b == b'_' {
37             // Drop the underscore.
38         } else if b == b'-' {
39             let mut keep_hyphen = prev_space || slice.get(i+1) == Some(&b' ');
40             // We want to keep the hypen only if it isn't medial, which means
41             // it has at least one adjacent space character. However, there
42             // is one exception. We need to keep the hypen in the character
43             // (U+1180) named `HANGUL JUNGSEONG O-E`. So we check for that
44             // here.
45             let rest_e = slice[i+1..] == b"E"[..] || slice[i+1..] == b"e"[..];
46             if !keep_hyphen && rest_e {
47                 keep_hyphen = slice[..next_write] == b"hanguljungseongo"[..];
48             }
49             if keep_hyphen {
50                 slice[next_write] = b;
51                 next_write += 1;
52             }
53         } else if b'A' <= b && b <= b'Z' {
54             slice[next_write] = b + (b'a' - b'A');
55             next_write += 1;
56         } else if b <= 0x7F {
57             slice[next_write] = b;
58             next_write += 1;
59         }
60         prev_space = false;
61     }
62     &mut slice[..next_write]
63 }
64 
65 /// Normalize the given symbolic name in place according to UAX44-LM3.
66 ///
67 /// A "symbolic name" typically corresponds to property names and property
68 /// value aliases. Note, though, that it should not be applied to property
69 /// string values.
70 ///
71 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
symbolic_name_normalize(string: &mut String)72 pub fn symbolic_name_normalize(string: &mut String) {
73     let bytes = unsafe {
74         // SAFETY: `symbolic_name_normalize_bytes` guarantees that
75         // `bytes[..len]` is valid UTF-8.
76         string.as_mut_vec()
77     };
78     let len = symbolic_name_normalize_bytes(bytes).len();
79     bytes.truncate(len);
80 }
81 
82 /// Normalize the given symbolic name in place according to UAX44-LM3.
83 ///
84 /// A "symbolic name" typically corresponds to property names and property
85 /// value aliases. Note, though, that it should not be applied to property
86 /// string values.
87 ///
88 /// The slice returned is guaranteed to be valid UTF-8 for all possible values
89 /// of `slice`.
90 ///
91 /// See: http://unicode.org/reports/tr44/#UAX44-LM3
symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8]92 fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
93     // I couldn't find a place in the standard that specified that property
94     // names/aliases had a particular structure (unlike character names), but
95     // we assume that it's ASCII only and drop anything that isn't ASCII.
96     let mut start = 0;
97     let mut starts_with_is = false;
98     if slice.len() >= 2 {
99         // Ignore any "is" prefix.
100         starts_with_is =
101             slice[0..2] == b"is"[..]
102             || slice[0..2] == b"IS"[..]
103             || slice[0..2] == b"iS"[..]
104             || slice[0..2] == b"Is"[..];
105         if starts_with_is {
106             start = 2;
107         }
108     }
109     let mut next_write = 0;
110     for i in start..slice.len() {
111         // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
112         // UTF-8, we ensure that the slice contains only ASCII bytes. In
113         // particular, we drop every non-ASCII byte from the normalized string.
114         let b = slice[i];
115         if b == b' ' || b == b'_' || b == b'-' {
116             continue;
117         } else if b'A' <= b && b <= b'Z' {
118             slice[next_write] = b + (b'a' - b'A');
119             next_write += 1;
120         } else if b <= 0x7F {
121             slice[next_write] = b;
122             next_write += 1;
123         }
124     }
125     // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
126     // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
127     // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
128     // is actually an alias for the 'Other' general category.
129     if starts_with_is && next_write == 1 && slice[0] == b'c' {
130         slice[0] = b'i';
131         slice[1] = b's';
132         slice[2] = b'c';
133         next_write = 3;
134     }
135     &mut slice[..next_write]
136 }
137 
138 #[cfg(test)]
139 mod tests {
140     use super::{
141         character_name_normalize, character_name_normalize_bytes,
142         symbolic_name_normalize, symbolic_name_normalize_bytes,
143     };
144 
char_norm(s: &str) -> String145     fn char_norm(s: &str) -> String {
146         let mut s = s.to_string();
147         character_name_normalize(&mut s);
148         s
149     }
150 
sym_norm(s: &str) -> String151     fn sym_norm(s: &str) -> String {
152         let mut s = s.to_string();
153         symbolic_name_normalize(&mut s);
154         s
155     }
156 
157     #[test]
char_normalize()158     fn char_normalize() {
159         assert_eq!(char_norm("HANGUL JUNGSEONG O-E"), "hanguljungseongo-e");
160         assert_eq!(char_norm("zero-width space"), "zerowidthspace");
161         assert_eq!(char_norm("zerowidthspace"), "zerowidthspace");
162         assert_eq!(char_norm("ZERO WIDTH SPACE"), "zerowidthspace");
163         assert_eq!(char_norm("TIBETAN MARK TSA -PHRU"), "tibetanmarktsa-phru");
164     }
165 
166     #[test]
sym_normalize()167     fn sym_normalize() {
168         assert_eq!(sym_norm("Line_Break"), "linebreak");
169         assert_eq!(sym_norm("Line-break"), "linebreak");
170         assert_eq!(sym_norm("linebreak"), "linebreak");
171         assert_eq!(sym_norm("BA"), "ba");
172         assert_eq!(sym_norm("ba"), "ba");
173         assert_eq!(sym_norm("Greek"), "greek");
174         assert_eq!(sym_norm("isGreek"), "greek");
175         assert_eq!(sym_norm("IS_Greek"), "greek");
176         assert_eq!(sym_norm("isc"), "isc");
177         assert_eq!(sym_norm("is c"), "isc");
178         assert_eq!(sym_norm("is_c"), "isc");
179     }
180 
181     #[test]
valid_utf8_character()182     fn valid_utf8_character() {
183         let mut x = b"abc\xFFxyz".to_vec();
184         let y = character_name_normalize_bytes(&mut x);
185         assert_eq!(y, b"abcxyz");
186     }
187 
188     #[test]
valid_utf8_symbolic()189     fn valid_utf8_symbolic() {
190         let mut x = b"abc\xFFxyz".to_vec();
191         let y = symbolic_name_normalize_bytes(&mut x);
192         assert_eq!(y, b"abcxyz");
193     }
194 }
195