1 /// Normalize the given character name in place according to UAX44-LM2.
2 ///
3 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
character_name_normalize(string: &mut String)4 pub fn character_name_normalize(string: &mut String) {
5     let bytes = unsafe {
6         // SAFETY: `character_name_normalize_bytes` guarantees that
7         // `bytes[..len]` is valid UTF-8.
8         string.as_mut_vec()
9     };
10     let len = character_name_normalize_bytes(bytes).len();
11     bytes.truncate(len);
12 }
13 
14 /// Normalize the given character name in place according to UAX44-LM2.
15 ///
16 /// The slice returned is guaranteed to be valid UTF-8 for all possible values
17 /// of `slice`.
18 ///
19 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
character_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8]20 fn character_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
21     // According to Unicode 4.8, character names consist only of Latin
22     // capital letters A to Z, ASCII digits, ASCII space or ASCII hypen.
23     // Therefore, we can do very simplistic case folding and operate on the
24     // raw bytes, since everything is ASCII. Note that we don't actually know
25     // whether `slice` is all ASCII or not, so we drop all non-ASCII bytes.
26     let mut next_write = 0;
27     let mut prev_letter = false;
28     // let mut prev_space = true;
29     for i in 0..slice.len() {
30         // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
31         // UTF-8, we ensure that the slice contains only ASCII bytes. In
32         // particular, we drop every non-ASCII byte from the normalized string.
33         let b = slice[i];
34         if b == b' ' {
35             // Drop spaces.
36         } else if b == b'_' {
37             // Drop the underscore.
38         } else if b == b'-' {
39             let medial = prev_letter
40                 && slice.get(i+1).map_or(false, |b| b.is_ascii_alphabetic());
41             let mut keep_hyphen = !medial;
42             // We want to keep the hypen only if it isn't medial. However,
43             // there is one exception. We need to keep the hypen in the
44             // character (U+1180) named `HANGUL JUNGSEONG O-E`. So we check for
45             // that here.
46             let next_e = slice
47                 .get(i+1)
48                 .map_or(false, |&b| b == b'E' || b == b'e');
49             // More characters after the final E are fine, as long as they are
50             // underscores and spaces.
51             let rest_empty = i+2 >= slice.len()
52                 || slice[i+2..].iter().all(|&b| b == b' ' || b == b'_');
53             if !keep_hyphen && next_e && rest_empty {
54                 keep_hyphen = slice[..next_write] == b"hanguljungseongo"[..];
55             }
56             if keep_hyphen {
57                 slice[next_write] = b;
58                 next_write += 1;
59             }
60         } else if b'A' <= b && b <= b'Z' {
61             slice[next_write] = b + (b'a' - b'A');
62             next_write += 1;
63         } else if b <= 0x7F {
64             slice[next_write] = b;
65             next_write += 1;
66         }
67         // prev_space = false;
68         prev_letter = b.is_ascii_alphabetic();
69     }
70     &mut slice[..next_write]
71 }
72 
73 /// Normalize the given symbolic name in place according to UAX44-LM3.
74 ///
75 /// A "symbolic name" typically corresponds to property names and property
76 /// value aliases. Note, though, that it should not be applied to property
77 /// string values.
78 ///
79 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
symbolic_name_normalize(string: &mut String)80 pub fn symbolic_name_normalize(string: &mut String) {
81     let bytes = unsafe {
82         // SAFETY: `symbolic_name_normalize_bytes` guarantees that
83         // `bytes[..len]` is valid UTF-8.
84         string.as_mut_vec()
85     };
86     let len = symbolic_name_normalize_bytes(bytes).len();
87     bytes.truncate(len);
88 }
89 
90 /// Normalize the given symbolic name in place according to UAX44-LM3.
91 ///
92 /// A "symbolic name" typically corresponds to property names and property
93 /// value aliases. Note, though, that it should not be applied to property
94 /// string values.
95 ///
96 /// The slice returned is guaranteed to be valid UTF-8 for all possible values
97 /// of `slice`.
98 ///
99 /// See: http://unicode.org/reports/tr44/#UAX44-LM3
symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8]100 fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
101     // I couldn't find a place in the standard that specified that property
102     // names/aliases had a particular structure (unlike character names), but
103     // we assume that it's ASCII only and drop anything that isn't ASCII.
104     let mut start = 0;
105     let mut starts_with_is = false;
106     if slice.len() >= 2 {
107         // Ignore any "is" prefix.
108         starts_with_is =
109             slice[0..2] == b"is"[..]
110             || slice[0..2] == b"IS"[..]
111             || slice[0..2] == b"iS"[..]
112             || slice[0..2] == b"Is"[..];
113         if starts_with_is {
114             start = 2;
115         }
116     }
117     let mut next_write = 0;
118     for i in start..slice.len() {
119         // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
120         // UTF-8, we ensure that the slice contains only ASCII bytes. In
121         // particular, we drop every non-ASCII byte from the normalized string.
122         let b = slice[i];
123         if b == b' ' || b == b'_' || b == b'-' {
124             continue;
125         } else if b'A' <= b && b <= b'Z' {
126             slice[next_write] = b + (b'a' - b'A');
127             next_write += 1;
128         } else if b <= 0x7F {
129             slice[next_write] = b;
130             next_write += 1;
131         }
132     }
133     // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
134     // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
135     // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
136     // is actually an alias for the 'Other' general category.
137     if starts_with_is && next_write == 1 && slice[0] == b'c' {
138         slice[0] = b'i';
139         slice[1] = b's';
140         slice[2] = b'c';
141         next_write = 3;
142     }
143     &mut slice[..next_write]
144 }
145 
146 #[cfg(test)]
147 mod tests {
148     use super::{
149         character_name_normalize, character_name_normalize_bytes,
150         symbolic_name_normalize, symbolic_name_normalize_bytes,
151     };
152 
char_norm(s: &str) -> String153     fn char_norm(s: &str) -> String {
154         let mut s = s.to_string();
155         character_name_normalize(&mut s);
156         s
157     }
158 
sym_norm(s: &str) -> String159     fn sym_norm(s: &str) -> String {
160         let mut s = s.to_string();
161         symbolic_name_normalize(&mut s);
162         s
163     }
164 
165     #[test]
char_normalize()166     fn char_normalize() {
167         assert_eq!(char_norm("HANGUL JUNGSEONG O-E"), "hanguljungseongo-e");
168         assert_eq!(char_norm("HANGUL JUNGSEONG O-E _"), "hanguljungseongo-e");
169         assert_eq!(char_norm("zero-width space"), "zerowidthspace");
170         assert_eq!(char_norm("zerowidthspace"), "zerowidthspace");
171         assert_eq!(char_norm("ZERO WIDTH SPACE"), "zerowidthspace");
172         assert_eq!(char_norm("TIBETAN MARK TSA -PHRU"), "tibetanmarktsa-phru");
173         assert_eq!(char_norm("tibetan_letter_-a"), "tibetanletter-a");
174     }
175 
176     #[test]
sym_normalize()177     fn sym_normalize() {
178         assert_eq!(sym_norm("Line_Break"), "linebreak");
179         assert_eq!(sym_norm("Line-break"), "linebreak");
180         assert_eq!(sym_norm("linebreak"), "linebreak");
181         assert_eq!(sym_norm("BA"), "ba");
182         assert_eq!(sym_norm("ba"), "ba");
183         assert_eq!(sym_norm("Greek"), "greek");
184         assert_eq!(sym_norm("isGreek"), "greek");
185         assert_eq!(sym_norm("IS_Greek"), "greek");
186         assert_eq!(sym_norm("isc"), "isc");
187         assert_eq!(sym_norm("is c"), "isc");
188         assert_eq!(sym_norm("is_c"), "isc");
189     }
190 
191     #[test]
valid_utf8_character()192     fn valid_utf8_character() {
193         let mut x = b"abc\xFFxyz".to_vec();
194         let y = character_name_normalize_bytes(&mut x);
195         assert_eq!(y, b"abcxyz");
196     }
197 
198     #[test]
valid_utf8_symbolic()199     fn valid_utf8_symbolic() {
200         let mut x = b"abc\xFFxyz".to_vec();
201         let y = symbolic_name_normalize_bytes(&mut x);
202         assert_eq!(y, b"abcxyz");
203     }
204 }
205