1 /// Normalize the given character name in place according to UAX44-LM2.
2 ///
3 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
character_name_normalize(string: &mut String)4 pub fn character_name_normalize(string: &mut String) {
5 let bytes = unsafe {
6 // SAFETY: `character_name_normalize_bytes` guarantees that
7 // `bytes[..len]` is valid UTF-8.
8 string.as_mut_vec()
9 };
10 let len = character_name_normalize_bytes(bytes).len();
11 bytes.truncate(len);
12 }
13
14 /// Normalize the given character name in place according to UAX44-LM2.
15 ///
16 /// The slice returned is guaranteed to be valid UTF-8 for all possible values
17 /// of `slice`.
18 ///
19 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
character_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8]20 fn character_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
21 // According to Unicode 4.8, character names consist only of Latin
22 // capital letters A to Z, ASCII digits, ASCII space or ASCII hypen.
23 // Therefore, we can do very simplistic case folding and operate on the
24 // raw bytes, since everything is ASCII. Note that we don't actually know
25 // whether `slice` is all ASCII or not, so we drop all non-ASCII bytes.
26 let mut next_write = 0;
27 let mut prev_space = true;
28 for i in 0..slice.len() {
29 // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
30 // UTF-8, we ensure that the slice contains only ASCII bytes. In
31 // particular, we drop every non-ASCII byte from the normalized string.
32 let b = slice[i];
33 if b == b' ' {
34 prev_space = true;
35 continue;
36 } else if b == b'_' {
37 // Drop the underscore.
38 } else if b == b'-' {
39 let mut keep_hyphen = prev_space || slice.get(i+1) == Some(&b' ');
40 // We want to keep the hypen only if it isn't medial, which means
41 // it has at least one adjacent space character. However, there
42 // is one exception. We need to keep the hypen in the character
43 // (U+1180) named `HANGUL JUNGSEONG O-E`. So we check for that
44 // here.
45 let rest_e = slice[i+1..] == b"E"[..] || slice[i+1..] == b"e"[..];
46 if !keep_hyphen && rest_e {
47 keep_hyphen = slice[..next_write] == b"hanguljungseongo"[..];
48 }
49 if keep_hyphen {
50 slice[next_write] = b;
51 next_write += 1;
52 }
53 } else if b'A' <= b && b <= b'Z' {
54 slice[next_write] = b + (b'a' - b'A');
55 next_write += 1;
56 } else if b <= 0x7F {
57 slice[next_write] = b;
58 next_write += 1;
59 }
60 prev_space = false;
61 }
62 &mut slice[..next_write]
63 }
64
65 /// Normalize the given symbolic name in place according to UAX44-LM3.
66 ///
67 /// A "symbolic name" typically corresponds to property names and property
68 /// value aliases. Note, though, that it should not be applied to property
69 /// string values.
70 ///
71 /// See: http://unicode.org/reports/tr44/#UAX44-LM2
symbolic_name_normalize(string: &mut String)72 pub fn symbolic_name_normalize(string: &mut String) {
73 let bytes = unsafe {
74 // SAFETY: `symbolic_name_normalize_bytes` guarantees that
75 // `bytes[..len]` is valid UTF-8.
76 string.as_mut_vec()
77 };
78 let len = symbolic_name_normalize_bytes(bytes).len();
79 bytes.truncate(len);
80 }
81
82 /// Normalize the given symbolic name in place according to UAX44-LM3.
83 ///
84 /// A "symbolic name" typically corresponds to property names and property
85 /// value aliases. Note, though, that it should not be applied to property
86 /// string values.
87 ///
88 /// The slice returned is guaranteed to be valid UTF-8 for all possible values
89 /// of `slice`.
90 ///
91 /// See: http://unicode.org/reports/tr44/#UAX44-LM3
symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8]92 fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
93 // I couldn't find a place in the standard that specified that property
94 // names/aliases had a particular structure (unlike character names), but
95 // we assume that it's ASCII only and drop anything that isn't ASCII.
96 let mut start = 0;
97 let mut starts_with_is = false;
98 if slice.len() >= 2 {
99 // Ignore any "is" prefix.
100 starts_with_is =
101 slice[0..2] == b"is"[..]
102 || slice[0..2] == b"IS"[..]
103 || slice[0..2] == b"iS"[..]
104 || slice[0..2] == b"Is"[..];
105 if starts_with_is {
106 start = 2;
107 }
108 }
109 let mut next_write = 0;
110 for i in start..slice.len() {
111 // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
112 // UTF-8, we ensure that the slice contains only ASCII bytes. In
113 // particular, we drop every non-ASCII byte from the normalized string.
114 let b = slice[i];
115 if b == b' ' || b == b'_' || b == b'-' {
116 continue;
117 } else if b'A' <= b && b <= b'Z' {
118 slice[next_write] = b + (b'a' - b'A');
119 next_write += 1;
120 } else if b <= 0x7F {
121 slice[next_write] = b;
122 next_write += 1;
123 }
124 }
125 // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
126 // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
127 // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
128 // is actually an alias for the 'Other' general category.
129 if starts_with_is && next_write == 1 && slice[0] == b'c' {
130 slice[0] = b'i';
131 slice[1] = b's';
132 slice[2] = b'c';
133 next_write = 3;
134 }
135 &mut slice[..next_write]
136 }
137
138 #[cfg(test)]
139 mod tests {
140 use super::{
141 character_name_normalize, character_name_normalize_bytes,
142 symbolic_name_normalize, symbolic_name_normalize_bytes,
143 };
144
char_norm(s: &str) -> String145 fn char_norm(s: &str) -> String {
146 let mut s = s.to_string();
147 character_name_normalize(&mut s);
148 s
149 }
150
sym_norm(s: &str) -> String151 fn sym_norm(s: &str) -> String {
152 let mut s = s.to_string();
153 symbolic_name_normalize(&mut s);
154 s
155 }
156
157 #[test]
char_normalize()158 fn char_normalize() {
159 assert_eq!(char_norm("HANGUL JUNGSEONG O-E"), "hanguljungseongo-e");
160 assert_eq!(char_norm("zero-width space"), "zerowidthspace");
161 assert_eq!(char_norm("zerowidthspace"), "zerowidthspace");
162 assert_eq!(char_norm("ZERO WIDTH SPACE"), "zerowidthspace");
163 assert_eq!(char_norm("TIBETAN MARK TSA -PHRU"), "tibetanmarktsa-phru");
164 }
165
166 #[test]
sym_normalize()167 fn sym_normalize() {
168 assert_eq!(sym_norm("Line_Break"), "linebreak");
169 assert_eq!(sym_norm("Line-break"), "linebreak");
170 assert_eq!(sym_norm("linebreak"), "linebreak");
171 assert_eq!(sym_norm("BA"), "ba");
172 assert_eq!(sym_norm("ba"), "ba");
173 assert_eq!(sym_norm("Greek"), "greek");
174 assert_eq!(sym_norm("isGreek"), "greek");
175 assert_eq!(sym_norm("IS_Greek"), "greek");
176 assert_eq!(sym_norm("isc"), "isc");
177 assert_eq!(sym_norm("is c"), "isc");
178 assert_eq!(sym_norm("is_c"), "isc");
179 }
180
181 #[test]
valid_utf8_character()182 fn valid_utf8_character() {
183 let mut x = b"abc\xFFxyz".to_vec();
184 let y = character_name_normalize_bytes(&mut x);
185 assert_eq!(y, b"abcxyz");
186 }
187
188 #[test]
valid_utf8_symbolic()189 fn valid_utf8_symbolic() {
190 let mut x = b"abc\xFFxyz".to_vec();
191 let y = symbolic_name_normalize_bytes(&mut x);
192 assert_eq!(y, b"abcxyz");
193 }
194 }
195