1 use std::cmp::Ordering;
2 use std::result;
3 
4 use ucd_util::{self, PropertyValues};
5 
6 use hir;
7 use unicode_tables::age;
8 use unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE;
9 use unicode_tables::general_category;
10 use unicode_tables::grapheme_cluster_break;
11 use unicode_tables::property_bool;
12 use unicode_tables::property_names::PROPERTY_NAMES;
13 use unicode_tables::property_values::PROPERTY_VALUES;
14 use unicode_tables::script;
15 use unicode_tables::script_extension;
16 use unicode_tables::sentence_break;
17 use unicode_tables::word_break;
18 
19 type Result<T> = result::Result<T, Error>;
20 
21 /// An error that occurs when dealing with Unicode.
22 ///
23 /// We don't impl the Error trait here because these always get converted
24 /// into other public errors. (This error type isn't exported.)
25 #[derive(Debug)]
26 pub enum Error {
27     PropertyNotFound,
28     PropertyValueNotFound,
29 }
30 
31 /// An iterator over a codepoint's simple case equivalence class.
32 #[derive(Debug)]
33 pub struct SimpleFoldIter(::std::slice::Iter<'static, char>);
34 
35 impl Iterator for SimpleFoldIter {
36     type Item = char;
37 
next(&mut self) -> Option<char>38     fn next(&mut self) -> Option<char> {
39         self.0.next().map(|c| *c)
40     }
41 }
42 
43 /// Return an iterator over the equivalence class of simple case mappings
44 /// for the given codepoint. The equivalence class does not include the
45 /// given codepoint.
46 ///
47 /// If the equivalence class is empty, then this returns the next scalar
48 /// value that has a non-empty equivalence class, if it exists. If no such
49 /// scalar value exists, then `None` is returned. The point of this behavior
50 /// is to permit callers to avoid calling `simple_fold` more than they need
51 /// to, since there is some cost to fetching the equivalence class.
simple_fold(c: char) -> result::Result<SimpleFoldIter, Option<char>>52 pub fn simple_fold(c: char) -> result::Result<SimpleFoldIter, Option<char>> {
53     CASE_FOLDING_SIMPLE
54         .binary_search_by_key(&c, |&(c1, _)| c1)
55         .map(|i| SimpleFoldIter(CASE_FOLDING_SIMPLE[i].1.iter()))
56         .map_err(|i| {
57             if i >= CASE_FOLDING_SIMPLE.len() {
58                 None
59             } else {
60                 Some(CASE_FOLDING_SIMPLE[i].0)
61             }
62         })
63 }
64 
65 /// Returns true if and only if the given (inclusive) range contains at least
66 /// one Unicode scalar value that has a non-empty non-trivial simple case
67 /// mapping.
68 ///
69 /// This function panics if `end < start`.
contains_simple_case_mapping(start: char, end: char) -> bool70 pub fn contains_simple_case_mapping(start: char, end: char) -> bool {
71     assert!(start <= end);
72     CASE_FOLDING_SIMPLE
73         .binary_search_by(|&(c, _)| {
74             if start <= c && c <= end {
75                 Ordering::Equal
76             } else if c > end {
77                 Ordering::Greater
78             } else {
79                 Ordering::Less
80             }
81         }).is_ok()
82 }
83 
84 /// A query for finding a character class defined by Unicode. This supports
85 /// either use of a property name directly, or lookup by property value. The
86 /// former generally refers to Binary properties (see UTS#44, Table 8), but
87 /// as a special exception (see UTS#18, Section 1.2) both general categories
88 /// (an enumeration) and scripts (a catalog) are supported as if each of their
89 /// possible values were a binary property.
90 ///
91 /// In all circumstances, property names and values are normalized and
92 /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`.
93 ///
94 /// The lifetime `'a` refers to the shorter of the lifetimes of property name
95 /// and property value.
96 #[derive(Debug)]
97 pub enum ClassQuery<'a> {
98     /// Return a class corresponding to a Unicode binary property, named by
99     /// a single letter.
100     OneLetter(char),
101     /// Return a class corresponding to a Unicode binary property.
102     ///
103     /// Note that, by special exception (see UTS#18, Section 1.2), both
104     /// general category values and script values are permitted here as if
105     /// they were a binary property.
106     Binary(&'a str),
107     /// Return a class corresponding to all codepoints whose property
108     /// (identified by `property_name`) corresponds to the given value
109     /// (identified by `property_value`).
110     ByValue {
111         /// A property name.
112         property_name: &'a str,
113         /// A property value.
114         property_value: &'a str,
115     },
116 }
117 
118 impl<'a> ClassQuery<'a> {
canonicalize(&self) -> Result<CanonicalClassQuery>119     fn canonicalize(&self) -> Result<CanonicalClassQuery> {
120         match *self {
121             ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()),
122             ClassQuery::Binary(name) => self.canonical_binary(name),
123             ClassQuery::ByValue { property_name, property_value } => {
124                 let property_name = normalize(property_name);
125                 let property_value = normalize(property_value);
126 
127                 let canon_name = match canonical_prop(&property_name) {
128                     None => return Err(Error::PropertyNotFound),
129                     Some(canon_name) => canon_name,
130                 };
131                 Ok(match canon_name {
132                     "General_Category" => {
133                         let canon = match canonical_gencat(&property_value) {
134                             None => return Err(Error::PropertyValueNotFound),
135                             Some(canon) => canon,
136                         };
137                         CanonicalClassQuery::GeneralCategory(canon)
138                     }
139                     "Script" => {
140                         let canon = match canonical_script(&property_value) {
141                             None => return Err(Error::PropertyValueNotFound),
142                             Some(canon) => canon,
143                         };
144                         CanonicalClassQuery::Script(canon)
145                     }
146                     _ => {
147                         let vals = match property_values(canon_name) {
148                             None => return Err(Error::PropertyValueNotFound),
149                             Some(vals) => vals,
150                         };
151                         let canon_val = match canonical_value(
152                             vals,
153                             &property_value,
154                         ) {
155                             None => return Err(Error::PropertyValueNotFound),
156                             Some(canon_val) => canon_val,
157                         };
158                         CanonicalClassQuery::ByValue {
159                             property_name: canon_name,
160                             property_value: canon_val,
161                         }
162                     }
163                 })
164             }
165         }
166     }
167 
canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery>168     fn canonical_binary(&self, name: &str) -> Result<CanonicalClassQuery> {
169         let norm = normalize(name);
170 
171         if let Some(canon) = canonical_prop(&norm) {
172             return Ok(CanonicalClassQuery::Binary(canon));
173         }
174         if let Some(canon) = canonical_gencat(&norm) {
175             return Ok(CanonicalClassQuery::GeneralCategory(canon));
176         }
177         if let Some(canon) = canonical_script(&norm) {
178             return Ok(CanonicalClassQuery::Script(canon));
179         }
180         Err(Error::PropertyNotFound)
181     }
182 }
183 
184 /// Like ClassQuery, but its parameters have been canonicalized. This also
185 /// differentiates binary properties from flattened general categories and
186 /// scripts.
187 #[derive(Debug, Eq, PartialEq)]
188 enum CanonicalClassQuery {
189     /// The canonical binary property name.
190     Binary(&'static str),
191     /// The canonical general category name.
192     GeneralCategory(&'static str),
193     /// The canonical script name.
194     Script(&'static str),
195     /// An arbitrary association between property and value, both of which
196     /// have been canonicalized.
197     ///
198     /// Note that by construction, the property name of ByValue will never
199     /// be General_Category or Script. Those two cases are subsumed by the
200     /// eponymous variants.
201     ByValue {
202         /// The canonical property name.
203         property_name: &'static str,
204         /// The canonical property value.
205         property_value: &'static str,
206     },
207 }
208 
209 /// Looks up a Unicode class given a query. If one doesn't exist, then
210 /// `None` is returned.
class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode>211 pub fn class<'a>(query: ClassQuery<'a>) -> Result<hir::ClassUnicode> {
212     use self::CanonicalClassQuery::*;
213 
214     match query.canonicalize()? {
215         Binary(name) => {
216             property_set(property_bool::BY_NAME, name)
217                 .map(hir_class)
218                 .ok_or(Error::PropertyNotFound)
219         }
220         GeneralCategory("Any") => {
221             Ok(hir_class(&[('\0', '\u{10FFFF}')]))
222         }
223         GeneralCategory("Assigned") => {
224             let mut cls =
225                 property_set(general_category::BY_NAME, "Unassigned")
226                     .map(hir_class)
227                     .ok_or(Error::PropertyNotFound)?;
228             cls.negate();
229             Ok(cls)
230         }
231         GeneralCategory("ASCII") => {
232             Ok(hir_class(&[('\0', '\x7F')]))
233         }
234         GeneralCategory(name) => {
235             property_set(general_category::BY_NAME, name)
236                 .map(hir_class)
237                 .ok_or(Error::PropertyValueNotFound)
238         }
239         Script(name) => {
240             property_set(script::BY_NAME, name)
241                 .map(hir_class)
242                 .ok_or(Error::PropertyValueNotFound)
243         }
244         ByValue { property_name: "Age", property_value } => {
245             let mut class = hir::ClassUnicode::empty();
246             for set in ages(property_value)? {
247                 class.union(&hir_class(set));
248             }
249             Ok(class)
250         }
251         ByValue { property_name: "Script_Extensions", property_value } => {
252             property_set(script_extension::BY_NAME, property_value)
253                 .map(hir_class)
254                 .ok_or(Error::PropertyValueNotFound)
255         }
256         ByValue { property_name: "Grapheme_Cluster_Break", property_value } => {
257             property_set(grapheme_cluster_break::BY_NAME, property_value)
258                 .map(hir_class)
259                 .ok_or(Error::PropertyValueNotFound)
260         }
261         ByValue { property_name: "Sentence_Break", property_value } => {
262             property_set(sentence_break::BY_NAME, property_value)
263                 .map(hir_class)
264                 .ok_or(Error::PropertyValueNotFound)
265         }
266         ByValue { property_name: "Word_Break", property_value } => {
267             property_set(word_break::BY_NAME, property_value)
268                 .map(hir_class)
269                 .ok_or(Error::PropertyValueNotFound)
270         }
271         _ => {
272             // What else should we support?
273             Err(Error::PropertyNotFound)
274         }
275     }
276 }
277 
278 /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges.
hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode279 pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode {
280     let hir_ranges: Vec<hir::ClassUnicodeRange> = ranges
281         .iter()
282         .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
283         .collect();
284     hir::ClassUnicode::new(hir_ranges)
285 }
286 
canonical_prop(normalized_name: &str) -> Option<&'static str>287 fn canonical_prop(normalized_name: &str) -> Option<&'static str> {
288     ucd_util::canonical_property_name(PROPERTY_NAMES, normalized_name)
289 }
290 
canonical_gencat(normalized_value: &str) -> Option<&'static str>291 fn canonical_gencat(normalized_value: &str) -> Option<&'static str> {
292     match normalized_value {
293         "any" => Some("Any"),
294         "assigned" => Some("Assigned"),
295         "ascii" => Some("ASCII"),
296         _ => {
297             let gencats = property_values("General_Category").unwrap();
298             canonical_value(gencats, normalized_value)
299         }
300     }
301 }
302 
canonical_script(normalized_value: &str) -> Option<&'static str>303 fn canonical_script(normalized_value: &str) -> Option<&'static str> {
304     let scripts = property_values("Script").unwrap();
305     canonical_value(scripts, normalized_value)
306 }
307 
canonical_value( vals: PropertyValues, normalized_value: &str, ) -> Option<&'static str>308 fn canonical_value(
309     vals: PropertyValues,
310     normalized_value: &str,
311 ) -> Option<&'static str> {
312     ucd_util::canonical_property_value(vals, normalized_value)
313 }
314 
normalize(x: &str) -> String315 fn normalize(x: &str) -> String {
316     let mut x = x.to_string();
317     ucd_util::symbolic_name_normalize(&mut x);
318     x
319 }
320 
property_values( canonical_property_name: &'static str, ) -> Option<PropertyValues>321 fn property_values(
322     canonical_property_name: &'static str,
323 ) -> Option<PropertyValues>
324 {
325     ucd_util::property_values(PROPERTY_VALUES, canonical_property_name)
326 }
327 
property_set( name_map: &'static [(&'static str, &'static [(char, char)])], canonical: &'static str, ) -> Option<&'static [(char, char)]>328 fn property_set(
329     name_map: &'static [(&'static str, &'static [(char, char)])],
330     canonical: &'static str,
331 ) -> Option<&'static [(char, char)]> {
332     name_map
333         .binary_search_by_key(&canonical, |x| x.0)
334         .ok()
335         .map(|i| name_map[i].1)
336 }
337 
338 /// An iterator over Unicode Age sets. Each item corresponds to a set of
339 /// codepoints that were added in a particular revision of Unicode. The
340 /// iterator yields items in chronological order.
341 #[derive(Debug)]
342 struct AgeIter {
343     ages: &'static [(&'static str, &'static [(char, char)])],
344 }
345 
ages(canonical_age: &str) -> Result<AgeIter>346 fn ages(canonical_age: &str) -> Result<AgeIter> {
347     const AGES: &'static [(&'static str, &'static [(char, char)])] = &[
348         ("V1_1", age::V1_1),
349         ("V2_0", age::V2_0),
350         ("V2_1", age::V2_1),
351         ("V3_0", age::V3_0),
352         ("V3_1", age::V3_1),
353         ("V3_2", age::V3_2),
354         ("V4_0", age::V4_0),
355         ("V4_1", age::V4_1),
356         ("V5_0", age::V5_0),
357         ("V5_1", age::V5_1),
358         ("V5_2", age::V5_2),
359         ("V6_0", age::V6_0),
360         ("V6_1", age::V6_1),
361         ("V6_2", age::V6_2),
362         ("V6_3", age::V6_3),
363         ("V7_0", age::V7_0),
364         ("V8_0", age::V8_0),
365         ("V9_0", age::V9_0),
366         ("V10_0", age::V10_0),
367         ("V11_0", age::V11_0),
368         ("V12_0", age::V12_0),
369         ("V12_1", age::V12_1),
370     ];
371     assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync");
372 
373     let pos = AGES.iter().position(|&(age, _)| canonical_age == age);
374     match pos {
375         None => Err(Error::PropertyValueNotFound),
376         Some(i) => Ok(AgeIter { ages: &AGES[..i+1] }),
377     }
378 }
379 
380 impl Iterator for AgeIter {
381     type Item = &'static [(char, char)];
382 
next(&mut self) -> Option<&'static [(char, char)]>383     fn next(&mut self) -> Option<&'static [(char, char)]> {
384         if self.ages.is_empty() {
385             None
386         } else {
387             let set = self.ages[0];
388             self.ages = &self.ages[1..];
389             Some(set.1)
390         }
391     }
392 }
393 
394 #[cfg(test)]
395 mod tests {
396     use super::{contains_simple_case_mapping, simple_fold};
397 
398     #[test]
simple_fold_k()399     fn simple_fold_k() {
400         let xs: Vec<char> = simple_fold('k').unwrap().collect();
401         assert_eq!(xs, vec!['K', 'K']);
402 
403         let xs: Vec<char> = simple_fold('K').unwrap().collect();
404         assert_eq!(xs, vec!['k', 'K']);
405 
406         let xs: Vec<char> = simple_fold('K').unwrap().collect();
407         assert_eq!(xs, vec!['K', 'k']);
408     }
409 
410     #[test]
simple_fold_a()411     fn simple_fold_a() {
412         let xs: Vec<char> = simple_fold('a').unwrap().collect();
413         assert_eq!(xs, vec!['A']);
414 
415         let xs: Vec<char> = simple_fold('A').unwrap().collect();
416         assert_eq!(xs, vec!['a']);
417     }
418 
419     #[test]
simple_fold_empty()420     fn simple_fold_empty() {
421         assert_eq!(Some('A'), simple_fold('?').unwrap_err());
422         assert_eq!(Some('A'), simple_fold('@').unwrap_err());
423         assert_eq!(Some('a'), simple_fold('[').unwrap_err());
424         assert_eq!(Some('Ⰰ'), simple_fold('☃').unwrap_err());
425     }
426 
427     #[test]
simple_fold_max()428     fn simple_fold_max() {
429         assert_eq!(None, simple_fold('\u{10FFFE}').unwrap_err());
430         assert_eq!(None, simple_fold('\u{10FFFF}').unwrap_err());
431     }
432 
433     #[test]
range_contains()434     fn range_contains() {
435         assert!(contains_simple_case_mapping('A', 'A'));
436         assert!(contains_simple_case_mapping('Z', 'Z'));
437         assert!(contains_simple_case_mapping('A', 'Z'));
438         assert!(contains_simple_case_mapping('@', 'A'));
439         assert!(contains_simple_case_mapping('Z', '['));
440         assert!(contains_simple_case_mapping('☃', 'Ⰰ'));
441 
442         assert!(!contains_simple_case_mapping('[', '['));
443         assert!(!contains_simple_case_mapping('[', '`'));
444 
445         assert!(!contains_simple_case_mapping('☃', '☃'));
446     }
447 
448     #[test]
regression_466()449     fn regression_466() {
450         use super::{CanonicalClassQuery, ClassQuery};
451 
452         let q = ClassQuery::OneLetter('C');
453         assert_eq!(
454             q.canonicalize().unwrap(),
455             CanonicalClassQuery::GeneralCategory("Other"));
456     }
457 }
458